Anasazi Version of the Day
Tsqr_SequentialCholeskyQR.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                 Anasazi: Block Eigensolvers Package
00005 //                 Copyright (2010) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00025 //
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 #ifndef __TSQR_Tsqr_SequentialCholeskyQR_hpp
00030 #define __TSQR_Tsqr_SequentialCholeskyQR_hpp
00031 
00032 #include <Tsqr_MatView.hpp>
00033 #include <Tsqr_Blas.hpp>
00034 #include <Tsqr_Lapack.hpp>
00035 #include <Tsqr_CacheBlockingStrategy.hpp>
00036 #include <Tsqr_CacheBlocker.hpp>
00037 #include <Tsqr_ScalarTraits.hpp>
00038 #include <Tsqr_Util.hpp>
00039 
00040 #include <string>
00041 #include <utility>
00042 #include <vector>
00043 
00046 
00047 namespace TSQR {
00048 
00049   template< class LocalOrdinal, class Scalar >
00050   class SequentialCholeskyQR {
00051   private:
00052     typedef MatView< LocalOrdinal, Scalar > mat_view;
00053     typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view;
00054 
00055   public:
00056     typedef Scalar scalar_type;
00057     typedef LocalOrdinal ordinal_type;
00058     // Here, FactorOutput is just a minimal object whose value is
00059     // irrelevant, so that the static interface looks like that of
00060     // SequentialTSQR.
00061     typedef int FactorOutput;
00062 
00064     size_t
00065     cache_block_size () const { return strategy_.cache_block_size(); }
00066 
00073     SequentialCholeskyQR (const size_t cache_block_size = 0) :
00074       strategy_ (cache_block_size)
00075     {}
00076 
00080     bool QR_produces_R_factor_with_nonnegative_diagonal () const {
00081       return true;
00082     }
00083 
00088     FactorOutput
00089     factor (const LocalOrdinal nrows,
00090       const LocalOrdinal ncols,
00091       const Scalar A[],
00092       const LocalOrdinal lda, 
00093       Scalar R[],
00094       const LocalOrdinal ldr,
00095       const bool contiguous_cache_blocks = false)
00096     {
00097       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00098       LAPACK< LocalOrdinal, Scalar > lapack;
00099       BLAS< LocalOrdinal, Scalar > blas;
00100       std::vector< Scalar > work (ncols);
00101       Matrix< LocalOrdinal, Scalar > ATA (ncols, ncols, Scalar(0));
00102       FactorOutput retval (0);
00103 
00104       if (contiguous_cache_blocks)
00105   {
00106     // Compute ATA := A^T * A, by iterating through the cache
00107     // blocks of A from top to bottom.
00108     //
00109     // We say "A_rest" because it points to the remaining part of
00110     // the matrix left to process; at the beginning, the "remaining"
00111     // part is the whole matrix, but that will change as the
00112     // algorithm progresses.
00113     mat_view A_rest (nrows, ncols, A, lda);
00114     // This call modifies A_rest (but not the actual matrix
00115     // entries; just the dimensions and current position).
00116     mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00117     // Process the first cache block: ATA := A_cur^T * A_cur
00118     blas.GEMM ("T", "N", ncols, ncols, A_cur.nrows(), 
00119          Scalar(1), A_cur.get(), A_cur.lda(), A_cur.get(), A_cur.lda(),
00120          Scalar(0), ATA.get(), ATA.lda());
00121     // Process the remaining cache blocks in order.
00122     while (! A_rest.empty())
00123       {
00124         A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00125         // ATA := ATA + A_cur^T * A_cur
00126         blas.GEMM ("T", "N", ncols, ncols, A_cur.nrows(), 
00127        Scalar(1), A_cur.get(), A_cur.lda(), A_cur.get(), A_cur.lda(),
00128        Scalar(1), ATA.get(), ATA.lda());
00129       }
00130   }
00131       else
00132   // Compute ATA := A^T * A, using a single BLAS call.
00133   blas.GEMM ("T", "N", ncols, ncols, nrows, 
00134        Scalar(1), A, lda, A, lda,
00135        Scalar(0), ATA.get(), ATA.lda());
00136 
00137       // Compute the Cholesky factorization of ATA in place, so that
00138       // A^T * A = R^T * R, where R is ncols by ncols upper
00139       // triangular.
00140       int info = 0;
00141       lapack.POTRF ("U", ncols, ATA.get(), ATA.lda(), &info);
00142       // FIXME (mfh 22 June 2010) The right thing to do here would be
00143       // to resort to a rank-revealing factorization, as Stathopoulos
00144       // and Wu (2002) do with their CholeskyQR + symmetric
00145       // eigensolver factorization.
00146       if (info != 0)
00147   throw std::runtime_error("Cholesky factorization failed");
00148 
00149       // Copy out the R factor
00150       fill_matrix (ncols, ncols, R, ldr, Scalar(0));
00151       copy_upper_triangle (ncols, ncols, R, ldr, ATA.get(), ATA.lda());
00152 
00153       // Compute A := A * R^{-1}.  We do this in place in A, using
00154       // BLAS' TRSM with the R factor (form POTRF) stored in the upper
00155       // triangle of ATA.
00156       {
00157   mat_view A_rest (nrows, ncols, A, lda);
00158   // This call modifies A_rest.
00159   mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00160 
00161   // Compute A_cur / R (Matlab notation for A_cur * R^{-1}) in place.
00162   blas.TRSM ("R", "U", "N", "N", A_cur.nrows(), ncols, 
00163        Scalar(1), ATA.get(), ATA.lda(), A_cur.get(), A_cur.lda());
00164 
00165   // Process the remaining cache blocks in order.
00166   while (! A_rest.empty())
00167     {
00168       A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00169       blas.TRSM ("R", "U", "N", "N", A_cur.nrows(), ncols, 
00170            Scalar(1), ATA.get(), ATA.lda(), A_cur.get(), A_cur.lda());
00171     }
00172       }
00173 
00174       return retval;
00175     }
00176 
00177 
00180     void
00181     explicit_Q (const LocalOrdinal nrows,
00182     const LocalOrdinal ncols_Q,
00183     const Scalar Q[],
00184     const LocalOrdinal ldq,
00185     const FactorOutput& factor_output,
00186     const LocalOrdinal ncols_C,
00187     Scalar C[],
00188     const LocalOrdinal ldc,
00189     const bool contiguous_cache_blocks = false)
00190     {
00191       if (ncols_Q != ncols_C)
00192   throw std::logic_error("SequentialCholeskyQR::explicit_Q() "
00193              "does not work if ncols_C != ncols_Q");
00194       const LocalOrdinal ncols = ncols_Q;
00195 
00196       if (contiguous_cache_blocks)
00197   {
00198     CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00199     mat_view C_rest (nrows, ncols, C, ldc);
00200     const_mat_view Q_rest (nrows, ncols, Q, ldq);
00201 
00202     mat_view C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks);
00203     const_mat_view Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks);
00204 
00205     while (! C_rest.empty())
00206       Q_cur.copy (C_cur);
00207   }
00208       else
00209   {
00210     mat_view C_view (nrows, ncols, C, ldc);
00211     C_view.copy (const_mat_view (nrows, ncols, Q, ldq));
00212   }
00213     }
00214 
00215 
00217     void
00218     cache_block (const LocalOrdinal nrows,
00219      const LocalOrdinal ncols,
00220      Scalar A_out[],
00221      const Scalar A_in[],
00222      const LocalOrdinal lda_in) const
00223     {
00224       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00225       blocker.cache_block (nrows, ncols, A_out, A_in, lda_in);
00226     }
00227 
00228 
00230     void
00231     un_cache_block (const LocalOrdinal nrows,
00232         const LocalOrdinal ncols,
00233         Scalar A_out[],
00234         const LocalOrdinal lda_out,       
00235         const Scalar A_in[]) const
00236     {
00237       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00238       blocker.un_cache_block (nrows, ncols, A_out, lda_out, A_in);
00239     }
00240 
00242     void
00243     fill_with_zeros (const LocalOrdinal nrows,
00244          const LocalOrdinal ncols,
00245          Scalar A[],
00246          const LocalOrdinal lda, 
00247          const bool contiguous_cache_blocks = false)
00248     {
00249       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00250       blocker.fill_with_zeros (nrows, ncols, A, lda, contiguous_cache_blocks);
00251     }
00252 
00260     template< class MatrixViewType >
00261     MatrixViewType
00262     top_block (const MatrixViewType& C, 
00263          const bool contiguous_cache_blocks = false) const 
00264     {
00265       // The CacheBlocker object knows how to construct a view of the
00266       // top cache block of C.  This is complicated because cache
00267       // blocks (in C) may or may not be stored contiguously.  If they
00268       // are stored contiguously, the CacheBlocker knows the right
00269       // layout, based on the cache blocking strategy.
00270       CacheBlocker< LocalOrdinal, Scalar > blocker (C.nrows(), C.ncols(), strategy_);
00271 
00272       // C_top_block is a view of the topmost cache block of C.
00273       // C_top_block should have >= ncols rows, otherwise either cache
00274       // blocking is broken or the input matrix C itself had fewer
00275       // rows than columns.
00276       MatrixViewType C_top_block = blocker.top_block (C, contiguous_cache_blocks);
00277       if (C_top_block.nrows() < C_top_block.ncols())
00278   throw std::logic_error ("C\'s topmost cache block has fewer rows than "
00279         "columns");
00280       return C_top_block;
00281     }
00282 
00283   private:
00284     CacheBlockingStrategy< LocalOrdinal, Scalar > strategy_;
00285   };
00286   
00287 } // namespace TSQR
00288 
00289 #endif // __TSQR_Tsqr_SequentialCholeskyQR_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends