Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_SequentialCholeskyQR.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __TSQR_Tsqr_SequentialCholeskyQR_hpp
00030 #define __TSQR_Tsqr_SequentialCholeskyQR_hpp
00031 
00032 #include <Tsqr_MatView.hpp>
00033 #include <Tsqr_Blas.hpp>
00034 #include <Tsqr_Lapack.hpp>
00035 #include <Tsqr_CacheBlockingStrategy.hpp>
00036 #include <Tsqr_CacheBlocker.hpp>
00037 #include <Tsqr_ScalarTraits.hpp>
00038 #include <Tsqr_Util.hpp>
00039 
00040 #include <string>
00041 #include <utility>
00042 #include <vector>
00043 
00046 
00047 namespace TSQR {
00048 
00058   template<class LocalOrdinal, class Scalar>
00059   class SequentialCholeskyQR {
00060   private:
00061     typedef MatView< LocalOrdinal, Scalar > mat_view;
00062     typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view;
00063 
00064   public:
00065     typedef Scalar scalar_type;
00066     typedef LocalOrdinal ordinal_type;
00067 
00074     typedef int FactorOutput;
00075 
00080     size_t TEUCHOS_DEPRECATED cache_block_size () const { 
00081       return strategy_.cache_size_hint(); 
00082     }
00083 
00085     size_t cache_size_hint () const { return strategy_.cache_size_hint(); }
00086 
00092     SequentialCholeskyQR (const size_t theCacheSizeHint = 0) :
00093       strategy_ (theCacheSizeHint)
00094     {}
00095 
00103     bool QR_produces_R_factor_with_nonnegative_diagonal () const {
00104       return true;
00105     }
00106 
00113     FactorOutput
00114     factor (const LocalOrdinal nrows,
00115       const LocalOrdinal ncols,
00116       const Scalar A[],
00117       const LocalOrdinal lda, 
00118       Scalar R[],
00119       const LocalOrdinal ldr,
00120       const bool contiguous_cache_blocks = false)
00121     {
00122       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00123       LAPACK< LocalOrdinal, Scalar > lapack;
00124       BLAS< LocalOrdinal, Scalar > blas;
00125       std::vector< Scalar > work (ncols);
00126       Matrix< LocalOrdinal, Scalar > ATA (ncols, ncols, Scalar(0));
00127       FactorOutput retval (0);
00128 
00129       if (contiguous_cache_blocks)
00130   {
00131     // Compute ATA := A^T * A, by iterating through the cache
00132     // blocks of A from top to bottom.
00133     //
00134     // We say "A_rest" because it points to the remaining part of
00135     // the matrix left to process; at the beginning, the "remaining"
00136     // part is the whole matrix, but that will change as the
00137     // algorithm progresses.
00138     mat_view A_rest (nrows, ncols, A, lda);
00139     // This call modifies A_rest (but not the actual matrix
00140     // entries; just the dimensions and current position).
00141     mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00142     // Process the first cache block: ATA := A_cur^T * A_cur
00143     blas.GEMM ("T", "N", ncols, ncols, A_cur.nrows(), 
00144          Scalar(1), A_cur.get(), A_cur.lda(), A_cur.get(), A_cur.lda(),
00145          Scalar(0), ATA.get(), ATA.lda());
00146     // Process the remaining cache blocks in order.
00147     while (! A_rest.empty())
00148       {
00149         A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00150         // ATA := ATA + A_cur^T * A_cur
00151         blas.GEMM ("T", "N", ncols, ncols, A_cur.nrows(), 
00152        Scalar(1), A_cur.get(), A_cur.lda(), A_cur.get(), A_cur.lda(),
00153        Scalar(1), ATA.get(), ATA.lda());
00154       }
00155   }
00156       else
00157   // Compute ATA := A^T * A, using a single BLAS call.
00158   blas.GEMM ("T", "N", ncols, ncols, nrows, 
00159        Scalar(1), A, lda, A, lda,
00160        Scalar(0), ATA.get(), ATA.lda());
00161 
00162       // Compute the Cholesky factorization of ATA in place, so that
00163       // A^T * A = R^T * R, where R is ncols by ncols upper
00164       // triangular.
00165       int info = 0;
00166       lapack.POTRF ("U", ncols, ATA.get(), ATA.lda(), &info);
00167       // FIXME (mfh 22 June 2010) The right thing to do here would be
00168       // to resort to a rank-revealing factorization, as Stathopoulos
00169       // and Wu (2002) do with their CholeskyQR + symmetric
00170       // eigensolver factorization.
00171       if (info != 0)
00172   throw std::runtime_error("Cholesky factorization failed");
00173 
00174       // Copy out the R factor
00175       fill_matrix (ncols, ncols, R, ldr, Scalar(0));
00176       copy_upper_triangle (ncols, ncols, R, ldr, ATA.get(), ATA.lda());
00177 
00178       // Compute A := A * R^{-1}.  We do this in place in A, using
00179       // BLAS' TRSM with the R factor (form POTRF) stored in the upper
00180       // triangle of ATA.
00181       {
00182   mat_view A_rest (nrows, ncols, A, lda);
00183   // This call modifies A_rest.
00184   mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00185 
00186   // Compute A_cur / R (Matlab notation for A_cur * R^{-1}) in place.
00187   blas.TRSM ("R", "U", "N", "N", A_cur.nrows(), ncols, 
00188        Scalar(1), ATA.get(), ATA.lda(), A_cur.get(), A_cur.lda());
00189 
00190   // Process the remaining cache blocks in order.
00191   while (! A_rest.empty())
00192     {
00193       A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00194       blas.TRSM ("R", "U", "N", "N", A_cur.nrows(), ncols, 
00195            Scalar(1), ATA.get(), ATA.lda(), A_cur.get(), A_cur.lda());
00196     }
00197       }
00198 
00199       return retval;
00200     }
00201 
00204     void
00205     explicit_Q (const LocalOrdinal nrows,
00206     const LocalOrdinal ncols_Q,
00207     const Scalar Q[],
00208     const LocalOrdinal ldq,
00209     const FactorOutput& factor_output,
00210     const LocalOrdinal ncols_C,
00211     Scalar C[],
00212     const LocalOrdinal ldc,
00213     const bool contiguous_cache_blocks = false)
00214     {
00215       if (ncols_Q != ncols_C)
00216   throw std::logic_error("SequentialCholeskyQR::explicit_Q() "
00217              "does not work if ncols_C != ncols_Q");
00218       const LocalOrdinal ncols = ncols_Q;
00219 
00220       if (contiguous_cache_blocks)
00221   {
00222     CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00223     mat_view C_rest (nrows, ncols, C, ldc);
00224     const_mat_view Q_rest (nrows, ncols, Q, ldq);
00225 
00226     mat_view C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks);
00227     const_mat_view Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks);
00228 
00229     while (! C_rest.empty())
00230       Q_cur.copy (C_cur);
00231   }
00232       else
00233   {
00234     mat_view C_view (nrows, ncols, C, ldc);
00235     C_view.copy (const_mat_view (nrows, ncols, Q, ldq));
00236   }
00237     }
00238 
00239 
00241     void
00242     cache_block (const LocalOrdinal nrows,
00243      const LocalOrdinal ncols,
00244      Scalar A_out[],
00245      const Scalar A_in[],
00246      const LocalOrdinal lda_in) const
00247     {
00248       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00249       blocker.cache_block (nrows, ncols, A_out, A_in, lda_in);
00250     }
00251 
00252 
00254     void
00255     un_cache_block (const LocalOrdinal nrows,
00256         const LocalOrdinal ncols,
00257         Scalar A_out[],
00258         const LocalOrdinal lda_out,       
00259         const Scalar A_in[]) const
00260     {
00261       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00262       blocker.un_cache_block (nrows, ncols, A_out, lda_out, A_in);
00263     }
00264 
00266     void
00267     fill_with_zeros (const LocalOrdinal nrows,
00268          const LocalOrdinal ncols,
00269          Scalar A[],
00270          const LocalOrdinal lda, 
00271          const bool contiguous_cache_blocks = false)
00272     {
00273       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00274       blocker.fill_with_zeros (nrows, ncols, A, lda, contiguous_cache_blocks);
00275     }
00276 
00284     template< class MatrixViewType >
00285     MatrixViewType
00286     top_block (const MatrixViewType& C, 
00287          const bool contiguous_cache_blocks = false) const 
00288     {
00289       // The CacheBlocker object knows how to construct a view of the
00290       // top cache block of C.  This is complicated because cache
00291       // blocks (in C) may or may not be stored contiguously.  If they
00292       // are stored contiguously, the CacheBlocker knows the right
00293       // layout, based on the cache blocking strategy.
00294       CacheBlocker< LocalOrdinal, Scalar > blocker (C.nrows(), C.ncols(), strategy_);
00295 
00296       // C_top_block is a view of the topmost cache block of C.
00297       // C_top_block should have >= ncols rows, otherwise either cache
00298       // blocking is broken or the input matrix C itself had fewer
00299       // rows than columns.
00300       MatrixViewType C_top_block = blocker.top_block (C, contiguous_cache_blocks);
00301       if (C_top_block.nrows() < C_top_block.ncols())
00302   throw std::logic_error ("C\'s topmost cache block has fewer rows than "
00303         "columns");
00304       return C_top_block;
00305     }
00306 
00307   private:
00308     CacheBlockingStrategy< LocalOrdinal, Scalar > strategy_;
00309   };
00310   
00311 } // namespace TSQR
00312 
00313 #endif // __TSQR_Tsqr_SequentialCholeskyQR_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends