Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_SequentialTsqr.hpp
Go to the documentation of this file.
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00032 #ifndef __TSQR_Tsqr_SequentialTsqr_hpp
00033 #define __TSQR_Tsqr_SequentialTsqr_hpp
00034 
00035 #include <Tsqr_ApplyType.hpp>
00036 #include <Tsqr_Matrix.hpp>
00037 #include <Tsqr_CacheBlockingStrategy.hpp>
00038 #include <Tsqr_CacheBlocker.hpp>
00039 #include <Tsqr_Combine.hpp>
00040 #include <Tsqr_LocalVerify.hpp>
00041 #include <Tsqr_NodeTsqr.hpp>
00042 #include <Tsqr_Util.hpp>
00043 
00044 #include <Teuchos_Describable.hpp>
00045 #include <Teuchos_ScalarTraits.hpp>
00046 
00047 #include <algorithm>
00048 #include <limits>
00049 #include <sstream>
00050 #include <string>
00051 #include <utility> // std::pair
00052 #include <vector>
00053 
00054 
00055 namespace TSQR {
00056 
00105   template<class LocalOrdinal, class Scalar>
00106   class SequentialTsqr : 
00107     public NodeTsqr<LocalOrdinal, Scalar, std::vector<std::vector<Scalar> > >
00108   {
00109   public:
00110     typedef LocalOrdinal ordinal_type;
00111     typedef Scalar scalar_type;
00112     typedef typename Teuchos::ScalarTraits<Scalar>::magnitudeType magnitude_type;
00113     typedef typename NodeTsqr<LocalOrdinal, Scalar, std::vector<std::vector<Scalar> > >::factor_output_type FactorOutput;
00114 
00115   private:
00116     typedef typename FactorOutput::const_iterator FactorOutputIter;
00117     typedef typename FactorOutput::const_reverse_iterator FactorOutputReverseIter;
00118     typedef MatView<LocalOrdinal, Scalar> mat_view;
00119     typedef ConstMatView<LocalOrdinal, Scalar> const_mat_view;
00120     typedef std::pair<mat_view, mat_view> block_pair_type;
00121     typedef std::pair<const_mat_view, const_mat_view> const_block_pair_type;
00122 
00150     mat_view
00151     factor_first_block (Combine<LocalOrdinal, Scalar>& combine,
00152       mat_view& A_top,
00153       std::vector<Scalar>& tau,
00154       std::vector<Scalar>& work) const
00155     {
00156       const LocalOrdinal ncols = A_top.ncols();
00157       combine.factor_first (A_top.nrows(), ncols, A_top.get(), A_top.lda(), 
00158           &tau[0], &work[0]);
00159       return mat_view(ncols, ncols, A_top.get(), A_top.lda());
00160     }
00161 
00166     void 
00167     apply_first_block (Combine<LocalOrdinal, Scalar>& combine,
00168            const ApplyType& applyType,
00169            const const_mat_view& Q_first,
00170            const std::vector<Scalar>& tau,
00171            mat_view& C_first,
00172            std::vector<Scalar>& work) const
00173     {
00174       const LocalOrdinal nrowsLocal = Q_first.nrows();
00175       combine.apply_first (applyType, nrowsLocal, C_first.ncols(), 
00176          Q_first.ncols(), Q_first.get(), Q_first.lda(),
00177          &tau[0], C_first.get(), C_first.lda(), &work[0]);
00178     }
00179 
00180     void
00181     combine_apply (Combine<LocalOrdinal, Scalar>& combine,
00182        const ApplyType& apply_type,
00183        const const_mat_view& Q_cur,
00184        const std::vector<Scalar>& tau,
00185        mat_view& C_top,
00186        mat_view& C_cur,
00187        std::vector<Scalar>& work) const
00188     {
00189       const LocalOrdinal nrows_local = Q_cur.nrows();
00190       const LocalOrdinal ncols_Q = Q_cur.ncols();
00191       const LocalOrdinal ncols_C = C_cur.ncols();
00192 
00193       combine.apply_inner (apply_type, 
00194          nrows_local, ncols_C, ncols_Q, 
00195          Q_cur.get(), C_cur.lda(), &tau[0],
00196          C_top.get(), C_top.lda(),
00197          C_cur.get(), C_cur.lda(), &work[0]);
00198     }
00199 
00200     void
00201     combine_factor (Combine<LocalOrdinal, Scalar>& combine,
00202         mat_view& R,
00203         mat_view& A_cur,
00204         std::vector<Scalar>& tau,
00205         std::vector<Scalar>& work) const
00206     {
00207       const LocalOrdinal nrows_local = A_cur.nrows();
00208       const LocalOrdinal ncols = A_cur.ncols();
00209 
00210       combine.factor_inner (nrows_local, ncols, R.get(), R.lda(), 
00211           A_cur.get(), A_cur.lda(), &tau[0], 
00212           &work[0]);
00213     }
00214 
00215   public:
00216 
00252     SequentialTsqr (const size_t cacheSizeHint = 0,
00253         const size_t sizeOfScalar = sizeof(Scalar)) :
00254       strategy_ (cacheSizeHint, sizeOfScalar)
00255     {}
00256 
00265     SequentialTsqr (const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy) :
00266       strategy_ (strategy) 
00267     {}
00268 
00274     std::string description () const {
00275       std::ostringstream os;
00276       os << "Intranode Tall Skinny QR (TSQR): sequential cache-blocked "
00277   "implementation with cache size hint " << this->cache_size_hint() 
00278    << " bytes.";
00279       return os.str();
00280     }
00281 
00285     bool QR_produces_R_factor_with_nonnegative_diagonal () const {
00286       typedef Combine<LocalOrdinal, Scalar> combine_type;
00287       return combine_type::QR_produces_R_factor_with_nonnegative_diagonal();
00288     }
00289 
00295     size_t cache_size_hint () const { 
00296       return strategy_.cache_size_hint(); 
00297     }
00298 
00303     size_t TEUCHOS_DEPRECATED cache_block_size () const { 
00304       return strategy_.cache_size_hint(); 
00305     }
00306 
00332     FactorOutput
00333     factor (const LocalOrdinal nrows,
00334       const LocalOrdinal ncols,
00335       Scalar A[],
00336       const LocalOrdinal lda, 
00337       const bool contiguous_cache_blocks) const
00338     {
00339       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00340       Combine<LocalOrdinal, Scalar> combine;
00341       std::vector<Scalar> work (ncols);
00342       FactorOutput tau_arrays;
00343 
00344       // We say "A_rest" because it points to the remaining part of
00345       // the matrix left to factor; at the beginning, the "remaining"
00346       // part is the whole matrix, but that will change as the
00347       // algorithm progresses.
00348       //
00349       // Note: if the cache blocks are stored contiguously, lda won't
00350       // be the correct leading dimension of A, but it won't matter:
00351       // we only ever operate on A_cur here, and A_cur's leading
00352       // dimension is set correctly by A_rest.split_top().
00353       mat_view A_rest (nrows, ncols, A, lda);
00354       // This call modifies A_rest.
00355       mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00356 
00357       // Factor the topmost block of A.
00358       std::vector<Scalar> tau_first (ncols);
00359       mat_view R_view = factor_first_block (combine, A_cur, tau_first, work);
00360       tau_arrays.push_back (tau_first);
00361 
00362       while (! A_rest.empty())
00363   {
00364     A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00365     std::vector<Scalar> tau (ncols);
00366     combine_factor (combine, R_view, A_cur, tau, work);
00367     tau_arrays.push_back (tau);
00368   }
00369       return tau_arrays;
00370     }
00371 
00379     void
00380     extract_R (const LocalOrdinal nrows,
00381          const LocalOrdinal ncols,
00382          const Scalar A[],
00383          const LocalOrdinal lda,
00384          Scalar R[],
00385          const LocalOrdinal ldr,
00386          const bool contiguous_cache_blocks) const
00387     {
00388       const_mat_view A_view (nrows, ncols, A, lda);
00389 
00390       // Identify top cache block of A
00391       const_mat_view A_top = top_block (A_view, contiguous_cache_blocks);
00392 
00393       // Fill R (including lower triangle) with zeros.
00394       fill_matrix (ncols, ncols, R, ldr, Teuchos::ScalarTraits<Scalar>::zero());
00395 
00396       // Copy out the upper triangle of the R factor from A into R.
00397       copy_upper_triangle (ncols, ncols, R, ldr, A_top.get(), A_top.lda());
00398     }
00399 
00408     FactorOutput
00409     factor (const LocalOrdinal nrows,
00410       const LocalOrdinal ncols,
00411       Scalar A[],
00412       const LocalOrdinal lda, 
00413       Scalar R[],
00414       const LocalOrdinal ldr,
00415       const bool contiguous_cache_blocks) const
00416     {
00417       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00418       Combine<LocalOrdinal, Scalar> combine;
00419       std::vector<Scalar> work (ncols);
00420       FactorOutput tau_arrays;
00421 
00422       // We say "A_rest" because it points to the remaining part of
00423       // the matrix left to factor; at the beginning, the "remaining"
00424       // part is the whole matrix, but that will change as the
00425       // algorithm progresses.
00426       //
00427       // Note: if the cache blocks are stored contiguously, lda won't
00428       // be the correct leading dimension of A, but it won't matter:
00429       // we only ever operate on A_cur here, and A_cur's leading
00430       // dimension is set correctly by A_rest.split_top().
00431       mat_view A_rest (nrows, ncols, A, lda);
00432       // This call modifies A_rest.
00433       mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00434 
00435       // Factor the topmost block of A.
00436       std::vector<Scalar> tau_first (ncols);
00437       mat_view R_view = factor_first_block (combine, A_cur, tau_first, work);
00438       tau_arrays.push_back (tau_first);
00439 
00440       while (! A_rest.empty())
00441   {
00442     A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00443     std::vector< Scalar > tau (ncols);
00444     combine_factor (combine, R_view, A_cur, tau, work);
00445     tau_arrays.push_back (tau);
00446   }
00447       
00448       // Copy the R factor resulting from the factorization out of
00449       // R_view (a view of the topmost cache block of A) into the R
00450       // output argument.
00451       fill_matrix (ncols, ncols, R, ldr, Scalar(0));
00452       copy_upper_triangle (ncols, ncols, R, ldr, R_view.get(), R_view.lda());
00453       return tau_arrays;
00454     }
00455 
00456 
00476     LocalOrdinal
00477     factor_num_cache_blocks (const LocalOrdinal nrows,
00478            const LocalOrdinal ncols,
00479            const Scalar A[],
00480            const LocalOrdinal lda, 
00481            const bool contiguous_cache_blocks) const
00482     {
00483       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00484       LocalOrdinal count = 0;
00485 
00486       const_mat_view A_rest (nrows, ncols, A, lda);
00487       if (A_rest.empty())
00488   return count;
00489 
00490       const_mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00491       ++count; // first factor step
00492 
00493       while (! A_rest.empty())
00494   {
00495     A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00496     ++count; // next factor step
00497   }
00498       return count;
00499     }
00500 
00504     void
00505     apply (const ApplyType& apply_type,
00506      const LocalOrdinal nrows,
00507      const LocalOrdinal ncols_Q,
00508      const Scalar Q[],
00509      const LocalOrdinal ldq,
00510      const FactorOutput& factor_output,
00511      const LocalOrdinal ncols_C,
00512      Scalar C[],
00513      const LocalOrdinal ldc,
00514      const bool contiguous_cache_blocks) const
00515     {
00516       // Quick exit and error tests
00517       if (ncols_Q == 0 || ncols_C == 0 || nrows == 0)
00518   return;
00519       else if (ldc < nrows)
00520   {
00521     std::ostringstream os;
00522     os << "SequentialTsqr::apply: ldc (= " << ldc << ") < nrows (= " << nrows << ")";
00523     throw std::invalid_argument (os.str());
00524   }
00525       else if (ldq < nrows)
00526   {
00527     std::ostringstream os;
00528     os << "SequentialTsqr::apply: ldq (= " << ldq << ") < nrows (= " << nrows << ")";
00529     throw std::invalid_argument (os.str());
00530   }
00531 
00532       // If contiguous cache blocks are used, then we have to use the
00533       // same convention as we did for factor().  Otherwise, we are
00534       // free to choose the cache block dimensions as we wish in
00535       // apply(), independently of what we did in factor().
00536       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols_Q, strategy_);
00537       LAPACK<LocalOrdinal, Scalar> lapack;
00538       Combine<LocalOrdinal, Scalar> combine;
00539 
00540       const bool transposed = apply_type.transposed();
00541       const FactorOutput& tau_arrays = factor_output; // rename for encapsulation
00542       std::vector<Scalar> work (ncols_C);
00543       
00544       // We say "*_rest" because it points to the remaining part of
00545       // the matrix left to factor; at the beginning, the "remaining"
00546       // part is the whole matrix, but that will change as the
00547       // algorithm progresses.
00548       //
00549       // Note: if the cache blocks are stored contiguously, ldq
00550       // resp. ldc won't be the correct leading dimension, but it
00551       // won't matter, since we only read the leading dimension of
00552       // return values of split_top_block() / split_bottom_block(),
00553       // which are set correctly (based e.g., on whether or not we are
00554       // using contiguous cache blocks).
00555       const_mat_view Q_rest (nrows, ncols_Q, Q, ldq);
00556       mat_view C_rest (nrows, ncols_C, C, ldc);
00557 
00558       // Identify the top ncols_C by ncols_C block of C.  C_rest is
00559       // not modified.
00560       mat_view C_top = blocker.top_block (C_rest, contiguous_cache_blocks);
00561 
00562       if (transposed)
00563   {
00564     const_mat_view Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks);
00565     mat_view C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks);
00566 
00567     // Apply the topmost block of Q.
00568     FactorOutputIter tau_iter = tau_arrays.begin();
00569     const std::vector<Scalar>& tau = *tau_iter++;
00570     apply_first_block (combine, apply_type, Q_cur, tau, C_cur, work);
00571 
00572     while (! Q_rest.empty())
00573       {
00574         Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks);
00575         C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks);
00576         combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work);
00577       }
00578   }
00579       else
00580   {
00581     // Start with the last local Q factor and work backwards up the matrix.
00582     FactorOutputReverseIter tau_iter = tau_arrays.rbegin();
00583 
00584     const_mat_view Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks);
00585     mat_view C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks);
00586 
00587     while (! Q_rest.empty())
00588       {
00589         combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work);
00590         Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks);
00591         C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks);
00592       }
00593     // Apply to last (topmost) cache block.
00594     apply_first_block (combine, apply_type, Q_cur, *tau_iter++, C_cur, work);
00595   }
00596     }
00597 
00601     void
00602     explicit_Q (const LocalOrdinal nrows,
00603     const LocalOrdinal ncols_Q,
00604     const Scalar Q[],
00605     const LocalOrdinal ldq,
00606     const FactorOutput& factor_output,
00607     const LocalOrdinal ncols_C,
00608     Scalar C[],
00609     const LocalOrdinal ldc,
00610     const bool contiguous_cache_blocks) const
00611     {
00612       // Identify top ncols_C by ncols_C block of C.  C_view is not
00613       // modified.  top_block() will set C_top to have the correct
00614       // leading dimension, whether or not cache blocks are stored
00615       // contiguously.
00616       mat_view C_view (nrows, ncols_C, C, ldc);
00617       mat_view C_top = top_block (C_view, contiguous_cache_blocks);
00618 
00619       // Fill C with zeros, and then fill the topmost block of C with
00620       // the first ncols_C columns of the identity matrix, so that C
00621       // itself contains the first ncols_C columns of the identity
00622       // matrix.
00623       fill_with_zeros (nrows, ncols_C, C, ldc, contiguous_cache_blocks);
00624       for (LocalOrdinal j = 0; j < ncols_C; ++j)
00625         C_top(j, j) = Scalar(1);
00626 
00627       // Apply the Q factor to C, to extract the first ncols_C columns
00628       // of Q in explicit form.
00629       apply (ApplyType::NoTranspose, 
00630        nrows, ncols_Q, Q, ldq, factor_output, 
00631        ncols_C, C, ldc, contiguous_cache_blocks);
00632     }
00633 
00637     void
00638     Q_times_B (const LocalOrdinal nrows,
00639          const LocalOrdinal ncols,
00640          Scalar Q[],
00641          const LocalOrdinal ldq,
00642          const Scalar B[],
00643          const LocalOrdinal ldb,
00644          const bool contiguous_cache_blocks) const
00645     {
00646       // We don't do any other error checking here (e.g., matrix
00647       // dimensions), though it would be a good idea to do so.
00648 
00649       // Take the easy exit if available.
00650       if (ncols == 0 || nrows == 0)
00651   return;
00652 
00653       // Compute Q := Q*B by iterating through cache blocks of Q.
00654       // This iteration works much like iteration through cache blocks
00655       // of A in factor() (which see).  Here, though, each cache block
00656       // computation is completely independent of the others; a slight
00657       // restructuring of this code would parallelize nicely using
00658       // OpenMP.
00659       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00660       BLAS< LocalOrdinal, Scalar > blas;
00661       mat_view Q_rest (nrows, ncols, Q, ldq);
00662       Matrix< LocalOrdinal, Scalar > 
00663   Q_cur_copy (LocalOrdinal(0), LocalOrdinal(0)); // will be resized
00664       while (! Q_rest.empty())
00665   {
00666     mat_view Q_cur = 
00667       blocker.split_top_block (Q_rest, contiguous_cache_blocks);
00668 
00669     // GEMM doesn't like aliased arguments, so we use a copy.
00670     // We only copy the current cache block, rather than all of
00671     // Q; this saves memory.
00672     Q_cur_copy.reshape (Q_cur.nrows(), ncols);
00673     Q_cur_copy.copy (Q_cur);
00674     // Q_cur := Q_cur_copy * B.
00675     blas.GEMM ("N", "N", Q_cur.nrows(), ncols, ncols, Scalar(1),
00676          Q_cur_copy.get(), Q_cur_copy.lda(), B, ldb,
00677          Scalar(0), Q_cur.get(), Q_cur.lda());
00678   }
00679     }
00680 
00691     void
00692     cache_block (const LocalOrdinal nrows,
00693      const LocalOrdinal ncols,
00694      Scalar A_out[],
00695      const Scalar A_in[],
00696      const LocalOrdinal lda_in) const
00697     {
00698       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00699       blocker.cache_block (nrows, ncols, A_out, A_in, lda_in);
00700     }
00701 
00718     void
00719     un_cache_block (const LocalOrdinal nrows,
00720         const LocalOrdinal ncols,
00721         Scalar A_out[],
00722         const LocalOrdinal lda_out,       
00723         const Scalar A_in[]) const
00724     {
00725       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00726       blocker.un_cache_block (nrows, ncols, A_out, lda_out, A_in);
00727     }
00728 
00741     void
00742     fill_with_zeros (const LocalOrdinal nrows,
00743          const LocalOrdinal ncols,
00744          Scalar A[],
00745          const LocalOrdinal lda, 
00746          const bool contiguous_cache_blocks) const
00747     {
00748       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00749       blocker.fill_with_zeros (nrows, ncols, A, lda, contiguous_cache_blocks);
00750     }
00751 
00752   protected:
00753 
00766     ConstMatView<LocalOrdinal, Scalar>
00767     const_top_block (const ConstMatView<LocalOrdinal, Scalar>& C, 
00768          const bool contiguous_cache_blocks) const 
00769     {
00770       // The CacheBlocker object knows how to construct a view of the
00771       // top cache block of C.  This is complicated because cache
00772       // blocks (in C) may or may not be stored contiguously.  If they
00773       // are stored contiguously, the CacheBlocker knows the right
00774       // layout, based on the cache blocking strategy.
00775       typedef CacheBlocker<LocalOrdinal, Scalar> blocker_type;
00776       blocker_type blocker (C.nrows(), C.ncols(), strategy_);
00777 
00778       // C_top_block is a view of the topmost cache block of C.
00779       // C_top_block should have >= ncols rows, otherwise either cache
00780       // blocking is broken or the input matrix C itself had fewer
00781       // rows than columns.
00782       const_mat_view C_top_block = 
00783   blocker.top_block (C, contiguous_cache_blocks);
00784       return C_top_block;
00785     }
00786 
00787   private:
00789     CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
00790   };
00791   
00792 } // namespace TSQR
00793 
00794 #endif // __TSQR_Tsqr_SequentialTsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends