Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_SequentialTsqr.hpp
Go to the documentation of this file.
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00045 #ifndef __TSQR_Tsqr_SequentialTsqr_hpp
00046 #define __TSQR_Tsqr_SequentialTsqr_hpp
00047 
00048 #include <Tsqr_ApplyType.hpp>
00049 #include <Tsqr_Matrix.hpp>
00050 #include <Tsqr_CacheBlockingStrategy.hpp>
00051 #include <Tsqr_CacheBlocker.hpp>
00052 #include <Tsqr_Combine.hpp>
00053 #include <Tsqr_LocalVerify.hpp>
00054 #include <Tsqr_NodeTsqr.hpp>
00055 #include <Tsqr_Util.hpp>
00056 
00057 #include <Teuchos_Describable.hpp>
00058 #include <Teuchos_ParameterList.hpp>
00059 #include <Teuchos_ParameterListExceptions.hpp>
00060 #include <Teuchos_ScalarTraits.hpp>
00061 
00062 #include <algorithm>
00063 #include <limits>
00064 #include <sstream>
00065 #include <string>
00066 #include <utility> // std::pair
00067 #include <vector>
00068 
00069 
00070 namespace TSQR {
00071 
00120   template<class LocalOrdinal, class Scalar>
00121   class SequentialTsqr : 
00122     public NodeTsqr<LocalOrdinal, Scalar, std::vector<std::vector<Scalar> > >
00123   {
00124   public:
00125     typedef LocalOrdinal ordinal_type;
00126     typedef Scalar scalar_type;
00127     typedef typename Teuchos::ScalarTraits<Scalar>::magnitudeType magnitude_type;
00128     typedef typename NodeTsqr<LocalOrdinal, Scalar, std::vector<std::vector<Scalar> > >::factor_output_type FactorOutput;
00129 
00130   private:
00131     typedef typename FactorOutput::const_iterator FactorOutputIter;
00132     typedef typename FactorOutput::const_reverse_iterator FactorOutputReverseIter;
00133     typedef MatView<LocalOrdinal, Scalar> mat_view;
00134     typedef ConstMatView<LocalOrdinal, Scalar> const_mat_view;
00135     typedef std::pair<mat_view, mat_view> block_pair_type;
00136     typedef std::pair<const_mat_view, const_mat_view> const_block_pair_type;
00137 
00165     mat_view
00166     factor_first_block (Combine<LocalOrdinal, Scalar>& combine,
00167       mat_view& A_top,
00168       std::vector<Scalar>& tau,
00169       std::vector<Scalar>& work) const
00170     {
00171       const LocalOrdinal ncols = A_top.ncols();
00172       combine.factor_first (A_top.nrows(), ncols, A_top.get(), A_top.lda(), 
00173           &tau[0], &work[0]);
00174       return mat_view(ncols, ncols, A_top.get(), A_top.lda());
00175     }
00176 
00181     void 
00182     apply_first_block (Combine<LocalOrdinal, Scalar>& combine,
00183            const ApplyType& applyType,
00184            const const_mat_view& Q_first,
00185            const std::vector<Scalar>& tau,
00186            mat_view& C_first,
00187            std::vector<Scalar>& work) const
00188     {
00189       const LocalOrdinal nrowsLocal = Q_first.nrows();
00190       combine.apply_first (applyType, nrowsLocal, C_first.ncols(), 
00191          Q_first.ncols(), Q_first.get(), Q_first.lda(),
00192          &tau[0], C_first.get(), C_first.lda(), &work[0]);
00193     }
00194 
00195     void
00196     combine_apply (Combine<LocalOrdinal, Scalar>& combine,
00197        const ApplyType& apply_type,
00198        const const_mat_view& Q_cur,
00199        const std::vector<Scalar>& tau,
00200        mat_view& C_top,
00201        mat_view& C_cur,
00202        std::vector<Scalar>& work) const
00203     {
00204       const LocalOrdinal nrows_local = Q_cur.nrows();
00205       const LocalOrdinal ncols_Q = Q_cur.ncols();
00206       const LocalOrdinal ncols_C = C_cur.ncols();
00207 
00208       combine.apply_inner (apply_type, 
00209          nrows_local, ncols_C, ncols_Q, 
00210          Q_cur.get(), C_cur.lda(), &tau[0],
00211          C_top.get(), C_top.lda(),
00212          C_cur.get(), C_cur.lda(), &work[0]);
00213     }
00214 
00215     void
00216     combine_factor (Combine<LocalOrdinal, Scalar>& combine,
00217         mat_view& R,
00218         mat_view& A_cur,
00219         std::vector<Scalar>& tau,
00220         std::vector<Scalar>& work) const
00221     {
00222       const LocalOrdinal nrows_local = A_cur.nrows();
00223       const LocalOrdinal ncols = A_cur.ncols();
00224 
00225       combine.factor_inner (nrows_local, ncols, R.get(), R.lda(), 
00226           A_cur.get(), A_cur.lda(), &tau[0], 
00227           &work[0]);
00228     }
00229 
00230   public:
00266     SequentialTsqr (const size_t cacheSizeHint = 0,
00267         const size_t sizeOfScalar = sizeof(Scalar)) :
00268       strategy_ (cacheSizeHint, sizeOfScalar)
00269     {}
00270 
00279     SequentialTsqr (const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy) :
00280       strategy_ (strategy) 
00281     {}
00282 
00292     SequentialTsqr (const Teuchos::RCP<Teuchos::ParameterList>& params)
00293     {
00294       setParameterList (params);
00295     }
00296 
00303     Teuchos::RCP<const Teuchos::ParameterList> 
00304     getValidParameters () const
00305     {
00306       using Teuchos::ParameterList;
00307       using Teuchos::parameterList;
00308       using Teuchos::RCP;
00309 
00310       const size_t cacheSizeHint = 0;
00311       const size_t sizeOfScalar = sizeof(Scalar);
00312 
00313       RCP<ParameterList> plist = parameterList ("NodeTsqr");
00314       plist->set ("Cache Size Hint", cacheSizeHint, 
00315       "Cache size hint in bytes (as a size_t) to use for intranode"
00316       "TSQR.  If zero, TSQR will pick a reasonable default.  "
00317       "The size should correspond to that of the largest cache that "
00318       "is private to each CPU core, if such a private cache exists; "
00319       "otherwise, it should correspond to the amount of shared "
00320       "cache, divided by the number of cores sharing that cache.");
00321       plist->set ("Size of Scalar", sizeOfScalar, "Size of the Scalar type.  "
00322       "Default is sizeof(Scalar).  Only set if sizeof(Scalar) does "
00323       "not describe how much memory a Scalar type takes.");
00324       return plist;
00325     }
00326 
00335     void 
00336     setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist)
00337     {
00338       using Teuchos::Exceptions::InvalidParameter;
00339       using Teuchos::ParameterList;
00340       using Teuchos::parameterList;
00341       using Teuchos::RCP;
00342       
00343       RCP<ParameterList> params = plist.is_null() ? 
00344   parameterList (*getValidParameters()) : plist;
00345 
00346       const std::string cacheSizeHintName ("Cache Size Hint");
00347       const std::string sizeOfScalarName ("Size of Scalar");
00348       // In order to avoid calling getValidParameters() and
00349       // constructing a default list, we set missing values here to
00350       // their defaults.  This duplicates default values set in
00351       // getValidParameters(), so if you change those, be careful to
00352       // change them here.
00353       size_t cacheSizeHint = 0;
00354       size_t sizeOfScalar = sizeof(Scalar);
00355       
00356       try {
00357   cacheSizeHint = params->get<size_t> (cacheSizeHintName);
00358       } catch (InvalidParameter&) {
00359   params->set (cacheSizeHintName, cacheSizeHint);
00360       }
00361       try {
00362   sizeOfScalar = params->get<size_t> (sizeOfScalarName);
00363       } catch (InvalidParameter&) {
00364   params->set (sizeOfScalarName, sizeOfScalar);
00365       }
00366 
00367       // Reconstruct the cache blocking strategy, since we may have
00368       // changed parameters.
00369       strategy_ = CacheBlockingStrategy<LocalOrdinal, Scalar> (cacheSizeHint, 
00370                      sizeOfScalar);
00371     }
00372 
00378     std::string description () const {
00379       std::ostringstream os;
00380       os << "Intranode Tall Skinny QR (TSQR): sequential cache-blocked "
00381   "implementation with cache size hint " << this->cache_size_hint() 
00382    << " bytes.";
00383       return os.str();
00384     }
00385 
00387     bool ready() const {
00388       return true;
00389     }
00390 
00394     bool QR_produces_R_factor_with_nonnegative_diagonal () const {
00395       typedef Combine<LocalOrdinal, Scalar> combine_type;
00396       return combine_type::QR_produces_R_factor_with_nonnegative_diagonal();
00397     }
00398 
00404     size_t cache_size_hint () const { 
00405       return strategy_.cache_size_hint(); 
00406     }
00407 
00412     size_t TEUCHOS_DEPRECATED cache_block_size () const { 
00413       return strategy_.cache_size_hint(); 
00414     }
00415 
00441     FactorOutput
00442     factor (const LocalOrdinal nrows,
00443       const LocalOrdinal ncols,
00444       Scalar A[],
00445       const LocalOrdinal lda, 
00446       const bool contiguous_cache_blocks) const
00447     {
00448       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00449       Combine<LocalOrdinal, Scalar> combine;
00450       std::vector<Scalar> work (ncols);
00451       FactorOutput tau_arrays;
00452 
00453       // We say "A_rest" because it points to the remaining part of
00454       // the matrix left to factor; at the beginning, the "remaining"
00455       // part is the whole matrix, but that will change as the
00456       // algorithm progresses.
00457       //
00458       // Note: if the cache blocks are stored contiguously, lda won't
00459       // be the correct leading dimension of A, but it won't matter:
00460       // we only ever operate on A_cur here, and A_cur's leading
00461       // dimension is set correctly by A_rest.split_top().
00462       mat_view A_rest (nrows, ncols, A, lda);
00463       // This call modifies A_rest.
00464       mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00465 
00466       // Factor the topmost block of A.
00467       std::vector<Scalar> tau_first (ncols);
00468       mat_view R_view = factor_first_block (combine, A_cur, tau_first, work);
00469       tau_arrays.push_back (tau_first);
00470 
00471       while (! A_rest.empty())
00472   {
00473     A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00474     std::vector<Scalar> tau (ncols);
00475     combine_factor (combine, R_view, A_cur, tau, work);
00476     tau_arrays.push_back (tau);
00477   }
00478       return tau_arrays;
00479     }
00480 
00488     void
00489     extract_R (const LocalOrdinal nrows,
00490          const LocalOrdinal ncols,
00491          const Scalar A[],
00492          const LocalOrdinal lda,
00493          Scalar R[],
00494          const LocalOrdinal ldr,
00495          const bool contiguous_cache_blocks) const
00496     {
00497       const_mat_view A_view (nrows, ncols, A, lda);
00498 
00499       // Identify top cache block of A
00500       const_mat_view A_top = top_block (A_view, contiguous_cache_blocks);
00501 
00502       // Fill R (including lower triangle) with zeros.
00503       fill_matrix (ncols, ncols, R, ldr, Teuchos::ScalarTraits<Scalar>::zero());
00504 
00505       // Copy out the upper triangle of the R factor from A into R.
00506       copy_upper_triangle (ncols, ncols, R, ldr, A_top.get(), A_top.lda());
00507     }
00508 
00517     FactorOutput
00518     factor (const LocalOrdinal nrows,
00519       const LocalOrdinal ncols,
00520       Scalar A[],
00521       const LocalOrdinal lda, 
00522       Scalar R[],
00523       const LocalOrdinal ldr,
00524       const bool contiguous_cache_blocks) const
00525     {
00526       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00527       Combine<LocalOrdinal, Scalar> combine;
00528       std::vector<Scalar> work (ncols);
00529       FactorOutput tau_arrays;
00530 
00531       // We say "A_rest" because it points to the remaining part of
00532       // the matrix left to factor; at the beginning, the "remaining"
00533       // part is the whole matrix, but that will change as the
00534       // algorithm progresses.
00535       //
00536       // Note: if the cache blocks are stored contiguously, lda won't
00537       // be the correct leading dimension of A, but it won't matter:
00538       // we only ever operate on A_cur here, and A_cur's leading
00539       // dimension is set correctly by A_rest.split_top().
00540       mat_view A_rest (nrows, ncols, A, lda);
00541       // This call modifies A_rest.
00542       mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00543 
00544       // Factor the topmost block of A.
00545       std::vector<Scalar> tau_first (ncols);
00546       mat_view R_view = factor_first_block (combine, A_cur, tau_first, work);
00547       tau_arrays.push_back (tau_first);
00548 
00549       while (! A_rest.empty())
00550   {
00551     A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00552     std::vector< Scalar > tau (ncols);
00553     combine_factor (combine, R_view, A_cur, tau, work);
00554     tau_arrays.push_back (tau);
00555   }
00556       
00557       // Copy the R factor resulting from the factorization out of
00558       // R_view (a view of the topmost cache block of A) into the R
00559       // output argument.
00560       fill_matrix (ncols, ncols, R, ldr, Scalar(0));
00561       copy_upper_triangle (ncols, ncols, R, ldr, R_view.get(), R_view.lda());
00562       return tau_arrays;
00563     }
00564 
00565 
00585     LocalOrdinal
00586     factor_num_cache_blocks (const LocalOrdinal nrows,
00587            const LocalOrdinal ncols,
00588            const Scalar A[],
00589            const LocalOrdinal lda, 
00590            const bool contiguous_cache_blocks) const
00591     {
00592       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00593       LocalOrdinal count = 0;
00594 
00595       const_mat_view A_rest (nrows, ncols, A, lda);
00596       if (A_rest.empty())
00597   return count;
00598 
00599       const_mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00600       ++count; // first factor step
00601 
00602       while (! A_rest.empty())
00603   {
00604     A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00605     ++count; // next factor step
00606   }
00607       return count;
00608     }
00609 
00613     void
00614     apply (const ApplyType& apply_type,
00615      const LocalOrdinal nrows,
00616      const LocalOrdinal ncols_Q,
00617      const Scalar Q[],
00618      const LocalOrdinal ldq,
00619      const FactorOutput& factor_output,
00620      const LocalOrdinal ncols_C,
00621      Scalar C[],
00622      const LocalOrdinal ldc,
00623      const bool contiguous_cache_blocks) const
00624     {
00625       // Quick exit and error tests
00626       if (ncols_Q == 0 || ncols_C == 0 || nrows == 0)
00627   return;
00628       else if (ldc < nrows)
00629   {
00630     std::ostringstream os;
00631     os << "SequentialTsqr::apply: ldc (= " << ldc << ") < nrows (= " << nrows << ")";
00632     throw std::invalid_argument (os.str());
00633   }
00634       else if (ldq < nrows)
00635   {
00636     std::ostringstream os;
00637     os << "SequentialTsqr::apply: ldq (= " << ldq << ") < nrows (= " << nrows << ")";
00638     throw std::invalid_argument (os.str());
00639   }
00640 
00641       // If contiguous cache blocks are used, then we have to use the
00642       // same convention as we did for factor().  Otherwise, we are
00643       // free to choose the cache block dimensions as we wish in
00644       // apply(), independently of what we did in factor().
00645       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols_Q, strategy_);
00646       LAPACK<LocalOrdinal, Scalar> lapack;
00647       Combine<LocalOrdinal, Scalar> combine;
00648 
00649       const bool transposed = apply_type.transposed();
00650       const FactorOutput& tau_arrays = factor_output; // rename for encapsulation
00651       std::vector<Scalar> work (ncols_C);
00652       
00653       // We say "*_rest" because it points to the remaining part of
00654       // the matrix left to factor; at the beginning, the "remaining"
00655       // part is the whole matrix, but that will change as the
00656       // algorithm progresses.
00657       //
00658       // Note: if the cache blocks are stored contiguously, ldq
00659       // resp. ldc won't be the correct leading dimension, but it
00660       // won't matter, since we only read the leading dimension of
00661       // return values of split_top_block() / split_bottom_block(),
00662       // which are set correctly (based e.g., on whether or not we are
00663       // using contiguous cache blocks).
00664       const_mat_view Q_rest (nrows, ncols_Q, Q, ldq);
00665       mat_view C_rest (nrows, ncols_C, C, ldc);
00666 
00667       // Identify the top ncols_C by ncols_C block of C.  C_rest is
00668       // not modified.
00669       mat_view C_top = blocker.top_block (C_rest, contiguous_cache_blocks);
00670 
00671       if (transposed)
00672   {
00673     const_mat_view Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks);
00674     mat_view C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks);
00675 
00676     // Apply the topmost block of Q.
00677     FactorOutputIter tau_iter = tau_arrays.begin();
00678     const std::vector<Scalar>& tau = *tau_iter++;
00679     apply_first_block (combine, apply_type, Q_cur, tau, C_cur, work);
00680 
00681     while (! Q_rest.empty())
00682       {
00683         Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks);
00684         C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks);
00685         combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work);
00686       }
00687   }
00688       else
00689   {
00690     // Start with the last local Q factor and work backwards up the matrix.
00691     FactorOutputReverseIter tau_iter = tau_arrays.rbegin();
00692 
00693     const_mat_view Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks);
00694     mat_view C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks);
00695 
00696     while (! Q_rest.empty())
00697       {
00698         combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work);
00699         Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks);
00700         C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks);
00701       }
00702     // Apply to last (topmost) cache block.
00703     apply_first_block (combine, apply_type, Q_cur, *tau_iter++, C_cur, work);
00704   }
00705     }
00706 
00710     void
00711     explicit_Q (const LocalOrdinal nrows,
00712     const LocalOrdinal ncols_Q,
00713     const Scalar Q[],
00714     const LocalOrdinal ldq,
00715     const FactorOutput& factor_output,
00716     const LocalOrdinal ncols_C,
00717     Scalar C[],
00718     const LocalOrdinal ldc,
00719     const bool contiguous_cache_blocks) const
00720     {
00721       // Identify top ncols_C by ncols_C block of C.  C_view is not
00722       // modified.  top_block() will set C_top to have the correct
00723       // leading dimension, whether or not cache blocks are stored
00724       // contiguously.
00725       mat_view C_view (nrows, ncols_C, C, ldc);
00726       mat_view C_top = this->top_block (C_view, contiguous_cache_blocks);
00727 
00728       // Fill C with zeros, and then fill the topmost block of C with
00729       // the first ncols_C columns of the identity matrix, so that C
00730       // itself contains the first ncols_C columns of the identity
00731       // matrix.
00732       fill_with_zeros (nrows, ncols_C, C, ldc, contiguous_cache_blocks);
00733       for (LocalOrdinal j = 0; j < ncols_C; ++j)
00734         C_top(j, j) = Scalar(1);
00735 
00736       // Apply the Q factor to C, to extract the first ncols_C columns
00737       // of Q in explicit form.
00738       apply (ApplyType::NoTranspose, 
00739        nrows, ncols_Q, Q, ldq, factor_output, 
00740        ncols_C, C, ldc, contiguous_cache_blocks);
00741     }
00742 
00746     void
00747     Q_times_B (const LocalOrdinal nrows,
00748          const LocalOrdinal ncols,
00749          Scalar Q[],
00750          const LocalOrdinal ldq,
00751          const Scalar B[],
00752          const LocalOrdinal ldb,
00753          const bool contiguous_cache_blocks) const
00754     {
00755       // We don't do any other error checking here (e.g., matrix
00756       // dimensions), though it would be a good idea to do so.
00757 
00758       // Take the easy exit if available.
00759       if (ncols == 0 || nrows == 0)
00760   return;
00761 
00762       // Compute Q := Q*B by iterating through cache blocks of Q.
00763       // This iteration works much like iteration through cache blocks
00764       // of A in factor() (which see).  Here, though, each cache block
00765       // computation is completely independent of the others; a slight
00766       // restructuring of this code would parallelize nicely using
00767       // OpenMP.
00768       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00769       BLAS< LocalOrdinal, Scalar > blas;
00770       mat_view Q_rest (nrows, ncols, Q, ldq);
00771       Matrix< LocalOrdinal, Scalar > 
00772   Q_cur_copy (LocalOrdinal(0), LocalOrdinal(0)); // will be resized
00773       while (! Q_rest.empty())
00774   {
00775     mat_view Q_cur = 
00776       blocker.split_top_block (Q_rest, contiguous_cache_blocks);
00777 
00778     // GEMM doesn't like aliased arguments, so we use a copy.
00779     // We only copy the current cache block, rather than all of
00780     // Q; this saves memory.
00781     Q_cur_copy.reshape (Q_cur.nrows(), ncols);
00782     Q_cur_copy.copy (Q_cur);
00783     // Q_cur := Q_cur_copy * B.
00784     blas.GEMM ("N", "N", Q_cur.nrows(), ncols, ncols, Scalar(1),
00785          Q_cur_copy.get(), Q_cur_copy.lda(), B, ldb,
00786          Scalar(0), Q_cur.get(), Q_cur.lda());
00787   }
00788     }
00789 
00800     void
00801     cache_block (const LocalOrdinal nrows,
00802      const LocalOrdinal ncols,
00803      Scalar A_out[],
00804      const Scalar A_in[],
00805      const LocalOrdinal lda_in) const
00806     {
00807       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00808       blocker.cache_block (nrows, ncols, A_out, A_in, lda_in);
00809     }
00810 
00827     void
00828     un_cache_block (const LocalOrdinal nrows,
00829         const LocalOrdinal ncols,
00830         Scalar A_out[],
00831         const LocalOrdinal lda_out,       
00832         const Scalar A_in[]) const
00833     {
00834       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00835       blocker.un_cache_block (nrows, ncols, A_out, lda_out, A_in);
00836     }
00837 
00850     void
00851     fill_with_zeros (const LocalOrdinal nrows,
00852          const LocalOrdinal ncols,
00853          Scalar A[],
00854          const LocalOrdinal lda, 
00855          const bool contiguous_cache_blocks) const
00856     {
00857       CacheBlocker<LocalOrdinal, Scalar> blocker (nrows, ncols, strategy_);
00858       blocker.fill_with_zeros (nrows, ncols, A, lda, contiguous_cache_blocks);
00859     }
00860 
00861   protected:
00862 
00875     ConstMatView<LocalOrdinal, Scalar>
00876     const_top_block (const ConstMatView<LocalOrdinal, Scalar>& C, 
00877          const bool contiguous_cache_blocks) const 
00878     {
00879       // The CacheBlocker object knows how to construct a view of the
00880       // top cache block of C.  This is complicated because cache
00881       // blocks (in C) may or may not be stored contiguously.  If they
00882       // are stored contiguously, the CacheBlocker knows the right
00883       // layout, based on the cache blocking strategy.
00884       typedef CacheBlocker<LocalOrdinal, Scalar> blocker_type;
00885       blocker_type blocker (C.nrows(), C.ncols(), strategy_);
00886 
00887       // C_top_block is a view of the topmost cache block of C.
00888       // C_top_block should have >= ncols rows, otherwise either cache
00889       // blocking is broken or the input matrix C itself had fewer
00890       // rows than columns.
00891       const_mat_view C_top_block = 
00892   blocker.top_block (C, contiguous_cache_blocks);
00893       return C_top_block;
00894     }
00895 
00896   private:
00898     CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
00899   };
00900   
00901 } // namespace TSQR
00902 
00903 #endif // __TSQR_Tsqr_SequentialTsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends