Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_KokkosNodeTsqr.hpp
Go to the documentation of this file.
00001 //@HEADER
00002 // ************************************************************************
00003 //
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 //
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ************************************************************************
00040 //@HEADER
00041 
00045 #ifndef __TSQR_KokkosNodeTsqr_hpp
00046 #define __TSQR_KokkosNodeTsqr_hpp
00047 
00048 #include <Tsqr_CacheBlocker.hpp>
00049 #include <Tsqr_Combine.hpp>
00050 #include <Tsqr_NodeTsqr.hpp>
00051 
00052 #include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
00053 #include <Teuchos_ScalarTraits.hpp>
00054 
00055 //#define KNR_DEBUG 1
00056 #ifdef KNR_DEBUG
00057 #  include <iostream>
00058 #endif // KNR_DEBUG
00059 
00060 namespace TSQR {
00061 
00062   namespace details {
00063 
00089     template<class LocalOrdinal, class Scalar>
00090     std::pair<LocalOrdinal, LocalOrdinal>
00091     cacheBlockIndexRange (const LocalOrdinal numRows,
00092                           const LocalOrdinal numCols,
00093                           const int partitionIndex,
00094                           const int numPartitions,
00095                           const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy)
00096     {
00097 #ifdef KNR_DEBUG
00098       using std::cerr;
00099       using std::endl;
00100       // cerr << "cacheBlockIndexRange(numRows=" << numRows
00101       //           << ", numCols=" << numCols
00102       //           << ", partitionIndex=" << partitionIndex
00103       //           << ", numPartitions=" << numPartitions
00104       //           << ", strategy)" << endl;
00105 #endif // KNR_DEBUG
00106 
00107       // The input index is a zero-based index of the current
00108       // partition (not the "current cache block" -- a partition
00109       // contains zero or more cache blocks).  If the input index is
00110       // out of range, then return, since there is nothing to do.
00111       //
00112       // The nice thing about partitioning over cache blocks is that
00113       // the cache blocking strategy guarantees that exactly one of
00114       // the following is true:
00115       //
00116       // 1. The partition is empty (contains zero cache blocks)
00117       // 2. All cache blocks in the partition are valid (none
00118       //    contains more columns than rows)
00119 
00120       // Return an empty partition (an empty cache block range) if
00121       // the partition index is out of range.
00122       if (partitionIndex >= numPartitions)
00123         return std::make_pair (LocalOrdinal(0), LocalOrdinal(0));
00124 
00125       const LocalOrdinal numRowsCacheBlock =
00126         strategy.cache_block_num_rows (numCols);
00127       const LocalOrdinal numCacheBlocks =
00128         strategy.num_cache_blocks (numRows, numCols, numRowsCacheBlock);
00129 
00130 #ifdef KNR_DEBUG
00131       // cerr << "numRowsCacheBlock=" << numRowsCacheBlock
00132       //           << ", numCacheBlocks=" << numCacheBlocks
00133       //           << endl;
00134 #endif // KNR_DEBUG
00135 
00136       // Figure out how many cache blocks my partition contains.  If
00137       // the number of partitions doesn't evenly divide the number
00138       // of cache blocks, we spread out the remainder among the
00139       // first few threads.
00140       const LocalOrdinal quotient = numCacheBlocks / numPartitions;
00141       const LocalOrdinal remainder = numCacheBlocks - quotient * numPartitions;
00142       const LocalOrdinal myNumCacheBlocks =
00143         (partitionIndex < remainder) ? (quotient + 1) : quotient;
00144 
00145 #ifdef KNR_DEBUG
00146       // cerr << "Partition " << partitionIndex << ": quotient=" << quotient
00147       //           << ", remainder=" << remainder << ", myNumCacheBlocks="
00148       //           << myNumCacheBlocks << endl;
00149 #endif // KNR_DEBUG
00150 
00151       // If there are no cache blocks, there is nothing to factor.
00152       // Return an empty cache block range to indicate this.
00153       if (myNumCacheBlocks == 0)
00154         return std::make_pair (LocalOrdinal(0), LocalOrdinal(0));
00155 
00156       // Index of my first cache block (inclusive).
00157       const LocalOrdinal myFirstCacheBlockIndex =
00158         (partitionIndex < remainder) ?
00159         partitionIndex * (quotient+1) :
00160         remainder * (quotient+1) + (partitionIndex - remainder) * quotient;
00161       // Index of my last cache block (exclusive).
00162       const LocalOrdinal myLastCacheBlockIndex =
00163         (partitionIndex+1 < remainder) ?
00164         (partitionIndex+1) * (quotient+1) :
00165         remainder * (quotient+1) + (partitionIndex+1 - remainder) * quotient;
00166       // Sanity check.
00167       if (myLastCacheBlockIndex <= myFirstCacheBlockIndex)
00168         {
00169           std::ostringstream os;
00170           os << "Partition " << (partitionIndex+1) << " of "
00171              << numPartitions << ":  My range of cache block indices ["
00172              << myFirstCacheBlockIndex << ", " << myLastCacheBlockIndex
00173              << ") is empty.";
00174           throw std::logic_error(os.str());
00175         }
00176       return std::make_pair (myFirstCacheBlockIndex, myLastCacheBlockIndex);
00177     }
00178 
00179 
00183     template<class LocalOrdinal, class Scalar>
00184     class FactorFirstPass {
00185     private:
00186       MatView<LocalOrdinal, Scalar> A_;
00187       // While tauArrays_ is shared among tasks (i.e., partitions),
00188       // there are no race conditions among entries, since each
00189       // partition writes its own entry.  Ditto for topBlocks_.
00190       std::vector<std::vector<Scalar> >& tauArrays_;
00191       std::vector<MatView<LocalOrdinal, Scalar> >& topBlocks_;
00192       CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
00193       int numPartitions_;
00194       bool contiguousCacheBlocks_;
00195 
00196       std::vector<Scalar>
00197       factorFirstCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
00198                              const MatView<LocalOrdinal, Scalar>& A_top,
00199                              std::vector<Scalar>& work)
00200       {
00201         std::vector<Scalar> tau (A_top.ncols());
00202 
00203         // We should only call this if A_top.ncols() > 0 and therefore
00204         // work.size() > 0, but we've already checked for that, so we
00205         // don't have to check again.
00206         combine.factor_first (A_top.nrows(), A_top.ncols(), A_top.get(),
00207                               A_top.lda(), &tau[0], &work[0]);
00208         return tau;
00209       }
00210 
00211       std::vector<Scalar>
00212       factorCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
00213                         const MatView<LocalOrdinal, Scalar>& A_top,
00214                         const MatView<LocalOrdinal, Scalar>& A_cur,
00215                         std::vector<Scalar>& work)
00216       {
00217         std::vector<Scalar> tau (A_top.ncols());
00218 
00219         // We should only call this if A_top.ncols() > 0 and therefore
00220         // tau.size() > 0 and work.size() > 0, but we've already
00221         // checked for that, so we don't have to check again.
00222         combine.factor_inner (A_cur.nrows(), A_top.ncols(),
00223                               A_top.get(), A_top.lda(),
00224                               A_cur.get(), A_cur.lda(),
00225                               &tau[0], &work[0]);
00226         return tau;
00227       }
00228 
00235       MatView<LocalOrdinal, Scalar>
00236       factor (const std::pair<LocalOrdinal, LocalOrdinal> cbIndices,
00237               const int partitionIndex)
00238       {
00239 #ifdef KNR_DEBUG
00240         using std::cerr;
00241         using std::endl;
00242 #endif // KNR_DEBUG
00243 
00244         typedef MatView<LocalOrdinal, Scalar> view_type;
00245         typedef CacheBlockRange<view_type> range_type;
00246 
00247         // Workspace is created here, because it must not be shared
00248         // among threads.
00249         std::vector<Scalar> work (A_.ncols());
00250 
00251         // Range of cache blocks to factor.
00252         range_type cbRange (A_, strategy_,
00253                             cbIndices.first,
00254                             cbIndices.second,
00255                             contiguousCacheBlocks_);
00256         // Iterator in the forward direction over the range of cache
00257         // blocks to factor.
00258         typedef typename CacheBlockRange<view_type>::iterator range_iter_type;
00259         range_iter_type cbIter = cbRange.begin();
00260 
00261         // Remember the top (first) block.
00262         view_type A_top = *cbIter;
00263         if (A_top.empty())
00264           return A_top;
00265         TEUCHOS_TEST_FOR_EXCEPTION(cbIndices.first >= cbIndices.second,
00266                            std::logic_error,
00267                            "FactorFirstPass::factor: A_top is not empty, but "
00268                            "the cache block index range " << cbIndices.first
00269                            << "," << cbIndices.second << " is empty.  Please "
00270                            "report this bug to the Kokkos developers.");
00271 
00272         // Current cache block index.
00273         LocalOrdinal curTauIdx = cbIndices.first;
00274 
00275         // Factor the first cache block.
00276         Combine<LocalOrdinal, Scalar> combine;
00277         tauArrays_[curTauIdx++] = factorFirstCacheBlock (combine, A_top, work);
00278 
00279         // Move past the first cache block.
00280         ++cbIter;
00281 
00282         // Number of cache block(s) we have factored thus far.
00283         LocalOrdinal count = 1;
00284 
00285         // Factor the remaining cache block(s).
00286         range_iter_type cbEnd = cbRange.end();
00287         while (cbIter != cbEnd)
00288           {
00289             view_type A_cur = *cbIter;
00290             // Iteration over cache blocks of a partition should
00291             // always result in nonempty cache blocks.
00292             TEUCHOS_TEST_FOR_EXCEPTION(A_cur.empty(), std::logic_error,
00293                                "FactorFirstPass::factor: The current cache bloc"
00294                                "k (the " << count << "-th to factor in the rang"
00295                                "e [" << cbIndices.first << ","
00296                                << cbIndices.second << ") of cache block indices"
00297                                ") in partition " << (partitionIndex+1) << " (ou"
00298                                "t of " << numPartitions_ << " partitions) is em"
00299                                "pty.  Please report this bug to the Kokkos deve"
00300                                "lopers.");
00301             TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(curTauIdx) >= tauArrays_.size(),
00302                                std::logic_error,
00303                                "FactorFirstPass::factor: curTauIdx (= "
00304                                << curTauIdx << ") >= tauArrays_.size() (= "
00305                                << tauArrays_.size() << ").  Please report this "
00306                                "bug to the Kokkos developers.");
00307             tauArrays_[curTauIdx++] =
00308               factorCacheBlock (combine, A_top, A_cur, work);
00309             ++count;
00310             ++cbIter;
00311           }
00312 #ifdef KNR_DEBUG
00313         cerr << "Factored " << count << " cache blocks" << endl;
00314 #endif // KNR_DEBUG
00315         return A_top;
00316       }
00317 
00318     public:
00339       FactorFirstPass (const MatView<LocalOrdinal,Scalar>& A,
00340                        std::vector<std::vector<Scalar> >& tauArrays,
00341                        std::vector<MatView<LocalOrdinal, Scalar> >& topBlocks,
00342                        const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
00343                        const int numPartitions,
00344                        const bool contiguousCacheBlocks = false) :
00345         A_ (A),
00346         tauArrays_ (tauArrays),
00347         topBlocks_ (topBlocks),
00348         strategy_ (strategy),
00349         numPartitions_ (numPartitions),
00350         contiguousCacheBlocks_ (contiguousCacheBlocks)
00351       {
00352         TEUCHOS_TEST_FOR_EXCEPTION(A_.empty(), std::logic_error,
00353                            "TSQR::FactorFirstPass constructor: A is empty.  "
00354                            "Please report this bug to the Kokkos developers.");
00355         TEUCHOS_TEST_FOR_EXCEPTION(numPartitions < 1, std::logic_error,
00356                            "TSQR::FactorFirstPass constructor: numPartitions "
00357                            "must be positive, but numPartitions = "
00358                            << numPartitions << ".  Please report this bug to "
00359                            "the Kokkos developers.");
00360       }
00361 
00392       void
00393       execute (const int partitionIndex)
00394       {
00395 #ifdef KNR_DEBUG
00396         using std::cerr;
00397         using std::endl;
00398         // cerr << "FactorFirstPass::execute (" << partitionIndex << ")" << endl;
00399 #endif // KNR_DEBUG
00400 
00401         if (partitionIndex < 0 || partitionIndex >= numPartitions_)
00402           return;
00403         else if (A_.empty())
00404           return;
00405         else
00406           {
00407             const std::pair<LocalOrdinal, LocalOrdinal> cbIndices =
00408               cacheBlockIndexRange (A_.nrows(), A_.ncols(), partitionIndex,
00409                                     numPartitions_, strategy_);
00410 #ifdef KNR_DEBUG
00411             cerr << "Partition " << partitionIndex
00412                  << ": Factoring cache block indices ["
00413                  << cbIndices.first << ", " << cbIndices.second << ")"
00414                  << endl;
00415 #endif // KNR_DEBUG
00416             // It's legitimate, though suboptimal, for some partitions
00417             // not to get any work to do (in this case, not to get any
00418             // cache blocks to factor).
00419             if (cbIndices.second <= cbIndices.first)
00420               return;
00421             else
00422               topBlocks_[partitionIndex] = factor (cbIndices, partitionIndex);
00423           }
00424       }
00425     };
00426 
00427 
00436     template<class LocalOrdinal, class Scalar>
00437     class ApplyFirstPass {
00438     private:
00439       ApplyType applyType_;
00440       ConstMatView<LocalOrdinal, Scalar> Q_;
00441       const std::vector<std::vector<Scalar> >& tauArrays_;
00442       const std::vector<MatView<LocalOrdinal, Scalar> >& topBlocks_;
00443       MatView<LocalOrdinal, Scalar> C_;
00444       CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
00445       int numPartitions_;
00446       bool explicitQ_, contiguousCacheBlocks_;
00447 
00448       void
00449       applyFirstCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
00450                             const ApplyType& applyType,
00451                             const ConstMatView<LocalOrdinal, Scalar>& Q_top,
00452                             const std::vector<Scalar>& tau,
00453                             const MatView<LocalOrdinal, Scalar>& C_top,
00454                             std::vector<Scalar>& work)
00455       {
00456         TEUCHOS_TEST_FOR_EXCEPTION(tau.size() < static_cast<size_t> (Q_top.ncols()),
00457                            std::logic_error,
00458                            "ApplyFirstPass::applyFirstCacheBlock: tau.size() "
00459                            "(= " << tau.size() << ") < number of columns "
00460                            << Q_top.ncols() << " in the Q factor.  Please "
00461                            "report this bug to the Kokkos developers.");
00462 
00463         // If we get this far, it's fair to assume that we have
00464         // checked whether tau and work have nonzero lengths.
00465         combine.apply_first (applyType, C_top.nrows(), C_top.ncols(),
00466                              Q_top.ncols(), Q_top.get(), Q_top.lda(),
00467                              &tau[0], C_top.get(), C_top.lda(), &work[0]);
00468       }
00469 
00470       void
00471       applyCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
00472                        const ApplyType& applyType,
00473                        const ConstMatView<LocalOrdinal, Scalar>& Q_cur,
00474                        const std::vector<Scalar>& tau,
00475                        const MatView<LocalOrdinal, Scalar>& C_top,
00476                        const MatView<LocalOrdinal, Scalar>& C_cur,
00477                        std::vector<Scalar>& work)
00478       {
00479         TEUCHOS_TEST_FOR_EXCEPTION(tau.size() < static_cast<size_t> (Q_cur.ncols()),
00480                            std::logic_error,
00481                            "ApplyFirstPass::applyCacheBlock: tau.size() "
00482                            "(= " << tau.size() << ") < number of columns "
00483                            << Q_cur.ncols() << " in the Q factor.  Please "
00484                            "report this bug to the Kokkos developers.");
00485 
00486         // If we get this far, it's fair to assume that we have
00487         // checked whether tau and work have nonzero lengths.
00488         combine.apply_inner (applyType, C_cur.nrows(), C_cur.ncols(),
00489                              Q_cur.ncols(), Q_cur.get(), Q_cur.lda(),
00490                              &tau[0],
00491                              C_top.get(), C_top.lda(),
00492                              C_cur.get(), C_cur.lda(),
00493                              &work[0]);
00494       }
00495 
00505       void
00506       apply (const ApplyType& applyType,
00507              const std::pair<LocalOrdinal, LocalOrdinal> cbIndices,
00508              const int partitionIndex)
00509       {
00510 #ifdef KNR_DEBUG
00511         using std::cerr;
00512         using std::endl;
00513 #endif // KNR_DEBUG
00514         typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
00515         typedef MatView<LocalOrdinal, Scalar> view_type;
00516         typedef CacheBlockRange<const_view_type> const_range_type;
00517         typedef CacheBlockRange<view_type> range_type;
00518         typedef CacheBlocker<LocalOrdinal, Scalar> blocker_type;
00519 
00520         if (cbIndices.first >= cbIndices.second)
00521           return; // My range of cache blocks is empty; nothing to do
00522 
00523         // Q_range: Range of cache blocks in the Q factor.
00524         // C_range: Range of cache blocks in the matrix C.
00525         const_range_type Q_range (Q_, strategy_,
00526                                   cbIndices.first, cbIndices.second,
00527                                   contiguousCacheBlocks_);
00528         range_type C_range (C_, strategy_,
00529                             cbIndices.first, cbIndices.second,
00530                             contiguousCacheBlocks_);
00531         TEUCHOS_TEST_FOR_EXCEPTION(Q_range.empty(), std::logic_error,
00532                            "Q_range is empty, but the range of cache block "
00533                            "indices [" << cbIndices.first << ", "
00534                            << cbIndices.second << ") is not empty.  Please "
00535                            "report this bug to the Kokkos developers.");
00536         TEUCHOS_TEST_FOR_EXCEPTION(C_range.empty(), std::logic_error,
00537                            "C_range is empty, but the range of cache block "
00538                            "indices [" << cbIndices.first << ", "
00539                            << cbIndices.second << ") is not empty.  Please "
00540                            "report this bug to the Kokkos developers.");
00541 
00542         // Task-local workspace array of length C_.ncols().  Workspace
00543         // must be per task, else there will be race conditions as
00544         // different tasks attempt to write to and read from the same
00545         // workspace simultaneously.
00546         std::vector<Scalar> work (C_.ncols());
00547 
00548         Combine<LocalOrdinal, Scalar> combine;
00549         if (applyType.transposed())
00550           {
00551             typename const_range_type::iterator Q_rangeIter = Q_range.begin();
00552             typename range_type::iterator C_rangeIter = C_range.begin();
00553             TEUCHOS_TEST_FOR_EXCEPTION(Q_rangeIter == Q_range.end(), std::logic_error,
00554                                "The Q cache block range claims to be nonempty, "
00555                                "but the iterator range is empty.  Please report"
00556                                " this bug to the Kokkos developers.");
00557             TEUCHOS_TEST_FOR_EXCEPTION(C_rangeIter == C_range.end(), std::logic_error,
00558                                "The C cache block range claims to be nonempty, "
00559                                "but the iterator range is empty.  Please report"
00560                                " this bug to the Kokkos developers.");
00561 
00562             // Q_top: Topmost cache block in the cache block range of Q.
00563             // C_top: Topmost cache block in the cache block range of C.
00564             const_view_type Q_top = *Q_rangeIter;
00565             view_type C_top = *C_rangeIter;
00566             if (explicitQ_)
00567               {
00568                 C_top.fill (Teuchos::ScalarTraits<Scalar>::zero());
00569                 if (partitionIndex == 0)
00570                   for (LocalOrdinal j = 0; j < C_top.ncols(); ++j)
00571                     C_top(j,j) = Teuchos::ScalarTraits<Scalar>::one();
00572               }
00573             LocalOrdinal curTauIndex = cbIndices.first;
00574 
00575             // Apply the first block.
00576             applyFirstCacheBlock (combine, applyType, Q_top,
00577                                   tauArrays_[curTauIndex++], C_top, work);
00578 
00579             // Apply the rest of the blocks, if any.
00580             ++Q_rangeIter;
00581             ++C_rangeIter;
00582             while (Q_rangeIter != Q_range.end())
00583               {
00584                 TEUCHOS_TEST_FOR_EXCEPTION(C_rangeIter == C_range.end(),
00585                                    std::logic_error,
00586                                    "When applying Q^T or Q^H to C: The Q cache "
00587                                    "block iterator is not yet at the end, but "
00588                                    "the C cache block iterator is.  Please "
00589                                    "report this bug to the Kokkos developers.");
00590                 const_view_type Q_cur = *Q_rangeIter;
00591                 view_type C_cur = *C_rangeIter;
00592                 ++Q_rangeIter;
00593                 ++C_rangeIter;
00594                 if (explicitQ_)
00595                   C_cur.fill (Teuchos::ScalarTraits<Scalar>::zero());
00596                 applyCacheBlock (combine, applyType, Q_cur,
00597                                  tauArrays_[curTauIndex++],
00598                                  C_top, C_cur, work);
00599               }
00600           }
00601         else
00602           {
00603             // Q_top: Topmost cache block in the cache block range of Q.
00604             // C_top: Topmost cache block in the cache block range of C.
00605             const_view_type Q_top = *(Q_range.begin());
00606             view_type C_top = *(C_range.begin());
00607 
00608             if (explicitQ_)
00609               {
00610                 // We've already filled the top ncols x ncols block of
00611                 // C_top with data (that's the result of applying the
00612                 // internode part of the Q factor via DistTsqr).
00613                 // However, we still need to fill the rest of C_top
00614                 // (everything but the top ncols rows of C_top) with
00615                 // zeros.
00616                 view_type C_top_rest (C_top.nrows() - C_top.ncols(), C_top.ncols(),
00617                                       C_top.get() + C_top.ncols(), C_top.lda());
00618                 C_top_rest.fill (Teuchos::ScalarTraits<Scalar>::zero());
00619               }
00620             LocalOrdinal curTauIndex = cbIndices.second-1;
00621 
00622             // When applying Q (rather than Q^T or Q^H), we apply the
00623             // cache blocks in reverse order.
00624             typename const_range_type::iterator Q_rangeIter = Q_range.rbegin();
00625             typename range_type::iterator C_rangeIter = C_range.rbegin();
00626             TEUCHOS_TEST_FOR_EXCEPTION(Q_rangeIter == Q_range.rend(), std::logic_error,
00627                                "The Q cache block range claims to be nonempty, "
00628                                "but the iterator range is empty.  Please report"
00629                                " this bug to the Kokkos developers.");
00630             TEUCHOS_TEST_FOR_EXCEPTION(C_rangeIter == C_range.rend(), std::logic_error,
00631                                "The C cache block range claims to be nonempty, "
00632                                "but the iterator range is empty.  Please report"
00633                                " this bug to the Kokkos developers.");
00634 
00635             // Equality of cache block range iterators only tests the
00636             // cache block index, not reverse-ness.  This means we can
00637             // compare a reverse-direction iterator (Q_rangeIter) with
00638             // a forward-direction iterator (Q_range.begin()).
00639             //
00640             // We do this because we need to handle the topmost block
00641             // of Q_range separately (applyFirstCacheBlock(), rather
00642             // than applyCacheBlock()).
00643             while (Q_rangeIter != Q_range.begin())
00644               {
00645                 const_view_type Q_cur = *Q_rangeIter;
00646                 view_type C_cur = *C_rangeIter;
00647 
00648                 if (explicitQ_)
00649                   C_cur.fill (Teuchos::ScalarTraits<Scalar>::zero());
00650 #ifdef KNR_DEBUG
00651                 cerr << "tauArrays_[curTauIndex=" << curTauIndex << "].size() = "
00652                      << tauArrays_[curTauIndex].size() << endl;
00653 #endif // KNR_DEBUG
00654                 TEUCHOS_TEST_FOR_EXCEPTION(curTauIndex < cbIndices.first, std::logic_error,
00655                                    "curTauIndex=" << curTauIndex << " out of valid "
00656                                    "range [" << cbIndices.first << ","
00657                                    << cbIndices.second << ").  Please report this "
00658                                    "bug to the Kokkos developers.");
00659                 applyCacheBlock (combine, applyType, Q_cur,
00660                                  tauArrays_[curTauIndex--],
00661                                  C_top, C_cur, work);
00662                 ++Q_rangeIter;
00663                 ++C_rangeIter;
00664               }
00665             TEUCHOS_TEST_FOR_EXCEPTION(curTauIndex < cbIndices.first, std::logic_error,
00666                                "curTauIndex=" << curTauIndex << " out of valid "
00667                                "range [" << cbIndices.first << ","
00668                                << cbIndices.second << ").  Please report this "
00669                                "bug to the Kokkos developers.");
00670 #ifdef KNR_DEBUG
00671             cerr << "tauArrays_[curTauIndex=" << curTauIndex << "].size() = "
00672                  << tauArrays_[curTauIndex].size() << endl;
00673 #endif // KNR_DEBUG
00674             // Apply the first block.
00675             applyFirstCacheBlock (combine, applyType, Q_top,
00676                                   tauArrays_[curTauIndex--], C_top, work);
00677           }
00678       }
00679 
00680     public:
00702       ApplyFirstPass (const ApplyType& applyType,
00703                       const ConstMatView<LocalOrdinal,Scalar>& Q,
00704                       const std::vector<std::vector<Scalar> >& tauArrays,
00705                       const std::vector<MatView<LocalOrdinal, Scalar> >& topBlocks,
00706                       const MatView<LocalOrdinal,Scalar>& C,
00707                       const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
00708                       const int numPartitions,
00709                       const bool explicitQ = false,
00710                       const bool contiguousCacheBlocks = false) :
00711         applyType_ (applyType),
00712         Q_ (Q),
00713         tauArrays_ (tauArrays),
00714         topBlocks_ (topBlocks),
00715         C_ (C),
00716         strategy_ (strategy),
00717         numPartitions_ (numPartitions),
00718         explicitQ_ (explicitQ),
00719         contiguousCacheBlocks_ (contiguousCacheBlocks)
00720       {}
00721 
00747       void
00748       execute (const int partitionIndex)
00749       {
00750         if (partitionIndex < 0 || partitionIndex >= numPartitions_)
00751           return;
00752         else if (Q_.empty())
00753           return;
00754         else if (C_.empty())
00755           return;
00756 
00757         // We use the same cache block indices for Q and for C.
00758         std::pair<LocalOrdinal, LocalOrdinal> cbIndices =
00759           cacheBlockIndexRange (Q_.nrows(), Q_.ncols(), partitionIndex,
00760                                 numPartitions_, strategy_);
00761         if (cbIndices.second <= cbIndices.first)
00762           return;
00763         {
00764           std::pair<size_t, size_t> cbInds (static_cast<size_t> (cbIndices.first),
00765                                             static_cast<size_t> (cbIndices.second));
00766           TEUCHOS_TEST_FOR_EXCEPTION(cbIndices.first < static_cast<LocalOrdinal>(0),
00767                              std::logic_error,
00768                              "TSQR::ApplyFirstPass::execute: cacheBlockIndexRa"
00769                              "nge(" << Q_.nrows() << ", " << Q_.ncols() << ", "
00770                              << partitionIndex << ", " << numPartitions_ << ","
00771                              " strategy) returned a cache block range "
00772                              << cbIndices.first << "," << cbIndices.second
00773                              << " with negative starting index.  Please report"
00774                              " this bug to the Kokkos developers.");
00775           TEUCHOS_TEST_FOR_EXCEPTION(cbInds.second > tauArrays_.size(),
00776                              std::logic_error,
00777                              "TSQR::ApplyFirstPass::execute: cacheBlockIndexRa"
00778                              "nge(" << Q_.nrows() << ", " << Q_.ncols() << ", "
00779                              << partitionIndex << ", " << numPartitions_ << ","
00780                              " strategy) returned a cache block range "
00781                              << cbIndices.first << "," << cbIndices.second
00782                              << " with starting index larger than the number o"
00783                              "f tau arrays " << tauArrays_.size() << ".  Pleas"
00784                              "e report this bug to the Kokkos developers.");
00785         }
00786 
00787         apply (applyType_, cbIndices, partitionIndex);
00788       }
00789 
00790     };
00791 
00792 
00796     template<class LocalOrdinal, class Scalar>
00797     class CacheBlockWDP {
00798     private:
00799       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
00800       typedef MatView<LocalOrdinal, Scalar> view_type;
00801       typedef CacheBlockRange<const_view_type> const_range_type;
00802       typedef CacheBlockRange<view_type> range_type;
00803 
00804       const_view_type A_in_;
00805       view_type A_out_;
00806       CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
00807       int numPartitions_;
00808       bool unblock_;
00809 
00814       void
00815       copyRange (const_range_type& cbInputRange, range_type& cbOutputRange)
00816       {
00817         typedef typename const_range_type::iterator input_iter_type;
00818         typedef typename range_type::iterator output_iter_type;
00819 
00820         input_iter_type inputIter = cbInputRange.begin();
00821         output_iter_type outputIter = cbOutputRange.begin();
00822 
00823         input_iter_type inputEnd = cbInputRange.end();
00824         // TODO (mfh 29 Jun 2012) In a debug build, check in the loop
00825         // below whether outputIter == cbOutputRange.end().  If so,
00826         // throw std::logic_error.  Don't declare outputEnd unless
00827         // we're in a debug build, because otherwise the compiler may
00828         // report warnings (gcc 4.5 doesn't; gcc 4.6 does).
00829         // output_iter_type outputEnd = cbOutputRange.end();
00830 
00831         while (inputIter != inputEnd) {
00832           const_view_type A_in_cur = *inputIter;
00833           view_type A_out_cur = *outputIter;
00834           A_out_cur.copy (A_in_cur);
00835           ++inputIter;
00836           ++outputIter;
00837         }
00838       }
00839 
00840     public:
00852       CacheBlockWDP (const ConstMatView<LocalOrdinal, Scalar> A_in,
00853                      const MatView<LocalOrdinal, Scalar> A_out,
00854                      const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
00855                      const int numPartitions,
00856                      const bool unblock) :
00857         A_in_ (A_in),
00858         A_out_ (A_out),
00859         strategy_ (strategy),
00860         numPartitions_ (numPartitions),
00861         unblock_ (unblock)
00862       {
00863         TEUCHOS_TEST_FOR_EXCEPTION(A_in_.nrows() != A_out_.nrows() ||
00864                            A_in_.ncols() != A_out_.ncols(),
00865                            std::invalid_argument,
00866                            "A_in and A_out do not have the same dimensions: "
00867                            "A_in is " << A_in_.nrows() << " by "
00868                            << A_in_.ncols() << ", but A_out is "
00869                            << A_out_.nrows() << " by "
00870                            << A_out_.ncols() << ".");
00871         TEUCHOS_TEST_FOR_EXCEPTION(numPartitions_ < 1,
00872                            std::invalid_argument,
00873                            "The number of partitions " << numPartitions_
00874                            << " is not a positive integer.");
00875       }
00876 
00882       void
00883       execute (const int partitionIndex)
00884       {
00885         if (partitionIndex < 0 || partitionIndex >= numPartitions_ ||
00886             A_in_.empty())
00887           return;
00888         else
00889           {
00890             typedef std::pair<LocalOrdinal, LocalOrdinal> index_range_type;
00891             const index_range_type cbIndices =
00892               cacheBlockIndexRange (A_in_.nrows(), A_in_.ncols(),
00893                                     partitionIndex, numPartitions_, strategy_);
00894             // It's perfectly legal for a partitioning to assign zero
00895             // cache block indices to a particular partition.  In that
00896             // case, this task has nothing to do.
00897             if (cbIndices.first >= cbIndices.second)
00898               return;
00899             else
00900               {
00901                 // If unblock_ is false, then A_in_ is in column-major
00902                 // order, and we want to cache-block it into A_out_.  If
00903                 // unblock_ is true, then A_in_ is cache-blocked, and we
00904                 // want to un-cache-block it into A_out_ (a matrix in
00905                 // column-major order).
00906                 const_range_type inputRange (A_in_, strategy_, cbIndices.first,
00907                                              cbIndices.second, unblock_);
00908                 range_type outputRange (A_out_, strategy_, cbIndices.first,
00909                                         cbIndices.second, ! unblock_);
00910                 copyRange (inputRange, outputRange);
00911               }
00912           }
00913       }
00914     };
00915 
00919     template<class LocalOrdinal, class Scalar>
00920     class MultWDP {
00921     private:
00922       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
00923       typedef MatView<LocalOrdinal, Scalar> view_type;
00924       typedef CacheBlockRange<view_type> range_type;
00925 
00926       view_type Q_;
00927       const_view_type B_;
00928       CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
00929       int numPartitions_;
00930       bool contiguousCacheBlocks_;
00931 
00932       void
00933       multBlock (BLAS<LocalOrdinal, Scalar>& blas,
00934                  const view_type& Q_cur,
00935                  Matrix<LocalOrdinal, Scalar>& Q_temp)
00936       {
00937         const LocalOrdinal numCols = Q_cur.ncols();
00938 
00939         // GEMM doesn't like aliased arguments, so we use a copy.  We
00940         // only copy the current cache block, rather than all of Q;
00941         // this saves memory.
00942         Q_temp.reshape (Q_cur.nrows(), numCols);
00943         Q_temp.copy (Q_cur);
00944 
00945         // Q_cur := Q_temp * B.
00946         blas.GEMM ("N", "N", Q_cur.nrows(), numCols, numCols,
00947                    Teuchos::ScalarTraits<Scalar>::one(),
00948                    Q_temp.get(), Q_temp.lda(), B_.get(), B_.lda(),
00949                    Scalar(0), Q_cur.get(), Q_cur.lda());
00950       }
00951 
00955       void
00956       multRange (range_type& cbRange)
00957       {
00958         typedef typename range_type::iterator iter_type;
00959         iter_type iter = cbRange.begin();
00960         iter_type end = cbRange.end();
00961 
00962         // Temporary storage for the BLAS' matrix-matrix multiply
00963         // routine (which forbids aliasing of any input argument and
00964         // the output argument).
00965         Matrix<LocalOrdinal, Scalar> Q_temp;
00966         BLAS<LocalOrdinal, Scalar> blas;
00967         while (iter != end)
00968           {
00969             view_type Q_cur = *iter;
00970             multBlock (blas, Q_cur, Q_temp);
00971             ++iter;
00972           }
00973       }
00974 
00975     public:
00985       MultWDP (const MatView<LocalOrdinal, Scalar> Q,
00986                const ConstMatView<LocalOrdinal, Scalar> B,
00987                const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
00988                const int numPartitions,
00989                const bool contiguousCacheBlocks) :
00990         Q_ (Q),
00991         B_ (B),
00992         strategy_ (strategy),
00993         numPartitions_ (numPartitions),
00994         contiguousCacheBlocks_ (contiguousCacheBlocks)
00995       {}
00996 
01002       void
01003       execute (const int partitionIndex)
01004       {
01005         if (partitionIndex < 0 || partitionIndex >= numPartitions_ ||
01006             Q_.empty())
01007           return;
01008         else
01009           {
01010             typedef std::pair<LocalOrdinal, LocalOrdinal> index_range_type;
01011             const index_range_type cbIndices =
01012               cacheBlockIndexRange (Q_.nrows(), Q_.ncols(), partitionIndex,
01013                                     numPartitions_, strategy_);
01014             if (cbIndices.first >= cbIndices.second)
01015               return;
01016             else
01017               {
01018                 range_type range (Q_, strategy_, cbIndices.first,
01019                                   cbIndices.second, contiguousCacheBlocks_);
01020                 multRange (range);
01021               }
01022           }
01023       }
01024     };
01025 
01026 
01030     template<class LocalOrdinal, class Scalar>
01031     class FillWDP {
01032     private:
01033       typedef MatView<LocalOrdinal, Scalar> view_type;
01034       typedef CacheBlockRange<view_type> range_type;
01035 
01036       view_type A_;
01037       CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
01038       const Scalar value_;
01039       int numPartitions_;
01040       bool contiguousCacheBlocks_;
01041 
01043       void
01044       fillRange (range_type& cbRange, const Scalar value)
01045       {
01046         typedef typename range_type::iterator iter_type;
01047         iter_type iter = cbRange.begin();
01048         iter_type end = cbRange.end();
01049         while (iter != end)
01050           {
01051             view_type A_cur = *iter;
01052             A_cur.fill (value);
01053             ++iter;
01054           }
01055       }
01056 
01057     public:
01067       FillWDP (const MatView<LocalOrdinal, Scalar> A,
01068                const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
01069                const Scalar value,
01070                const int numPartitions,
01071                const bool contiguousCacheBlocks) :
01072         A_ (A),
01073         strategy_ (strategy),
01074         value_ (value),
01075         numPartitions_ (numPartitions),
01076         contiguousCacheBlocks_ (contiguousCacheBlocks)
01077       {}
01078 
01084       void
01085       execute (const int partitionIndex)
01086       {
01087         if (partitionIndex < 0 || partitionIndex >= numPartitions_ ||
01088             A_.empty())
01089           return;
01090         else
01091           {
01092             typedef std::pair<LocalOrdinal, LocalOrdinal> index_range_type;
01093             const index_range_type cbIndices =
01094               cacheBlockIndexRange (A_.nrows(), A_.ncols(), partitionIndex,
01095                                     numPartitions_, strategy_);
01096             if (cbIndices.first >= cbIndices.second)
01097               return;
01098             else
01099               {
01100                 range_type range (A_, strategy_, cbIndices.first,
01101                                   cbIndices.second, contiguousCacheBlocks_);
01102                 fillRange (range, value_);
01103               }
01104           }
01105       }
01106     };
01107   } // namespace details
01108 
01119   template<class LocalOrdinal, class Scalar>
01120   struct KokkosNodeTsqrFactorOutput {
01129     KokkosNodeTsqrFactorOutput (const size_t theNumCacheBlocks,
01130                                 const int theNumPartitions) :
01131       firstPassTauArrays (theNumCacheBlocks)
01132     {
01133       // Protect the cast to size_t from a negative number of
01134       // partitions.
01135       TEUCHOS_TEST_FOR_EXCEPTION(theNumPartitions < 1, std::invalid_argument,
01136                          "TSQR::KokkosNodeTsqrFactorOutput: Invalid number of "
01137                          "partitions " << theNumPartitions << "; number of "
01138                          "partitions must be a positive integer.");
01139       // If there's only one partition, we don't even need a second
01140       // pass (it's just sequential TSQR), and we don't need a TAU
01141       // array for the top partition.
01142       secondPassTauArrays.resize (static_cast<size_t> (theNumPartitions-1));
01143       topBlocks.resize (static_cast<size_t> (theNumPartitions));
01144     }
01145 
01147     int numCacheBlocks() const { return firstPassTauArrays.size(); }
01148 
01150     int numPartitions() const { return topBlocks.size(); }
01151 
01153     std::vector<std::vector<Scalar> > firstPassTauArrays;
01154 
01168     std::vector<std::vector<Scalar> > secondPassTauArrays;
01169 
01173     std::vector<MatView<LocalOrdinal, Scalar> > topBlocks;
01174   };
01175 
01201   template<class LocalOrdinal, class Scalar, class NodeType>
01202   class KokkosNodeTsqr :
01203     public NodeTsqr<LocalOrdinal, Scalar, KokkosNodeTsqrFactorOutput<LocalOrdinal, Scalar> >,
01204     public Teuchos::ParameterListAcceptorDefaultBase
01205   {
01206   public:
01207     typedef LocalOrdinal local_ordinal_type;
01208     typedef Scalar scalar_type;
01209     typedef NodeType node_type;
01210 
01213     typedef typename NodeTsqr<LocalOrdinal, Scalar, KokkosNodeTsqrFactorOutput<LocalOrdinal, Scalar> >::factor_output_type FactorOutput;
01214 
01226     KokkosNodeTsqr (const Teuchos::RCP<node_type>& node,
01227                     const Teuchos::RCP<Teuchos::ParameterList>& params) :
01228       node_ (node)
01229     {
01230       setParameterList (params);
01231     }
01232 
01242     KokkosNodeTsqr (const Teuchos::RCP<Teuchos::ParameterList>& params) :
01243       node_ (Teuchos::null)
01244     {
01245       setParameterList (params);
01246     }
01247 
01253     KokkosNodeTsqr (const Teuchos::RCP<node_type>& node) :
01254       node_ (node)
01255     {
01256       setParameterList (Teuchos::null);
01257     }
01258 
01265     KokkosNodeTsqr () : node_ (Teuchos::null)
01266     {
01267       setParameterList (Teuchos::null);
01268     }
01269 
01286     void setNode (const Teuchos::RCP<node_type>& node) {
01287       node_ = node;
01288     }
01289 
01294     bool ready() const {
01295       return ! getNode().is_null();
01296     }
01297 
01301     std::string description () const {
01302       using Teuchos::TypeNameTraits;
01303       std::ostringstream os;
01304       os << "KokkosNodeTsqr<LocalOrdinal="
01305          << TypeNameTraits<LocalOrdinal>::name()
01306          << ", Scalar="
01307          << TypeNameTraits<Scalar>::name()
01308          << ", NodeType="
01309          << TypeNameTraits<node_type>::name()
01310          << ">: \"Cache Size Hint\"=" << strategy_.cache_size_hint()
01311          << ", \"Size of Scalar\"=" << strategy_.size_of_scalar()
01312          << ", \"Num Tasks\"=" << numPartitions_;
01313       return os.str();
01314     }
01315 
01323     void
01324     setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& paramList)
01325     {
01326       using Teuchos::ParameterList;
01327       using Teuchos::parameterList;
01328       using Teuchos::RCP;
01329       using Teuchos::rcp;
01330 
01331       RCP<ParameterList> plist;
01332       if (paramList.is_null()) {
01333         plist = rcp (new ParameterList (*getValidParameters ()));
01334       } else {
01335         plist = paramList;
01336         plist->validateParametersAndSetDefaults (*getValidParameters ());
01337       }
01338       // Get values of parameters.  We do this "transactionally" so
01339       // that (except for validation and filling in defaults above)
01340       // this method has the strong exception guarantee (it either
01341       // returns, or throws an exception with no externally visible
01342       // side effects).
01343       size_t cacheSizeHint, sizeOfScalar;
01344       int numPartitions;
01345       try {
01346         cacheSizeHint = plist->get<size_t> ("Cache Size Hint");
01347         sizeOfScalar = plist->get<size_t> ("Size of Scalar");
01348         numPartitions = plist->get<int> ("Num Tasks");
01349       } catch (Teuchos::Exceptions::InvalidParameter& e) {
01350         std::ostringstream os;
01351         os << "Failed to read default parameters after setting defaults.  Pleas"
01352           "e report this bug to the Kokkos developers.  Original exception mess"
01353           "age: " << e.what();
01354         throw std::logic_error (os.str());
01355       }
01356       numPartitions_ = numPartitions;
01357 
01358       // Recreate the cache blocking strategy.
01359       typedef CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_type;
01360       strategy_ = strategy_type (cacheSizeHint, sizeOfScalar);
01361 
01362       // Save the input parameter list.
01363       setMyParamList (plist);
01364     }
01365 
01370     Teuchos::RCP<const Teuchos::ParameterList>
01371     getValidParameters() const
01372     {
01373       using Teuchos::ParameterList;
01374       using Teuchos::parameterList;
01375       using Teuchos::RCP;
01376 
01377       if (defaultParams_.is_null()) {
01378         RCP<ParameterList> params = parameterList ("Intranode TSQR");
01379         params->set ("Cache Size Hint",
01380                      static_cast<size_t>(0),
01381                      std::string("Cache size in bytes; a hint for TSQR.  Set to t"
01382                                  "he size of the largest private cache per CPU co"
01383                                  "re, or the fraction of shared cache per core.  "
01384                                  "If zero, we pick a reasonable default."));
01385         params->set ("Size of Scalar",
01386                      sizeof(Scalar),
01387                      std::string ("Size in bytes of the Scalar type.  In most "
01388                                   "cases, the default sizeof(Scalar) is fine.  "
01389                                   "Set a non-default value only when Scalar's "
01390                                   "data is dynamically allocated (such as for a "
01391                                   "type with precision variable at run time)."));
01392 
01393         // The number of partitions is an int rather than a
01394         // LocalOrdinal, to ensure that it is always stored with the
01395         // same type, despite the type of LocalOrdinal.  Besides, Kokkos
01396         // wants an int anyway.
01397         params->set ("Num Tasks",
01398                      defaultNumPartitions (),
01399                      std::string ("Number of partitions; the maximum available pa"
01400                                   "rallelelism in intranode TSQR.  Slight oversub"
01401                                   "scription is OK; undersubscription may have a "
01402                                   "performance cost."));
01403         defaultParams_ = params;
01404       }
01405       return defaultParams_;
01406     }
01407 
01408     FactorOutput
01409     factor (const LocalOrdinal numRows,
01410             const LocalOrdinal numCols,
01411             Scalar A[],
01412             const LocalOrdinal lda,
01413             Scalar R[],
01414             const LocalOrdinal ldr,
01415             const bool contiguousCacheBlocks) const
01416     {
01417       MatView<LocalOrdinal, Scalar> A_view (numRows, numCols, A, lda);
01418       MatView<LocalOrdinal, Scalar> R_view (numCols, numCols, R, ldr);
01419       return factorImpl (A_view, R_view, contiguousCacheBlocks);
01420     }
01421 
01422     void
01423     apply (const ApplyType& applyType,
01424            const LocalOrdinal nrows,
01425            const LocalOrdinal ncols_Q,
01426            const Scalar Q[],
01427            const LocalOrdinal ldq,
01428            const FactorOutput& factorOutput,
01429            const LocalOrdinal ncols_C,
01430            Scalar C[],
01431            const LocalOrdinal ldc,
01432            const bool contiguousCacheBlocks) const
01433     {
01434       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01435       typedef MatView<LocalOrdinal, Scalar> view_type;
01436 
01437       const_view_type Q_view (nrows, ncols_Q, Q, ldq);
01438       view_type C_view (nrows, ncols_C, C, ldc);
01439       applyImpl (applyType, Q_view, factorOutput, C_view,
01440                  false, contiguousCacheBlocks);
01441     }
01442 
01443     void
01444     explicit_Q (const LocalOrdinal nrows,
01445                 const LocalOrdinal ncols_Q,
01446                 const Scalar Q[],
01447                 const LocalOrdinal ldq,
01448                 const FactorOutput& factorOutput,
01449                 const LocalOrdinal ncols_C,
01450                 Scalar C[],
01451                 const LocalOrdinal ldc,
01452                 const bool contiguousCacheBlocks) const
01453     {
01454       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01455       typedef MatView<LocalOrdinal, Scalar> view_type;
01456 
01457       const_view_type Q_view (nrows, ncols_Q, Q, ldq);
01458       view_type C_view (nrows, ncols_C, C, ldc);
01459       applyImpl (ApplyType::NoTranspose, Q_view, factorOutput,
01460                  C_view, true, contiguousCacheBlocks);
01461     }
01462 
01463     bool QR_produces_R_factor_with_nonnegative_diagonal () const {
01464       return combine_.QR_produces_R_factor_with_nonnegative_diagonal ();
01465     }
01466 
01467     size_t TEUCHOS_DEPRECATED cache_block_size() const {
01468       return strategy_.cache_size_hint();
01469     }
01470 
01471     size_t cache_size_hint() const {
01472       return strategy_.cache_size_hint();
01473     }
01474 
01475     void
01476     fill_with_zeros (const LocalOrdinal nrows,
01477                      const LocalOrdinal ncols,
01478                      Scalar A[],
01479                      const LocalOrdinal lda,
01480                      const bool contiguousCacheBlocks) const
01481     {
01482       Teuchos::RCP<node_type> node = getNode ();
01483       TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error,
01484                          "The Kokkos Node instance has not yet been set.  "
01485                          "KokkosNodeTsqr needs a Kokkos Node instance in order "
01486                          "to perform computations.");
01487 
01488       typedef MatView<LocalOrdinal, Scalar> view_type;
01489       view_type A_view (nrows, ncols, A, lda);
01490 
01491       typedef details::FillWDP<LocalOrdinal, Scalar> fill_wdp_type;
01492       typedef Teuchos::ScalarTraits<Scalar> STS;
01493       fill_wdp_type filler (A_view, strategy_, STS::zero(),
01494                             numPartitions_, contiguousCacheBlocks);
01495       node->parallel_for (0, numPartitions_, filler);
01496     }
01497 
01498     void
01499     cache_block (const LocalOrdinal nrows,
01500                  const LocalOrdinal ncols,
01501                  Scalar A_out[],
01502                  const Scalar A_in[],
01503                  const LocalOrdinal lda_in) const
01504     {
01505       Teuchos::RCP<node_type> node = getNode ();
01506       TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error,
01507                          "The Kokkos Node instance has not yet been set.  "
01508                          "KokkosNodeTsqr needs a Kokkos Node instance in order "
01509                          "to perform computations.");
01510 
01511       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01512       const_view_type A_in_view (nrows, ncols, A_in, lda_in);
01513 
01514       // The leading dimension of A_out doesn't matter here, since its
01515       // cache blocks are to be stored contiguously.  We set it
01516       // arbitrarily to a sensible value.
01517       typedef MatView<LocalOrdinal, Scalar> view_type;
01518       view_type A_out_view (nrows, ncols, A_out, nrows);
01519 
01520       typedef details::CacheBlockWDP<LocalOrdinal, Scalar> cb_wdp_type;
01521       cb_wdp_type cacheBlocker (A_in_view, A_out_view, strategy_,
01522                                 numPartitions_, false);
01523       node->parallel_for (0, numPartitions_, cacheBlocker);
01524     }
01525 
01526     void
01527     un_cache_block (const LocalOrdinal nrows,
01528                     const LocalOrdinal ncols,
01529                     Scalar A_out[],
01530                     const LocalOrdinal lda_out,
01531                     const Scalar A_in[]) const
01532     {
01533       Teuchos::RCP<node_type> node = getNode ();
01534       TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error,
01535                          "The Kokkos Node instance has not yet been set.  "
01536                          "KokkosNodeTsqr needs a Kokkos Node instance in order "
01537                          "to perform computations.");
01538 
01539       // The leading dimension of A_in doesn't matter here, since its
01540       // cache blocks are contiguously stored.  We set it arbitrarily
01541       // to a sensible value.
01542       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01543       const_view_type A_in_view (nrows, ncols, A_in, nrows);
01544 
01545       typedef MatView<LocalOrdinal, Scalar> view_type;
01546       view_type A_out_view (nrows, ncols, A_out, lda_out);
01547 
01548       typedef details::CacheBlockWDP<LocalOrdinal, Scalar> cb_wdp_type;
01549       cb_wdp_type cacheBlocker (A_in_view, A_out_view, strategy_,
01550                                 numPartitions_, true);
01551       node->parallel_for (0, numPartitions_, cacheBlocker);
01552     }
01553 
01554     void
01555     Q_times_B (const LocalOrdinal nrows,
01556                const LocalOrdinal ncols,
01557                Scalar Q[],
01558                const LocalOrdinal ldq,
01559                const Scalar B[],
01560                const LocalOrdinal ldb,
01561                const bool contiguousCacheBlocks) const
01562     {
01563       Teuchos::RCP<node_type> node = getNode ();
01564       TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error,
01565                          "The Kokkos Node instance has not yet been set.  "
01566                          "KokkosNodeTsqr needs a Kokkos Node instance in order "
01567                          "to perform computations.");
01568 
01569       typedef MatView<LocalOrdinal, Scalar> view_type;
01570       view_type Q_view (nrows, ncols, Q, ldq);
01571 
01572       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01573       const_view_type B_view (ncols, ncols, B, ldb);
01574 
01575       typedef details::MultWDP<LocalOrdinal, Scalar> mult_wdp_type;
01576       mult_wdp_type mult (Q_view, B_view, strategy_, numPartitions_,
01577                           contiguousCacheBlocks);
01578       node->parallel_for (0, numPartitions_, mult);
01579     }
01580 
01581   private:
01583     Teuchos::RCP<node_type> getNode () const {
01584       return node_;
01585     }
01586 
01588     Combine<LocalOrdinal, Scalar> combine_;
01589 
01591     mutable std::vector<Scalar> work_;
01592 
01594     Teuchos::RCP<node_type> node_;
01595 
01597     CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
01598 
01605     int numPartitions_;
01606 
01608     mutable Teuchos::RCP<const Teuchos::ParameterList> defaultParams_;
01609 
01622     int
01623     defaultNumPartitions () const
01624     {
01625       // Currently the Kokkos Node API does not give us access to the
01626       // amount of available parallelism, so we return a constant.
01627       // Mild oversubscription is OK.
01628       return 16;
01629     }
01630 
01631     FactorOutput
01632     factorImpl (MatView<LocalOrdinal, Scalar> A,
01633                 MatView<LocalOrdinal, Scalar> R,
01634                 const bool contiguousCacheBlocks) const
01635     {
01636       if (A.empty()) {
01637         TEUCHOS_TEST_FOR_EXCEPTION(! R.empty(), std::logic_error,
01638                                    "KokkosNodeTsqr::factorImpl: A is empty, but R "
01639                                    "is not.  Please report this bug to the Kokkos "
01640                                    "developers.");
01641         return FactorOutput (0, 0);
01642       }
01643       Teuchos::RCP<node_type> node = getNode ();
01644       TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error,
01645                          "The Kokkos Node instance has not yet been set.  "
01646                          "KokkosNodeTsqr needs a Kokkos Node instance in order "
01647                          "to perform computations.");
01648 
01649       const LocalOrdinal numRowsPerCacheBlock =
01650         strategy_.cache_block_num_rows (A.ncols());
01651       const LocalOrdinal numCacheBlocks =
01652         strategy_.num_cache_blocks (A.nrows(), A.ncols(), numRowsPerCacheBlock);
01653       //
01654       // Compute the first factorization pass (over partitions).
01655       //
01656       FactorOutput result (numCacheBlocks, numPartitions_);
01657       typedef details::FactorFirstPass<LocalOrdinal, Scalar> first_pass_type;
01658       first_pass_type firstPass (A, result.firstPassTauArrays,
01659                                  result.topBlocks, strategy_,
01660                                  numPartitions_, contiguousCacheBlocks);
01661       // parallel_for wants an exclusive range.
01662       node->parallel_for (0, numPartitions_, firstPass);
01663 
01664       // Each partition collected a view of its top block, where that
01665       // partition's R factor is stored.  The second pass reduces
01666       // those R factors.  We do this on one thread to avoid the
01667       // overhead of parallelizing it.  If the typical use case is
01668       // oversubscription, you should parallelize this step with
01669       // multiple passes.  Note that we can't use parallel_reduce,
01670       // because the tree topology matters.
01671       factorSecondPass (result.topBlocks, result.secondPassTauArrays,
01672                         numPartitions_);
01673 
01674       // The "topmost top block" contains the resulting R factor.
01675       typedef MatView<LocalOrdinal, Scalar> view_type;
01676       const view_type& R_top = result.topBlocks[0];
01677       TEUCHOS_TEST_FOR_EXCEPTION(R_top.empty(), std::logic_error,
01678                          "After factorSecondPass: result.topBlocks[0] is an "
01679                          "empty MatView.  Please report this bug to the Kokkos "
01680                          "developers.");
01681       view_type R_top_square (R_top.ncols(), R_top.ncols(),
01682                               R_top.get(), R_top.lda());
01683       R.fill (Teuchos::ScalarTraits<Scalar>::zero());
01684       // Only copy the upper triangle of R_top into R.
01685       copy_upper_triangle (R.ncols(), R.ncols(), R.get(), R.lda(),
01686                            R_top.get(), R_top.lda());
01687       return result;
01688     }
01689 
01690     void
01691     applyImpl (const ApplyType& applyType,
01692                const ConstMatView<LocalOrdinal, Scalar>& Q,
01693                const FactorOutput& factorOutput,
01694                const MatView<LocalOrdinal, Scalar>& C,
01695                const bool explicitQ,
01696                const bool contiguousCacheBlocks) const
01697     {
01698       using details::cacheBlockIndexRange;
01699       typedef details::ApplyFirstPass<LocalOrdinal, Scalar> first_pass_type;
01700       typedef CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_type;
01701       typedef MatView<LocalOrdinal, Scalar> view_type;
01702 
01703       Teuchos::RCP<node_type> node = getNode ();
01704       TEUCHOS_TEST_FOR_EXCEPTION(node.is_null(), std::runtime_error,
01705                          "The Kokkos Node instance has not yet been set.  "
01706                          "KokkosNodeTsqr needs a Kokkos Node instance in order "
01707                          "to perform computations.");
01708       TEUCHOS_TEST_FOR_EXCEPTION(numPartitions_ != factorOutput.numPartitions(),
01709                          std::invalid_argument,
01710                          "applyImpl: KokkosNodeTsqr's number of partitions "
01711                          << numPartitions_ << " does not match the given "
01712                          "factorOutput's number of partitions "
01713                          << factorOutput.numPartitions() << ".  This likely "
01714                          "means that the given factorOutput object comes from "
01715                          "a different instance of KokkosNodeTsqr.  Please "
01716                          "report this bug to the Kokkos developers.");
01717       const int numParts = numPartitions_;
01718       first_pass_type firstPass (applyType, Q, factorOutput.firstPassTauArrays,
01719                                  factorOutput.topBlocks, C, strategy_,
01720                                  numParts, explicitQ, contiguousCacheBlocks);
01721       // Get a view of each partition's top block of the C matrix.
01722       std::vector<view_type> topBlocksOfC (numParts);
01723       {
01724         typedef std::pair<LocalOrdinal, LocalOrdinal> index_range_type;
01725         typedef CacheBlocker<LocalOrdinal, Scalar> blocker_type;
01726         blocker_type C_blocker (C.nrows(), C.ncols(), strategy_);
01727 
01728         // For each partition, collect its top block of C.
01729         for (int partIdx = 0; partIdx < numParts; ++partIdx) {
01730           const index_range_type cbIndices =
01731             cacheBlockIndexRange (C.nrows(), C.ncols(), partIdx,
01732                                   numParts, strategy_);
01733           if (cbIndices.first >= cbIndices.second) {
01734             topBlocksOfC[partIdx] = view_type (0, 0, NULL, 0);
01735           } else {
01736             topBlocksOfC[partIdx] =
01737               C_blocker.get_cache_block (C, cbIndices.first,
01738                                          contiguousCacheBlocks);
01739           }
01740         }
01741       }
01742 
01743       if (applyType.transposed()) {
01744         // parallel_for wants an exclusive range.
01745         node->parallel_for (0, numPartitions_, firstPass);
01746         applySecondPass (applyType, factorOutput, topBlocksOfC,
01747                          strategy_, explicitQ);
01748       } else {
01749         applySecondPass (applyType, factorOutput, topBlocksOfC,
01750                          strategy_, explicitQ);
01751         // parallel_for wants an exclusive range.
01752         node->parallel_for (0, numPartitions_, firstPass);
01753       }
01754     }
01755 
01756     std::vector<Scalar>
01757     factorPair (const MatView<LocalOrdinal, Scalar>& R_top,
01758                 const MatView<LocalOrdinal, Scalar>& R_bot) const
01759     {
01760       TEUCHOS_TEST_FOR_EXCEPTION(R_top.empty(), std::logic_error,
01761                          "R_top is empty!");
01762       TEUCHOS_TEST_FOR_EXCEPTION(R_bot.empty(), std::logic_error,
01763                          "R_bot is empty!");
01764       TEUCHOS_TEST_FOR_EXCEPTION(work_.size() == 0, std::logic_error,
01765                          "Workspace array work_ has length zero.");
01766       TEUCHOS_TEST_FOR_EXCEPTION(work_.size() < static_cast<size_t> (R_top.ncols()),
01767                          std::logic_error,
01768                          "Workspace array work_ has length = "
01769                          << work_.size() << " < R_top.ncols() = "
01770                          << R_top.ncols() << ".");
01771 
01772       std::vector<Scalar> tau (R_top.ncols());
01773 
01774       // Our convention for such helper methods is for the immediate
01775       // parent to allocate workspace (the work_ array in this case).
01776       //
01777       // The statement below only works if R_top and R_bot have a
01778       // nonzero (and the same) number of columns, but we have already
01779       // checked that above.
01780       combine_.factor_pair (R_top.ncols(), R_top.get(), R_top.lda(),
01781                             R_bot.get(), R_bot.lda(), &tau[0], &work_[0]);
01782       return tau;
01783     }
01784 
01785     void
01786     factorSecondPass (std::vector<MatView<LocalOrdinal, Scalar> >& topBlocks,
01787                       std::vector<std::vector<Scalar> >& tauArrays,
01788                       const int numPartitions) const
01789     {
01790       if (numPartitions <= 1)
01791         return; // Done!
01792       TEUCHOS_TEST_FOR_EXCEPTION (topBlocks.size() < static_cast<size_t>(numPartitions),
01793                           std::logic_error,
01794                           "KokkosNodeTsqr::factorSecondPass: topBlocks.size() "
01795                           "(= " << topBlocks.size() << ") < numPartitions (= "
01796                           << numPartitions << ").  Please report this bug to "
01797                           "the Kokkos developers.");
01798       TEUCHOS_TEST_FOR_EXCEPTION (tauArrays.size() < static_cast<size_t>(numPartitions-1),
01799                           std::logic_error,
01800                           "KokkosNodeTsqr::factorSecondPass: topBlocks.size() "
01801                           "(= " << topBlocks.size() << ") < numPartitions-1 (= "
01802                           << (numPartitions-1) << ").  Please report this bug "
01803                           "to the Kokkos developers.");
01804       // The top partition (partition index zero) should always be
01805       // nonempty if we get this far, so its top block should also be
01806       // nonempty.
01807       TEUCHOS_TEST_FOR_EXCEPTION(topBlocks[0].empty(), std::logic_error,
01808                          "KokkosNodeTsqr::factorSecondPass: topBlocks[0] is "
01809                          "empty.  Please report this bug to the Kokkos "
01810                          "developers.");
01811       // However, other partitions besides the top one might be empty,
01812       // in which case their top blocks will be empty.  We skip over
01813       // the empty partitions in the loop below.
01814       work_.resize (static_cast<size_t> (topBlocks[0].ncols()));
01815       for (int partIdx = 1; partIdx < numPartitions; ++partIdx)
01816         if (! topBlocks[partIdx].empty())
01817           tauArrays[partIdx-1] = factorPair (topBlocks[0], topBlocks[partIdx]);
01818     }
01819 
01820     void
01821     applyPair (const ApplyType& applyType,
01822                const MatView<LocalOrdinal, Scalar>& R_bot,
01823                const std::vector<Scalar>& tau,
01824                const MatView<LocalOrdinal, Scalar>& C_top,
01825                const MatView<LocalOrdinal, Scalar>& C_bot) const
01826     {
01827       // Our convention for such helper methods is for the immediate
01828       // parent to allocate workspace (the work_ array in this case).
01829       //
01830       // The statement below only works if C_top, R_bot, and C_bot
01831       // have a nonzero (and the same) number of columns, but we have
01832       // already checked that above.
01833       combine_.apply_pair (applyType, C_top.ncols(), R_bot.ncols(),
01834                            R_bot.get(), R_bot.lda(), &tau[0],
01835                            C_top.get(), C_top.lda(),
01836                            C_bot.get(), C_bot.lda(), &work_[0]);
01837     }
01838 
01839     void
01840     applySecondPass (const ApplyType& applyType,
01841                      const FactorOutput& factorOutput,
01842                      std::vector<MatView<LocalOrdinal, Scalar> >& topBlocksOfC,
01843                      const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
01844                      const bool explicitQ) const
01845     {
01846       const int numParts = factorOutput.numPartitions();
01847       if (numParts <= 1)
01848         return; // Done!
01849       TEUCHOS_TEST_FOR_EXCEPTION(topBlocksOfC.size() != static_cast<size_t>(numParts),
01850                          std::logic_error,
01851                          "KokkosNodeTsqr:applySecondPass: topBlocksOfC.size() ("
01852                          "= " << topBlocksOfC.size() << ") != number of partiti"
01853                          "ons (= " << numParts << ").  Please report this bug t"
01854                          "o the Kokkos developers.");
01855       TEUCHOS_TEST_FOR_EXCEPTION(factorOutput.secondPassTauArrays.size() !=
01856                                  static_cast<size_t>(numParts-1),
01857                                  std::logic_error,
01858                                  "KokkosNodeTsqr:applySecondPass: factorOutput"
01859                                  ".secondPassTauArrays.size() (= "
01860                                  << factorOutput.secondPassTauArrays.size()
01861                                  << ") != number of partitions minus 1 (= "
01862                                  << (numParts-1) << ").  Please report this bug"
01863                                  " to the Kokkos developers.");
01864       const LocalOrdinal numCols = topBlocksOfC[0].ncols();
01865       work_.resize (static_cast<size_t> (numCols));
01866 
01867       typedef MatView<LocalOrdinal, Scalar> view_type;
01868       typedef typename std::vector<view_type>::size_type size_type;
01869 
01870       // Top blocks of C are the whole cache blocks.  We only want to
01871       // affect the top ncols x ncols part of each of those blocks in
01872       // this method.
01873       view_type C_top_square (numCols, numCols, topBlocksOfC[0].get(),
01874                               topBlocksOfC[0].lda());
01875       if (applyType.transposed()) {
01876         // Don't include the topmost (index 0) partition in the
01877         // iteration; that corresponds to C_top_square.
01878         for (int partIdx = 1; partIdx < numParts; ++partIdx) {
01879           // It's legitimate for some partitions not to have any
01880           // cache blocks.  In that case, their top block will be
01881           // empty, and we can skip over them.
01882           const view_type& C_cur = topBlocksOfC[partIdx];
01883           if (! C_cur.empty()) {
01884             view_type C_cur_square (numCols, numCols, C_cur.get(),
01885                                     C_cur.lda());
01886             // If explicitQ: We've already done the first pass and
01887             // filled the top blocks of C.
01888             applyPair (applyType, factorOutput.topBlocks[partIdx],
01889                        factorOutput.secondPassTauArrays[partIdx-1],
01890                        C_top_square, C_cur_square);
01891           }
01892         }
01893       } else {
01894         // In non-transposed mode, when computing the first
01895         // C.ncols() columns of the explicit Q factor, intranode
01896         // TSQR would run after internode TSQR (i.e., DistTsqr)
01897         // (even if only running on a single node in non-MPI mode).
01898         // Therefore, internode TSQR is responsible for filling the
01899         // top block of this node's part of the C matrix.
01900         //
01901         // Don't include the topmost partition in the iteration;
01902         // that corresponds to C_top_square.
01903         for (int partIdx = numParts - 1; partIdx > 0; --partIdx) {
01904           // It's legitimate for some partitions not to have any
01905           // cache blocks.  In that case, their top block will be
01906           // empty, and we can skip over them.
01907           const view_type& C_cur = topBlocksOfC[partIdx];
01908           if (! C_cur.empty()) {
01909             view_type C_cur_square (numCols, numCols,
01910                                     C_cur.get(), C_cur.lda());
01911             // The "first" pass (actually the last, only named
01912             // "first" by analogy with factorFirstPass()) will
01913             // fill the rest of these top blocks.  For now, we
01914             // just fill the top n x n part of the top blocks
01915             // with zeros.
01916             if (explicitQ) {
01917               C_cur_square.fill (Teuchos::ScalarTraits<Scalar>::zero());
01918             }
01919             applyPair (applyType, factorOutput.topBlocks[partIdx],
01920                        factorOutput.secondPassTauArrays[partIdx-1],
01921                        C_top_square, C_cur_square);
01922           }
01923         }
01924       }
01925     }
01926 
01927   protected:
01928 
01942     ConstMatView<LocalOrdinal, Scalar>
01943     const_top_block (const ConstMatView<LocalOrdinal, Scalar>& C,
01944                      const bool contiguous_cache_blocks) const
01945     {
01946       typedef CacheBlocker<LocalOrdinal, Scalar> blocker_type;
01947       blocker_type blocker (C.nrows(), C.ncols(), strategy_);
01948 
01949       // C_top_block is a view of the topmost cache block of C.
01950       // C_top_block should have >= ncols rows, otherwise either cache
01951       // blocking is broken or the input matrix C itself had fewer
01952       // rows than columns.
01953       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01954       const_view_type C_top = blocker.top_block (C, contiguous_cache_blocks);
01955       return C_top;
01956     }
01957   };
01958 } // namespace TSQR
01959 
01960 #endif // __TSQR_KokkosNodeTsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends