Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_KokkosNodeTsqr.hpp
Go to the documentation of this file.
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00032 #ifndef __TSQR_KokkosNodeTsqr_hpp
00033 #define __TSQR_KokkosNodeTsqr_hpp
00034 
00035 #include <Tsqr_CacheBlocker.hpp>
00036 #include <Tsqr_Combine.hpp>
00037 #include <Tsqr_NodeTsqr.hpp>
00038 
00039 #include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
00040 #include <Teuchos_ScalarTraits.hpp>
00041 
00042 //#define KNR_DEBUG 1
00043 #ifdef KNR_DEBUG
00044 #  include <iostream>
00045 #endif // KNR_DEBUG
00046 
00047 namespace TSQR {
00048 
00049   namespace details {
00050 
00076     template<class LocalOrdinal, class Scalar>
00077     std::pair<LocalOrdinal, LocalOrdinal>
00078     cacheBlockIndexRange (const LocalOrdinal numRows,
00079         const LocalOrdinal numCols,
00080         const int partitionIndex,
00081         const int numPartitions,
00082         const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy)
00083     {
00084 #ifdef KNR_DEBUG
00085       using std::cerr;
00086       using std::endl;
00087       // cerr << "cacheBlockIndexRange(numRows=" << numRows 
00088       //     << ", numCols=" << numCols
00089       //     << ", partitionIndex=" << partitionIndex 
00090       //     << ", numPartitions=" << numPartitions
00091       //     << ", strategy)" << endl;
00092 #endif // KNR_DEBUG
00093 
00094       // The input index is a zero-based index of the current
00095       // partition (not the "current cache block" -- a partition
00096       // contains zero or more cache blocks).  If the input index is
00097       // out of range, then return, since there is nothing to do.
00098       //
00099       // The nice thing about partitioning over cache blocks is that
00100       // the cache blocking strategy guarantees that exactly one of
00101       // the following is true:
00102       // 
00103       // 1. The partition is empty (contains zero cache blocks)
00104       // 2. All cache blocks in the partition are valid (none 
00105       //    contains more columns than rows)
00106 
00107       // Return an empty partition (an empty cache block range) if
00108       // the partition index is out of range.
00109       if (partitionIndex >= numPartitions)
00110   return std::make_pair (LocalOrdinal(0), LocalOrdinal(0));
00111 
00112       const LocalOrdinal numRowsCacheBlock = 
00113   strategy.cache_block_num_rows (numCols);
00114       const LocalOrdinal numCacheBlocks = 
00115   strategy.num_cache_blocks (numRows, numCols, numRowsCacheBlock);
00116 
00117 #ifdef KNR_DEBUG
00118       // cerr << "numRowsCacheBlock=" << numRowsCacheBlock
00119       //     << ", numCacheBlocks=" << numCacheBlocks
00120       //     << endl;
00121 #endif // KNR_DEBUG
00122 
00123       // Figure out how many cache blocks my partition contains.  If
00124       // the number of partitions doesn't evenly divide the number
00125       // of cache blocks, we spread out the remainder among the
00126       // first few threads.
00127       const LocalOrdinal quotient = numCacheBlocks / numPartitions;
00128       const LocalOrdinal remainder = numCacheBlocks - quotient * numPartitions;
00129       const LocalOrdinal myNumCacheBlocks = 
00130   (partitionIndex < remainder) ? (quotient + 1) : quotient;
00131 
00132 #ifdef KNR_DEBUG
00133       // cerr << "Partition " << partitionIndex << ": quotient=" << quotient 
00134       //     << ", remainder=" << remainder << ", myNumCacheBlocks=" 
00135       //     << myNumCacheBlocks << endl;
00136 #endif // KNR_DEBUG
00137 
00138       // If there are no cache blocks, there is nothing to factor.
00139       // Return an empty cache block range to indicate this.
00140       if (myNumCacheBlocks == 0)
00141   return std::make_pair (LocalOrdinal(0), LocalOrdinal(0));
00142 
00143       // Index of my first cache block (inclusive).
00144       const LocalOrdinal myFirstCacheBlockIndex = 
00145   (partitionIndex < remainder) ? 
00146   partitionIndex * (quotient+1) :
00147   remainder * (quotient+1) + (partitionIndex - remainder) * quotient;
00148       // Index of my last cache block (exclusive).
00149       const LocalOrdinal myLastCacheBlockIndex = 
00150   (partitionIndex+1 < remainder) ? 
00151   (partitionIndex+1) * (quotient+1) :
00152   remainder * (quotient+1) + (partitionIndex+1 - remainder) * quotient;
00153       // Sanity check.
00154       if (myLastCacheBlockIndex <= myFirstCacheBlockIndex)
00155   {
00156     std::ostringstream os;
00157     os << "Partition " << (partitionIndex+1) << " of " 
00158        << numPartitions << ":  My range of cache block indices [" 
00159        << myFirstCacheBlockIndex << ", " << myLastCacheBlockIndex 
00160        << ") is empty.";
00161     throw std::logic_error(os.str());
00162   }
00163       return std::make_pair (myFirstCacheBlockIndex, myLastCacheBlockIndex);
00164     }
00165 
00166 
00170     template<class LocalOrdinal, class Scalar>
00171     class FactorFirstPass {
00172     private:
00173       MatView<LocalOrdinal, Scalar> A_;
00174       // While tauArrays_ is shared among tasks (i.e., partitions),
00175       // there are no race conditions among entries, since each
00176       // partition writes its own entry.  Ditto for topBlocks_.
00177       std::vector<std::vector<Scalar> >& tauArrays_;
00178       std::vector<MatView<LocalOrdinal, Scalar> >& topBlocks_;
00179       CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
00180       int numPartitions_;
00181       bool contiguousCacheBlocks_;
00182 
00183       std::vector<Scalar>
00184       factorFirstCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
00185            const MatView<LocalOrdinal, Scalar>& A_top,
00186            std::vector<Scalar>& work)
00187       {
00188   std::vector<Scalar> tau (A_top.ncols());
00189 
00190   // We should only call this if A_top.ncols() > 0 and therefore
00191   // work.size() > 0, but we've already checked for that, so we
00192   // don't have to check again.
00193   combine.factor_first (A_top.nrows(), A_top.ncols(), A_top.get(), 
00194             A_top.lda(), &tau[0], &work[0]);
00195   return tau;
00196       }
00197 
00198       std::vector<Scalar>
00199       factorCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
00200       const MatView<LocalOrdinal, Scalar>& A_top, 
00201       const MatView<LocalOrdinal, Scalar>& A_cur,
00202       std::vector<Scalar>& work)
00203       {
00204   std::vector<Scalar> tau (A_top.ncols());
00205 
00206   // We should only call this if A_top.ncols() > 0 and therefore
00207   // tau.size() > 0 and work.size() > 0, but we've already
00208   // checked for that, so we don't have to check again.
00209   combine.factor_inner (A_cur.nrows(), A_top.ncols(), 
00210             A_top.get(), A_top.lda(),
00211             A_cur.get(), A_cur.lda(),
00212             &tau[0], &work[0]);
00213   return tau;
00214       }
00215 
00222       MatView<LocalOrdinal, Scalar>
00223       factor (const std::pair<LocalOrdinal, LocalOrdinal> cbIndices,
00224         const int partitionIndex)
00225       {
00226 #ifdef KNR_DEBUG
00227   using std::cerr;
00228   using std::endl;
00229 #endif // KNR_DEBUG
00230 
00231   typedef MatView<LocalOrdinal, Scalar> view_type;
00232   typedef CacheBlockRange<view_type> range_type;
00233 
00234   // Workspace is created here, because it must not be shared
00235   // among threads.
00236   std::vector<Scalar> work (A_.ncols());
00237 
00238   // Range of cache blocks to factor.
00239   range_type cbRange (A_, strategy_, 
00240           cbIndices.first, 
00241           cbIndices.second, 
00242           contiguousCacheBlocks_);
00243   // Iterator in the forward direction over the range of cache
00244   // blocks to factor.
00245   typedef typename CacheBlockRange<view_type>::iterator range_iter_type;
00246   range_iter_type cbIter = cbRange.begin();
00247 
00248   // Remember the top (first) block.
00249   view_type A_top = *cbIter;
00250   if (A_top.empty())
00251     return A_top;
00252   TEST_FOR_EXCEPTION(cbIndices.first >= cbIndices.second,
00253          std::logic_error,
00254          "FactorFirstPass::factor: A_top is not empty, but "
00255          "the cache block index range " << cbIndices.first 
00256          << "," << cbIndices.second << " is empty.  Please "
00257          "report this bug to the Kokkos developers.");
00258 
00259   // Current cache block index.
00260   LocalOrdinal curTauIdx = cbIndices.first;
00261 
00262   // Factor the first cache block.
00263   Combine<LocalOrdinal, Scalar> combine;
00264   tauArrays_[curTauIdx++] = factorFirstCacheBlock (combine, A_top, work);
00265 
00266   // Move past the first cache block.
00267   ++cbIter;
00268 
00269   // Number of cache block(s) we have factored thus far.
00270   LocalOrdinal count = 1;
00271 
00272   // Factor the remaining cache block(s).
00273   range_iter_type cbEnd = cbRange.end();
00274   while (cbIter != cbEnd)
00275     {
00276       view_type A_cur = *cbIter;
00277       // Iteration over cache blocks of a partition should
00278       // always result in nonempty cache blocks.
00279       TEST_FOR_EXCEPTION(A_cur.empty(), std::logic_error,
00280              "FactorFirstPass::factor: The current cache bloc"
00281              "k (the " << count << "-th to factor in the rang"
00282              "e [" << cbIndices.first << ","
00283              << cbIndices.second << ") of cache block indices"
00284              ") in partition " << (partitionIndex+1) << " (ou"
00285              "t of " << numPartitions_ << " partitions) is em"
00286              "pty.  Please report this bug to the Kokkos deve"
00287              "lopers.");
00288       TEST_FOR_EXCEPTION(static_cast<size_t>(curTauIdx) >= tauArrays_.size(),
00289              std::logic_error,
00290              "FactorFirstPass::factor: curTauIdx (= " 
00291              << curTauIdx << ") >= tauArrays_.size() (= " 
00292              << tauArrays_.size() << ").  Please report this "
00293              "bug to the Kokkos developers.");
00294       tauArrays_[curTauIdx++] = 
00295         factorCacheBlock (combine, A_top, A_cur, work);
00296       ++count;
00297       ++cbIter;
00298     }
00299 #ifdef KNR_DEBUG
00300   cerr << "Factored " << count << " cache blocks" << endl;
00301 #endif // KNR_DEBUG
00302   return A_top;
00303       }
00304 
00305     public:
00326       FactorFirstPass (const MatView<LocalOrdinal,Scalar>& A, 
00327            std::vector<std::vector<Scalar> >& tauArrays,
00328            std::vector<MatView<LocalOrdinal, Scalar> >& topBlocks,
00329            const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
00330            const int numPartitions,
00331            const bool contiguousCacheBlocks = false) :
00332   A_ (A), 
00333   tauArrays_ (tauArrays), 
00334   topBlocks_ (topBlocks),
00335   strategy_ (strategy), 
00336   numPartitions_ (numPartitions),
00337   contiguousCacheBlocks_ (contiguousCacheBlocks)
00338       {
00339   TEST_FOR_EXCEPTION(A_.empty(), std::logic_error,
00340          "TSQR::FactorFirstPass constructor: A is empty.  "
00341          "Please report this bug to the Kokkos developers.");
00342   TEST_FOR_EXCEPTION(numPartitions < 1, std::logic_error,
00343          "TSQR::FactorFirstPass constructor: numPartitions "
00344          "must be positive, but numPartitions = " 
00345          << numPartitions << ".  Please report this bug to "
00346          "the Kokkos developers.");
00347       }
00348 
00379       void 
00380       execute (const int partitionIndex) 
00381       {
00382 #ifdef KNR_DEBUG
00383   using std::cerr;
00384   using std::endl;
00385   // cerr << "FactorFirstPass::execute (" << partitionIndex << ")" << endl;
00386 #endif // KNR_DEBUG
00387 
00388   if (partitionIndex < 0 || partitionIndex >= numPartitions_)
00389     return;
00390   else if (A_.empty())
00391     return;
00392   else 
00393     {
00394       const std::pair<LocalOrdinal, LocalOrdinal> cbIndices = 
00395         cacheBlockIndexRange (A_.nrows(), A_.ncols(), partitionIndex, 
00396             numPartitions_, strategy_);
00397 #ifdef KNR_DEBUG
00398       cerr << "Partition " << partitionIndex 
00399      << ": Factoring cache block indices ["
00400      << cbIndices.first << ", " << cbIndices.second << ")"
00401      << endl;
00402 #endif // KNR_DEBUG
00403       // It's legitimate, though suboptimal, for some partitions
00404       // not to get any work to do (in this case, not to get any
00405       // cache blocks to factor).
00406       if (cbIndices.second <= cbIndices.first)
00407         return;
00408       else
00409         topBlocks_[partitionIndex] = factor (cbIndices, partitionIndex);
00410     }
00411       }
00412     };
00413 
00414 
00423     template<class LocalOrdinal, class Scalar>
00424     class ApplyFirstPass {
00425     private:
00426       ApplyType applyType_;
00427       ConstMatView<LocalOrdinal, Scalar> Q_;
00428       const std::vector<std::vector<Scalar> >& tauArrays_;
00429       const std::vector<MatView<LocalOrdinal, Scalar> >& topBlocks_;
00430       MatView<LocalOrdinal, Scalar> C_;
00431       CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
00432       int numPartitions_;
00433       bool explicitQ_, contiguousCacheBlocks_;
00434 
00435       void
00436       applyFirstCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
00437           const ApplyType& applyType,
00438           const ConstMatView<LocalOrdinal, Scalar>& Q_top,
00439           const std::vector<Scalar>& tau,
00440           const MatView<LocalOrdinal, Scalar>& C_top,
00441           std::vector<Scalar>& work)
00442       {
00443   TEST_FOR_EXCEPTION(tau.size() < static_cast<size_t> (Q_top.ncols()), 
00444          std::logic_error,
00445          "ApplyFirstPass::applyFirstCacheBlock: tau.size() "
00446          "(= " << tau.size() << ") < number of columns " 
00447          << Q_top.ncols() << " in the Q factor.  Please "
00448          "report this bug to the Kokkos developers.");
00449 
00450   // If we get this far, it's fair to assume that we have
00451   // checked whether tau and work have nonzero lengths.
00452   combine.apply_first (applyType, C_top.nrows(), C_top.ncols(),
00453            Q_top.ncols(), Q_top.get(), Q_top.lda(),
00454            &tau[0], C_top.get(), C_top.lda(), &work[0]);
00455       }
00456 
00457       void
00458       applyCacheBlock (Combine<LocalOrdinal, Scalar>& combine,
00459            const ApplyType& applyType,
00460            const ConstMatView<LocalOrdinal, Scalar>& Q_cur,
00461            const std::vector<Scalar>& tau,
00462            const MatView<LocalOrdinal, Scalar>& C_top,
00463            const MatView<LocalOrdinal, Scalar>& C_cur,
00464            std::vector<Scalar>& work)
00465       {
00466   TEST_FOR_EXCEPTION(tau.size() < static_cast<size_t> (Q_cur.ncols()), 
00467          std::logic_error,
00468          "ApplyFirstPass::applyCacheBlock: tau.size() "
00469          "(= " << tau.size() << ") < number of columns " 
00470          << Q_cur.ncols() << " in the Q factor.  Please "
00471          "report this bug to the Kokkos developers.");
00472 
00473   // If we get this far, it's fair to assume that we have
00474   // checked whether tau and work have nonzero lengths.
00475   combine.apply_inner (applyType, C_cur.nrows(), C_cur.ncols(), 
00476            Q_cur.ncols(), Q_cur.get(), Q_cur.lda(), 
00477            &tau[0],
00478            C_top.get(), C_top.lda(),
00479            C_cur.get(), C_cur.lda(),
00480            &work[0]);
00481       }
00482 
00492       void
00493       apply (const ApplyType& applyType,
00494        const std::pair<LocalOrdinal, LocalOrdinal> cbIndices,
00495        const int partitionIndex)
00496       {
00497 #ifdef KNR_DEBUG
00498   using std::cerr;
00499   using std::endl;
00500 #endif // KNR_DEBUG
00501   typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
00502   typedef MatView<LocalOrdinal, Scalar> view_type;
00503   typedef CacheBlockRange<const_view_type> const_range_type;
00504   typedef CacheBlockRange<view_type> range_type;
00505   typedef CacheBlocker<LocalOrdinal, Scalar> blocker_type;
00506 
00507   if (cbIndices.first >= cbIndices.second)
00508     return; // My range of cache blocks is empty; nothing to do
00509 
00510   // Q_range: Range of cache blocks in the Q factor.
00511   // C_range: Range of cache blocks in the matrix C.
00512   const_range_type Q_range (Q_, strategy_, 
00513           cbIndices.first, cbIndices.second,
00514           contiguousCacheBlocks_);
00515   range_type C_range (C_, strategy_, 
00516           cbIndices.first, cbIndices.second,
00517           contiguousCacheBlocks_);
00518   TEST_FOR_EXCEPTION(Q_range.empty(), std::logic_error,
00519          "Q_range is empty, but the range of cache block "
00520          "indices [" << cbIndices.first << ", " 
00521          << cbIndices.second << ") is not empty.  Please "
00522          "report this bug to the Kokkos developers.");
00523   TEST_FOR_EXCEPTION(C_range.empty(), std::logic_error,
00524          "C_range is empty, but the range of cache block "
00525          "indices [" << cbIndices.first << ", " 
00526          << cbIndices.second << ") is not empty.  Please "
00527          "report this bug to the Kokkos developers.");
00528 
00529   // Task-local workspace array of length C_.ncols().  Workspace
00530   // must be per task, else there will be race conditions as
00531   // different tasks attempt to write to and read from the same
00532   // workspace simultaneously.
00533   std::vector<Scalar> work (C_.ncols());
00534 
00535   Combine<LocalOrdinal, Scalar> combine;
00536   if (applyType.transposed())
00537     {
00538       typename const_range_type::iterator Q_rangeIter = Q_range.begin();
00539       typename range_type::iterator C_rangeIter = C_range.begin();
00540       TEST_FOR_EXCEPTION(Q_rangeIter == Q_range.end(), std::logic_error,
00541              "The Q cache block range claims to be nonempty, "
00542              "but the iterator range is empty.  Please report"
00543              " this bug to the Kokkos developers.");
00544       TEST_FOR_EXCEPTION(C_rangeIter == C_range.end(), std::logic_error,
00545              "The C cache block range claims to be nonempty, "
00546              "but the iterator range is empty.  Please report"
00547              " this bug to the Kokkos developers.");
00548 
00549       // Q_top: Topmost cache block in the cache block range of Q.
00550       // C_top: Topmost cache block in the cache block range of C.
00551       const_view_type Q_top = *Q_rangeIter;
00552       view_type C_top = *C_rangeIter;
00553       if (explicitQ_)
00554         { 
00555     C_top.fill (Teuchos::ScalarTraits<Scalar>::zero());
00556     if (partitionIndex == 0)
00557       for (LocalOrdinal j = 0; j < C_top.ncols(); ++j)
00558         C_top(j,j) = Teuchos::ScalarTraits<Scalar>::one();
00559         }
00560       LocalOrdinal curTauIndex = cbIndices.first;
00561 
00562       // Apply the first block.
00563       applyFirstCacheBlock (combine, applyType, Q_top, 
00564           tauArrays_[curTauIndex++], C_top, work);
00565 
00566       // Apply the rest of the blocks, if any.
00567       ++Q_rangeIter;
00568       ++C_rangeIter;
00569       while (Q_rangeIter != Q_range.end())
00570         {
00571     TEST_FOR_EXCEPTION(C_rangeIter == C_range.end(),
00572            std::logic_error,
00573            "When applying Q^T or Q^H to C: The Q cache "
00574            "block iterator is not yet at the end, but "
00575            "the C cache block iterator is.  Please "
00576            "report this bug to the Kokkos developers.");
00577     const_view_type Q_cur = *Q_rangeIter;
00578     view_type C_cur = *C_rangeIter;
00579     ++Q_rangeIter;
00580     ++C_rangeIter;
00581     if (explicitQ_)
00582       C_cur.fill (Teuchos::ScalarTraits<Scalar>::zero());
00583     applyCacheBlock (combine, applyType, Q_cur, 
00584          tauArrays_[curTauIndex++], 
00585          C_top, C_cur, work);
00586         }
00587     }
00588   else
00589     {
00590       // Q_top: Topmost cache block in the cache block range of Q.
00591       // C_top: Topmost cache block in the cache block range of C.
00592       const_view_type Q_top = *(Q_range.begin());
00593       view_type C_top = *(C_range.begin());
00594 
00595       if (explicitQ_)
00596         {
00597     // We've already filled the top ncols x ncols block of
00598     // C_top with data (that's the result of applying the
00599     // internode part of the Q factor via DistTsqr).
00600     // However, we still need to fill the rest of C_top
00601     // (everything but the top ncols rows of C_top) with
00602     // zeros.
00603     view_type C_top_rest (C_top.nrows() - C_top.ncols(), C_top.ncols(),
00604               C_top.get() + C_top.ncols(), C_top.lda());
00605     C_top_rest.fill (Teuchos::ScalarTraits<Scalar>::zero());
00606         }
00607       LocalOrdinal curTauIndex = cbIndices.second-1;
00608 
00609       // When applying Q (rather than Q^T or Q^H), we apply the
00610       // cache blocks in reverse order.
00611       typename const_range_type::iterator Q_rangeIter = Q_range.rbegin();
00612       typename range_type::iterator C_rangeIter = C_range.rbegin();
00613       TEST_FOR_EXCEPTION(Q_rangeIter == Q_range.rend(), std::logic_error,
00614              "The Q cache block range claims to be nonempty, "
00615              "but the iterator range is empty.  Please report"
00616              " this bug to the Kokkos developers.");
00617       TEST_FOR_EXCEPTION(C_rangeIter == C_range.rend(), std::logic_error,
00618              "The C cache block range claims to be nonempty, "
00619              "but the iterator range is empty.  Please report"
00620              " this bug to the Kokkos developers.");
00621 
00622       // Equality of cache block range iterators only tests the
00623       // cache block index, not reverse-ness.  This means we can
00624       // compare a reverse-direction iterator (Q_rangeIter) with
00625       // a forward-direction iterator (Q_range.begin()).
00626       //
00627       // We do this because we need to handle the topmost block
00628       // of Q_range separately (applyFirstCacheBlock(), rather
00629       // than applyCacheBlock()).
00630       while (Q_rangeIter != Q_range.begin())
00631         {
00632     const_view_type Q_cur = *Q_rangeIter;
00633     view_type C_cur = *C_rangeIter;
00634 
00635     if (explicitQ_)
00636       C_cur.fill (Teuchos::ScalarTraits<Scalar>::zero());
00637 #ifdef KNR_DEBUG
00638     cerr << "tauArrays_[curTauIndex=" << curTauIndex << "].size() = " 
00639          << tauArrays_[curTauIndex].size() << endl;
00640 #endif // KNR_DEBUG
00641     TEST_FOR_EXCEPTION(curTauIndex < cbIndices.first, std::logic_error,
00642            "curTauIndex=" << curTauIndex << " out of valid "
00643            "range [" << cbIndices.first << "," 
00644            << cbIndices.second << ").  Please report this "
00645            "bug to the Kokkos developers.");
00646     applyCacheBlock (combine, applyType, Q_cur, 
00647          tauArrays_[curTauIndex--], 
00648          C_top, C_cur, work);
00649     ++Q_rangeIter;
00650     ++C_rangeIter;
00651         }
00652       TEST_FOR_EXCEPTION(curTauIndex < cbIndices.first, std::logic_error,
00653              "curTauIndex=" << curTauIndex << " out of valid "
00654              "range [" << cbIndices.first << "," 
00655              << cbIndices.second << ").  Please report this "
00656              "bug to the Kokkos developers.");
00657 #ifdef KNR_DEBUG
00658       cerr << "tauArrays_[curTauIndex=" << curTauIndex << "].size() = " 
00659      << tauArrays_[curTauIndex].size() << endl;
00660 #endif // KNR_DEBUG
00661       // Apply the first block.
00662       applyFirstCacheBlock (combine, applyType, Q_top, 
00663           tauArrays_[curTauIndex--], C_top, work);
00664     }
00665       }
00666 
00667     public:
00689       ApplyFirstPass (const ApplyType& applyType,
00690           const ConstMatView<LocalOrdinal,Scalar>& Q, 
00691           const std::vector<std::vector<Scalar> >& tauArrays,
00692           const std::vector<MatView<LocalOrdinal, Scalar> >& topBlocks,
00693           const MatView<LocalOrdinal,Scalar>& C,
00694           const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
00695           const int numPartitions,
00696           const bool explicitQ = false,
00697           const bool contiguousCacheBlocks = false) :
00698   applyType_ (applyType),
00699   Q_ (Q),
00700   tauArrays_ (tauArrays), 
00701   topBlocks_ (topBlocks),
00702   C_ (C),
00703   strategy_ (strategy), 
00704   numPartitions_ (numPartitions),
00705   explicitQ_ (explicitQ),
00706   contiguousCacheBlocks_ (contiguousCacheBlocks)
00707       {}
00708 
00734       void 
00735       execute (const int partitionIndex) 
00736       {
00737   if (partitionIndex < 0 || partitionIndex >= numPartitions_)
00738     return;
00739   else if (Q_.empty())
00740     return;
00741   else if (C_.empty())
00742     return;
00743 
00744   // We use the same cache block indices for Q and for C.
00745   std::pair<LocalOrdinal, LocalOrdinal> cbIndices = 
00746     cacheBlockIndexRange (Q_.nrows(), Q_.ncols(), partitionIndex, 
00747         numPartitions_, strategy_);
00748   if (cbIndices.second <= cbIndices.first)
00749     return;
00750   {
00751     std::pair<size_t, size_t> cbInds (static_cast<size_t> (cbIndices.first),
00752               static_cast<size_t> (cbIndices.second));
00753     TEST_FOR_EXCEPTION(cbIndices.first < static_cast<LocalOrdinal>(0), 
00754            std::logic_error,
00755            "TSQR::ApplyFirstPass::execute: cacheBlockIndexRa"
00756            "nge(" << Q_.nrows() << ", " << Q_.ncols() << ", "
00757            << partitionIndex << ", " << numPartitions_ << ","
00758            " strategy) returned a cache block range " 
00759            << cbIndices.first << "," << cbIndices.second 
00760            << " with negative starting index.  Please report"
00761            " this bug to the Kokkos developers.");
00762     TEST_FOR_EXCEPTION(cbInds.second > tauArrays_.size(),
00763            std::logic_error,
00764            "TSQR::ApplyFirstPass::execute: cacheBlockIndexRa"
00765            "nge(" << Q_.nrows() << ", " << Q_.ncols() << ", " 
00766            << partitionIndex << ", " << numPartitions_ << ","
00767            " strategy) returned a cache block range "
00768            << cbIndices.first << "," << cbIndices.second 
00769            << " with starting index larger than the number o"
00770            "f tau arrays " << tauArrays_.size() << ".  Pleas"
00771            "e report this bug to the Kokkos developers.");
00772   }
00773 
00774   apply (applyType_, cbIndices, partitionIndex);
00775       }
00776 
00777     };
00778 
00779 
00783     template<class LocalOrdinal, class Scalar>
00784     class CacheBlockWDP {
00785     private:
00786       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
00787       typedef MatView<LocalOrdinal, Scalar> view_type;
00788       typedef CacheBlockRange<const_view_type> const_range_type;
00789       typedef CacheBlockRange<view_type> range_type;
00790       
00791       const_view_type A_in_;
00792       view_type A_out_;
00793       CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
00794       int numPartitions_;
00795       bool unblock_;
00796 
00801       void
00802       copyRange (const_range_type& cbInputRange, range_type& cbOutputRange)
00803       {
00804   typedef typename const_range_type::iterator input_iter_type;
00805   typedef typename range_type::iterator output_iter_type;
00806   
00807   input_iter_type inputIter = cbInputRange.begin();
00808   output_iter_type outputIter = cbOutputRange.begin();
00809 
00810   input_iter_type inputEnd = cbInputRange.end();
00811   output_iter_type outputEnd = cbOutputRange.end();
00812   
00813   while (inputIter != inputEnd)
00814     {
00815       const_view_type A_in_cur = *inputIter;
00816       view_type A_out_cur = *outputIter;
00817       A_out_cur.copy (A_in_cur);
00818       ++inputIter;
00819       ++outputIter;
00820     }
00821       }
00822 
00823     public:
00835       CacheBlockWDP (const ConstMatView<LocalOrdinal, Scalar> A_in,
00836          const MatView<LocalOrdinal, Scalar> A_out,
00837          const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
00838          const int numPartitions,
00839          const bool unblock) :
00840   A_in_ (A_in), 
00841   A_out_ (A_out), 
00842   strategy_ (strategy), 
00843   numPartitions_ (numPartitions),
00844   unblock_ (unblock)
00845       {
00846   TEST_FOR_EXCEPTION(A_in_.nrows() != A_out_.nrows() || 
00847          A_in_.ncols() != A_out_.ncols(), 
00848          std::invalid_argument,
00849          "A_in and A_out do not have the same dimensions: "
00850          "A_in is " << A_in_.nrows() << " by " 
00851          << A_in_.ncols() << ", but A_out is " 
00852          << A_out_.nrows() << " by " 
00853          << A_out_.ncols() << ".");
00854   TEST_FOR_EXCEPTION(numPartitions_ < 1, 
00855          std::invalid_argument,
00856          "The number of partitions " << numPartitions_ 
00857          << " is not a positive integer.");
00858       }
00859       
00865       void 
00866       execute (const int partitionIndex)
00867       {
00868   if (partitionIndex < 0 || partitionIndex >= numPartitions_ || 
00869       A_in_.empty())
00870     return;
00871   else
00872     {
00873       typedef std::pair<LocalOrdinal, LocalOrdinal> index_range_type;
00874       const index_range_type cbIndices = 
00875         cacheBlockIndexRange (A_in_.nrows(), A_in_.ncols(), 
00876             partitionIndex, numPartitions_, strategy_);
00877       // It's perfectly legal for a partitioning to assign zero
00878       // cache block indices to a particular partition.  In that
00879       // case, this task has nothing to do.
00880       if (cbIndices.first >= cbIndices.second)
00881         return;
00882       else
00883         {
00884     // If unblock_ is false, then A_in_ is in column-major
00885     // order, and we want to cache-block it into A_out_.  If
00886     // unblock_ is true, then A_in_ is cache-blocked, and we
00887     // want to un-cache-block it into A_out_ (a matrix in
00888     // column-major order).
00889     const_range_type inputRange (A_in_, strategy_, cbIndices.first,
00890                cbIndices.second, unblock_);
00891     range_type outputRange (A_out_, strategy_, cbIndices.first, 
00892           cbIndices.second, ! unblock_);
00893     copyRange (inputRange, outputRange);
00894         }
00895     }
00896       }
00897     };
00898 
00902     template<class LocalOrdinal, class Scalar>
00903     class MultWDP {
00904     private:
00905       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
00906       typedef MatView<LocalOrdinal, Scalar> view_type;
00907       typedef CacheBlockRange<view_type> range_type;
00908 
00909       view_type Q_;
00910       const_view_type B_;
00911       CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
00912       int numPartitions_;
00913       bool contiguousCacheBlocks_;
00914 
00915       void
00916       multBlock (BLAS<LocalOrdinal, Scalar>& blas,
00917      const view_type& Q_cur,
00918      Matrix<LocalOrdinal, Scalar>& Q_temp)
00919       {
00920   const LocalOrdinal numCols = Q_cur.ncols();
00921 
00922   // GEMM doesn't like aliased arguments, so we use a copy.  We
00923   // only copy the current cache block, rather than all of Q;
00924   // this saves memory.
00925   Q_temp.reshape (Q_cur.nrows(), numCols);
00926   Q_temp.copy (Q_cur);
00927 
00928   // Q_cur := Q_temp * B.
00929   blas.GEMM ("N", "N", Q_cur.nrows(), numCols, numCols, 
00930        Teuchos::ScalarTraits<Scalar>::one(),
00931        Q_temp.get(), Q_temp.lda(), B_.get(), B_.lda(),
00932        Scalar(0), Q_cur.get(), Q_cur.lda());
00933       }
00934 
00938       void
00939       multRange (range_type& cbRange)
00940       {
00941   typedef typename range_type::iterator iter_type;
00942   iter_type iter = cbRange.begin();
00943   iter_type end = cbRange.end();
00944 
00945   // Temporary storage for the BLAS' matrix-matrix multiply
00946   // routine (which forbids aliasing of any input argument and
00947   // the output argument).
00948   Matrix<LocalOrdinal, Scalar> Q_temp;
00949   BLAS<LocalOrdinal, Scalar> blas;
00950   while (iter != end)
00951     {
00952       view_type Q_cur = *iter;
00953       multBlock (blas, Q_cur, Q_temp);
00954       ++iter;
00955     }
00956       }
00957 
00958     public:
00968       MultWDP (const MatView<LocalOrdinal, Scalar> Q,
00969          const ConstMatView<LocalOrdinal, Scalar> B,
00970          const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
00971          const int numPartitions,
00972          const bool contiguousCacheBlocks) :
00973   Q_ (Q), 
00974   B_ (B),
00975   strategy_ (strategy), 
00976   numPartitions_ (numPartitions),
00977   contiguousCacheBlocks_ (contiguousCacheBlocks)
00978       {}
00979 
00985       void
00986       execute (const int partitionIndex) 
00987       {
00988   if (partitionIndex < 0 || partitionIndex >= numPartitions_ || 
00989       Q_.empty())
00990     return;
00991   else
00992     {
00993       typedef std::pair<LocalOrdinal, LocalOrdinal> index_range_type;
00994       const index_range_type cbIndices = 
00995         cacheBlockIndexRange (Q_.nrows(), Q_.ncols(), partitionIndex, 
00996             numPartitions_, strategy_);
00997       if (cbIndices.first >= cbIndices.second)
00998         return;
00999       else
01000         {
01001     range_type range (Q_, strategy_, cbIndices.first, 
01002           cbIndices.second, contiguousCacheBlocks_);
01003     multRange (range);
01004         }
01005     }
01006       }
01007     };
01008 
01009 
01013     template<class LocalOrdinal, class Scalar>
01014     class FillWDP {
01015     private:
01016       typedef MatView<LocalOrdinal, Scalar> view_type;
01017       typedef CacheBlockRange<view_type> range_type;
01018 
01019       view_type A_;
01020       CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
01021       const Scalar value_;
01022       int numPartitions_;
01023       bool contiguousCacheBlocks_;
01024 
01026       void
01027       fillRange (range_type& cbRange, const Scalar value)
01028       {
01029   typedef typename range_type::iterator iter_type;
01030   iter_type iter = cbRange.begin();
01031   iter_type end = cbRange.end();
01032   while (iter != end)
01033     {
01034       view_type A_cur = *iter;
01035       A_cur.fill (value);
01036       ++iter;
01037     }
01038       }
01039 
01040     public:
01050       FillWDP (const MatView<LocalOrdinal, Scalar> A,
01051          const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
01052          const Scalar value,
01053          const int numPartitions,
01054          const bool contiguousCacheBlocks) :
01055   A_ (A),
01056   strategy_ (strategy), 
01057   value_ (value),
01058   numPartitions_ (numPartitions),
01059   contiguousCacheBlocks_ (contiguousCacheBlocks)
01060       {}
01061 
01067       void
01068       execute (const int partitionIndex) 
01069       {
01070   if (partitionIndex < 0 || partitionIndex >= numPartitions_ || 
01071       A_.empty())
01072     return;
01073   else
01074     {
01075       typedef std::pair<LocalOrdinal, LocalOrdinal> index_range_type;
01076       const index_range_type cbIndices = 
01077         cacheBlockIndexRange (A_.nrows(), A_.ncols(), partitionIndex, 
01078             numPartitions_, strategy_);
01079       if (cbIndices.first >= cbIndices.second)
01080         return;
01081       else 
01082         {
01083     range_type range (A_, strategy_, cbIndices.first, 
01084           cbIndices.second, contiguousCacheBlocks_);
01085     fillRange (range, value_);
01086         }
01087     }
01088       }
01089     };
01090   } // namespace details
01091 
01102   template<class LocalOrdinal, class Scalar>
01103   struct KokkosNodeTsqrFactorOutput {
01112     KokkosNodeTsqrFactorOutput (const size_t theNumCacheBlocks, 
01113         const int theNumPartitions) :
01114       firstPassTauArrays (theNumCacheBlocks)
01115     {
01116       // Protect the cast to size_t from a negative number of
01117       // partitions.
01118       TEST_FOR_EXCEPTION(theNumPartitions < 1, std::invalid_argument,
01119        "TSQR::KokkosNodeTsqrFactorOutput: Invalid number of "
01120        "partitions " << theNumPartitions << "; number of "
01121        "partitions must be a positive integer.");
01122       // If there's only one partition, we don't even need a second
01123       // pass (it's just sequential TSQR), and we don't need a TAU
01124       // array for the top partition.
01125       secondPassTauArrays.resize (static_cast<size_t> (theNumPartitions-1));
01126       topBlocks.resize (static_cast<size_t> (theNumPartitions));
01127     }
01128 
01130     int numCacheBlocks() const { return firstPassTauArrays.size(); }
01131 
01133     int numPartitions() const { return topBlocks.size(); }
01134 
01136     std::vector<std::vector<Scalar> > firstPassTauArrays;
01137 
01151     std::vector<std::vector<Scalar> > secondPassTauArrays;
01152 
01156     std::vector<MatView<LocalOrdinal, Scalar> > topBlocks;
01157   };
01158 
01175   template<class LocalOrdinal, class Scalar, class NodeType>    
01176   class KokkosNodeTsqr : 
01177     public NodeTsqr<LocalOrdinal, Scalar, KokkosNodeTsqrFactorOutput<LocalOrdinal, Scalar> >,
01178     public Teuchos::ParameterListAcceptorDefaultBase
01179   {
01180   public:
01181     typedef LocalOrdinal local_ordinal_type;
01182     typedef Scalar scalar_type;
01183     typedef NodeType node_type;
01184 
01187     typedef typename NodeTsqr<LocalOrdinal, Scalar, KokkosNodeTsqrFactorOutput<LocalOrdinal, Scalar> >::factor_output_type FactorOutput;
01188 
01189   private:
01191     Combine<LocalOrdinal, Scalar> combine_;
01192 
01194     mutable std::vector<Scalar> work_;
01195 
01197     Teuchos::RCP<const node_type> node_;
01198 
01200     CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_;
01201 
01208     int numPartitions_;
01209 
01224     int 
01225     defaultNumPartitions () const
01226     {
01227       // Currently the Kokkos Node API does not give us access to the
01228       // amount of available parallelism, so we return a constant.
01229       return 8;
01230     }
01231 
01232   public:
01238     KokkosNodeTsqr (const Teuchos::RCP<const node_type>& node,
01239         const Teuchos::RCP<Teuchos::ParameterList>& params) :
01240       node_ (node)
01241     {
01242       setParameterList (params);
01243     }
01244 
01248     std::string description () const {
01249       using Teuchos::TypeNameTraits;
01250       std::ostringstream os;
01251       os << "KokkosNodeTsqr<LocalOrdinal=" 
01252    << TypeNameTraits<LocalOrdinal>::name()
01253    << ", Scalar=" 
01254    << TypeNameTraits<Scalar>::name()
01255    << ", NodeType="
01256    << TypeNameTraits<node_type>::name()
01257    << ">: \"Cache Size Hint\"=" << strategy_.cache_size_hint()
01258    << ", \"Size of Scalar\"=" << strategy_.size_of_scalar()
01259    << ", \"Num Partitions\"=" << numPartitions_;
01260       return os.str();
01261     }
01262 
01270     void
01271     setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& paramList) 
01272     {
01273       using Teuchos::ParameterList;
01274       using Teuchos::parameterList;
01275       using Teuchos::RCP;
01276       using Teuchos::rcp;
01277 
01278       RCP<ParameterList> plist;
01279       if (paramList.is_null())
01280   plist = rcp (new ParameterList (*getValidParameters ()));
01281       else
01282   {
01283     plist = paramList;
01284     plist->validateParametersAndSetDefaults (*getValidParameters ());
01285   }
01286       // Get values of parameters.  We do this "transactionally" so
01287       // that (except for validation and filling in defaults above)
01288       // this method has the strong exception guarantee (it either
01289       // returns, or throws an exception with no externally visible
01290       // side effects).
01291       size_t cacheSizeHint, sizeOfScalar;
01292       int numPartitions;
01293       try {
01294   cacheSizeHint = plist->get<size_t> ("Cache Size Hint");
01295   sizeOfScalar = plist->get<size_t> ("Size of Scalar");
01296   numPartitions = plist->get<int> ("Num Partitions");
01297       } catch (Teuchos::Exceptions::InvalidParameter& e) {
01298   std::ostringstream os;
01299   os << "Failed to read default parameters after setting defaults.  Pleas"
01300     "e report this bug to the Kokkos developers.  Original exception mess"
01301     "age: " << e.what();
01302   throw std::logic_error(os.str());
01303       }
01304       numPartitions_ = numPartitions;
01305 
01306       // Recreate the cache blocking strategy.
01307       typedef CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_type;
01308       strategy_ = strategy_type (cacheSizeHint, sizeOfScalar);
01309 
01310       // Save the input parameter list.
01311       setMyParamList (plist);
01312     }
01313 
01330     Teuchos::RCP<const Teuchos::ParameterList> 
01331     getValidParameters() const
01332     {
01333       using Teuchos::ParameterList;
01334       using Teuchos::parameterList;
01335       using Teuchos::RCP;
01336 
01337       RCP<ParameterList> params = parameterList ("Intranode TSQR");
01338       params->set ("Cache Size Hint", 
01339        static_cast<size_t>(0), 
01340        std::string("Cache size in bytes; a hint for TSQR.  Set to t"
01341              "he size of the largest private cache per CPU co"
01342              "re, or the fraction of shared cache per core.  "
01343              "If zero, we pick a reasonable default."));
01344       params->set ("Size of Scalar", 
01345        sizeof(Scalar),
01346        std::string ("Size in bytes of the Scalar type.  In most "
01347         "cases, the default sizeof(Scalar) is fine.  "
01348         "Set a non-default value only when Scalar's "
01349         "data is dynamically allocated (such as for a "
01350         "type with precision variable at run time)."));
01351 
01352       // The number of partitions is an int rather than a
01353       // LocalOrdinal, to ensure that it is always stored with the
01354       // same type, despite the type of LocalOrdinal.  Besides, Kokkos
01355       // wants an int anyway.
01356       params->set ("Num Partitions",
01357        defaultNumPartitions (),
01358        std::string ("Number of partitions; the maximum available pa"
01359         "rallelelism in intranode TSQR.  Slight oversub"
01360         "scription is OK; undersubscription may have a "
01361         "performance cost."));
01362       return params;
01363     }
01364 
01366     FactorOutput
01367     factor (const LocalOrdinal numRows, 
01368       const LocalOrdinal numCols,
01369       Scalar A[],
01370       const LocalOrdinal lda,
01371       Scalar R[],
01372       const LocalOrdinal ldr,
01373       const bool contiguousCacheBlocks) const
01374     {
01375       MatView<LocalOrdinal, Scalar> A_view (numRows, numCols, A, lda);
01376       MatView<LocalOrdinal, Scalar> R_view (numCols, numCols, R, ldr);
01377       return factorImpl (A_view, R_view, contiguousCacheBlocks);
01378     }
01379     
01380 
01382     void
01383     apply (const ApplyType& applyType,
01384      const LocalOrdinal nrows,
01385      const LocalOrdinal ncols_Q,
01386      const Scalar Q[],
01387      const LocalOrdinal ldq,
01388      const FactorOutput& factorOutput,
01389      const LocalOrdinal ncols_C,
01390      Scalar C[],
01391      const LocalOrdinal ldc,
01392      const bool contiguousCacheBlocks) const
01393     {
01394       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01395       typedef MatView<LocalOrdinal, Scalar> view_type;
01396 
01397       const_view_type Q_view (nrows, ncols_Q, Q, ldq);
01398       view_type C_view (nrows, ncols_C, C, ldc);
01399       applyImpl (applyType, Q_view, factorOutput, C_view,
01400      false, contiguousCacheBlocks);
01401     }
01402 
01404     void
01405     explicit_Q (const LocalOrdinal nrows,
01406     const LocalOrdinal ncols_Q,
01407     const Scalar Q[],
01408     const LocalOrdinal ldq,
01409     const FactorOutput& factorOutput,
01410     const LocalOrdinal ncols_C,
01411     Scalar C[],
01412     const LocalOrdinal ldc,
01413     const bool contiguousCacheBlocks) const
01414     {
01415       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01416       typedef MatView<LocalOrdinal, Scalar> view_type;
01417 
01418       const_view_type Q_view (nrows, ncols_Q, Q, ldq);
01419       view_type C_view (nrows, ncols_C, C, ldc);
01420       applyImpl (ApplyType::NoTranspose, Q_view, factorOutput,
01421      C_view, true, contiguousCacheBlocks);
01422     }
01423 
01427     bool QR_produces_R_factor_with_nonnegative_diagonal () const {
01428       return combine_.QR_produces_R_factor_with_nonnegative_diagonal ();
01429     }
01430 
01440     size_t TEUCHOS_DEPRECATED cache_block_size() const {
01441       return strategy_.cache_size_hint();
01442     }
01443 
01447     size_t cache_size_hint() const {
01448       return strategy_.cache_size_hint();
01449     }
01450 
01452     void
01453     fill_with_zeros (const LocalOrdinal nrows,
01454          const LocalOrdinal ncols,
01455          Scalar A[],
01456          const LocalOrdinal lda, 
01457          const bool contiguousCacheBlocks) const
01458     {
01459       typedef MatView<LocalOrdinal, Scalar> view_type;
01460       view_type A_view (nrows, ncols, A, lda);
01461 
01462       typedef details::FillWDP<LocalOrdinal, Scalar> fill_wdp_type;
01463       typedef Teuchos::ScalarTraits<Scalar> STS;
01464       fill_wdp_type filler (A_view, strategy_, STS::zero(), 
01465           numPartitions_, contiguousCacheBlocks);
01466       node_->parallel_for (0, numPartitions_, filler);
01467     }
01468 
01470     void
01471     cache_block (const LocalOrdinal nrows,
01472      const LocalOrdinal ncols, 
01473      Scalar A_out[],
01474      const Scalar A_in[],
01475      const LocalOrdinal lda_in) const
01476     {
01477       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01478       const_view_type A_in_view (nrows, ncols, A_in, lda_in);
01479 
01480       // The leading dimension of A_out doesn't matter here, since its
01481       // cache blocks are to be stored contiguously.  We set it
01482       // arbitrarily to a sensible value.
01483       typedef MatView<LocalOrdinal, Scalar> view_type;
01484       view_type A_out_view (nrows, ncols, A_out, nrows);
01485 
01486       typedef details::CacheBlockWDP<LocalOrdinal, Scalar> cb_wdp_type;
01487       cb_wdp_type cacheBlocker (A_in_view, A_out_view, strategy_, 
01488         numPartitions_, false);
01489       node_->parallel_for (0, numPartitions_, cacheBlocker);
01490     }
01491 
01493     void
01494     un_cache_block (const LocalOrdinal nrows,
01495         const LocalOrdinal ncols,
01496         Scalar A_out[],
01497         const LocalOrdinal lda_out,       
01498         const Scalar A_in[]) const
01499     {
01500       // The leading dimension of A_in doesn't matter here, since its
01501       // cache blocks are contiguously stored.  We set it arbitrarily
01502       // to a sensible value.
01503       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01504       const_view_type A_in_view (nrows, ncols, A_in, nrows);
01505 
01506       typedef MatView<LocalOrdinal, Scalar> view_type;
01507       view_type A_out_view (nrows, ncols, A_out, lda_out);
01508 
01509       typedef details::CacheBlockWDP<LocalOrdinal, Scalar> cb_wdp_type;
01510       cb_wdp_type cacheBlocker (A_in_view, A_out_view, strategy_, 
01511         numPartitions_, true);
01512       node_->parallel_for (0, numPartitions_, cacheBlocker);
01513     }
01514 
01516     void
01517     Q_times_B (const LocalOrdinal nrows,
01518          const LocalOrdinal ncols,
01519          Scalar Q[],
01520          const LocalOrdinal ldq,
01521          const Scalar B[],
01522          const LocalOrdinal ldb,
01523          const bool contiguousCacheBlocks) const
01524     {
01525       typedef MatView<LocalOrdinal, Scalar> view_type;
01526       view_type Q_view (nrows, ncols, Q, ldq);
01527 
01528       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01529       const_view_type B_view (ncols, ncols, B, ldb);
01530 
01531       typedef details::MultWDP<LocalOrdinal, Scalar> mult_wdp_type;
01532       mult_wdp_type mult (Q_view, B_view, strategy_, numPartitions_, 
01533         contiguousCacheBlocks);
01534       node_->parallel_for (0, numPartitions_, mult);
01535     }
01536 
01537   private:
01538 
01539     FactorOutput
01540     factorImpl (MatView<LocalOrdinal, Scalar> A,
01541     MatView<LocalOrdinal, Scalar> R,
01542     const bool contiguousCacheBlocks) const
01543     {
01544       if (A.empty())
01545   {
01546     TEST_FOR_EXCEPTION(! R.empty(), std::logic_error,
01547            "KokkosNodeTsqr::factorImpl: A is empty, but R "
01548            "is not.  Please report this bug to the Kokkos "
01549            "developers.");
01550     return FactorOutput (0, 0);
01551   }
01552       const LocalOrdinal numRowsPerCacheBlock = 
01553   strategy_.cache_block_num_rows (A.ncols());
01554       const LocalOrdinal numCacheBlocks = 
01555   strategy_.num_cache_blocks (A.nrows(), A.ncols(), numRowsPerCacheBlock);
01556       //
01557       // Compute the first factorization pass (over partitions).
01558       //
01559       FactorOutput result (numCacheBlocks, numPartitions_);
01560       typedef details::FactorFirstPass<LocalOrdinal, Scalar> first_pass_type;
01561       first_pass_type firstPass (A, result.firstPassTauArrays, 
01562          result.topBlocks, strategy_, 
01563          numPartitions_, contiguousCacheBlocks);
01564       // parallel_for wants an exclusive range.
01565       node_->parallel_for (0, numPartitions_, firstPass);
01566 
01567       // Each partition collected a view of its top block, where that
01568       // partition's R factor is stored.  The second pass reduces
01569       // those R factors.  We do this on one thread to avoid the
01570       // overhead of parallelizing it.  If the typical use case is
01571       // oversubscription, you should parallelize this step with
01572       // multiple passes.  Note that we can't use parallel_reduce,
01573       // because the tree topology matters.
01574       factorSecondPass (result.topBlocks, result.secondPassTauArrays, 
01575       numPartitions_);
01576 
01577       // The "topmost top block" contains the resulting R factor.
01578       typedef MatView<LocalOrdinal, Scalar> view_type;
01579       const view_type& R_top = result.topBlocks[0];
01580       TEST_FOR_EXCEPTION(R_top.empty(), std::logic_error,
01581        "After factorSecondPass: result.topBlocks[0] is an "
01582        "empty MatView.  Please report this bug to the Kokkos "
01583        "developers.");
01584       view_type R_top_square (R_top.ncols(), R_top.ncols(), 
01585             R_top.get(), R_top.lda());
01586       R.fill (Teuchos::ScalarTraits<Scalar>::zero());
01587       // Only copy the upper triangle of R_top into R.
01588       copy_upper_triangle (R.ncols(), R.ncols(), R.get(), R.lda(),
01589          R_top.get(), R_top.lda());
01590       return result;
01591     }
01592 
01593     void
01594     applyImpl (const ApplyType& applyType, 
01595          const ConstMatView<LocalOrdinal, Scalar>& Q, 
01596          const FactorOutput& factorOutput,
01597          const MatView<LocalOrdinal, Scalar>& C, 
01598          const bool explicitQ,
01599          const bool contiguousCacheBlocks) const
01600     {
01601       using details::cacheBlockIndexRange;
01602       typedef details::ApplyFirstPass<LocalOrdinal, Scalar> first_pass_type;
01603       typedef CacheBlockingStrategy<LocalOrdinal, Scalar> strategy_type;
01604       typedef MatView<LocalOrdinal, Scalar> view_type;
01605 
01606       TEST_FOR_EXCEPTION(numPartitions_ != factorOutput.numPartitions(),
01607        std::invalid_argument,
01608        "applyImpl: KokkosNodeTsqr's number of partitions " 
01609        << numPartitions_ << " does not match the given "
01610        "factorOutput's number of partitions " 
01611        << factorOutput.numPartitions() << ".  This likely "
01612        "means that the given factorOutput object comes from "
01613        "a different instance of KokkosNodeTsqr.  Please "
01614        "report this bug to the Kokkos developers.");
01615       const int numParts = numPartitions_;
01616       first_pass_type firstPass (applyType, Q, factorOutput.firstPassTauArrays,
01617          factorOutput.topBlocks, C, strategy_,
01618          numParts, explicitQ, contiguousCacheBlocks);
01619       // Get a view of each partition's top block of the C matrix.
01620       std::vector<view_type> topBlocksOfC (numParts);
01621       {
01622   typedef std::pair<LocalOrdinal, LocalOrdinal> index_range_type;
01623   typedef CacheBlocker<LocalOrdinal, Scalar> blocker_type;
01624   blocker_type C_blocker (C.nrows(), C.ncols(), strategy_);
01625 
01626   // For each partition, collect its top block of C.
01627   for (int partIdx = 0; partIdx < numParts; ++partIdx)
01628     {
01629       const index_range_type cbIndices = 
01630         cacheBlockIndexRange (C.nrows(), C.ncols(), partIdx, 
01631             numParts, strategy_);
01632       if (cbIndices.first >= cbIndices.second)
01633         topBlocksOfC[partIdx] = view_type (0, 0, NULL, 0);
01634       else
01635         topBlocksOfC[partIdx] = 
01636     C_blocker.get_cache_block (C, cbIndices.first, 
01637              contiguousCacheBlocks);
01638     }
01639       }
01640 
01641       if (applyType.transposed())
01642   {
01643     // parallel_for wants an exclusive range.
01644     node_->parallel_for (0, numPartitions_, firstPass);
01645     applySecondPass (applyType, factorOutput, topBlocksOfC, 
01646          strategy_, explicitQ);
01647   }
01648       else
01649   {
01650     applySecondPass (applyType, factorOutput, topBlocksOfC, 
01651          strategy_, explicitQ);
01652     // parallel_for wants an exclusive range.
01653     node_->parallel_for (0, numPartitions_, firstPass);
01654   }
01655     }
01656 
01657     std::vector<Scalar>
01658     factorPair (const MatView<LocalOrdinal, Scalar>& R_top,
01659     const MatView<LocalOrdinal, Scalar>& R_bot) const
01660     {
01661       TEST_FOR_EXCEPTION(R_top.empty(), std::logic_error,
01662        "R_top is empty!");
01663       TEST_FOR_EXCEPTION(R_bot.empty(), std::logic_error,
01664        "R_bot is empty!");
01665       TEST_FOR_EXCEPTION(work_.size() == 0, std::logic_error,
01666        "Workspace array work_ has length zero.");
01667       TEST_FOR_EXCEPTION(work_.size() < static_cast<size_t> (R_top.ncols()),
01668        std::logic_error,
01669        "Workspace array work_ has length = " 
01670        << work_.size() << " < R_top.ncols() = " 
01671        << R_top.ncols() << ".");
01672 
01673       std::vector<Scalar> tau (R_top.ncols());
01674 
01675       // Our convention for such helper methods is for the immediate
01676       // parent to allocate workspace (the work_ array in this case).
01677       //
01678       // The statement below only works if R_top and R_bot have a
01679       // nonzero (and the same) number of columns, but we have already
01680       // checked that above.
01681       combine_.factor_pair (R_top.ncols(), R_top.get(), R_top.lda(),
01682           R_bot.get(), R_bot.lda(), &tau[0], &work_[0]);
01683       return tau;
01684     }
01685 
01686     void
01687     factorSecondPass (std::vector<MatView<LocalOrdinal, Scalar> >& topBlocks,
01688           std::vector<std::vector<Scalar> >& tauArrays,
01689           const int numPartitions) const
01690     {
01691       if (numPartitions <= 1)
01692   return; // Done!
01693       TEST_FOR_EXCEPTION (topBlocks.size() < static_cast<size_t>(numPartitions), 
01694         std::logic_error,
01695         "KokkosNodeTsqr::factorSecondPass: topBlocks.size() "
01696         "(= " << topBlocks.size() << ") < numPartitions (= " 
01697         << numPartitions << ").  Please report this bug to "
01698         "the Kokkos developers.");
01699       TEST_FOR_EXCEPTION (tauArrays.size() < static_cast<size_t>(numPartitions-1), 
01700         std::logic_error,
01701         "KokkosNodeTsqr::factorSecondPass: topBlocks.size() "
01702         "(= " << topBlocks.size() << ") < numPartitions-1 (= " 
01703         << (numPartitions-1) << ").  Please report this bug "
01704         "to the Kokkos developers.");
01705       // The top partition (partition index zero) should always be
01706       // nonempty if we get this far, so its top block should also be
01707       // nonempty.
01708       TEST_FOR_EXCEPTION(topBlocks[0].empty(), std::logic_error,
01709        "KokkosNodeTsqr::factorSecondPass: topBlocks[0] is "
01710        "empty.  Please report this bug to the Kokkos "
01711        "developers.");
01712       // However, other partitions besides the top one might be empty,
01713       // in which case their top blocks will be empty.  We skip over
01714       // the empty partitions in the loop below.
01715       work_.resize (static_cast<size_t> (topBlocks[0].ncols()));
01716       for (int partIdx = 1; partIdx < numPartitions; ++partIdx)
01717   if (! topBlocks[partIdx].empty())
01718     tauArrays[partIdx-1] = factorPair (topBlocks[0], topBlocks[partIdx]);
01719     }
01720 
01721     void
01722     applyPair (const ApplyType& applyType,
01723          const MatView<LocalOrdinal, Scalar>& R_bot,
01724          const std::vector<Scalar>& tau,
01725          const MatView<LocalOrdinal, Scalar>& C_top,
01726          const MatView<LocalOrdinal, Scalar>& C_bot) const
01727     {
01728       // Our convention for such helper methods is for the immediate
01729       // parent to allocate workspace (the work_ array in this case).
01730       //
01731       // The statement below only works if C_top, R_bot, and C_bot
01732       // have a nonzero (and the same) number of columns, but we have
01733       // already checked that above.
01734       combine_.apply_pair (applyType, C_top.ncols(), R_bot.ncols(), 
01735          R_bot.get(), R_bot.lda(), &tau[0], 
01736          C_top.get(), C_top.lda(),
01737          C_bot.get(), C_bot.lda(), &work_[0]);
01738     }
01739 
01740     void
01741     applySecondPass (const ApplyType& applyType,
01742          const FactorOutput& factorOutput,
01743          std::vector<MatView<LocalOrdinal, Scalar> >& topBlocksOfC,
01744          const CacheBlockingStrategy<LocalOrdinal, Scalar>& strategy,
01745          const bool explicitQ) const
01746     {
01747       const int numParts = factorOutput.numPartitions();
01748       if (numParts <= 1)
01749   return; // Done!
01750       TEST_FOR_EXCEPTION(topBlocksOfC.size() != static_cast<size_t>(numParts),
01751        std::logic_error,
01752        "KokkosNodeTsqr:applySecondPass: topBlocksOfC.size() ("
01753        "= " << topBlocksOfC.size() << ") != number of partiti"
01754        "ons (= " << numParts << ").  Please report this bug t"
01755        "o the Kokkos developers.");
01756       TEST_FOR_EXCEPTION(factorOutput.secondPassTauArrays.size() != static_cast<size_t>(numParts-1),
01757        std::logic_error,
01758        "KokkosNodeTsqr:applySecondPass: factorOutput.secondPassTauArrays.size() ("
01759        "= " << factorOutput.secondPassTauArrays.size() << ") != number of partiti"
01760        "ons minus 1 (= " << (numParts-1) << ").  Please report this bug t"
01761        "o the Kokkos developers.");
01762 
01763       const LocalOrdinal numCols = topBlocksOfC[0].ncols();
01764       work_.resize (static_cast<size_t> (numCols));
01765 
01766       typedef MatView<LocalOrdinal, Scalar> view_type;
01767       typedef typename std::vector<view_type>::size_type size_type;
01768 
01769       // Top blocks of C are the whole cache blocks.  We only want to
01770       // affect the top ncols x ncols part of each of those blocks in
01771       // this method.
01772       view_type C_top_square (numCols, numCols, topBlocksOfC[0].get(), 
01773             topBlocksOfC[0].lda());
01774       if (applyType.transposed())
01775   {
01776     // Don't include the topmost (index 0) partition in the
01777     // iteration; that corresponds to C_top_square.
01778     for (int partIdx = 1; partIdx < numParts; ++partIdx)
01779       {
01780         // It's legitimate for some partitions not to have any
01781         // cache blocks.  In that case, their top block will be
01782         // empty, and we can skip over them.
01783         const view_type& C_cur = topBlocksOfC[partIdx];
01784         if (! C_cur.empty())
01785     {
01786       view_type C_cur_square (numCols, numCols, C_cur.get(), 
01787             C_cur.lda());
01788       // If explicitQ: We've already done the first pass and
01789       // filled the top blocks of C.
01790       applyPair (applyType, factorOutput.topBlocks[partIdx],
01791            factorOutput.secondPassTauArrays[partIdx-1], 
01792            C_top_square, C_cur_square);
01793     }
01794       }
01795   }
01796       else
01797   {
01798     // In non-transposed mode, when computing the first
01799     // C.ncols() columns of the explicit Q factor, intranode
01800     // TSQR would run after internode TSQR (i.e., DistTsqr)
01801     // (even if only running on a single node in non-MPI mode).
01802     // Therefore, internode TSQR is responsible for filling the
01803     // top block of this node's part of the C matrix.
01804     //
01805     // Don't include the topmost partition in the iteration;
01806     // that corresponds to C_top_square.
01807     for (int partIdx = numParts - 1; partIdx > 0; --partIdx)
01808       {
01809         // It's legitimate for some partitions not to have any
01810         // cache blocks.  In that case, their top block will be
01811         // empty, and we can skip over them.
01812         const view_type& C_cur = topBlocksOfC[partIdx];
01813         if (! C_cur.empty())
01814     {
01815       view_type C_cur_square (numCols, numCols, 
01816             C_cur.get(), C_cur.lda());
01817       // The "first" pass (actually the last, only named
01818       // "first" by analogy with factorFirstPass()) will
01819       // fill the rest of these top blocks.  For now, we
01820       // just fill the top n x n part of the top blocks
01821       // with zeros.
01822       if (explicitQ)
01823         C_cur_square.fill (Teuchos::ScalarTraits<Scalar>::zero());
01824       applyPair (applyType, factorOutput.topBlocks[partIdx],
01825            factorOutput.secondPassTauArrays[partIdx-1],
01826            C_top_square, C_cur_square);
01827     }
01828       }
01829   }
01830     }
01831 
01832   protected:
01833 
01847     ConstMatView<LocalOrdinal, Scalar>
01848     const_top_block (const ConstMatView<LocalOrdinal, Scalar>& C, 
01849          const bool contiguous_cache_blocks) const 
01850     {
01851       typedef CacheBlocker<LocalOrdinal, Scalar> blocker_type;
01852       blocker_type blocker (C.nrows(), C.ncols(), strategy_);
01853 
01854       // C_top_block is a view of the topmost cache block of C.
01855       // C_top_block should have >= ncols rows, otherwise either cache
01856       // blocking is broken or the input matrix C itself had fewer
01857       // rows than columns.
01858       typedef ConstMatView<LocalOrdinal, Scalar> const_view_type;
01859       const_view_type C_top = blocker.top_block (C, contiguous_cache_blocks);
01860       return C_top;
01861     }
01862   };
01863 } // namespace TSQR
01864 
01865 #endif // __TSQR_KokkosNodeTsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends