Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr.hpp
Go to the documentation of this file.
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00045 #ifndef __TSQR_Tsqr_hpp
00046 #define __TSQR_Tsqr_hpp
00047 
00048 #include <Tsqr_ApplyType.hpp>
00049 #include <Tsqr_Matrix.hpp>
00050 #include <Tsqr_MessengerBase.hpp>
00051 #include <Tsqr_DistTsqr.hpp>
00052 #include <Tsqr_SequentialTsqr.hpp>
00053 #include <Tsqr_Util.hpp>
00054 
00055 #include <Kokkos_MultiVector.hpp>
00056 #include <Teuchos_as.hpp>
00057 #include <Teuchos_ScalarTraits.hpp>
00058 #include <Teuchos_SerialDenseMatrix.hpp>
00059 
00060 
00061 namespace TSQR {
00062 
00099   template<class LocalOrdinal, 
00100      class Scalar, 
00101      class NodeTsqrType = SequentialTsqr<LocalOrdinal, Scalar> >
00102   class Tsqr {
00103   public:
00104     typedef MatView<LocalOrdinal, Scalar> matview_type;
00105     typedef ConstMatView<LocalOrdinal, Scalar> const_matview_type;
00106     typedef Matrix<LocalOrdinal, Scalar> matrix_type;
00107 
00108     typedef Scalar scalar_type;
00109     typedef LocalOrdinal ordinal_type;
00110     typedef Teuchos::ScalarTraits<Scalar> STS;
00111     typedef typename STS::magnitudeType magnitude_type;
00112 
00113     typedef NodeTsqrType node_tsqr_type;
00114     typedef DistTsqr<LocalOrdinal, Scalar> dist_tsqr_type;
00115     typedef typename Teuchos::RCP<node_tsqr_type> node_tsqr_ptr;
00116     typedef typename Teuchos::RCP<dist_tsqr_type> dist_tsqr_ptr;
00119     typedef typename dist_tsqr_type::rank_type rank_type;
00120 
00121     typedef typename node_tsqr_type::FactorOutput NodeOutput;
00122     typedef typename dist_tsqr_type::FactorOutput DistOutput;
00123 
00130     typedef std::pair<NodeOutput, DistOutput> FactorOutput;
00131 
00139     Tsqr (const node_tsqr_ptr& nodeTsqr, 
00140     const dist_tsqr_ptr& distTsqr) :
00141       nodeTsqr_ (nodeTsqr), 
00142       distTsqr_ (distTsqr)
00143     {}
00144 
00149     Teuchos::RCP<node_tsqr_type> getNodeTsqr () {
00150       return nodeTsqr_;
00151     }
00152 
00158     size_t cache_size_hint() const { return nodeTsqr_->cache_size_hint(); }
00159 
00164     size_t TEUCHOS_DEPRECATED cache_block_size() const { 
00165       return nodeTsqr_->cache_size_hint(); 
00166     }
00167 
00176     bool QR_produces_R_factor_with_nonnegative_diagonal () const {
00177       // Tsqr computes an R factor with nonnegative diagonal, if and
00178       // only if all QR factorization steps (both intranode and
00179       // internode) produce an R factor with a nonnegative diagonal.
00180       return nodeTsqr_->QR_produces_R_factor_with_nonnegative_diagonal() &&
00181   distTsqr_->QR_produces_R_factor_with_nonnegative_diagonal();
00182     }
00183 
00225     template<class NodeType>
00226     void
00227     factorExplicit (Kokkos::MultiVector<Scalar, NodeType>& A,
00228         Kokkos::MultiVector<Scalar, NodeType>& Q,
00229         Teuchos::SerialDenseMatrix<LocalOrdinal, Scalar>& R,
00230         const bool contiguousCacheBlocks,
00231         const bool forceNonnegativeDiagonal=false)
00232     {
00233       using Teuchos::asSafe;
00234       typedef Kokkos::MultiVector<Scalar, NodeType> KMV;
00235 
00236       // Tsqr currently likes LocalOrdinal ordinals, but
00237       // Kokkos::MultiVector has size_t ordinals.  Do conversions
00238       // here.  
00239       //
00240       // Teuchos::asSafe() can do safe conversion (e.g., checking for
00241       // overflow when casting to a narrower integer type), if a
00242       // custom specialization is defined for
00243       // Teuchos::ValueTypeConversionTraits<size_t, LocalOrdinal>.
00244       // Otherwise, this has the same (potentially) unsafe effect as
00245       // static_cast<LocalOrdinal>(...) would have.
00246       const LocalOrdinal A_numRows = asSafe<LocalOrdinal> (A.getNumRows());
00247       const LocalOrdinal A_numCols = asSafe<LocalOrdinal> (A.getNumCols());
00248       const LocalOrdinal A_stride = asSafe<LocalOrdinal> (A.getStride());
00249       const LocalOrdinal Q_numRows = asSafe<LocalOrdinal> (Q.getNumRows());
00250       const LocalOrdinal Q_numCols = asSafe<LocalOrdinal> (Q.getNumCols());
00251       const LocalOrdinal Q_stride = asSafe<LocalOrdinal> (Q.getStride());
00252 
00253       // Sanity checks for matrix dimensions
00254       if (A_numRows < A_numCols) {
00255   std::ostringstream os;
00256   os << "In Tsqr::factorExplicit: input matrix A has " << A_numRows 
00257      << " local rows, and " << A_numCols << " columns.  The input "
00258     "matrix must have at least as many rows on each processor as "
00259     "there are columns.";
00260   throw std::invalid_argument(os.str());
00261       } else if (A_numRows != Q_numRows) {
00262   std::ostringstream os;
00263   os << "In Tsqr::factorExplicit: input matrix A and output matrix Q "
00264     "must have the same number of rows.  A has " << A_numRows << " rows"
00265     " and Q has " << Q_numRows << " rows.";
00266   throw std::invalid_argument(os.str());
00267       } else if (R.numRows() < R.numCols()) {
00268   std::ostringstream os;
00269   os << "In Tsqr::factorExplicit: output matrix R must have at least "
00270     "as many rows as columns.  R has " << R.numRows() << " rows and "
00271      << R.numCols() << " columns.";
00272   throw std::invalid_argument(os.str());
00273       } else if (A_numCols != R.numCols()) {
00274   std::ostringstream os;
00275   os << "In Tsqr::factorExplicit: input matrix A and output matrix R "
00276     "must have the same number of columns.  A has " << A_numCols 
00277      << " columns and R has " << R.numCols() << " columns.";
00278   throw std::invalid_argument(os.str());
00279       }
00280 
00281       // Check for quick exit, based on matrix dimensions
00282       if (Q_numCols == 0)
00283   return;
00284 
00285       // Hold on to nonconst views of A and Q.  This will make TSQR
00286       // correct (if perhaps inefficient) for all possible Kokkos Node
00287       // types, even GPU nodes.
00288       Teuchos::ArrayRCP<scalar_type> A_ptr = A.getValuesNonConst();
00289       Teuchos::ArrayRCP<scalar_type> Q_ptr = Q.getValuesNonConst();
00290 
00291       R.putScalar (STS::zero());
00292       NodeOutput nodeResults = 
00293   nodeTsqr_->factor (A_numRows, A_numCols, A_ptr.getRawPtr(), A_stride,
00294          R.values(), R.stride(), contiguousCacheBlocks);
00295       // FIXME (mfh 19 Oct 2010) Replace actions on raw pointer with
00296       // actions on the Kokkos::MultiVector or at least the ArrayRCP.
00297       nodeTsqr_->fill_with_zeros (Q_numRows, Q_numCols, 
00298           Q_ptr.getRawPtr(), Q_stride,
00299           contiguousCacheBlocks);
00300       matview_type Q_rawView (Q_numRows, Q_numCols, 
00301             Q_ptr.getRawPtr(), Q_stride);
00302       matview_type Q_top_block = 
00303   nodeTsqr_->top_block (Q_rawView, contiguousCacheBlocks);
00304       if (Q_top_block.nrows() < R.numCols()) {
00305   std::ostringstream os;
00306   os << "The top block of Q has too few rows.  This means that the "
00307      << "the intranode TSQR implementation has a bug in its top_block"
00308      << "() method.  The top block should have at least " << R.numCols()
00309      << " rows, but instead has only " << Q_top_block.ncols() 
00310      << " rows.";
00311   throw std::logic_error (os.str());
00312       }
00313       {
00314   matview_type Q_top (R.numCols(), Q_numCols, Q_top_block.get(), 
00315           Q_top_block.lda());
00316   matview_type R_view (R.numRows(), R.numCols(), R.values(), R.stride());
00317   distTsqr_->factorExplicit (R_view, Q_top, forceNonnegativeDiagonal);
00318       }
00319       nodeTsqr_->apply (ApplyType::NoTranspose, 
00320       A_numRows, A_numCols, A_ptr.getRawPtr(), A_stride,
00321       nodeResults, Q_numCols, Q_ptr.getRawPtr(), Q_stride,
00322       contiguousCacheBlocks);
00323 
00324       // If necessary, force the R factor to have a nonnegative diagonal.
00325       if (forceNonnegativeDiagonal && 
00326     ! QR_produces_R_factor_with_nonnegative_diagonal()) {
00327   details::NonnegDiagForcer<LocalOrdinal, Scalar, STS::isComplex> forcer;
00328   matview_type Q_mine (Q_numRows, Q_numCols, Q_ptr.getRawPtr(), Q_stride);
00329   matview_type R_mine (R.numRows(), R.numCols(), R.values(), R.stride());
00330   forcer.force (Q_mine, R_mine);
00331       }
00332 
00333       // "Commit" the changes to the multivector.
00334       A_ptr = Teuchos::null;
00335       Q_ptr = Teuchos::null;
00336     }
00337 
00338 
00382     FactorOutput
00383     factor (const LocalOrdinal nrows_local,
00384       const LocalOrdinal ncols, 
00385       Scalar A_local[],
00386       const LocalOrdinal lda_local,
00387       Scalar R[],
00388       const LocalOrdinal ldr,
00389       const bool contiguousCacheBlocks = false)
00390     {
00391       MatView< LocalOrdinal, Scalar > R_view (ncols, ncols, R, ldr);
00392       R_view.fill (STS::zero());
00393       NodeOutput nodeResults = 
00394   nodeTsqr_->factor (nrows_local, ncols, A_local, lda_local, 
00395         R_view.get(), R_view.lda(), 
00396         contiguousCacheBlocks);
00397       DistOutput distResults = distTsqr_->factor (R_view);
00398       return std::make_pair (nodeResults, distResults);
00399     }
00400 
00439     void
00440     apply (const std::string& op,
00441      const LocalOrdinal nrows_local,
00442      const LocalOrdinal ncols_Q,
00443      const Scalar Q_local[],
00444      const LocalOrdinal ldq_local,
00445      const FactorOutput& factor_output,
00446      const LocalOrdinal ncols_C,
00447      Scalar C_local[],
00448      const LocalOrdinal ldc_local,
00449      const bool contiguousCacheBlocks = false)
00450     {
00451       ApplyType applyType (op);
00452 
00453       // This determines the order in which we apply the intranode
00454       // part of the Q factor vs. the internode part of the Q factor.
00455       const bool transposed = applyType.transposed();
00456 
00457       // View of this node's local part of the matrix C.
00458       matview_type C_view (nrows_local, ncols_C, C_local, ldc_local);
00459 
00460       // Identify top ncols_C by ncols_C block of C_local.
00461       // top_block() will set C_top_view to have the correct leading
00462       // dimension, whether or not cache blocks are stored
00463       // contiguously.
00464       //
00465       // C_top_view is the topmost cache block of C_local.  It has at
00466       // least as many rows as columns, but it likely has more rows
00467       // than columns.
00468       matview_type C_view_top_block = 
00469   nodeTsqr_->top_block (C_view, contiguousCacheBlocks);
00470 
00471       // View of the topmost ncols_C by ncols_C block of C.
00472       matview_type C_top_view (ncols_C, ncols_C, C_view_top_block.get(), 
00473              C_view_top_block.lda());
00474 
00475       if (! transposed) 
00476   {
00477     // C_top (small compact storage) gets a deep copy of the top
00478     // ncols_C by ncols_C block of C_local.
00479           matrix_type C_top (C_top_view);
00480 
00481     // Compute in place on all processors' C_top blocks.
00482     distTsqr_->apply (applyType, C_top.ncols(), ncols_Q, C_top.get(), 
00483           C_top.lda(), factor_output.second);
00484 
00485     // Copy the result from C_top back into the top ncols_C by
00486     // ncols_C block of C_local.
00487     C_top_view.copy (C_top);
00488 
00489     // Apply the local Q factor (in Q_local and
00490     // factor_output.first) to C_local.
00491     nodeTsqr_->apply (applyType, nrows_local, ncols_Q, 
00492           Q_local, ldq_local, factor_output.first, 
00493           ncols_C, C_local, ldc_local,
00494           contiguousCacheBlocks);
00495   }
00496       else
00497   {
00498     // Apply the (transpose of the) local Q factor (in Q_local
00499     // and factor_output.first) to C_local.
00500     nodeTsqr_->apply (applyType, nrows_local, ncols_Q, 
00501           Q_local, ldq_local, factor_output.first, 
00502           ncols_C, C_local, ldc_local,
00503           contiguousCacheBlocks);
00504 
00505     // C_top (small compact storage) gets a deep copy of the top
00506     // ncols_C by ncols_C block of C_local.
00507           matrix_type C_top (C_top_view);
00508 
00509     // Compute in place on all processors' C_top blocks.
00510     distTsqr_->apply (applyType, ncols_C, ncols_Q, C_top.get(), 
00511           C_top.lda(), factor_output.second);
00512 
00513     // Copy the result from C_top back into the top ncols_C by
00514     // ncols_C block of C_local.
00515     C_top_view.copy (C_top);
00516   }
00517     }
00518 
00552     void
00553     explicit_Q (const LocalOrdinal nrows_local,
00554     const LocalOrdinal ncols_Q_in,
00555     const Scalar Q_local_in[],
00556     const LocalOrdinal ldq_local_in,
00557     const FactorOutput& factorOutput,
00558     const LocalOrdinal ncols_Q_out,
00559     Scalar Q_local_out[],
00560     const LocalOrdinal ldq_local_out,
00561     const bool contiguousCacheBlocks = false)
00562     {
00563       nodeTsqr_->fill_with_zeros (nrows_local, ncols_Q_out, Q_local_out,
00564           ldq_local_out, contiguousCacheBlocks);
00565       // "Rank" here means MPI rank, not linear algebra rank.
00566       const rank_type myRank = distTsqr_->rank();
00567       if (myRank == 0)
00568   {
00569     // View of this node's local part of the matrix Q_out.
00570     matview_type Q_out_view (nrows_local, ncols_Q_out, 
00571            Q_local_out, ldq_local_out);
00572 
00573     // View of the topmost cache block of Q_out.  It is
00574     // guaranteed to have at least as many rows as columns.
00575     matview_type Q_out_top = 
00576       nodeTsqr_->top_block (Q_out_view, contiguousCacheBlocks);
00577 
00578     // Fill (topmost cache block of) Q_out with the first
00579     // ncols_Q_out columns of the identity matrix.
00580     for (ordinal_type j = 0; j < ncols_Q_out; ++j)
00581       Q_out_top(j, j) = Scalar(1);
00582   }
00583       apply ("N", nrows_local, 
00584        ncols_Q_in, Q_local_in, ldq_local_in, factorOutput,
00585        ncols_Q_out, Q_local_out, ldq_local_out,
00586        contiguousCacheBlocks);
00587     }
00588 
00593     void
00594     Q_times_B (const LocalOrdinal nrows,
00595          const LocalOrdinal ncols,
00596          Scalar Q[],
00597          const LocalOrdinal ldq,
00598          const Scalar B[],
00599          const LocalOrdinal ldb,
00600          const bool contiguousCacheBlocks = false) const
00601     {
00602       // This requires no internode communication.  However, the work
00603       // is not redundant, since each MPI process has a different Q.
00604       nodeTsqr_->Q_times_B (nrows, ncols, Q, ldq, B, ldb, 
00605          contiguousCacheBlocks);
00606       // We don't need a barrier after this method, unless users are
00607       // doing mean MPI_Get() things.
00608     }
00609 
00620     LocalOrdinal
00621     reveal_R_rank (const LocalOrdinal ncols,
00622        Scalar R[],
00623        const LocalOrdinal ldr,
00624        Scalar U[],
00625        const LocalOrdinal ldu,
00626        const magnitude_type& tol) const 
00627     {
00628       // Forward the request to the intranode TSQR implementation.
00629       // Currently, this work is performed redundantly on all MPI
00630       // processes, without communication or agreement.
00631       //
00632       // FIXME (mfh 26 Aug 2010) This be a problem if your cluster is
00633       // heterogeneous, because then you might obtain different
00634       // integer rank results.  This is because heterogeneous nodes
00635       // might each compute the rank-revealing decomposition with
00636       // slightly different rounding error.
00637       return nodeTsqr_->reveal_R_rank (ncols, R, ldr, U, ldu, tol);
00638     }
00639 
00676     template<class NodeType>
00677     LocalOrdinal
00678     revealRank (Kokkos::MultiVector<Scalar, NodeType>& Q,
00679     Teuchos::SerialDenseMatrix<LocalOrdinal, Scalar>& R,
00680     const magnitude_type& tol,
00681     const bool contiguousCacheBlocks = false) const
00682     {
00683       typedef Kokkos::MultiVector<Scalar, NodeType> KMV;
00684 
00685       const LocalOrdinal nrows = static_cast<LocalOrdinal> (Q.getNumRows());
00686       const LocalOrdinal ncols = static_cast<LocalOrdinal> (Q.getNumCols());
00687       const LocalOrdinal ldq = static_cast<LocalOrdinal> (Q.getStride());
00688       Teuchos::ArrayRCP<Scalar> Q_ptr = Q.getValuesNonConst();
00689 
00690       // Take the easy exit if available.
00691       if (ncols == 0)
00692   return 0;
00693 
00694       //
00695       // FIXME (mfh 16 Jul 2010) We _should_ compute the SVD of R (as
00696       // the copy B) on Proc 0 only.  This would ensure that all
00697       // processors get the same SVD and rank (esp. in a heterogeneous
00698       // computing environment).  For now, we just do this computation
00699       // redundantly, and hope that all the returned rank values are
00700       // the same.
00701       //
00702       matrix_type U (ncols, ncols, STS::zero());
00703       const ordinal_type rank = 
00704   reveal_R_rank (ncols, R.values(), R.stride(), 
00705            U.get(), U.lda(), tol);
00706       if (rank < ncols)
00707   {
00708     // cerr << ">>> Rank of R: " << rank << " < ncols=" << ncols << endl;
00709     // cerr << ">>> Resulting U:" << endl;
00710     // print_local_matrix (cerr, ncols, ncols, R, ldr);
00711     // cerr << endl;
00712 
00713     // If R is not full rank: reveal_R_rank() already computed
00714     // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and
00715     // overwrote R with \f$\Sigma V^*\f$.  Now, we compute \f$Q
00716     // := Q \cdot U\f$, respecting cache blocks of Q.
00717     Q_times_B (nrows, ncols, Q_ptr.getRawPtr(), ldq, 
00718          U.get(), U.lda(), contiguousCacheBlocks);
00719   }
00720       return rank;
00721     }
00722 
00727     void
00728     cache_block (const LocalOrdinal nrows_local,
00729      const LocalOrdinal ncols,
00730      Scalar A_local_out[],
00731      const Scalar A_local_in[],
00732      const LocalOrdinal lda_local_in) const
00733     {
00734       nodeTsqr_->cache_block (nrows_local, ncols, 
00735             A_local_out, 
00736             A_local_in, lda_local_in);
00737     }
00738 
00743     void
00744     un_cache_block (const LocalOrdinal nrows_local,
00745         const LocalOrdinal ncols,
00746         Scalar A_local_out[],
00747         const LocalOrdinal lda_local_out,       
00748         const Scalar A_local_in[]) const
00749     {
00750       nodeTsqr_->un_cache_block (nrows_local, ncols, 
00751          A_local_out, lda_local_out, 
00752          A_local_in);
00753     }
00754 
00755   private:
00756     node_tsqr_ptr nodeTsqr_;
00757     dist_tsqr_ptr distTsqr_;
00758   }; // class Tsqr
00759 
00760 } // namespace TSQR
00761 
00762 #endif // __TSQR_Tsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends