Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr.hpp
Go to the documentation of this file.
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00032 #ifndef __TSQR_Tsqr_hpp
00033 #define __TSQR_Tsqr_hpp
00034 
00035 #include <Tsqr_ApplyType.hpp>
00036 #include <Tsqr_Matrix.hpp>
00037 #include <Tsqr_MessengerBase.hpp>
00038 #include <Tsqr_DistTsqr.hpp>
00039 #include <Tsqr_SequentialTsqr.hpp>
00040 #include <Tsqr_Util.hpp>
00041 
00042 #include <Kokkos_MultiVector.hpp>
00043 #include <Teuchos_as.hpp>
00044 #include <Teuchos_ScalarTraits.hpp>
00045 #include <Teuchos_SerialDenseMatrix.hpp>
00046 
00047 
00048 namespace TSQR {
00049 
00085   template<class LocalOrdinal, 
00086      class Scalar, 
00087      class NodeTsqrType = SequentialTsqr<LocalOrdinal, Scalar> >
00088   class Tsqr {
00089   public:
00090     typedef MatView<LocalOrdinal, Scalar> matview_type;
00091     typedef ConstMatView<LocalOrdinal, Scalar> const_matview_type;
00092     typedef Matrix<LocalOrdinal, Scalar> matrix_type;
00093 
00094     typedef Scalar scalar_type;
00095     typedef LocalOrdinal ordinal_type;
00096     typedef Teuchos::ScalarTraits<Scalar> STS;
00097     typedef typename STS::magnitudeType magnitude_type;
00098 
00099     typedef NodeTsqrType node_tsqr_type;
00100     typedef DistTsqr<LocalOrdinal, Scalar> dist_tsqr_type;
00101     typedef typename Teuchos::RCP<node_tsqr_type> node_tsqr_ptr;
00102     typedef typename Teuchos::RCP<dist_tsqr_type> dist_tsqr_ptr;
00105     typedef typename dist_tsqr_type::rank_type rank_type;
00106 
00107     typedef typename node_tsqr_type::FactorOutput NodeOutput;
00108     typedef typename dist_tsqr_type::FactorOutput DistOutput;
00109 
00116     typedef std::pair<NodeOutput, DistOutput> FactorOutput;
00117 
00125     Tsqr (const node_tsqr_ptr& nodeTsqr, 
00126     const dist_tsqr_ptr& distTsqr) :
00127       nodeTsqr_ (nodeTsqr), 
00128       distTsqr_ (distTsqr)
00129     {}
00130 
00136     size_t cache_size_hint() const { return nodeTsqr_->cache_size_hint(); }
00137 
00142     size_t TEUCHOS_DEPRECATED cache_block_size() const { 
00143       return nodeTsqr_->cache_size_hint(); 
00144     }
00145 
00154     bool QR_produces_R_factor_with_nonnegative_diagonal () const {
00155       // Tsqr computes an R factor with nonnegative diagonal, if and
00156       // only if all QR factorization steps (both intranode and
00157       // internode) produce an R factor with a nonnegative diagonal.
00158       return nodeTsqr_->QR_produces_R_factor_with_nonnegative_diagonal() &&
00159   distTsqr_->QR_produces_R_factor_with_nonnegative_diagonal();
00160     }
00161 
00199     template<class NodeType>
00200     void
00201     factorExplicit (Kokkos::MultiVector<Scalar, NodeType>& A,
00202         Kokkos::MultiVector<Scalar, NodeType>& Q,
00203         Teuchos::SerialDenseMatrix<LocalOrdinal, Scalar>& R,
00204         const bool contiguousCacheBlocks)
00205     {
00206       using Teuchos::asSafe;
00207       typedef Kokkos::MultiVector<Scalar, NodeType> KMV;
00208 
00209       // Tsqr currently likes LocalOrdinal ordinals, but
00210       // Kokkos::MultiVector has size_t ordinals.  Do conversions
00211       // here.  
00212       //
00213       // Teuchos::asSafe() can do safe conversion (e.g., checking for
00214       // overflow when casting to a narrower integer type), if a
00215       // custom specialization is defined for
00216       // Teuchos::ValueTypeConversionTraits<size_t, LocalOrdinal>.
00217       // Otherwise, this has the same (potentially) unsafe effect as
00218       // static_cast<LocalOrdinal>(...) would have.
00219       const LocalOrdinal A_numRows = asSafe<LocalOrdinal> (A.getNumRows());
00220       const LocalOrdinal A_numCols = asSafe<LocalOrdinal> (A.getNumCols());
00221       const LocalOrdinal A_stride = asSafe<LocalOrdinal> (A.getStride());
00222       const LocalOrdinal Q_numRows = asSafe<LocalOrdinal> (Q.getNumRows());
00223       const LocalOrdinal Q_numCols = asSafe<LocalOrdinal> (Q.getNumCols());
00224       const LocalOrdinal Q_stride = asSafe<LocalOrdinal> (Q.getStride());
00225 
00226       // Sanity checks for matrix dimensions
00227       if (A_numRows < A_numCols)
00228   {
00229     std::ostringstream os;
00230     os << "In Tsqr::factorExplicit: input matrix A has " << A_numRows 
00231        << " local rows, and " << A_numCols << " columns.  The input "
00232       "matrix must have at least as many rows on each processor as "
00233       "there are columns.";
00234     throw std::invalid_argument(os.str());
00235   }
00236       else if (A_numRows != Q_numRows)
00237   {
00238     std::ostringstream os;
00239     os << "In Tsqr::factorExplicit: input matrix A and output matrix Q "
00240       "must have the same number of rows.  A has " << A_numRows << " rows"
00241       " and Q has " << Q_numRows << " rows.";
00242     throw std::invalid_argument(os.str());
00243   }
00244       else if (R.numRows() < R.numCols())
00245   {
00246     std::ostringstream os;
00247     os << "In Tsqr::factorExplicit: output matrix R must have at least "
00248       "as many rows as columns.  R has " << R.numRows() << " rows and "
00249        << R.numCols() << " columns.";
00250     throw std::invalid_argument(os.str());
00251   }
00252       else if (A_numCols != R.numCols())
00253   {
00254     std::ostringstream os;
00255     os << "In Tsqr::factorExplicit: input matrix A and output matrix R "
00256       "must have the same number of columns.  A has " << A_numCols 
00257        << " columns and R has " << R.numCols() << " columns.";
00258     throw std::invalid_argument(os.str());
00259   }
00260 
00261       // Check for quick exit, based on matrix dimensions
00262       if (Q_numCols == 0)
00263   return;
00264 
00265       // Hold on to nonconst views of A and Q.  This will make TSQR
00266       // correct (if perhaps inefficient) for all possible Kokkos Node
00267       // types, even GPU nodes.
00268       Teuchos::ArrayRCP< scalar_type > A_ptr = A.getValuesNonConst();
00269       Teuchos::ArrayRCP< scalar_type > Q_ptr = Q.getValuesNonConst();
00270 
00271       R.putScalar (Scalar(0));
00272       NodeOutput nodeResults = 
00273   nodeTsqr_->factor (A_numRows, A_numCols, A_ptr.getRawPtr(), A_stride,
00274          R.values(), R.stride(), contiguousCacheBlocks);
00275       // FIXME (mfh 19 Oct 2010) Replace actions on raw pointer with
00276       // actions on the Kokkos::MultiVector or at least the ArrayRCP.
00277       nodeTsqr_->fill_with_zeros (Q_numRows, Q_numCols, 
00278           Q_ptr.getRawPtr(), Q_stride,
00279           contiguousCacheBlocks);
00280       matview_type Q_rawView (Q_numRows, Q_numCols, 
00281             Q_ptr.getRawPtr(), Q_stride);
00282       matview_type Q_top_block = 
00283   nodeTsqr_->top_block (Q_rawView, contiguousCacheBlocks);
00284       if (Q_top_block.nrows() < R.numCols())
00285   {
00286     std::ostringstream os;
00287     os << "The top block of Q has too few rows.  This means that the "
00288        << "the intranode TSQR implementation has a bug in its top_block"
00289        << "() method.  The top block should have at least " << R.numCols()
00290        << " rows, but instead has only " << Q_top_block.ncols() 
00291        << " rows.";
00292     throw std::logic_error (os.str());
00293   }
00294       {
00295   matview_type Q_top (R.numCols(), Q_numCols, Q_top_block.get(), 
00296           Q_top_block.lda());
00297   matview_type R_view (R.numRows(), R.numCols(), R.values(), R.stride());
00298   distTsqr_->factorExplicit (R_view, Q_top);
00299       }
00300       nodeTsqr_->apply (ApplyType::NoTranspose, 
00301       A_numRows, A_numCols, A_ptr.getRawPtr(), A_stride,
00302       nodeResults, Q_numCols, Q_ptr.getRawPtr(), Q_stride,
00303       contiguousCacheBlocks);
00304       // "Commit" the changes to the multivector.
00305       A_ptr = Teuchos::null;
00306       Q_ptr = Teuchos::null;
00307     }
00308 
00309 
00353     FactorOutput
00354     factor (const LocalOrdinal nrows_local,
00355       const LocalOrdinal ncols, 
00356       Scalar A_local[],
00357       const LocalOrdinal lda_local,
00358       Scalar R[],
00359       const LocalOrdinal ldr,
00360       const bool contiguousCacheBlocks = false)
00361     {
00362       MatView< LocalOrdinal, Scalar > R_view (ncols, ncols, R, ldr);
00363       R_view.fill (Scalar(0));
00364       NodeOutput nodeResults = 
00365   nodeTsqr_->factor (nrows_local, ncols, A_local, lda_local, 
00366         R_view.get(), R_view.lda(), 
00367         contiguousCacheBlocks);
00368       DistOutput distResults = distTsqr_->factor (R_view);
00369       return std::make_pair (nodeResults, distResults);
00370     }
00371 
00410     void
00411     apply (const std::string& op,
00412      const LocalOrdinal nrows_local,
00413      const LocalOrdinal ncols_Q,
00414      const Scalar Q_local[],
00415      const LocalOrdinal ldq_local,
00416      const FactorOutput& factor_output,
00417      const LocalOrdinal ncols_C,
00418      Scalar C_local[],
00419      const LocalOrdinal ldc_local,
00420      const bool contiguousCacheBlocks = false)
00421     {
00422       ApplyType applyType (op);
00423 
00424       // This determines the order in which we apply the intranode
00425       // part of the Q factor vs. the internode part of the Q factor.
00426       const bool transposed = applyType.transposed();
00427 
00428       // View of this node's local part of the matrix C.
00429       matview_type C_view (nrows_local, ncols_C, C_local, ldc_local);
00430 
00431       // Identify top ncols_C by ncols_C block of C_local.
00432       // top_block() will set C_top_view to have the correct leading
00433       // dimension, whether or not cache blocks are stored
00434       // contiguously.
00435       //
00436       // C_top_view is the topmost cache block of C_local.  It has at
00437       // least as many rows as columns, but it likely has more rows
00438       // than columns.
00439       matview_type C_view_top_block = 
00440   nodeTsqr_->top_block (C_view, contiguousCacheBlocks);
00441 
00442       // View of the topmost ncols_C by ncols_C block of C.
00443       matview_type C_top_view (ncols_C, ncols_C, C_view_top_block.get(), 
00444              C_view_top_block.lda());
00445 
00446       if (! transposed) 
00447   {
00448     // C_top (small compact storage) gets a deep copy of the top
00449     // ncols_C by ncols_C block of C_local.
00450           matrix_type C_top (C_top_view);
00451 
00452     // Compute in place on all processors' C_top blocks.
00453     distTsqr_->apply (applyType, C_top.ncols(), ncols_Q, C_top.get(), 
00454           C_top.lda(), factor_output.second);
00455 
00456     // Copy the result from C_top back into the top ncols_C by
00457     // ncols_C block of C_local.
00458     C_top_view.copy (C_top);
00459 
00460     // Apply the local Q factor (in Q_local and
00461     // factor_output.first) to C_local.
00462     nodeTsqr_->apply (applyType, nrows_local, ncols_Q, 
00463           Q_local, ldq_local, factor_output.first, 
00464           ncols_C, C_local, ldc_local,
00465           contiguousCacheBlocks);
00466   }
00467       else
00468   {
00469     // Apply the (transpose of the) local Q factor (in Q_local
00470     // and factor_output.first) to C_local.
00471     nodeTsqr_->apply (applyType, nrows_local, ncols_Q, 
00472           Q_local, ldq_local, factor_output.first, 
00473           ncols_C, C_local, ldc_local,
00474           contiguousCacheBlocks);
00475 
00476     // C_top (small compact storage) gets a deep copy of the top
00477     // ncols_C by ncols_C block of C_local.
00478           matrix_type C_top (C_top_view);
00479 
00480     // Compute in place on all processors' C_top blocks.
00481     distTsqr_->apply (applyType, ncols_C, ncols_Q, C_top.get(), 
00482           C_top.lda(), factor_output.second);
00483 
00484     // Copy the result from C_top back into the top ncols_C by
00485     // ncols_C block of C_local.
00486     C_top_view.copy (C_top);
00487   }
00488     }
00489 
00523     void
00524     explicit_Q (const LocalOrdinal nrows_local,
00525     const LocalOrdinal ncols_Q_in,
00526     const Scalar Q_local_in[],
00527     const LocalOrdinal ldq_local_in,
00528     const FactorOutput& factorOutput,
00529     const LocalOrdinal ncols_Q_out,
00530     Scalar Q_local_out[],
00531     const LocalOrdinal ldq_local_out,
00532     const bool contiguousCacheBlocks = false)
00533     {
00534       nodeTsqr_->fill_with_zeros (nrows_local, ncols_Q_out, Q_local_out,
00535           ldq_local_out, contiguousCacheBlocks);
00536       // "Rank" here means MPI rank, not linear algebra rank.
00537       const rank_type myRank = distTsqr_->rank();
00538       if (myRank == 0)
00539   {
00540     // View of this node's local part of the matrix Q_out.
00541     matview_type Q_out_view (nrows_local, ncols_Q_out, 
00542            Q_local_out, ldq_local_out);
00543 
00544     // View of the topmost cache block of Q_out.  It is
00545     // guaranteed to have at least as many rows as columns.
00546     matview_type Q_out_top = 
00547       nodeTsqr_->top_block (Q_out_view, contiguousCacheBlocks);
00548 
00549     // Fill (topmost cache block of) Q_out with the first
00550     // ncols_Q_out columns of the identity matrix.
00551     for (ordinal_type j = 0; j < ncols_Q_out; ++j)
00552       Q_out_top(j, j) = Scalar(1);
00553   }
00554       apply ("N", nrows_local, 
00555        ncols_Q_in, Q_local_in, ldq_local_in, factorOutput,
00556        ncols_Q_out, Q_local_out, ldq_local_out,
00557        contiguousCacheBlocks);
00558     }
00559 
00564     void
00565     Q_times_B (const LocalOrdinal nrows,
00566          const LocalOrdinal ncols,
00567          Scalar Q[],
00568          const LocalOrdinal ldq,
00569          const Scalar B[],
00570          const LocalOrdinal ldb,
00571          const bool contiguousCacheBlocks = false) const
00572     {
00573       // This requires no internode communication.  However, the work
00574       // is not redundant, since each MPI process has a different Q.
00575       nodeTsqr_->Q_times_B (nrows, ncols, Q, ldq, B, ldb, 
00576          contiguousCacheBlocks);
00577       // We don't need a barrier after this method, unless users are
00578       // doing mean MPI_Get() things.
00579     }
00580 
00591     LocalOrdinal
00592     reveal_R_rank (const LocalOrdinal ncols,
00593        Scalar R[],
00594        const LocalOrdinal ldr,
00595        Scalar U[],
00596        const LocalOrdinal ldu,
00597        const magnitude_type& tol) const 
00598     {
00599       // Forward the request to the intranode TSQR implementation.
00600       // Currently, this work is performed redundantly on all MPI
00601       // processes, without communication or agreement.
00602       //
00603       // FIXME (mfh 26 Aug 2010) This be a problem if your cluster is
00604       // heterogeneous, because then you might obtain different
00605       // integer rank results.  This is because heterogeneous nodes
00606       // might each compute the rank-revealing decomposition with
00607       // slightly different rounding error.
00608       return nodeTsqr_->reveal_R_rank (ncols, R, ldr, U, ldu, tol);
00609     }
00610 
00647     template<class NodeType>
00648     LocalOrdinal
00649     revealRank (Kokkos::MultiVector<Scalar, NodeType>& Q,
00650     Teuchos::SerialDenseMatrix<LocalOrdinal, Scalar>& R,
00651     const magnitude_type& tol,
00652     const bool contiguousCacheBlocks = false) const
00653     {
00654       typedef Kokkos::MultiVector<Scalar, NodeType> KMV;
00655 
00656       const LocalOrdinal nrows = static_cast<LocalOrdinal> (Q.getNumRows());
00657       const LocalOrdinal ncols = static_cast<LocalOrdinal> (Q.getNumCols());
00658       const LocalOrdinal ldq = static_cast<LocalOrdinal> (Q.getStride());
00659       Teuchos::ArrayRCP<Scalar> Q_ptr = Q.getValuesNonConst();
00660 
00661       // Take the easy exit if available.
00662       if (ncols == 0)
00663   return 0;
00664 
00665       //
00666       // FIXME (mfh 16 Jul 2010) We _should_ compute the SVD of R (as
00667       // the copy B) on Proc 0 only.  This would ensure that all
00668       // processors get the same SVD and rank (esp. in a heterogeneous
00669       // computing environment).  For now, we just do this computation
00670       // redundantly, and hope that all the returned rank values are
00671       // the same.
00672       //
00673       matrix_type U (ncols, ncols, STS::zero());
00674       const ordinal_type rank = 
00675   reveal_R_rank (ncols, R.values(), R.stride(), 
00676            U.get(), U.lda(), tol);
00677       if (rank < ncols)
00678   {
00679     // cerr << ">>> Rank of R: " << rank << " < ncols=" << ncols << endl;
00680     // cerr << ">>> Resulting U:" << endl;
00681     // print_local_matrix (cerr, ncols, ncols, R, ldr);
00682     // cerr << endl;
00683 
00684     // If R is not full rank: reveal_R_rank() already computed
00685     // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and
00686     // overwrote R with \f$\Sigma V^*\f$.  Now, we compute \f$Q
00687     // := Q \cdot U\f$, respecting cache blocks of Q.
00688     Q_times_B (nrows, ncols, Q_ptr.getRawPtr(), ldq, 
00689          U.get(), U.lda(), contiguousCacheBlocks);
00690   }
00691       return rank;
00692     }
00693 
00698     void
00699     cache_block (const LocalOrdinal nrows_local,
00700      const LocalOrdinal ncols,
00701      Scalar A_local_out[],
00702      const Scalar A_local_in[],
00703      const LocalOrdinal lda_local_in) const
00704     {
00705       nodeTsqr_->cache_block (nrows_local, ncols, 
00706             A_local_out, 
00707             A_local_in, lda_local_in);
00708     }
00709 
00714     void
00715     un_cache_block (const LocalOrdinal nrows_local,
00716         const LocalOrdinal ncols,
00717         Scalar A_local_out[],
00718         const LocalOrdinal lda_local_out,       
00719         const Scalar A_local_in[]) const
00720     {
00721       nodeTsqr_->un_cache_block (nrows_local, ncols, 
00722          A_local_out, lda_local_out, 
00723          A_local_in);
00724     }
00725 
00726   private:
00727     node_tsqr_ptr nodeTsqr_;
00728     dist_tsqr_ptr distTsqr_;
00729   }; // class Tsqr
00730 
00731 } // namespace TSQR
00732 
00733 #endif // __TSQR_Tsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends