Anasazi Version of the Day
Tsqr.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                 Anasazi: Block Eigensolvers Package
00005 //                 Copyright (2010) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00025 //
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 #ifndef __TSQR_Tsqr_hpp
00030 #define __TSQR_Tsqr_hpp
00031 
00032 #include <Tsqr_ApplyType.hpp>
00033 #include <Tsqr_Matrix.hpp>
00034 #include <Tsqr_MessengerBase.hpp>
00035 #include <Tsqr_DistTsqr.hpp>
00036 #include <Tsqr_SequentialTsqr.hpp>
00037 #include <Tsqr_ScalarTraits.hpp>
00038 #include <Tsqr_Util.hpp>
00039 
00042 
00043 namespace TSQR {
00044 
00077   template< class LocalOrdinal, 
00078       class Scalar, 
00079       class NodeTsqrType = SequentialTsqr< LocalOrdinal, Scalar >,
00080       class DistTsqrType = DistTsqr< LocalOrdinal, Scalar > >
00081   class Tsqr {
00082   public:
00083     typedef MatView< LocalOrdinal, Scalar > matview_type;
00084     typedef ConstMatView< LocalOrdinal, Scalar > const_matview_type;
00085     typedef Matrix< LocalOrdinal, Scalar > matrix_type;
00086 
00087     typedef Scalar scalar_type;
00088     typedef LocalOrdinal ordinal_type;
00089     typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00090 
00091     typedef NodeTsqrType node_tsqr_type;
00092     typedef DistTsqrType dist_tsqr_type;
00093     typedef typename Teuchos::RCP< node_tsqr_type > node_tsqr_ptr;
00094     typedef typename Teuchos::RCP< dist_tsqr_type > dist_tsqr_ptr;
00095     typedef typename dist_tsqr_type::rank_type rank_type;
00096 
00097     typedef typename node_tsqr_type::FactorOutput NodeOutput;
00098     typedef typename dist_tsqr_type::FactorOutput DistOutput;
00099     typedef std::pair< NodeOutput, DistOutput > FactorOutput;
00100 
00108     Tsqr (const node_tsqr_ptr& nodeTsqr, const dist_tsqr_ptr& distTsqr) :
00109       nodeTsqr_ (nodeTsqr), distTsqr_ (distTsqr)
00110     {}
00111 
00120     size_t cache_block_size() const { return nodeTsqr_->cache_block_size(); }
00121 
00128     bool QR_produces_R_factor_with_nonnegative_diagonal () const {
00129       return nodeTsqr_->QR_produces_R_factor_with_nonnegative_diagonal() &&
00130   distTsqr_->QR_produces_R_factor_with_nonnegative_diagonal();
00131     }
00132 
00133 
00134     void
00135     factorExplicit (matview_type& A, 
00136         matview_type& Q, 
00137         matview_type& R, 
00138         const bool contiguousCacheBlocks = false)
00139     {
00140       // Sanity checks for matrix dimensions
00141       if (A.nrows() < A.ncols())
00142   throw std::logic_error ("A must have at least as many rows as columns");
00143       else if (A.nrows() != Q.nrows())
00144   throw std::logic_error ("A and Q must have same number of rows");
00145       else if (R.nrows() < R.ncols())
00146   throw std::logic_error ("R must have at least as many rows as columns");
00147       else if (A.ncols() != R.ncols())
00148   throw std::logic_error ("A and R must have the same number of columns");
00149 
00150       // Checks for quick exit, based on matrix dimensions
00151       if (Q.ncols() == ordinal_type(0))
00152   return;
00153 
00154       R.fill (scalar_type (0));
00155       NodeOutput nodeResults = 
00156   nodeTsqr_->factor (A.nrows(), A.ncols(), A.get(), A.lda(), 
00157          R.get(), R.lda(), contiguousCacheBlocks);
00158       nodeTsqr_->fill_with_zeros (Q.nrows(), Q.ncols(), Q.get(), Q.lda(), 
00159           contiguousCacheBlocks);
00160       matview_type Q_top_block = 
00161   nodeTsqr_->top_block (Q, contiguousCacheBlocks);
00162       if (Q_top_block.nrows() < R.ncols())
00163   throw std::logic_error("Q has too few rows");
00164       matview_type Q_top (R.ncols(), Q.ncols(), 
00165         Q_top_block.get(), Q_top_block.lda());
00166       distTsqr_->factorExplicit (R, Q_top);
00167       nodeTsqr_->apply (ApplyType::NoTranspose, 
00168       A.nrows(), A.ncols(), A.get(), A.lda(), nodeResults,
00169       Q.ncols(), Q.get(), Q.lda(), contiguousCacheBlocks);
00170     }
00171 
00172 
00216     FactorOutput
00217     factor (const LocalOrdinal nrows_local,
00218       const LocalOrdinal ncols, 
00219       Scalar A_local[],
00220       const LocalOrdinal lda_local,
00221       Scalar R[],
00222       const LocalOrdinal ldr,
00223       const bool contiguousCacheBlocks = false)
00224     {
00225       MatView< LocalOrdinal, Scalar > R_view (ncols, ncols, R, ldr);
00226       R_view.fill (Scalar(0));
00227       NodeOutput nodeResults = 
00228   nodeTsqr_->factor (nrows_local, ncols, A_local, lda_local, 
00229         R_view.get(), R_view.lda(), 
00230         contiguousCacheBlocks);
00231       DistOutput distResults = distTsqr_->factor (R_view);
00232       return std::make_pair (nodeResults, distResults);
00233     }
00234 
00273     void
00274     apply (const std::string& op,
00275      const LocalOrdinal nrows_local,
00276      const LocalOrdinal ncols_Q,
00277      const Scalar Q_local[],
00278      const LocalOrdinal ldq_local,
00279      const FactorOutput& factor_output,
00280      const LocalOrdinal ncols_C,
00281      Scalar C_local[],
00282      const LocalOrdinal ldc_local,
00283      const bool contiguousCacheBlocks = false)
00284     {
00285       ApplyType applyType (op);
00286 
00287       // This determines the order in which we apply the intranode
00288       // part of the Q factor vs. the internode part of the Q factor.
00289       const bool transposed = applyType.transposed();
00290 
00291       // View of this node's local part of the matrix C.
00292       matview_type C_view (nrows_local, ncols_C, C_local, ldc_local);
00293 
00294       // Identify top ncols_C by ncols_C block of C_local.
00295       // top_block() will set C_top_view to have the correct leading
00296       // dimension, whether or not cache blocks are stored
00297       // contiguously.
00298       //
00299       // C_top_view is the topmost cache block of C_local.  It has at
00300       // least as many rows as columns, but it likely has more rows
00301       // than columns.
00302       matview_type C_view_top_block = 
00303   nodeTsqr_->top_block (C_view, contiguousCacheBlocks);
00304 
00305       // View of the topmost ncols_C by ncols_C block of C.
00306       matview_type C_top_view (ncols_C, ncols_C, C_view_top_block.get(), 
00307              C_view_top_block.lda());
00308 
00309       if (! transposed) 
00310   {
00311     // C_top (small compact storage) gets a deep copy of the top
00312     // ncols_C by ncols_C block of C_local.
00313           matrix_type C_top (C_top_view);
00314 
00315     // Compute in place on all processors' C_top blocks.
00316     distTsqr_->apply (applyType, C_top.ncols(), ncols_Q, C_top.get(), 
00317           C_top.lda(), factor_output.second);
00318 
00319     // Copy the result from C_top back into the top ncols_C by
00320     // ncols_C block of C_local.
00321     C_top_view.copy (C_top);
00322 
00323     // Apply the local Q factor (in Q_local and
00324     // factor_output.first) to C_local.
00325     nodeTsqr_->apply (applyType, nrows_local, ncols_Q, 
00326           Q_local, ldq_local, factor_output.first, 
00327           ncols_C, C_local, ldc_local,
00328           contiguousCacheBlocks);
00329   }
00330       else
00331   {
00332     // Apply the (transpose of the) local Q factor (in Q_local
00333     // and factor_output.first) to C_local.
00334     nodeTsqr_->apply (applyType, nrows_local, ncols_Q, 
00335           Q_local, ldq_local, factor_output.first, 
00336           ncols_C, C_local, ldc_local,
00337           contiguousCacheBlocks);
00338 
00339     // C_top (small compact storage) gets a deep copy of the top
00340     // ncols_C by ncols_C block of C_local.
00341           matrix_type C_top (C_top_view);
00342 
00343     // Compute in place on all processors' C_top blocks.
00344     distTsqr_->apply (applyType, ncols_C, ncols_Q, C_top.get(), 
00345           C_top.lda(), factor_output.second);
00346 
00347     // Copy the result from C_top back into the top ncols_C by
00348     // ncols_C block of C_local.
00349     C_top_view.copy (C_top);
00350   }
00351     }
00352 
00386     void
00387     explicit_Q (const LocalOrdinal nrows_local,
00388     const LocalOrdinal ncols_Q_in,
00389     const Scalar Q_local_in[],
00390     const LocalOrdinal ldq_local_in,
00391     const FactorOutput& factorOutput,
00392     const LocalOrdinal ncols_Q_out,
00393     Scalar Q_local_out[],
00394     const LocalOrdinal ldq_local_out,
00395     const bool contiguousCacheBlocks = false)
00396     {
00397       nodeTsqr_->fill_with_zeros (nrows_local, ncols_Q_out, Q_local_out,
00398          ldq_local_out, contiguousCacheBlocks);
00399       const rank_type myRank = distTsqr_->rank();
00400       if (myRank == 0)
00401   {
00402     // View of this node's local part of the matrix Q_out.
00403     matview_type Q_out_view (nrows_local, ncols_Q_out, 
00404            Q_local_out, ldq_local_out);
00405 
00406     // View of the topmost cache block of Q_out.  It is
00407     // guaranteed to have at least as many rows as columns.
00408     matview_type Q_out_top = 
00409       nodeTsqr_->top_block (Q_out_view, contiguousCacheBlocks);
00410 
00411     // Fill (topmost cache block of) Q_out with the first
00412     // ncols_Q_out columns of the identity matrix.
00413     for (ordinal_type j = 0; j < ncols_Q_out; ++j)
00414       Q_out_top(j, j) = Scalar(1);
00415   }
00416       apply ("N", nrows_local, 
00417        ncols_Q_in, Q_local_in, ldq_local_in, factorOutput,
00418        ncols_Q_out, Q_local_out, ldq_local_out,
00419        contiguousCacheBlocks);
00420     }
00421 
00426     void
00427     Q_times_B (const LocalOrdinal nrows,
00428          const LocalOrdinal ncols,
00429          Scalar Q[],
00430          const LocalOrdinal ldq,
00431          const Scalar B[],
00432          const LocalOrdinal ldb,
00433          const bool contiguousCacheBlocks = false) const
00434     {
00435       // This requires no internode communication.  However, the work
00436       // is not redundant, since each MPI process has a different Q.
00437       nodeTsqr_->Q_times_B (nrows, ncols, Q, ldq, B, ldb, 
00438          contiguousCacheBlocks);
00439       // We don't need a barrier after this method, unless users are
00440       // doing mean MPI_Get() things.
00441     }
00442 
00450     LocalOrdinal
00451     reveal_R_rank (const LocalOrdinal ncols,
00452        Scalar R[],
00453        const LocalOrdinal ldr,
00454        Scalar U[],
00455        const LocalOrdinal ldu,
00456        const magnitude_type tol) const 
00457     {
00458       // Forward the request to the intranode TSQR implementation.
00459       // This work is performed redundantly on all MPI processes.
00460       //
00461       // FIXME (mfh 26 Aug 2010) Could be a problem if your cluster is
00462       // heterogeneous, because then you might obtain different
00463       // answers (due to rounding error) on different processors.
00464       return nodeTsqr_->reveal_R_rank (ncols, R, ldr, U, ldu, tol);
00465     }
00466 
00485     LocalOrdinal
00486     reveal_rank (const LocalOrdinal nrows,
00487      const LocalOrdinal ncols,
00488      Scalar Q[],
00489      const LocalOrdinal ldq,
00490      Scalar R[],
00491      const LocalOrdinal ldr,
00492      const magnitude_type tol,
00493      const bool contiguousCacheBlocks = false) const
00494     {
00495       // Take the easy exit if available.
00496       if (ncols == 0)
00497   return 0;
00498 
00499       //
00500       // FIXME (mfh 16 Jul 2010) We _should_ compute the SVD of R (as
00501       // the copy B) on Proc 0 only.  This would ensure that all
00502       // processors get the same SVD and rank (esp. in a heterogeneous
00503       // computing environment).  For now, we just do this computation
00504       // redundantly, and hope that all the returned rank values are
00505       // the same.
00506       //
00507       matrix_type U (ncols, ncols, Scalar(0));
00508       const ordinal_type rank = 
00509   reveal_R_rank (ncols, R, ldr, U.get(), U.lda(), tol);
00510       
00511       if (rank < ncols)
00512   {
00513     // cerr << ">>> Rank of R: " << rank << " < ncols=" << ncols << endl;
00514     // cerr << ">>> Resulting U:" << endl;
00515     // print_local_matrix (cerr, ncols, ncols, R, ldr);
00516     // cerr << endl;
00517 
00518     // If R is not full rank: reveal_R_rank() already computed
00519     // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and
00520     // overwrote R with \f$\Sigma V^*\f$.  Now, we compute \f$Q
00521     // := Q \cdot U\f$, respecting cache blocks of Q.
00522     Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), 
00523          contiguousCacheBlocks);
00524   }
00525       return rank;
00526     }
00527 
00532     void
00533     cache_block (const LocalOrdinal nrows_local,
00534      const LocalOrdinal ncols,
00535      Scalar A_local_out[],
00536      const Scalar A_local_in[],
00537      const LocalOrdinal lda_local_in) const
00538     {
00539       nodeTsqr_->cache_block (nrows_local, ncols, 
00540             A_local_out, 
00541             A_local_in, lda_local_in);
00542     }
00543 
00548     void
00549     un_cache_block (const LocalOrdinal nrows_local,
00550         const LocalOrdinal ncols,
00551         Scalar A_local_out[],
00552         const LocalOrdinal lda_local_out,       
00553         const Scalar A_local_in[]) const
00554     {
00555       nodeTsqr_->un_cache_block (nrows_local, ncols, 
00556          A_local_out, lda_local_out, 
00557          A_local_in);
00558     }
00559 
00560   private:
00561     node_tsqr_ptr nodeTsqr_;
00562     dist_tsqr_ptr distTsqr_;
00563   }; // class Tsqr
00564 
00565 } // namespace TSQR
00566 
00567 #endif // __TSQR_Tsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends