Anasazi Version of the Day
TbbTsqr_TbbParallelTsqr.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                 Anasazi: Block Eigensolvers Package
00005 //                 Copyright (2010) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00025 //
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 #ifndef __TSQR_TBB_TbbParallelTsqr_hpp
00030 #define __TSQR_TBB_TbbParallelTsqr_hpp
00031 
00032 #include <tbb/tbb.h>
00033 #include <TbbTsqr_FactorTask.hpp>
00034 #include <TbbTsqr_ApplyTask.hpp>
00035 #include <TbbTsqr_ExplicitQTask.hpp>
00036 #include <TbbTsqr_RevealRankTask.hpp>
00037 #include <TbbTsqr_CacheBlockTask.hpp>
00038 #include <TbbTsqr_UnCacheBlockTask.hpp>
00039 #include <TbbTsqr_FillWithZerosTask.hpp>
00040 
00041 #include <Tsqr_ApplyType.hpp>
00042 #include <Tsqr_ScalarTraits.hpp>
00043 
00044 #include <algorithm>
00045 #include <limits>
00046 
00049 
00050 namespace TSQR {
00051   namespace TBB {
00052     
00053     template< class LocalOrdinal, class Scalar, class TimerType >
00054     class TbbParallelTsqr {
00055     private:
00056       typedef MatView< LocalOrdinal, Scalar > mat_view;
00057       typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view;
00058       typedef std::pair< mat_view, mat_view > split_t;
00059       typedef std::pair< const_mat_view, const_mat_view > const_split_t;
00060       typedef std::pair< const_mat_view, mat_view > top_blocks_t;
00061       typedef std::vector< top_blocks_t > array_top_blocks_t;
00062 
00063       template< class MatrixViewType >
00064       MatrixViewType
00065       top_block_helper (const size_t P_first,
00066       const size_t P_last,
00067       const MatrixViewType& C, 
00068       const bool contiguous_cache_blocks = false) const
00069       {
00070   if (P_first > P_last)
00071     throw std::logic_error ("P_first > P_last");
00072   else if (P_first == P_last)
00073     return seq_.top_block (C, contiguous_cache_blocks);
00074   else
00075     {
00076       typedef std::pair< MatrixViewType, MatrixViewType > split_type;
00077 
00078       // Divide [P_first, P_last] into two intervals: [P_first,
00079       // P_mid] and [P_mid+1, P_last].  Recurse on the first
00080       // interval [P_first, P_mid].
00081       const size_t P_mid = (P_first + P_last) / 2;
00082       split_type C_split = partitioner_.split (C, P_first, P_mid, P_last,
00083                  contiguous_cache_blocks);
00084       return top_block_helper (P_first, P_mid, C_split.first, 
00085              contiguous_cache_blocks);
00086     }
00087       }
00088 
00089     public:
00090       typedef Scalar scalar_type;
00091       typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00092       typedef LocalOrdinal ordinal_type;
00093       
00096       static bool QR_produces_R_factor_with_nonnegative_diagonal() {
00097   typedef Combine< LocalOrdinal, Scalar > combine_type;
00098   typedef LAPACK< LocalOrdinal, Scalar > lapack_type;
00099       
00100   return combine_type::QR_produces_R_factor_with_nonnegative_diagonal() &&
00101     lapack_type::QR_produces_R_factor_with_nonnegative_diagonal();
00102       }
00103 
00106       typedef typename SequentialTsqr< LocalOrdinal, Scalar >::FactorOutput SeqOutput;
00109       typedef std::vector< std::vector< Scalar > > ParOutput;
00113       typedef typename std::pair< std::vector< SeqOutput >, ParOutput > FactorOutput;
00114 
00122       TbbParallelTsqr (const size_t numCores = 1,
00123            const size_t cacheBlockSize = 0) :
00124   seq_ (cacheBlockSize),
00125   min_seq_factor_timing_ (std::numeric_limits<double>::max()),
00126   max_seq_factor_timing_ (std::numeric_limits<double>::min()),
00127   min_seq_apply_timing_ (std::numeric_limits<double>::max()),
00128   max_seq_apply_timing_ (std::numeric_limits<double>::min())
00129       {
00130   if (numCores < 1)
00131     ncores_ = 1; // default is no parallelism
00132   else
00133     ncores_ = numCores;
00134       }
00135       
00139       size_t ncores() const { return ncores_; }
00140 
00145       size_t cache_block_size() const { return seq_.cache_block_size(); }
00146 
00147 
00148       double
00149       min_seq_factor_timing () const { return min_seq_factor_timing_; }
00150       double
00151       max_seq_factor_timing () const { return max_seq_factor_timing_; }
00152       double
00153       min_seq_apply_timing () const { return min_seq_apply_timing_; }
00154       double
00155       max_seq_apply_timing () const { return max_seq_apply_timing_; }
00156 
00157       FactorOutput
00158       factor (const LocalOrdinal nrows,
00159         const LocalOrdinal ncols, 
00160         Scalar A[],
00161         const LocalOrdinal lda,
00162         Scalar R[],
00163         const LocalOrdinal ldr,
00164         const bool contiguous_cache_blocks = false)
00165       {
00166   using tbb::task;
00167 
00168   mat_view A_view (nrows, ncols, A, lda);
00169   // A_top will be modified in place by exactly one task, to
00170   // indicate the partition from which we may extract the R
00171   // factor after finishing the factorization.
00172   mat_view A_top;
00173 
00174   std::vector< SeqOutput > seq_output (ncores());
00175   ParOutput par_output (ncores(), std::vector< Scalar >(ncols));
00176   if (ncores() < 1)
00177     {
00178       if (! A_view.empty())
00179         throw std::logic_error("Zero subproblems, but A not empty!");
00180       else // Return empty results
00181         return std::make_pair (seq_output, par_output);
00182     }
00183   
00184   double my_seq_timing = double(0);
00185   double min_seq_timing = double(0);
00186   double max_seq_timing = double(0);
00187   try {
00188     typedef FactorTask< LocalOrdinal, Scalar, TimerType > factor_task_t;
00189 
00190     // When the root task completes, A_top will be set to the
00191     // topmost partition of A.  We can then extract the R factor
00192     // from A_top.
00193     factor_task_t& root_task = *new( task::allocate_root() ) 
00194       factor_task_t(0, ncores()-1, A_view, &A_top, seq_output, 
00195         par_output, seq_, my_seq_timing, min_seq_timing,
00196         max_seq_timing, contiguous_cache_blocks);
00197     task::spawn_root_and_wait (root_task);
00198   } catch (tbb::captured_exception& ex) {
00199     // TBB can't guarantee on all systems that an exception
00200     // thrown in another thread will have its type correctly
00201     // propagated to this thread.  If it can't, then it captures
00202     // the exception as a tbb:captured_exception, and propagates
00203     // it to here.  It may be able to propagate the exception,
00204     // though, so be prepared for that.  We deal with the latter
00205     // case by allowing the exception to propagate.
00206     std::ostringstream os;
00207     os << "Intel TBB caught an exception, while computing the QR factor"
00208       "ization of a matrix A.  Unfortunately, its type information was "
00209       "lost, because the exception was thrown in another thread.  Its "
00210       "\"what()\" function returns the following string: " << ex.what();
00211     throw std::runtime_error (os.str());
00212   } 
00213 
00214   // Copy the R factor out of A_top into R.
00215   seq_.extract_R (A_top.nrows(), A_top.ncols(), A_top.get(), 
00216       A_top.lda(), R, ldr, contiguous_cache_blocks);
00217 
00218   // Save the timings for future reference
00219   if (min_seq_timing < min_seq_factor_timing_)
00220     min_seq_factor_timing_ = min_seq_timing;
00221   if (max_seq_timing > max_seq_factor_timing_)
00222     max_seq_factor_timing_ = max_seq_timing;
00223 
00224   return std::make_pair (seq_output, par_output);
00225       }
00226 
00227       void
00228       apply (const ApplyType& apply_type,
00229        const LocalOrdinal nrows,
00230        const LocalOrdinal ncols_Q,
00231        const Scalar Q[],
00232        const LocalOrdinal ldq,
00233        const FactorOutput& factor_output,
00234        const LocalOrdinal ncols_C,
00235        Scalar C[],
00236        const LocalOrdinal ldc,
00237        const bool contiguous_cache_blocks = false)
00238       {
00239   using tbb::task;
00240 
00241   if (apply_type.transposed())
00242     throw std::logic_error ("Applying Q^T and Q^H not implemented");
00243 
00244   const_mat_view Q_view (nrows, ncols_Q, Q, ldq);
00245   mat_view C_view (nrows, ncols_C, C, ldc);
00246   if (! apply_type.transposed())
00247     {
00248       array_top_blocks_t top_blocks (ncores());
00249       build_partition_array (0, ncores()-1, top_blocks, Q_view, 
00250            C_view, contiguous_cache_blocks);
00251       double my_seq_timing = 0.0;
00252       double min_seq_timing = 0.0;
00253       double max_seq_timing = 0.0;
00254       try {
00255         typedef ApplyTask< LocalOrdinal, Scalar, TimerType > apply_task_t;
00256         apply_task_t& root_task = 
00257     *new( task::allocate_root() )
00258     apply_task_t (0, ncores()-1, Q_view, C_view, top_blocks,
00259             factor_output, seq_, my_seq_timing, 
00260             min_seq_timing, max_seq_timing,
00261             contiguous_cache_blocks);
00262         task::spawn_root_and_wait (root_task);
00263       } catch (tbb::captured_exception& ex) {
00264         std::ostringstream os;
00265         os << "Intel TBB caught an exception, while applying a Q factor "
00266     "computed previously by factor() to the matrix C.  Unfortunate"
00267     "ly, its type information was lost, because the exception was "
00268     "thrown in another thread.  Its \"what()\" function returns th"
00269     "e following string: " << ex.what();
00270         throw std::runtime_error (os.str());
00271       }
00272 
00273       // Save the timings for future reference
00274       if (min_seq_timing < min_seq_apply_timing_)
00275         min_seq_apply_timing_ = min_seq_timing;
00276       if (max_seq_timing > max_seq_apply_timing_)
00277         max_seq_apply_timing_ = max_seq_timing;
00278     }
00279       }
00280 
00281 
00282       void
00283       explicit_Q (const LocalOrdinal nrows,
00284       const LocalOrdinal ncols_Q_in,
00285       const Scalar Q_in[],
00286       const LocalOrdinal ldq_in,
00287       const FactorOutput& factor_output,
00288       const LocalOrdinal ncols_Q_out,
00289       Scalar Q_out[],
00290       const LocalOrdinal ldq_out,
00291       const bool contiguous_cache_blocks = false)
00292       {
00293   using tbb::task;
00294 
00295   mat_view Q_out_view (nrows, ncols_Q_out, Q_out, ldq_out);
00296   try {
00297     typedef ExplicitQTask< LocalOrdinal, Scalar > explicit_Q_task_t;    
00298     explicit_Q_task_t& root_task = *new( task::allocate_root() )
00299       explicit_Q_task_t (0, ncores()-1, Q_out_view, seq_, 
00300              contiguous_cache_blocks);
00301     task::spawn_root_and_wait (root_task);
00302   } catch (tbb::captured_exception& ex) {
00303     std::ostringstream os;
00304     os << "Intel TBB caught an exception, while preparing to compute"
00305       " the explicit Q factor from a QR factorization computed previ"
00306       "ously by factor().  Unfortunately, its type information was l"
00307       "ost, because the exception was thrown in another thread.  Its"
00308       " \"what()\" function returns the following string: " 
00309        << ex.what();
00310     throw std::runtime_error (os.str());
00311   }
00312   apply (ApplyType::NoTranspose, 
00313          nrows, ncols_Q_in, Q_in, ldq_in, factor_output, 
00314          ncols_Q_out, Q_out, ldq_out, 
00315          contiguous_cache_blocks);
00316       }
00317 
00322       void
00323       Q_times_B (const LocalOrdinal nrows,
00324      const LocalOrdinal ncols,
00325      Scalar Q[],
00326      const LocalOrdinal ldq,
00327      const Scalar B[],
00328      const LocalOrdinal ldb,
00329      const bool contiguous_cache_blocks = false) const
00330       {
00331   // Compute Q := Q*B in parallel.  This works much like
00332   // cache_block() (which see), in that each thread's instance
00333   // does not need to communicate with the others.
00334   try {
00335     using tbb::task;
00336     typedef RevealRankTask< LocalOrdinal, Scalar > rrtask_type;
00337 
00338     mat_view Q_view (nrows, ncols, Q, ldq);
00339     const_mat_view B_view (ncols, ncols, B, ldb);
00340 
00341     rrtask_type& root_task = *new( task::allocate_root() )
00342       rrtask_type (0, ncores()-1, Q_view, B_view, seq_, 
00343        contiguous_cache_blocks);
00344     task::spawn_root_and_wait (root_task);
00345   } catch (tbb::captured_exception& ex) {
00346     std::ostringstream os;
00347     os << "Intel TBB caught an exception, while computing Q := Q*U.  "
00348       "Unfortunately, its type information was lost, because the "
00349       "exception was thrown in another thread.  Its \"what()\" function "
00350       "returns the following string: " << ex.what();
00351     throw std::runtime_error (os.str());
00352   }
00353       }
00354 
00355 
00363       LocalOrdinal
00364       reveal_R_rank (const LocalOrdinal ncols,
00365          Scalar R[],
00366          const LocalOrdinal ldr,
00367          Scalar U[],
00368          const LocalOrdinal ldu,
00369          const magnitude_type tol) const 
00370       {
00371   return seq_.reveal_R_rank (ncols, R, ldr, U, ldu, tol);
00372       }
00373 
00385       LocalOrdinal
00386       reveal_rank (const LocalOrdinal nrows,
00387        const LocalOrdinal ncols,
00388        Scalar Q[],
00389        const LocalOrdinal ldq,
00390        Scalar R[],
00391        const LocalOrdinal ldr,
00392        const magnitude_type tol,
00393        const bool contiguous_cache_blocks = false) const
00394       {
00395   // Take the easy exit if available.
00396   if (ncols == 0)
00397     return 0;
00398 
00399   Matrix< LocalOrdinal, Scalar > U (ncols, ncols, Scalar(0));
00400   const LocalOrdinal rank = 
00401     reveal_R_rank (ncols, R, ldr, U.get(), U.ldu(), tol);
00402       
00403   if (rank < ncols)
00404     {
00405       // If R is not full rank: reveal_R_rank() already computed
00406       // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and
00407       // overwrote R with \f$\Sigma V^*\f$.  Now, we compute \f$Q
00408       // := Q \cdot U\f$, respecting cache blocks of Q.
00409       Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), 
00410            contiguous_cache_blocks);
00411     }
00412   return rank;
00413       }
00414 
00415       void
00416       cache_block (const LocalOrdinal nrows,
00417        const LocalOrdinal ncols, 
00418        Scalar A_out[],
00419        const Scalar A_in[],
00420        const LocalOrdinal lda_in) const 
00421       {
00422   using tbb::task;
00423 
00424   const_mat_view A_in_view (nrows, ncols, A_in, lda_in);
00425   // A_out won't have leading dimension lda_in, but that's OK,
00426   // as long as all the routines are told that A_out is
00427   // cache-blocked.
00428   mat_view A_out_view (nrows, ncols, A_out, lda_in);
00429   try {
00430     typedef CacheBlockTask< LocalOrdinal, Scalar > cache_block_task_t;
00431     cache_block_task_t& root_task = *new( task::allocate_root() )
00432       cache_block_task_t (0, ncores()-1, A_out_view, A_in_view, seq_);
00433     task::spawn_root_and_wait (root_task);
00434   } catch (tbb::captured_exception& ex) {
00435     std::ostringstream os;
00436     os << "Intel TBB caught an exception, while cache-blocking a mat"
00437       "rix.  Unfortunately, its type information was lost, because t"
00438       "he exception was thrown in another thread.  Its \"what()\" fu"
00439       "nction returns the following string: " << ex.what();
00440     throw std::runtime_error (os.str());
00441   }
00442       }
00443 
00444       void
00445       un_cache_block (const LocalOrdinal nrows,
00446           const LocalOrdinal ncols,
00447           Scalar A_out[],
00448           const LocalOrdinal lda_out,       
00449           const Scalar A_in[]) const
00450       {
00451   using tbb::task;
00452 
00453   // A_in doesn't have leading dimension lda_out, but that's OK,
00454   // as long as all the routines are told that A_in is cache-
00455   // blocked.
00456   const_mat_view A_in_view (nrows, ncols, A_in, lda_out);
00457   mat_view A_out_view (nrows, ncols, A_out, lda_out);
00458   try {
00459     typedef UnCacheBlockTask< LocalOrdinal, Scalar > un_cache_block_task_t;
00460     un_cache_block_task_t& root_task = *new( task::allocate_root() )
00461       un_cache_block_task_t (0, ncores()-1, A_out_view, A_in_view, seq_);
00462     task::spawn_root_and_wait (root_task);
00463   } catch (tbb::captured_exception& ex) {
00464     std::ostringstream os;
00465     os << "Intel TBB caught an exception, while un-cache-blocking a "
00466       "matrix.  Unfortunately, its type information was lost, becaus"
00467       "e the exception was thrown in another thread.  Its \"what()\""
00468       " function returns the following string: " << ex.what();
00469     throw std::runtime_error (os.str());
00470   }
00471       }
00472 
00473       template< class MatrixViewType >
00474       MatrixViewType
00475       top_block (const MatrixViewType& C, 
00476      const bool contiguous_cache_blocks = false) const
00477       {
00478   return top_block_helper (0, ncores()-1, C, contiguous_cache_blocks);
00479       }
00480 
00481       void
00482       fill_with_zeros (const LocalOrdinal nrows,
00483            const LocalOrdinal ncols,
00484            Scalar C[],
00485            const LocalOrdinal ldc, 
00486            const bool contiguous_cache_blocks = false) const
00487       {
00488   using tbb::task;
00489   mat_view C_view (nrows, ncols, C, ldc);
00490 
00491   try {
00492     typedef FillWithZerosTask< LocalOrdinal, Scalar > fill_task_t;
00493     fill_task_t& root_task = *new( task::allocate_root() )
00494       fill_task_t (0, ncores()-1, C_view, seq_, contiguous_cache_blocks);
00495     task::spawn_root_and_wait (root_task);
00496   } catch (tbb::captured_exception& ex) {
00497     std::ostringstream os;
00498     os << "Intel TBB caught an exception, while un-cache-blocking a "
00499       "matrix.  Unfortunately, its type information was lost, becaus"
00500       "e the exception was thrown in another thread.  Its \"what()\""
00501       " function returns the following string: " << ex.what();
00502     throw std::runtime_error (os.str());
00503   }
00504       }
00505 
00506     private:
00507       size_t ncores_;
00508       TSQR::SequentialTsqr< LocalOrdinal, Scalar > seq_;
00509       TSQR::Combine< LocalOrdinal, Scalar > combine_;
00510       Partitioner< LocalOrdinal, Scalar > partitioner_;
00511 
00512       double min_seq_factor_timing_;
00513       double max_seq_factor_timing_;
00514       double min_seq_apply_timing_;
00515       double max_seq_apply_timing_;
00516 
00517       void
00518       build_partition_array (const size_t P_first,
00519            const size_t P_last,
00520            array_top_blocks_t& top_blocks,
00521            const_mat_view& Q,
00522            mat_view& C,
00523            const bool contiguous_cache_blocks = false) const
00524       {
00525   if (P_first > P_last)
00526     return;
00527   else if (P_first == P_last)
00528     {
00529       const_mat_view Q_top = seq_.top_block (Q, contiguous_cache_blocks);
00530       mat_view C_top = seq_.top_block (C, contiguous_cache_blocks);
00531       top_blocks[P_first] = 
00532         std::make_pair (const_mat_view (Q_top.ncols(), Q_top.ncols(), Q_top.get(), Q_top.lda()), 
00533             mat_view (C_top.ncols(), C_top.ncols(), C_top.get(), C_top.lda()));
00534     }
00535   else
00536     {
00537       // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
00538       const size_t P_mid = (P_first + P_last) / 2;
00539       const_split_t Q_split = 
00540         partitioner_.split (Q, P_first, P_mid, P_last,
00541           contiguous_cache_blocks);
00542       split_t C_split = 
00543         partitioner_.split (C, P_first, P_mid, P_last,
00544           contiguous_cache_blocks);
00545       build_partition_array (P_first, P_mid, top_blocks, Q_split.first, 
00546            C_split.first, contiguous_cache_blocks);
00547       build_partition_array (P_mid+1, P_last, top_blocks, Q_split.second, 
00548            C_split.second, contiguous_cache_blocks);
00549     }
00550       }
00551 
00552 
00553     };
00554 
00555   } // namespace TBB
00556 } // namespace TSQR
00557 
00558 #endif // __TSQR_TBB_TbbParallelTsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends