Kokkos Node API and Local Linear Algebra Kernels Version of the Day
TbbTsqr_TbbParallelTsqr.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __TSQR_TBB_TbbParallelTsqr_hpp
00030 #define __TSQR_TBB_TbbParallelTsqr_hpp
00031 
00032 #include <tbb/tbb.h>
00033 #include <TbbTsqr_FactorTask.hpp>
00034 #include <TbbTsqr_ApplyTask.hpp>
00035 #include <TbbTsqr_ExplicitQTask.hpp>
00036 #include <TbbTsqr_RevealRankTask.hpp>
00037 #include <TbbTsqr_CacheBlockTask.hpp>
00038 #include <TbbTsqr_UnCacheBlockTask.hpp>
00039 #include <TbbTsqr_FillWithZerosTask.hpp>
00040 
00041 #include <Tsqr_ApplyType.hpp>
00042 #include <Teuchos_ScalarTraits.hpp>
00043 
00044 #include <algorithm>
00045 #include <limits>
00046 
00049 
00050 namespace TSQR {
00051   namespace TBB {
00052     
00065     template<class LocalOrdinal, class Scalar, class TimerType>
00066     class TbbParallelTsqr {
00067     private:
00068       typedef MatView<LocalOrdinal, Scalar> mat_view;
00069       typedef ConstMatView<LocalOrdinal, Scalar> const_mat_view;
00070       typedef std::pair<mat_view, mat_view> split_t;
00071       typedef std::pair<const_mat_view, const_mat_view> const_split_t;
00072       typedef std::pair<const_mat_view, mat_view> top_blocks_t;
00073       typedef std::vector<top_blocks_t> array_top_blocks_t;
00074 
00075       template<class MatrixViewType>
00076       MatrixViewType
00077       top_block_helper (const size_t P_first,
00078       const size_t P_last,
00079       const MatrixViewType& C, 
00080       const bool contiguous_cache_blocks) const
00081       {
00082   if (P_first > P_last)
00083     throw std::logic_error ("P_first > P_last");
00084   else if (P_first == P_last)
00085     return seq_.top_block (C, contiguous_cache_blocks);
00086   else
00087     {
00088       typedef std::pair< MatrixViewType, MatrixViewType > split_type;
00089 
00090       // Divide [P_first, P_last] into two intervals: [P_first,
00091       // P_mid] and [P_mid+1, P_last].  Recurse on the first
00092       // interval [P_first, P_mid].
00093       const size_t P_mid = (P_first + P_last) / 2;
00094       split_type C_split = partitioner_.split (C, P_first, P_mid, P_last,
00095                  contiguous_cache_blocks);
00096       // The partitioner may decide that the current block C has
00097       // too few rows to be worth splitting.  In that case,
00098       // C_split.first should be the same block as C, and
00099       // C_split.second (the bottom block) will be empty.  We
00100       // deal with this in the same way as the base case
00101       // (P_first == P_last) above.
00102       if (C_split.second.empty() || C_split.second.nrows() == 0)
00103         return seq_.top_block (C_split.first, contiguous_cache_blocks);
00104       else
00105         return top_block_helper (P_first, P_mid, C_split.first, 
00106                contiguous_cache_blocks);
00107     }
00108       }
00109 
00110     public:
00111       typedef Scalar scalar_type;
00112       typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
00113       typedef LocalOrdinal ordinal_type;
00114 
00117       static bool QR_produces_R_factor_with_nonnegative_diagonal() {
00118   typedef Combine<LocalOrdinal, Scalar> combine_type;
00119   typedef LAPACK<LocalOrdinal, Scalar> lapack_type;
00120       
00121   return combine_type::QR_produces_R_factor_with_nonnegative_diagonal() &&
00122     lapack_type::QR_produces_R_factor_with_nonnegative_diagonal();
00123       }
00124       
00127       typedef typename SequentialTsqr<LocalOrdinal, Scalar>::FactorOutput SeqOutput;
00132       typedef std::vector<std::vector<Scalar> > ParOutput;
00139       typedef typename std::pair<std::vector<SeqOutput>, ParOutput> FactorOutput;
00140 
00148       TbbParallelTsqr (const size_t numCores = 1,
00149            const size_t cacheSizeHint = 0) :
00150   seq_ (cacheSizeHint),
00151   min_seq_factor_timing_ (std::numeric_limits<double>::max()),
00152   max_seq_factor_timing_ (std::numeric_limits<double>::min()),
00153   min_seq_apply_timing_ (std::numeric_limits<double>::max()),
00154   max_seq_apply_timing_ (std::numeric_limits<double>::min())
00155       {
00156   if (numCores < 1)
00157     ncores_ = 1; // default is no parallelism
00158   else
00159     ncores_ = numCores;
00160       }
00161       
00166       size_t ncores() const { return ncores_; }
00167 
00172       size_t TEUCHOS_DEPRECATED cache_block_size() const { 
00173   return seq_.cache_size_hint(); 
00174       }
00175 
00181       size_t cache_size_hint() const { return seq_.cache_size_hint(); }
00182 
00184       double
00185       min_seq_factor_timing () const { return min_seq_factor_timing_; }
00187       double
00188       max_seq_factor_timing () const { return max_seq_factor_timing_; }
00190       double
00191       min_seq_apply_timing () const { return min_seq_apply_timing_; }
00193       double
00194       max_seq_apply_timing () const { return max_seq_apply_timing_; }
00195 
00196       FactorOutput
00197       factor (const LocalOrdinal nrows,
00198         const LocalOrdinal ncols, 
00199         Scalar A[],
00200         const LocalOrdinal lda,
00201         Scalar R[],
00202         const LocalOrdinal ldr,
00203         const bool contiguous_cache_blocks) const
00204       {
00205   using tbb::task;
00206 
00207   mat_view A_view (nrows, ncols, A, lda);
00208   // A_top will be modified in place by exactly one task, to
00209   // indicate the partition from which we may extract the R
00210   // factor after finishing the factorization.
00211   mat_view A_top;
00212 
00213   std::vector<SeqOutput> seq_output (ncores());
00214   ParOutput par_output (ncores(), std::vector<Scalar>(ncols));
00215   if (ncores() < 1)
00216     {
00217       if (! A_view.empty())
00218         throw std::logic_error("Zero subproblems, but A not empty!");
00219       else // Return empty results
00220         return std::make_pair (seq_output, par_output);
00221     }
00222   
00223   double my_seq_timing = double(0);
00224   double min_seq_timing = double(0);
00225   double max_seq_timing = double(0);
00226   try {
00227     typedef FactorTask<LocalOrdinal, Scalar, TimerType> factor_task_t;
00228 
00229     // When the root task completes, A_top will be set to the
00230     // topmost partition of A.  We can then extract the R factor
00231     // from A_top.
00232     factor_task_t& root_task = *new( task::allocate_root() ) 
00233       factor_task_t(0, ncores()-1, A_view, &A_top, seq_output, 
00234         par_output, seq_, my_seq_timing, min_seq_timing,
00235         max_seq_timing, contiguous_cache_blocks);
00236     task::spawn_root_and_wait (root_task);
00237   } catch (tbb::captured_exception& ex) {
00238     // TBB can't guarantee on all systems that an exception
00239     // thrown in another thread will have its type correctly
00240     // propagated to this thread.  If it can't, then it captures
00241     // the exception as a tbb:captured_exception, and propagates
00242     // it to here.  It may be able to propagate the exception,
00243     // though, so be prepared for that.  We deal with the latter
00244     // case by allowing the exception to propagate.
00245     std::ostringstream os;
00246     os << "Intel TBB caught an exception, while computing the QR factor"
00247       "ization of a matrix A.  Unfortunately, its type information was "
00248       "lost, because the exception was thrown in another thread.  Its "
00249       "\"what()\" function returns the following string: " << ex.what();
00250     throw std::runtime_error (os.str());
00251   } 
00252 
00253   // Copy the R factor out of A_top into R.
00254   seq_.extract_R (A_top.nrows(), A_top.ncols(), A_top.get(), 
00255       A_top.lda(), R, ldr, contiguous_cache_blocks);
00256 
00257   // Save the timings for future reference
00258   if (min_seq_timing < min_seq_factor_timing_)
00259     min_seq_factor_timing_ = min_seq_timing;
00260   if (max_seq_timing > max_seq_factor_timing_)
00261     max_seq_factor_timing_ = max_seq_timing;
00262 
00263   return std::make_pair (seq_output, par_output);
00264       }
00265 
00266       void
00267       apply (const ApplyType& apply_type,
00268        const LocalOrdinal nrows,
00269        const LocalOrdinal ncols_Q,
00270        const Scalar Q[],
00271        const LocalOrdinal ldq,
00272        const FactorOutput& factor_output,
00273        const LocalOrdinal ncols_C,
00274        Scalar C[],
00275        const LocalOrdinal ldc,
00276        const bool contiguous_cache_blocks) const
00277       {
00278   using tbb::task;
00279 
00280   if (apply_type.transposed())
00281     throw std::logic_error ("Applying Q^T and Q^H not implemented");
00282 
00283   const_mat_view Q_view (nrows, ncols_Q, Q, ldq);
00284   mat_view C_view (nrows, ncols_C, C, ldc);
00285   if (! apply_type.transposed())
00286     {
00287       array_top_blocks_t top_blocks (ncores());
00288       build_partition_array (0, ncores()-1, top_blocks, Q_view, 
00289            C_view, contiguous_cache_blocks);
00290       double my_seq_timing = 0.0;
00291       double min_seq_timing = 0.0;
00292       double max_seq_timing = 0.0;
00293       try {
00294         typedef ApplyTask<LocalOrdinal, Scalar, TimerType> apply_task_t;
00295         apply_task_t& root_task = 
00296     *new( task::allocate_root() )
00297     apply_task_t (0, ncores()-1, Q_view, C_view, top_blocks,
00298             factor_output, seq_, my_seq_timing, 
00299             min_seq_timing, max_seq_timing,
00300             contiguous_cache_blocks);
00301         task::spawn_root_and_wait (root_task);
00302       } catch (tbb::captured_exception& ex) {
00303         std::ostringstream os;
00304         os << "Intel TBB caught an exception, while applying a Q factor "
00305     "computed previously by factor() to the matrix C.  Unfortunate"
00306     "ly, its type information was lost, because the exception was "
00307     "thrown in another thread.  Its \"what()\" function returns th"
00308     "e following string: " << ex.what();
00309         throw std::runtime_error (os.str());
00310       }
00311 
00312       // Save the timings for future reference
00313       if (min_seq_timing < min_seq_apply_timing_)
00314         min_seq_apply_timing_ = min_seq_timing;
00315       if (max_seq_timing > max_seq_apply_timing_)
00316         max_seq_apply_timing_ = max_seq_timing;
00317     }
00318       }
00319 
00320 
00321       void
00322       explicit_Q (const LocalOrdinal nrows,
00323       const LocalOrdinal ncols_Q_in,
00324       const Scalar Q_in[],
00325       const LocalOrdinal ldq_in,
00326       const FactorOutput& factor_output,
00327       const LocalOrdinal ncols_Q_out,
00328       Scalar Q_out[],
00329       const LocalOrdinal ldq_out,
00330       const bool contiguous_cache_blocks) const
00331       {
00332   using tbb::task;
00333 
00334   mat_view Q_out_view (nrows, ncols_Q_out, Q_out, ldq_out);
00335   try {
00336     typedef ExplicitQTask< LocalOrdinal, Scalar > explicit_Q_task_t;    
00337     explicit_Q_task_t& root_task = *new( task::allocate_root() )
00338       explicit_Q_task_t (0, ncores()-1, Q_out_view, seq_, 
00339              contiguous_cache_blocks);
00340     task::spawn_root_and_wait (root_task);
00341   } catch (tbb::captured_exception& ex) {
00342     std::ostringstream os;
00343     os << "Intel TBB caught an exception, while preparing to compute"
00344       " the explicit Q factor from a QR factorization computed previ"
00345       "ously by factor().  Unfortunately, its type information was l"
00346       "ost, because the exception was thrown in another thread.  Its"
00347       " \"what()\" function returns the following string: " 
00348        << ex.what();
00349     throw std::runtime_error (os.str());
00350   }
00351   apply (ApplyType::NoTranspose, 
00352          nrows, ncols_Q_in, Q_in, ldq_in, factor_output, 
00353          ncols_Q_out, Q_out, ldq_out, 
00354          contiguous_cache_blocks);
00355       }
00356 
00361       void
00362       Q_times_B (const LocalOrdinal nrows,
00363      const LocalOrdinal ncols,
00364      Scalar Q[],
00365      const LocalOrdinal ldq,
00366      const Scalar B[],
00367      const LocalOrdinal ldb,
00368      const bool contiguous_cache_blocks) const
00369       {
00370   // Compute Q := Q*B in parallel.  This works much like
00371   // cache_block() (which see), in that each thread's instance
00372   // does not need to communicate with the others.
00373   try {
00374     using tbb::task;
00375     typedef RevealRankTask<LocalOrdinal, Scalar> rrtask_type;
00376 
00377     mat_view Q_view (nrows, ncols, Q, ldq);
00378     const_mat_view B_view (ncols, ncols, B, ldb);
00379 
00380     rrtask_type& root_task = *new( task::allocate_root() )
00381       rrtask_type (0, ncores()-1, Q_view, B_view, seq_, 
00382        contiguous_cache_blocks);
00383     task::spawn_root_and_wait (root_task);
00384   } catch (tbb::captured_exception& ex) {
00385     std::ostringstream os;
00386     os << "Intel TBB caught an exception, while computing Q := Q*U.  "
00387       "Unfortunately, its type information was lost, because the "
00388       "exception was thrown in another thread.  Its \"what()\" function "
00389       "returns the following string: " << ex.what();
00390     throw std::runtime_error (os.str());
00391   }
00392       }
00393 
00394 
00402       LocalOrdinal
00403       reveal_R_rank (const LocalOrdinal ncols,
00404          Scalar R[],
00405          const LocalOrdinal ldr,
00406          Scalar U[],
00407          const LocalOrdinal ldu,
00408          const magnitude_type tol) const 
00409       {
00410   return seq_.reveal_R_rank (ncols, R, ldr, U, ldu, tol);
00411       }
00412 
00424       LocalOrdinal
00425       reveal_rank (const LocalOrdinal nrows,
00426        const LocalOrdinal ncols,
00427        Scalar Q[],
00428        const LocalOrdinal ldq,
00429        Scalar R[],
00430        const LocalOrdinal ldr,
00431        const magnitude_type tol,
00432        const bool contiguous_cache_blocks = false) const
00433       {
00434   // Take the easy exit if available.
00435   if (ncols == 0)
00436     return 0;
00437 
00438   Matrix<LocalOrdinal, Scalar> U (ncols, ncols, Scalar(0));
00439   const LocalOrdinal rank = 
00440     reveal_R_rank (ncols, R, ldr, U.get(), U.ldu(), tol);
00441       
00442   if (rank < ncols)
00443     {
00444       // If R is not full rank: reveal_R_rank() already computed
00445       // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and
00446       // overwrote R with \f$\Sigma V^*\f$.  Now, we compute \f$Q
00447       // := Q \cdot U\f$, respecting cache blocks of Q.
00448       Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), 
00449            contiguous_cache_blocks);
00450     }
00451   return rank;
00452       }
00453 
00454       void
00455       cache_block (const LocalOrdinal nrows,
00456        const LocalOrdinal ncols, 
00457        Scalar A_out[],
00458        const Scalar A_in[],
00459        const LocalOrdinal lda_in) const 
00460       {
00461   using tbb::task;
00462 
00463   const_mat_view A_in_view (nrows, ncols, A_in, lda_in);
00464   // A_out won't have leading dimension lda_in, but that's OK,
00465   // as long as all the routines are told that A_out is
00466   // cache-blocked.
00467   mat_view A_out_view (nrows, ncols, A_out, lda_in);
00468   try {
00469     typedef CacheBlockTask< LocalOrdinal, Scalar > cache_block_task_t;
00470     cache_block_task_t& root_task = *new( task::allocate_root() )
00471       cache_block_task_t (0, ncores()-1, A_out_view, A_in_view, seq_);
00472     task::spawn_root_and_wait (root_task);
00473   } catch (tbb::captured_exception& ex) {
00474     std::ostringstream os;
00475     os << "Intel TBB caught an exception, while cache-blocking a mat"
00476       "rix.  Unfortunately, its type information was lost, because t"
00477       "he exception was thrown in another thread.  Its \"what()\" fu"
00478       "nction returns the following string: " << ex.what();
00479     throw std::runtime_error (os.str());
00480   }
00481       }
00482 
00483       void
00484       un_cache_block (const LocalOrdinal nrows,
00485           const LocalOrdinal ncols,
00486           Scalar A_out[],
00487           const LocalOrdinal lda_out,       
00488           const Scalar A_in[]) const
00489       {
00490   using tbb::task;
00491 
00492   // A_in doesn't have leading dimension lda_out, but that's OK,
00493   // as long as all the routines are told that A_in is cache-
00494   // blocked.
00495   const_mat_view A_in_view (nrows, ncols, A_in, lda_out);
00496   mat_view A_out_view (nrows, ncols, A_out, lda_out);
00497   try {
00498     typedef UnCacheBlockTask< LocalOrdinal, Scalar > un_cache_block_task_t;
00499     un_cache_block_task_t& root_task = *new( task::allocate_root() )
00500       un_cache_block_task_t (0, ncores()-1, A_out_view, A_in_view, seq_);
00501     task::spawn_root_and_wait (root_task);
00502   } catch (tbb::captured_exception& ex) {
00503     std::ostringstream os;
00504     os << "Intel TBB caught an exception, while un-cache-blocking a "
00505       "matrix.  Unfortunately, its type information was lost, becaus"
00506       "e the exception was thrown in another thread.  Its \"what()\""
00507       " function returns the following string: " << ex.what();
00508     throw std::runtime_error (os.str());
00509   }
00510       }
00511 
00512       template< class MatrixViewType >
00513       MatrixViewType
00514       top_block (const MatrixViewType& C, 
00515      const bool contiguous_cache_blocks = false) const
00516       {
00517   return top_block_helper (0, ncores()-1, C, contiguous_cache_blocks);
00518       }
00519 
00520       void
00521       fill_with_zeros (const LocalOrdinal nrows,
00522            const LocalOrdinal ncols,
00523            Scalar C[],
00524            const LocalOrdinal ldc, 
00525            const bool contiguous_cache_blocks) const
00526       {
00527   using tbb::task;
00528   mat_view C_view (nrows, ncols, C, ldc);
00529 
00530   try {
00531     typedef FillWithZerosTask< LocalOrdinal, Scalar > fill_task_t;
00532     fill_task_t& root_task = *new( task::allocate_root() )
00533       fill_task_t (0, ncores()-1, C_view, seq_, contiguous_cache_blocks);
00534     task::spawn_root_and_wait (root_task);
00535   } catch (tbb::captured_exception& ex) {
00536     std::ostringstream os;
00537     os << "Intel TBB caught an exception, while un-cache-blocking a "
00538       "matrix.  Unfortunately, its type information was lost, becaus"
00539       "e the exception was thrown in another thread.  Its \"what()\""
00540       " function returns the following string: " << ex.what();
00541     throw std::runtime_error (os.str());
00542   }
00543       }
00544 
00545     private:
00546       size_t ncores_;
00547       TSQR::SequentialTsqr<LocalOrdinal, Scalar> seq_;
00548       TSQR::Combine<LocalOrdinal, Scalar> combine_;
00549       Partitioner<LocalOrdinal, Scalar> partitioner_;
00550 
00551       mutable double min_seq_factor_timing_;
00552       mutable double max_seq_factor_timing_;
00553       mutable double min_seq_apply_timing_;
00554       mutable double max_seq_apply_timing_;
00555 
00556       void
00557       build_partition_array (const size_t P_first,
00558            const size_t P_last,
00559            array_top_blocks_t& top_blocks,
00560            const_mat_view& Q,
00561            mat_view& C,
00562            const bool contiguous_cache_blocks = false) const
00563       {
00564   if (P_first > P_last)
00565     return;
00566   else if (P_first == P_last)
00567     {
00568       const_mat_view Q_top = seq_.top_block (Q, contiguous_cache_blocks);
00569       mat_view C_top = seq_.top_block (C, contiguous_cache_blocks);
00570       top_blocks[P_first] = 
00571         std::make_pair (const_mat_view (Q_top.ncols(), Q_top.ncols(), 
00572                 Q_top.get(), Q_top.lda()), 
00573             mat_view (C_top.ncols(), C_top.ncols(), 
00574           C_top.get(), C_top.lda()));
00575     }
00576   else
00577     {
00578       // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
00579       const size_t P_mid = (P_first + P_last) / 2;
00580       const_split_t Q_split = 
00581         partitioner_.split (Q, P_first, P_mid, P_last,
00582           contiguous_cache_blocks);
00583       split_t C_split = 
00584         partitioner_.split (C, P_first, P_mid, P_last,
00585           contiguous_cache_blocks);
00586       // The partitioner may decide that the current blocks Q
00587       // and C have too few rows to be worth splitting.  (The
00588       // partitioner should split both Q and C in the same way.)
00589       // In that case, Q_split.first should be the same block as
00590       // Q, and Q_split.second (the bottom block) will be empty.
00591       // Ditto for C_split.  We deal with this in the same way
00592       // as the base case (P_first == P_last) above.
00593       if (Q_split.second.empty() || Q_split.second.nrows() == 0)
00594         {
00595     const_mat_view Q_top = 
00596       seq_.top_block (Q, contiguous_cache_blocks);
00597     mat_view C_top = seq_.top_block (C, contiguous_cache_blocks);
00598     top_blocks[P_first] = 
00599       std::make_pair (const_mat_view (Q_top.ncols(), Q_top.ncols(), 
00600               Q_top.get(), Q_top.lda()), 
00601           mat_view (C_top.ncols(), C_top.ncols(), 
00602               C_top.get(), C_top.lda()));
00603         }
00604       else
00605         {
00606     build_partition_array (P_first, P_mid, top_blocks, 
00607                Q_split.first, C_split.first, 
00608                contiguous_cache_blocks);
00609     build_partition_array (P_mid+1, P_last, top_blocks, 
00610                Q_split.second, C_split.second, 
00611                contiguous_cache_blocks);
00612         }
00613     }
00614       }
00615 
00616 
00617     };
00618 
00619   } // namespace TBB
00620 } // namespace TSQR
00621 
00622 #endif // __TSQR_TBB_TbbParallelTsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends