Kokkos Node API and Local Linear Algebra Kernels Version of the Day
TbbTsqr_TbbParallelTsqr.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __TSQR_TBB_TbbParallelTsqr_hpp
00043 #define __TSQR_TBB_TbbParallelTsqr_hpp
00044 
00045 #include <tbb/tbb.h>
00046 #include <tbb/task_scheduler_init.h>
00047 
00048 #include <TbbTsqr_FactorTask.hpp>
00049 #include <TbbTsqr_ApplyTask.hpp>
00050 #include <TbbTsqr_ExplicitQTask.hpp>
00051 #include <TbbTsqr_RevealRankTask.hpp>
00052 #include <TbbTsqr_CacheBlockTask.hpp>
00053 #include <TbbTsqr_UnCacheBlockTask.hpp>
00054 #include <TbbTsqr_FillWithZerosTask.hpp>
00055 
00056 #include <Tsqr_ApplyType.hpp>
00057 #include <Teuchos_ScalarTraits.hpp>
00058 
00059 #include <algorithm>
00060 #include <limits>
00061 
00062 
00063 namespace TSQR {
00064   namespace TBB {
00065     
00078     template<class LocalOrdinal, class Scalar, class TimerType>
00079     class TbbParallelTsqr {
00080     private:
00081       typedef MatView<LocalOrdinal, Scalar> mat_view;
00082       typedef ConstMatView<LocalOrdinal, Scalar> const_mat_view;
00083       typedef std::pair<mat_view, mat_view> split_t;
00084       typedef std::pair<const_mat_view, const_mat_view> const_split_t;
00085       typedef std::pair<const_mat_view, mat_view> top_blocks_t;
00086       typedef std::vector<top_blocks_t> array_top_blocks_t;
00087 
00088       template<class MatrixViewType>
00089       MatrixViewType
00090       top_block_helper (const size_t P_first,
00091       const size_t P_last,
00092       const MatrixViewType& C, 
00093       const bool contiguous_cache_blocks) const
00094       {
00095   if (P_first > P_last)
00096     throw std::logic_error ("P_first > P_last");
00097   else if (P_first == P_last)
00098     return seq_.top_block (C, contiguous_cache_blocks);
00099   else
00100     {
00101       typedef std::pair<MatrixViewType, MatrixViewType> split_type;
00102 
00103       // Divide [P_first, P_last] into two intervals: [P_first,
00104       // P_mid] and [P_mid+1, P_last].  Recurse on the first
00105       // interval [P_first, P_mid].
00106       const size_t P_mid = (P_first + P_last) / 2;
00107       split_type C_split = partitioner_.split (C, P_first, P_mid, P_last,
00108                  contiguous_cache_blocks);
00109       // The partitioner may decide that the current block C has
00110       // too few rows to be worth splitting.  In that case,
00111       // C_split.first should be the same block as C, and
00112       // C_split.second (the bottom block) will be empty.  We
00113       // deal with this in the same way as the base case
00114       // (P_first == P_last) above.
00115       if (C_split.second.empty() || C_split.second.nrows() == 0)
00116         return seq_.top_block (C_split.first, contiguous_cache_blocks);
00117       else
00118         return top_block_helper (P_first, P_mid, C_split.first, 
00119                contiguous_cache_blocks);
00120     }
00121       }
00122 
00123     public:
00124       typedef Scalar scalar_type;
00125       typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
00126       typedef LocalOrdinal ordinal_type;
00127 
00130       static bool QR_produces_R_factor_with_nonnegative_diagonal() {
00131   typedef Combine<LocalOrdinal, Scalar> combine_type;
00132   typedef LAPACK<LocalOrdinal, Scalar> lapack_type;
00133       
00134   return combine_type::QR_produces_R_factor_with_nonnegative_diagonal() &&
00135     lapack_type::QR_produces_R_factor_with_nonnegative_diagonal();
00136       }
00137       
00140       typedef typename SequentialTsqr<LocalOrdinal, Scalar>::FactorOutput SeqOutput;
00141 
00146       typedef std::vector<std::vector<Scalar> > ParOutput;
00147 
00154       typedef typename std::pair<std::vector<SeqOutput>, ParOutput> FactorOutput;
00155 
00163       TbbParallelTsqr (const size_t numTasks = 1,
00164            const size_t cacheSizeHint = 0) :
00165   seq_ (cacheSizeHint),
00166   min_seq_factor_timing_ (std::numeric_limits<double>::max()),
00167   max_seq_factor_timing_ (std::numeric_limits<double>::min()),
00168   min_seq_apply_timing_ (std::numeric_limits<double>::max()),
00169   max_seq_apply_timing_ (std::numeric_limits<double>::min())
00170       {
00171   if (numTasks < 1)
00172     numTasks_ = 1; // default is no parallelism
00173   else
00174     numTasks_ = numTasks;
00175       }
00176 
00185       TbbParallelTsqr (const Teuchos::RCP<Teuchos::ParameterList>& plist) :
00186   seq_ (plist), // SequentialTsqr has a plist-accepting constructor.
00187   numTasks_ (1),  // Set a safe default for now.
00188   min_seq_factor_timing_ (std::numeric_limits<double>::max()),
00189   max_seq_factor_timing_ (std::numeric_limits<double>::min()),
00190   min_seq_apply_timing_ (std::numeric_limits<double>::max()),
00191   max_seq_apply_timing_ (std::numeric_limits<double>::min())
00192       {
00193   if (! plist.is_null()) {
00194     const int defaultNumTasks = 1; // A reasonable safe default value.
00195     int numTasks = plist->get ("Num Tasks", defaultNumTasks);
00196     if (numTasks < 1) { // Default is no parallelism.
00197       plist->set ("Num Tasks", defaultNumTasks);
00198     }
00199     numTasks_ = numTasks;
00200   }
00201       }
00202 
00203       Teuchos::RCP<const Teuchos::ParameterList>
00204       getValidParameters () const
00205       {
00206   using Teuchos::ParameterList;
00207   using Teuchos::parameterList;
00208   using Teuchos::RCP;
00209 
00210   // TbbTsqr recursively divides the tall skinny matrix on the
00211   // node into TBB tasks.  Each task works on a block row.  The
00212   // TBB task scheduler ensures that oversubscribing TBB tasks
00213   // won't oversubscribe cores, so it's OK if
00214   // default_num_threads() is too many.  For example, TBB might
00215   // say default_num_threads() is the number of cores on the
00216   // node, but the TBB task scheduler might have been
00217   // initialized with the number of cores per NUMA region, for
00218   // hybrid MPI + TBB parallelism.
00219   const int numTasks = 
00220     tbb::task_scheduler_init::default_num_threads();
00221   const size_t cacheSizeHint = 0;
00222   const size_t sizeOfScalar = sizeof(Scalar);
00223 
00224   RCP<ParameterList> params = parameterList ("NodeTsqr");
00225   params->set ("Num Tasks", numTasks, 
00226          "Number of tasks to use in the intranode parallel part "
00227          "TSQR.  There is little/no performance penalty for mild "
00228          "oversubscription, but a potential performance penalty "
00229          "for undersubscription.");
00230   params->set ("Cache Size Hint", cacheSizeHint, 
00231         "Cache size hint in bytes (as a size_t) to use for "
00232         "intranode TSQR.  If zero, TSQR will pick a reasonable "
00233         "default.  See the documentation of SequentialTsqr for "
00234          "a discussion of how to tune this parameter.");
00235   params->set ("Size of Scalar", sizeOfScalar);
00236 
00237   return params;
00238       }
00239 
00240       void 
00241       setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist)
00242       {
00243   seq_.setParameterList (plist);
00244 
00245   if (! plist.is_null()) {
00246     const int defaultNumCores = 1; // A reasonable safe default value.
00247     int numTasks = plist->get ("Num Tasks", defaultNumCores);
00248     if (numTasks < 1) { // Default is no parallelism.
00249       plist->set ("Num Tasks", defaultNumCores);
00250     }
00251     numTasks_ = numTasks;
00252   }
00253       }
00254 
00259       size_t ntasks() const { return numTasks_; }
00260 
00268       size_t TEUCHOS_DEPRECATED ncores() const { return numTasks_; }
00269 
00274       size_t TEUCHOS_DEPRECATED cache_block_size() const { 
00275   return seq_.cache_size_hint(); 
00276       }
00277 
00283       size_t cache_size_hint() const { return seq_.cache_size_hint(); }
00284 
00286       double
00287       min_seq_factor_timing () const { return min_seq_factor_timing_; }
00289       double
00290       max_seq_factor_timing () const { return max_seq_factor_timing_; }
00292       double
00293       min_seq_apply_timing () const { return min_seq_apply_timing_; }
00295       double
00296       max_seq_apply_timing () const { return max_seq_apply_timing_; }
00297 
00298       FactorOutput
00299       factor (const LocalOrdinal nrows,
00300         const LocalOrdinal ncols, 
00301         Scalar A[],
00302         const LocalOrdinal lda,
00303         Scalar R[],
00304         const LocalOrdinal ldr,
00305         const bool contiguous_cache_blocks) const
00306       {
00307   using tbb::task;
00308 
00309   mat_view A_view (nrows, ncols, A, lda);
00310   // A_top will be modified in place by exactly one task, to
00311   // indicate the partition from which we may extract the R
00312   // factor after finishing the factorization.
00313   mat_view A_top;
00314 
00315   std::vector<SeqOutput> seq_output (ntasks());
00316   ParOutput par_output (ntasks(), std::vector<Scalar>(ncols));
00317   if (ntasks() < 1)
00318     {
00319       if (! A_view.empty())
00320         throw std::logic_error("Zero subproblems, but A not empty!");
00321       else // Return empty results
00322         return std::make_pair (seq_output, par_output);
00323     }
00324   
00325   double my_seq_timing = double(0);
00326   double min_seq_timing = double(0);
00327   double max_seq_timing = double(0);
00328   try {
00329     typedef FactorTask<LocalOrdinal, Scalar, TimerType> factor_task_t;
00330 
00331     // When the root task completes, A_top will be set to the
00332     // topmost partition of A.  We can then extract the R factor
00333     // from A_top.
00334     factor_task_t& root_task = *new( task::allocate_root() ) 
00335       factor_task_t(0, ntasks()-1, A_view, &A_top, seq_output, 
00336         par_output, seq_, my_seq_timing, min_seq_timing,
00337         max_seq_timing, contiguous_cache_blocks);
00338     task::spawn_root_and_wait (root_task);
00339   } catch (tbb::captured_exception& ex) {
00340     // TBB can't guarantee on all systems that an exception
00341     // thrown in another thread will have its type correctly
00342     // propagated to this thread.  If it can't, then it captures
00343     // the exception as a tbb:captured_exception, and propagates
00344     // it to here.  It may be able to propagate the exception,
00345     // though, so be prepared for that.  We deal with the latter
00346     // case by allowing the exception to propagate.
00347     std::ostringstream os;
00348     os << "Intel TBB caught an exception, while computing the QR factor"
00349       "ization of a matrix A.  Unfortunately, its type information was "
00350       "lost, because the exception was thrown in another thread.  Its "
00351       "\"what()\" function returns the following string: " << ex.what();
00352     throw std::runtime_error (os.str());
00353   } 
00354 
00355   // Copy the R factor out of A_top into R.
00356   seq_.extract_R (A_top.nrows(), A_top.ncols(), A_top.get(), 
00357       A_top.lda(), R, ldr, contiguous_cache_blocks);
00358 
00359   // Save the timings for future reference
00360   if (min_seq_timing < min_seq_factor_timing_)
00361     min_seq_factor_timing_ = min_seq_timing;
00362   if (max_seq_timing > max_seq_factor_timing_)
00363     max_seq_factor_timing_ = max_seq_timing;
00364 
00365   return std::make_pair (seq_output, par_output);
00366       }
00367 
00368       void
00369       apply (const ApplyType& apply_type,
00370        const LocalOrdinal nrows,
00371        const LocalOrdinal ncols_Q,
00372        const Scalar Q[],
00373        const LocalOrdinal ldq,
00374        const FactorOutput& factor_output,
00375        const LocalOrdinal ncols_C,
00376        Scalar C[],
00377        const LocalOrdinal ldc,
00378        const bool contiguous_cache_blocks) const
00379       {
00380   using tbb::task;
00381 
00382   if (apply_type.transposed())
00383     throw std::logic_error ("Applying Q^T and Q^H not implemented");
00384 
00385   const_mat_view Q_view (nrows, ncols_Q, Q, ldq);
00386   mat_view C_view (nrows, ncols_C, C, ldc);
00387   if (! apply_type.transposed())
00388     {
00389       array_top_blocks_t top_blocks (ntasks());
00390       build_partition_array (0, ntasks()-1, top_blocks, Q_view, 
00391            C_view, contiguous_cache_blocks);
00392       double my_seq_timing = 0.0;
00393       double min_seq_timing = 0.0;
00394       double max_seq_timing = 0.0;
00395       try {
00396         typedef ApplyTask<LocalOrdinal, Scalar, TimerType> apply_task_t;
00397         apply_task_t& root_task = 
00398     *new( task::allocate_root() )
00399     apply_task_t (0, ntasks()-1, Q_view, C_view, top_blocks,
00400             factor_output, seq_, my_seq_timing, 
00401             min_seq_timing, max_seq_timing,
00402             contiguous_cache_blocks);
00403         task::spawn_root_and_wait (root_task);
00404       } catch (tbb::captured_exception& ex) {
00405         std::ostringstream os;
00406         os << "Intel TBB caught an exception, while applying a Q factor "
00407     "computed previously by factor() to the matrix C.  Unfortunate"
00408     "ly, its type information was lost, because the exception was "
00409     "thrown in another thread.  Its \"what()\" function returns th"
00410     "e following string: " << ex.what();
00411         throw std::runtime_error (os.str());
00412       }
00413 
00414       // Save the timings for future reference
00415       if (min_seq_timing < min_seq_apply_timing_)
00416         min_seq_apply_timing_ = min_seq_timing;
00417       if (max_seq_timing > max_seq_apply_timing_)
00418         max_seq_apply_timing_ = max_seq_timing;
00419     }
00420       }
00421 
00422 
00423       void
00424       explicit_Q (const LocalOrdinal nrows,
00425       const LocalOrdinal ncols_Q_in,
00426       const Scalar Q_in[],
00427       const LocalOrdinal ldq_in,
00428       const FactorOutput& factor_output,
00429       const LocalOrdinal ncols_Q_out,
00430       Scalar Q_out[],
00431       const LocalOrdinal ldq_out,
00432       const bool contiguous_cache_blocks) const
00433       {
00434   using tbb::task;
00435 
00436   mat_view Q_out_view (nrows, ncols_Q_out, Q_out, ldq_out);
00437   try {
00438     typedef ExplicitQTask< LocalOrdinal, Scalar > explicit_Q_task_t;    
00439     explicit_Q_task_t& root_task = *new( task::allocate_root() )
00440       explicit_Q_task_t (0, ntasks()-1, Q_out_view, seq_, 
00441              contiguous_cache_blocks);
00442     task::spawn_root_and_wait (root_task);
00443   } catch (tbb::captured_exception& ex) {
00444     std::ostringstream os;
00445     os << "Intel TBB caught an exception, while preparing to compute"
00446       " the explicit Q factor from a QR factorization computed previ"
00447       "ously by factor().  Unfortunately, its type information was l"
00448       "ost, because the exception was thrown in another thread.  Its"
00449       " \"what()\" function returns the following string: " 
00450        << ex.what();
00451     throw std::runtime_error (os.str());
00452   }
00453   apply (ApplyType::NoTranspose, 
00454          nrows, ncols_Q_in, Q_in, ldq_in, factor_output, 
00455          ncols_Q_out, Q_out, ldq_out, 
00456          contiguous_cache_blocks);
00457       }
00458 
00463       void
00464       Q_times_B (const LocalOrdinal nrows,
00465      const LocalOrdinal ncols,
00466      Scalar Q[],
00467      const LocalOrdinal ldq,
00468      const Scalar B[],
00469      const LocalOrdinal ldb,
00470      const bool contiguous_cache_blocks) const
00471       {
00472   // Compute Q := Q*B in parallel.  This works much like
00473   // cache_block() (which see), in that each thread's instance
00474   // does not need to communicate with the others.
00475   try {
00476     using tbb::task;
00477     typedef RevealRankTask<LocalOrdinal, Scalar> rrtask_type;
00478 
00479     mat_view Q_view (nrows, ncols, Q, ldq);
00480     const_mat_view B_view (ncols, ncols, B, ldb);
00481 
00482     rrtask_type& root_task = *new( task::allocate_root() )
00483       rrtask_type (0, ntasks()-1, Q_view, B_view, seq_, 
00484        contiguous_cache_blocks);
00485     task::spawn_root_and_wait (root_task);
00486   } catch (tbb::captured_exception& ex) {
00487     std::ostringstream os;
00488     os << "Intel TBB caught an exception, while computing Q := Q*U.  "
00489       "Unfortunately, its type information was lost, because the "
00490       "exception was thrown in another thread.  Its \"what()\" function "
00491       "returns the following string: " << ex.what();
00492     throw std::runtime_error (os.str());
00493   }
00494       }
00495 
00496 
00504       LocalOrdinal
00505       reveal_R_rank (const LocalOrdinal ncols,
00506          Scalar R[],
00507          const LocalOrdinal ldr,
00508          Scalar U[],
00509          const LocalOrdinal ldu,
00510          const magnitude_type tol) const 
00511       {
00512   return seq_.reveal_R_rank (ncols, R, ldr, U, ldu, tol);
00513       }
00514 
00526       LocalOrdinal
00527       reveal_rank (const LocalOrdinal nrows,
00528        const LocalOrdinal ncols,
00529        Scalar Q[],
00530        const LocalOrdinal ldq,
00531        Scalar R[],
00532        const LocalOrdinal ldr,
00533        const magnitude_type tol,
00534        const bool contiguous_cache_blocks = false) const
00535       {
00536   // Take the easy exit if available.
00537   if (ncols == 0)
00538     return 0;
00539 
00540   Matrix<LocalOrdinal, Scalar> U (ncols, ncols, Scalar(0));
00541   const LocalOrdinal rank = 
00542     reveal_R_rank (ncols, R, ldr, U.get(), U.ldu(), tol);
00543       
00544   if (rank < ncols)
00545     {
00546       // If R is not full rank: reveal_R_rank() already computed
00547       // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and
00548       // overwrote R with \f$\Sigma V^*\f$.  Now, we compute \f$Q
00549       // := Q \cdot U\f$, respecting cache blocks of Q.
00550       Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), 
00551            contiguous_cache_blocks);
00552     }
00553   return rank;
00554       }
00555 
00556       void
00557       cache_block (const LocalOrdinal nrows,
00558        const LocalOrdinal ncols, 
00559        Scalar A_out[],
00560        const Scalar A_in[],
00561        const LocalOrdinal lda_in) const 
00562       {
00563   using tbb::task;
00564 
00565   const_mat_view A_in_view (nrows, ncols, A_in, lda_in);
00566   // A_out won't have leading dimension lda_in, but that's OK,
00567   // as long as all the routines are told that A_out is
00568   // cache-blocked.
00569   mat_view A_out_view (nrows, ncols, A_out, lda_in);
00570   try {
00571     typedef CacheBlockTask< LocalOrdinal, Scalar > cache_block_task_t;
00572     cache_block_task_t& root_task = *new( task::allocate_root() )
00573       cache_block_task_t (0, ntasks()-1, A_out_view, A_in_view, seq_);
00574     task::spawn_root_and_wait (root_task);
00575   } catch (tbb::captured_exception& ex) {
00576     std::ostringstream os;
00577     os << "Intel TBB caught an exception, while cache-blocking a mat"
00578       "rix.  Unfortunately, its type information was lost, because t"
00579       "he exception was thrown in another thread.  Its \"what()\" fu"
00580       "nction returns the following string: " << ex.what();
00581     throw std::runtime_error (os.str());
00582   }
00583       }
00584 
00585       void
00586       un_cache_block (const LocalOrdinal nrows,
00587           const LocalOrdinal ncols,
00588           Scalar A_out[],
00589           const LocalOrdinal lda_out,       
00590           const Scalar A_in[]) const
00591       {
00592   using tbb::task;
00593 
00594   // A_in doesn't have leading dimension lda_out, but that's OK,
00595   // as long as all the routines are told that A_in is cache-
00596   // blocked.
00597   const_mat_view A_in_view (nrows, ncols, A_in, lda_out);
00598   mat_view A_out_view (nrows, ncols, A_out, lda_out);
00599   try {
00600     typedef UnCacheBlockTask< LocalOrdinal, Scalar > un_cache_block_task_t;
00601     un_cache_block_task_t& root_task = *new( task::allocate_root() )
00602       un_cache_block_task_t (0, ntasks()-1, A_out_view, A_in_view, seq_);
00603     task::spawn_root_and_wait (root_task);
00604   } catch (tbb::captured_exception& ex) {
00605     std::ostringstream os;
00606     os << "Intel TBB caught an exception, while un-cache-blocking a "
00607       "matrix.  Unfortunately, its type information was lost, becaus"
00608       "e the exception was thrown in another thread.  Its \"what()\""
00609       " function returns the following string: " << ex.what();
00610     throw std::runtime_error (os.str());
00611   }
00612       }
00613 
00614       template< class MatrixViewType >
00615       MatrixViewType
00616       top_block (const MatrixViewType& C, 
00617      const bool contiguous_cache_blocks = false) const
00618       {
00619   return top_block_helper (0, ntasks()-1, C, contiguous_cache_blocks);
00620       }
00621 
00622       void
00623       fill_with_zeros (const LocalOrdinal nrows,
00624            const LocalOrdinal ncols,
00625            Scalar C[],
00626            const LocalOrdinal ldc, 
00627            const bool contiguous_cache_blocks) const
00628       {
00629   using tbb::task;
00630   mat_view C_view (nrows, ncols, C, ldc);
00631 
00632   try {
00633     typedef FillWithZerosTask< LocalOrdinal, Scalar > fill_task_t;
00634     fill_task_t& root_task = *new( task::allocate_root() )
00635       fill_task_t (0, ntasks()-1, C_view, seq_, contiguous_cache_blocks);
00636     task::spawn_root_and_wait (root_task);
00637   } catch (tbb::captured_exception& ex) {
00638     std::ostringstream os;
00639     os << "Intel TBB caught an exception, while un-cache-blocking a "
00640       "matrix.  Unfortunately, its type information was lost, becaus"
00641       "e the exception was thrown in another thread.  Its \"what()\""
00642       " function returns the following string: " << ex.what();
00643     throw std::runtime_error (os.str());
00644   }
00645       }
00646 
00647     private:
00648       size_t numTasks_;
00649       TSQR::SequentialTsqr<LocalOrdinal, Scalar> seq_;
00650       TSQR::Combine<LocalOrdinal, Scalar> combine_;
00651       Partitioner<LocalOrdinal, Scalar> partitioner_;
00652 
00653       mutable double min_seq_factor_timing_;
00654       mutable double max_seq_factor_timing_;
00655       mutable double min_seq_apply_timing_;
00656       mutable double max_seq_apply_timing_;
00657 
00658       void
00659       build_partition_array (const size_t P_first,
00660            const size_t P_last,
00661            array_top_blocks_t& top_blocks,
00662            const_mat_view& Q,
00663            mat_view& C,
00664            const bool contiguous_cache_blocks = false) const
00665       {
00666   if (P_first > P_last)
00667     return;
00668   else if (P_first == P_last)
00669     {
00670       const_mat_view Q_top = seq_.top_block (Q, contiguous_cache_blocks);
00671       mat_view C_top = seq_.top_block (C, contiguous_cache_blocks);
00672       top_blocks[P_first] = 
00673         std::make_pair (const_mat_view (Q_top.ncols(), Q_top.ncols(), 
00674                 Q_top.get(), Q_top.lda()), 
00675             mat_view (C_top.ncols(), C_top.ncols(), 
00676           C_top.get(), C_top.lda()));
00677     }
00678   else
00679     {
00680       // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
00681       const size_t P_mid = (P_first + P_last) / 2;
00682       const_split_t Q_split = 
00683         partitioner_.split (Q, P_first, P_mid, P_last,
00684           contiguous_cache_blocks);
00685       split_t C_split = 
00686         partitioner_.split (C, P_first, P_mid, P_last,
00687           contiguous_cache_blocks);
00688       // The partitioner may decide that the current blocks Q
00689       // and C have too few rows to be worth splitting.  (The
00690       // partitioner should split both Q and C in the same way.)
00691       // In that case, Q_split.first should be the same block as
00692       // Q, and Q_split.second (the bottom block) will be empty.
00693       // Ditto for C_split.  We deal with this in the same way
00694       // as the base case (P_first == P_last) above.
00695       if (Q_split.second.empty() || Q_split.second.nrows() == 0)
00696         {
00697     const_mat_view Q_top = 
00698       seq_.top_block (Q, contiguous_cache_blocks);
00699     mat_view C_top = seq_.top_block (C, contiguous_cache_blocks);
00700     top_blocks[P_first] = 
00701       std::make_pair (const_mat_view (Q_top.ncols(), Q_top.ncols(), 
00702               Q_top.get(), Q_top.lda()), 
00703           mat_view (C_top.ncols(), C_top.ncols(), 
00704               C_top.get(), C_top.lda()));
00705         }
00706       else
00707         {
00708     build_partition_array (P_first, P_mid, top_blocks, 
00709                Q_split.first, C_split.first, 
00710                contiguous_cache_blocks);
00711     build_partition_array (P_mid+1, P_last, top_blocks, 
00712                Q_split.second, C_split.second, 
00713                contiguous_cache_blocks);
00714         }
00715     }
00716       }
00717 
00718 
00719     };
00720 
00721   } // namespace TBB
00722 } // namespace TSQR
00723 
00724 #endif // __TSQR_TBB_TbbParallelTsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends