Anasazi Version of the Day
TbbTsqr_FactorTask.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                 Anasazi: Block Eigensolvers Package
00005 //                 Copyright (2010) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00025 //
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 #ifndef __TSQR_TBB_FactorTask_hpp
00030 #define __TSQR_TBB_FactorTask_hpp
00031 
00032 #include <tbb/task.h>
00033 #include <TbbTsqr_Partitioner.hpp>
00034 #include <Tsqr_SequentialTsqr.hpp>
00035 #include <algorithm>
00036 
00039 
00040 namespace TSQR {
00041   namespace TBB {
00042     
00043     template< class LocalOrdinal, class Scalar, class TimerType >
00044     class FactorTask : public tbb::task {
00045     public:
00046       typedef MatView< LocalOrdinal, Scalar > mat_view;
00047       typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view;
00048       typedef std::pair< mat_view, mat_view > split_t;
00049       typedef std::pair< const_mat_view, const_mat_view > const_split_t;
00050 
00052       typedef typename SequentialTsqr< LocalOrdinal, Scalar >::FactorOutput SeqOutput;
00055       typedef std::vector< std::vector< Scalar > > ParOutput;
00059       typedef typename std::pair< std::vector< SeqOutput >, ParOutput > FactorOutput;
00060 
00061       FactorTask (const size_t P_first__, 
00062       const size_t P_last__,
00063       mat_view A,
00064       mat_view* const A_top_ptr,
00065       std::vector< SeqOutput >& seq_outputs,
00066       ParOutput& par_output,
00067       const SequentialTsqr< LocalOrdinal, Scalar > seq,
00068       double& my_seq_timing,
00069       double& min_seq_timing,
00070       double& max_seq_timing,
00071       const bool contiguous_cache_blocks) :
00072   P_first_ (P_first__),
00073   P_last_ (P_last__),
00074   A_ (A),
00075   A_top_ptr_ (A_top_ptr),
00076   seq_outputs_ (seq_outputs),
00077   par_output_ (par_output),
00078   seq_ (seq),
00079   contiguous_cache_blocks_ (contiguous_cache_blocks),
00080   my_seq_timing_ (my_seq_timing),
00081   min_seq_timing_ (min_seq_timing),
00082   max_seq_timing_ (max_seq_timing)
00083       {}
00084 
00085       tbb::task* execute () {
00086   if (P_first_ > P_last_ || A_.empty())
00087     return NULL;
00088   else if (P_first_ == P_last_)
00089     {
00090       execute_base_case ();
00091       return NULL;
00092     }
00093   else
00094     {
00095       // Recurse on two intervals: [P_first, P_mid] and [P_mid+1, P_last]
00096       const size_t P_mid = (P_first_ + P_last_) / 2;
00097       split_t A_split = 
00098         partitioner_.split (A_, P_first_, P_mid, P_last_,
00099           contiguous_cache_blocks_);
00100 
00101       double top_timing;
00102       double top_min_timing = 0.0;
00103       double top_max_timing = 0.0;
00104       double bot_timing;
00105       double bot_min_timing = 0.0;
00106       double bot_max_timing = 0.0;
00107 
00108       FactorTask& topTask = *new( allocate_child() )
00109         FactorTask (P_first_, P_mid, A_split.first, A_top_ptr_, 
00110         seq_outputs_, par_output_, seq_,
00111         top_timing, top_min_timing, top_max_timing,
00112         contiguous_cache_blocks_);
00113       // After the task finishes, A_bot will be set to the topmost
00114       // partition of A_split.second.  This will let us combine
00115       // the two subproblems (using factor_pair()) after their
00116       // tasks complete.
00117       mat_view A_bot;
00118       FactorTask& botTask = *new( allocate_child() )
00119         FactorTask (P_mid+1, P_last_, A_split.second, &A_bot, 
00120         seq_outputs_, par_output_, seq_,
00121         bot_timing, bot_min_timing, bot_max_timing,
00122         contiguous_cache_blocks_);
00123       set_ref_count (3); // 3 children (2 + 1 for the wait)
00124       spawn (topTask);
00125       spawn_and_wait_for_all (botTask);
00126       
00127       // Combine the two results
00128       factor_pair (P_first_, P_mid+1, *A_top_ptr_, A_bot);
00129 
00130       top_min_timing = (top_min_timing == 0.0) ? top_timing : top_min_timing;
00131       top_max_timing = (top_max_timing == 0.0) ? top_timing : top_max_timing;
00132 
00133       bot_min_timing = (bot_min_timing == 0.0) ? bot_timing : bot_min_timing;
00134       bot_max_timing = (bot_max_timing == 0.0) ? bot_timing : bot_max_timing;
00135 
00136       min_seq_timing_ = std::min (top_min_timing, bot_min_timing);
00137       max_seq_timing_ = std::min (top_max_timing, bot_max_timing);
00138 
00139       return NULL;
00140     }
00141       }
00142 
00143     private:
00144       const size_t P_first_, P_last_;
00145       mat_view A_;
00146       mat_view* const A_top_ptr_;
00147       std::vector< SeqOutput >& seq_outputs_;
00148       ParOutput& par_output_;
00149       SequentialTsqr< LocalOrdinal, Scalar > seq_;
00150       TSQR::Combine< LocalOrdinal, Scalar > combine_;
00151       Partitioner< LocalOrdinal, Scalar > partitioner_;
00152       const bool contiguous_cache_blocks_;
00153       double& my_seq_timing_;
00154       double& min_seq_timing_;
00155       double& max_seq_timing_;
00156 
00157       void 
00158       factor_pair (const size_t P_top,
00159        const size_t P_bot,
00160        mat_view& A_top, // different than A_top_
00161        mat_view& A_bot)
00162       {
00163   if (P_top == P_bot) 
00164     throw std::logic_error("factor_pair: should never get here!");
00165 
00166   // We only read and write the upper ncols x ncols triangle of
00167   // each block.
00168   const LocalOrdinal ncols = A_top.ncols();
00169   if (A_bot.ncols() != ncols)
00170     throw std::logic_error("A_bot.ncols() != A_top.ncols()");
00171   
00172   std::vector< Scalar >& tau = par_output_[P_bot];
00173   std::vector< Scalar > work (ncols);
00174   combine_.factor_pair (ncols, A_top.get(), A_top.lda(),
00175             A_bot.get(), A_bot.lda(), &tau[0], &work[0]);
00176       }
00177 
00178       void
00179       execute_base_case () 
00180       {
00181   TimerType timer("");
00182   timer.start();
00183   seq_outputs_[P_first_] = 
00184     seq_.factor (A_.nrows(), A_.ncols(), A_.get(), 
00185            A_.lda(), contiguous_cache_blocks_);
00186   // Assign the topmost cache block of the current partition to
00187   // *A_top_ptr_.  Every base case invocation does this, so that
00188   // we can combine subproblems.  The root task also does this,
00189   // but for a different reason: so that we can extract the R
00190   // factor, once we're done with the factorization.
00191   *A_top_ptr_ = seq_.top_block (A_, contiguous_cache_blocks_);
00192   my_seq_timing_ = timer.stop();
00193       }
00194     };
00195   } // namespace TBB
00196 } // namespace TSQR
00197 
00198 #endif // __TSQR_TBB_FactorTask_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends