Thyra_SpmdMultiVectorBase.hpp

00001 // @HEADER
00002 // ***********************************************************************
00003 // 
00004 //    Thyra: Interfaces and Support for Abstract Numerical Algorithms
00005 //                 Copyright (2004) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00025 // 
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 #ifndef THYRA_SPMD_MULTI_VECTOR_BASE_HPP
00030 #define THYRA_SPMD_MULTI_VECTOR_BASE_HPP
00031 
00032 
00033 #include "Thyra_SpmdMultiVectorBaseDecl.hpp"
00034 #include "Thyra_MultiVectorDefaultBase.hpp"
00035 #include "Thyra_SingleScalarEuclideanLinearOpBase.hpp"
00036 #include "Thyra_SpmdVectorSpaceDefaultBase.hpp"
00037 #include "Thyra_DetachedMultiVectorView.hpp"
00038 #include "RTOpPack_SPMD_apply_op.hpp"
00039 #include "RTOp_parallel_helpers.h"
00040 #include "Teuchos_Workspace.hpp"
00041 #include "Teuchos_dyn_cast.hpp"
00042 #include "Teuchos_Time.hpp"
00043 #include "Teuchos_CommHelpers.hpp"
00044 
00045 
00046 // Define to see some timing output!
00047 //#define THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
00048 
00049 
00050 namespace Thyra {
00051 
00052 
00053 template<class Scalar>
00054 SpmdMultiVectorBase<Scalar>::SpmdMultiVectorBase()
00055   :in_applyOp_(false)
00056   ,globalDim_(0)
00057   ,localOffset_(-1)
00058   ,localSubDim_(0)
00059   ,numCols_(0)
00060   ,nonconstLocalValuesViewPtr_(0)
00061   ,localValuesViewPtr_(0)
00062 {}
00063 
00064 
00065 // Overridden from EuclideanLinearOpBase
00066 
00067 
00068 template<class Scalar>
00069 Teuchos::RCP< const ScalarProdVectorSpaceBase<Scalar> >
00070 SpmdMultiVectorBase<Scalar>::rangeScalarProdVecSpc() const
00071 {
00072   return Teuchos::rcp_dynamic_cast<const ScalarProdVectorSpaceBase<Scalar> >(
00073     spmdSpace(),true
00074     );
00075 }
00076 
00077 
00078 // Overridden from LinearOpBase
00079 
00080 
00081 template<class Scalar>
00082 void SpmdMultiVectorBase<Scalar>::apply(
00083   const ETransp                     M_trans
00084   ,const MultiVectorBase<Scalar>    &X
00085   ,MultiVectorBase<Scalar>          *Y
00086   ,const Scalar                     alpha
00087   ,const Scalar                     beta
00088   ) const
00089 {
00090   this->single_scalar_euclidean_apply_impl(M_trans,X,Y,alpha,beta);
00091 }
00092 
00093 
00094 // Overridden from MultiVectorBase
00095 
00096 
00097 template<class Scalar>
00098 void SpmdMultiVectorBase<Scalar>::mvMultiReductApplyOpImpl(
00099   const RTOpPack::RTOpT<Scalar>         &pri_op
00100   ,const int                            num_multi_vecs
00101   ,const MultiVectorBase<Scalar>*const  multi_vecs[]
00102   ,const int                            num_targ_multi_vecs
00103   ,MultiVectorBase<Scalar>*const        targ_multi_vecs[]
00104   ,RTOpPack::ReductTarget*const         reduct_objs[]
00105   ,const Index                          pri_first_ele_offset_in
00106   ,const Index                          pri_sub_dim_in
00107   ,const Index                          pri_global_offset_in
00108   ,const Index                          sec_first_ele_offset_in
00109   ,const Index                          sec_sub_dim_in
00110   ) const
00111 {
00112   using Teuchos::dyn_cast;
00113   using Teuchos::Workspace;
00114   Teuchos::WorkspaceStore* wss = Teuchos::get_default_workspace_store().get();
00115   const Index numCols = this->domain()->dim();
00116   const SpmdVectorSpaceBase<Scalar> &spmdSpc = *spmdSpace();
00117 #ifdef TEUCHOS_DEBUG
00118   TEST_FOR_EXCEPTION(
00119     in_applyOp_, std::invalid_argument
00120     ,"SpmdMultiVectorBase<>::mvMultiReductApplyOpImpl(...): Error, this method is being entered recursively which is a "
00121     "clear sign that one of the methods acquireDetachedView(...), releaseDetachedView(...) or commitDetachedView(...) "
00122     "was not implemented properly!"
00123     );
00124   apply_op_validate_input(
00125     "SpmdMultiVectorBase<>::mvMultiReductApplyOpImpl(...)", *this->domain(), *this->range()
00126     ,pri_op,num_multi_vecs,multi_vecs,num_targ_multi_vecs,targ_multi_vecs
00127     ,reduct_objs,pri_first_ele_offset_in,pri_sub_dim_in,pri_global_offset_in
00128     ,sec_first_ele_offset_in,sec_sub_dim_in
00129     );
00130 #endif
00131   // Flag that we are in applyOp()
00132   in_applyOp_ = true;
00133   // First see if this is a locally replicated vector in which case
00134   // we treat this as a local operation only.
00135   const bool locallyReplicated = (localSubDim_ == globalDim_);
00136   // Get the overlap in the current process with the input logical sub-vector
00137   // from (first_ele_offset_in,sub_dim_in,global_offset_in)
00138   Teuchos_Index  overlap_first_local_ele_off  = 0;
00139   Teuchos_Index  overlap_local_sub_dim        = 0;
00140   Teuchos_Index  overlap_global_offset        = 0;
00141   RTOp_parallel_calc_overlap(
00142     globalDim_, localSubDim_, localOffset_, pri_first_ele_offset_in, pri_sub_dim_in, pri_global_offset_in
00143     ,&overlap_first_local_ele_off, &overlap_local_sub_dim, &overlap_global_offset
00144     );
00145   const Range1D
00146     local_rng = (
00147       overlap_first_local_ele_off>=0
00148       ? Range1D( localOffset_+overlap_first_local_ele_off, localOffset_+overlap_first_local_ele_off+overlap_local_sub_dim-1 )
00149       : Range1D::Invalid
00150       ),
00151     col_rng(
00152       sec_first_ele_offset_in
00153       ,sec_sub_dim_in >= 0 ? sec_first_ele_offset_in+sec_sub_dim_in-1 : numCols-1
00154       );
00155   // Create sub-vector views of all of the *participating* local data
00156   Workspace<RTOpPack::ConstSubMultiVectorView<Scalar> > sub_multi_vecs(wss,num_multi_vecs);
00157   Workspace<RTOpPack::SubMultiVectorView<Scalar> > targ_sub_multi_vecs(wss,num_targ_multi_vecs);
00158   if( overlap_first_local_ele_off >= 0 ) {
00159     for(int k = 0; k < num_multi_vecs; ++k ) {
00160       multi_vecs[k]->acquireDetachedView( local_rng, col_rng, &sub_multi_vecs[k] );
00161       sub_multi_vecs[k].setGlobalOffset( overlap_global_offset );
00162     }
00163     for(int k = 0; k < num_targ_multi_vecs; ++k ) {
00164       targ_multi_vecs[k]->acquireDetachedView( local_rng, col_rng, &targ_sub_multi_vecs[k] );
00165       targ_sub_multi_vecs[k].setGlobalOffset( overlap_global_offset );
00166     }
00167   }
00168   // Apply the RTOp operator object (all processors must participate)
00169   RTOpPack::SPMD_apply_op(
00170     locallyReplicated ? NULL : spmdSpc.getComm().get()                                 // comm
00171     ,pri_op                                                                            // op
00172     ,col_rng.size()                                                                    // num_cols
00173     ,num_multi_vecs                                                                    // num_multi_vecs
00174     ,num_multi_vecs && overlap_first_local_ele_off>=0 ? &sub_multi_vecs[0] : NULL      // sub_multi_vecs
00175     ,num_targ_multi_vecs                                                               // num_targ_multi_vecs
00176     ,num_targ_multi_vecs && overlap_first_local_ele_off>=0 ? &targ_sub_multi_vecs[0] : NULL// targ_sub_multi_vecs
00177     ,reduct_objs                                                                       // reduct_objs
00178     );
00179   // Free and commit the local data
00180   if( overlap_first_local_ele_off >= 0 ) {
00181     for(int k = 0; k < num_multi_vecs; ++k ) {
00182       sub_multi_vecs[k].setGlobalOffset(local_rng.lbound());
00183       multi_vecs[k]->releaseDetachedView( &sub_multi_vecs[k] );
00184     }
00185     for(int k = 0; k < num_targ_multi_vecs; ++k ) {
00186       targ_sub_multi_vecs[k].setGlobalOffset(local_rng.lbound());
00187       targ_multi_vecs[k]->commitDetachedView( &targ_sub_multi_vecs[k] );
00188     }
00189   }
00190   // Flag that we are leaving applyOp()
00191   in_applyOp_ = false;
00192 }
00193 
00194 
00195 template<class Scalar>
00196 void SpmdMultiVectorBase<Scalar>::acquireDetachedMultiVectorViewImpl(
00197   const Range1D &rowRng_in,
00198   const Range1D &colRng_in,
00199   RTOpPack::ConstSubMultiVectorView<Scalar>  *sub_mv
00200   ) const
00201 {
00202   const Range1D rowRng = validateRowRange(rowRng_in);
00203   const Range1D colRng = validateColRange(colRng_in);
00204   if( rowRng.lbound() < localOffset_ || localOffset_+localSubDim_-1 < rowRng.ubound() ) {
00205     // rng consists of off-processor elements so use the default implementation!
00206     MultiVectorDefaultBase<Scalar>::acquireDetachedMultiVectorViewImpl(
00207       rowRng_in,colRng_in,sub_mv
00208       );
00209     return;
00210   }
00211   /*
00212   if (localValuesViewPtr_) {
00213     freeLocalData( localValuesViewPtr_ );
00214     localValuesViewPtr_ = 0;
00215   }
00216   */
00217   // 2007/06/08: rabartl: ABove, this logic for this is all wrong when partial
00218   // views are requested.  Therefore, I must assume here that these are always
00219   // direct views.  The problem is that client code can ask for one column
00220   // view at a time which is perfectly okay.  However, the current way this is
00221   // setup does not handle this well.  This all needs to be reworked to clean
00222   // this up.
00223   const Scalar *localValues = NULL;
00224   int leadingDim = 0;
00225   this->getLocalData(&localValues,&leadingDim);
00226   localValuesViewPtr_ = localValues;
00227   sub_mv->initialize(
00228     rowRng.lbound()                               // globalOffset
00229     ,rowRng.size()                                // subDim
00230     ,colRng.lbound()                              // colOffset
00231     ,colRng.size()                                // numSubCols
00232     ,localValues
00233     +(rowRng.lbound()-localOffset_)
00234     +colRng.lbound()*leadingDim                   // values
00235     ,leadingDim                                   // leadingDim
00236     );
00237 }
00238 
00239 
00240 template<class Scalar>
00241 void SpmdMultiVectorBase<Scalar>::releaseDetachedMultiVectorViewImpl(
00242   RTOpPack::ConstSubMultiVectorView<Scalar>* sub_mv
00243   ) const
00244 {
00245   if(
00246     sub_mv->globalOffset() < localOffset_ 
00247     ||
00248     localOffset_+localSubDim_ < sub_mv->globalOffset()+sub_mv->subDim()
00249     )
00250   {
00251     // Let the default implementation handle it!
00252     MultiVectorDefaultBase<Scalar>::releaseDetachedMultiVectorViewImpl(sub_mv);
00253     return;
00254   }
00255   /*
00256 #ifdef TEUCHOS_DEBUG
00257   TEST_FOR_EXCEPT( localValuesViewPtr_ == 0 );
00258 #endif
00259   freeLocalData( localValuesViewPtr_ );
00260   localValuesViewPtr_ = 0;
00261   */
00262   // 2007/06/08: rabartl: See comment in acquireDetachedView(...) above!
00263   sub_mv->set_uninitialized();
00264 }
00265 
00266 
00267 template<class Scalar>
00268 void SpmdMultiVectorBase<Scalar>::acquireNonconstDetachedMultiVectorViewImpl(
00269   const Range1D &rowRng_in,
00270   const Range1D &colRng_in,
00271   RTOpPack::SubMultiVectorView<Scalar> *sub_mv
00272   )
00273 {
00274   const Range1D rowRng = validateRowRange(rowRng_in);
00275   const Range1D colRng = validateColRange(colRng_in);
00276   if(
00277     rowRng.lbound() < localOffset_
00278     ||
00279     localOffset_+localSubDim_-1 < rowRng.ubound()
00280     )
00281   {
00282     // rng consists of off-processor elements so use the default implementation!
00283     MultiVectorDefaultBase<Scalar>::acquireNonconstDetachedMultiVectorViewImpl(
00284       rowRng_in, colRng_in, sub_mv
00285       );
00286     return;
00287   }
00288   // rng consists of all local data so get it!
00289   /*
00290   if (nonconstLocalValuesViewPtr_) {
00291     commitLocalData( nonconstLocalValuesViewPtr_ );
00292     nonconstLocalValuesViewPtr_ = 0;
00293   }
00294   */
00295   // 2007/06/08: rabartl: See comment in acquireDetachedView(...) above!
00296   Scalar *localValues = NULL;
00297   int leadingDim = 0;
00298   this->getLocalData(&localValues,&leadingDim);
00299   nonconstLocalValuesViewPtr_ = localValues;
00300   sub_mv->initialize(
00301     rowRng.lbound()                               // globalOffset
00302     ,rowRng.size()                                // subDim
00303     ,colRng.lbound()                              // colOffset
00304     ,colRng.size()                                // numSubCols
00305     ,localValues
00306     +(rowRng.lbound()-localOffset_)
00307     +colRng.lbound()*leadingDim                   // values
00308     ,leadingDim                                   // leadingDim
00309     );
00310 }
00311 
00312 
00313 template<class Scalar>
00314 void SpmdMultiVectorBase<Scalar>::commitNonconstDetachedMultiVectorViewImpl(
00315   RTOpPack::SubMultiVectorView<Scalar>* sub_mv
00316   )
00317 {
00318   if(
00319     sub_mv->globalOffset() < localOffset_
00320     ||
00321     localOffset_+localSubDim_ < sub_mv->globalOffset()+sub_mv->subDim()
00322     )
00323   {
00324     // Let the default implementation handle it!
00325     MultiVectorDefaultBase<Scalar>::commitNonconstDetachedMultiVectorViewImpl(sub_mv);
00326     return;
00327   }
00328   /*
00329 #ifdef TEUCHOS_DEBUG
00330   TEST_FOR_EXCEPT( nonconstLocalValuesViewPtr_ == 0 );
00331 #endif
00332   commitLocalData( nonconstLocalValuesViewPtr_ );
00333   nonconstLocalValuesViewPtr_ = 0;
00334   */
00335   // 2007/06/08: rabartl: See comment in acquireDetachedView(...) above!
00336   sub_mv->set_uninitialized();
00337 }
00338 
00339 
00340 // protected
00341 
00342 
00343 template<class Scalar>
00344 void SpmdMultiVectorBase<Scalar>::euclideanApply(
00345   const ETransp                     M_trans
00346   ,const MultiVectorBase<Scalar>    &X
00347   ,MultiVectorBase<Scalar>          *Y
00348   ,const Scalar                     alpha
00349   ,const Scalar                     beta
00350   ) const
00351 {
00352   typedef Teuchos::ScalarTraits<Scalar> ST;
00353   using Teuchos::Workspace;
00354   Teuchos::WorkspaceStore* wss = Teuchos::get_default_workspace_store().get();
00355 
00356 #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
00357   Teuchos::Time timerTotal("dummy",true);
00358   Teuchos::Time timer("dummy");
00359 #endif
00360 
00361   //
00362   // This function performs one of two operations.
00363   //
00364   // The first operation (M_trans == NOTRANS) is:
00365   //
00366   //     Y = beta * Y + alpha * M * X
00367   //
00368   // where Y and M have compatible (distributed?) range vector
00369   // spaces and X is a locally replicated serial multi-vector.  This
00370   // operation does not require any global communication.
00371   //
00372   // The second operation (M_trans == TRANS) is:
00373   //
00374   //     Y = beta * Y + alpha * M' * X
00375   //
00376   // where M and X have compatible (distributed?) range vector spaces
00377   // and Y is a locally replicated serial multi-vector.  This operation
00378   // requires a local reduction.
00379   //
00380 
00381   //
00382   // Get spaces and validate compatibility
00383   //
00384 
00385   // Get the SpmdVectorSpace
00386   const SpmdVectorSpaceBase<Scalar> &spmdSpc = *this->spmdSpace();
00387 
00388   // Get the Spmd communicator
00389   const Teuchos::RCP<const Teuchos::Comm<Index> >
00390     comm = spmdSpc.getComm();
00391 #ifdef TEUCHOS_DEBUG
00392   const VectorSpaceBase<Scalar>
00393     &Y_range = *Y->range(),
00394     &X_range = *X.range();
00395 //  std::cout << "SpmdMultiVectorBase<Scalar>::apply(...): comm = " << comm << std::endl;
00396   TEST_FOR_EXCEPTION(
00397     ( globalDim_ > localSubDim_ ) && comm.get()==NULL, std::logic_error
00398     ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!"
00399     );
00400   // ToDo: Write a good general validation function that I can call that will replace
00401   // all of these TEST_FOR_EXCEPTION(...) uses
00402 
00403   TEST_FOR_EXCEPTION(
00404     real_trans(M_trans)==NOTRANS && !spmdSpc.isCompatible(Y_range), Exceptions::IncompatibleVectorSpaces
00405     ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!"
00406     );
00407   TEST_FOR_EXCEPTION(
00408     real_trans(M_trans)==TRANS && !spmdSpc.isCompatible(X_range), Exceptions::IncompatibleVectorSpaces
00409     ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!"
00410     );
00411 #endif
00412 
00413   //
00414   // Get explicit (local) views of Y, M and X
00415   //
00416 
00417 #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
00418   timer.start();
00419 #endif
00420     
00421   DetachedMultiVectorView<Scalar>
00422     Y_local(
00423       *Y
00424       ,real_trans(M_trans)==NOTRANS ? Range1D(localOffset_,localOffset_+localSubDim_-1) : Range1D()
00425       ,Range1D()
00426       );
00427   ConstDetachedMultiVectorView<Scalar>
00428     M_local(
00429       *this
00430       ,Range1D(localOffset_,localOffset_+localSubDim_-1)
00431       ,Range1D()
00432       );
00433   ConstDetachedMultiVectorView<Scalar>
00434     X_local(
00435       X
00436       ,real_trans(M_trans)==NOTRANS ? Range1D() : Range1D(localOffset_,localOffset_+localSubDim_-1)
00437       ,Range1D()
00438       );
00439 #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
00440   timer.stop();
00441   std::cout << "\nSpmdMultiVectorBase<Scalar>::apply(...): Time for getting view = " << timer.totalElapsedTime() << " seconds\n";
00442 #endif
00443 #ifdef TEUCHOS_DEBUG    
00444   TEST_FOR_EXCEPTION(
00445     real_trans(M_trans)==NOTRANS && ( M_local.numSubCols() != X_local.subDim() || X_local.numSubCols() != Y_local.numSubCols() )
00446     , Exceptions::IncompatibleVectorSpaces
00447     ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!"
00448     );
00449   TEST_FOR_EXCEPTION(
00450     real_trans(M_trans)==TRANS && ( M_local.subDim() != X_local.subDim() || X_local.numSubCols() != Y_local.numSubCols() )
00451     , Exceptions::IncompatibleVectorSpaces
00452     ,"SpmdMultiVectorBase<Scalar>::apply(...MultiVectorBase<Scalar>...): Error!"
00453     );
00454 #endif
00455 
00456   //
00457   // If nonlocal (i.e. M_trans==TRANS) then create temporary storage
00458   // for:
00459   //
00460   //     Y_local_tmp = alpha * M(local) * X(local)                 : on nonroot processes
00461   //
00462   // or
00463   //
00464   //     Y_local_tmp = beta*Y_local + alpha * M(local) * X(local)  : on root process (localOffset_==0)
00465   // 
00466   // and set
00467   //
00468   //     localBeta = ( localOffset_ == 0 ? beta : 0.0 )
00469   //
00470   // Above, we choose localBeta such that we will only perform
00471   // Y_local = beta * Y_local + ...  on one process (the root
00472   // process where localOffset_==0x).  Then, when we add up Y_local
00473   // on all of the processors and we will get the correct result.
00474   //
00475   // If strictly local (i.e. M_trans == NOTRANS) then set:
00476   //
00477   //      Y_local_tmp = Y_local
00478   //      localBeta = beta
00479   //
00480 
00481 #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
00482   timer.start();
00483 #endif
00484     
00485   Workspace<Scalar> Y_local_tmp_store(wss,Y_local.subDim()*Y_local.numSubCols(),false);
00486   RTOpPack::SubMultiVectorView<Scalar> Y_local_tmp;
00487   Scalar localBeta;
00488   if( real_trans(M_trans) == TRANS && globalDim_ > localSubDim_ ) {
00489     // Nonlocal
00490     Y_local_tmp.initialize(
00491       0, Y_local.subDim()
00492       ,0, Y_local.numSubCols()
00493       ,&Y_local_tmp_store[0], Y_local.subDim() // leadingDim == subDim (columns are adjacent)
00494       );
00495     if( localOffset_ == 0 ) {
00496       // Root process: Must copy Y_local into Y_local_tmp
00497       for( int j = 0; j < Y_local.numSubCols(); ++j ) {
00498         Scalar *Y_local_j = Y_local.values() + Y_local.leadingDim()*j;
00499         std::copy( Y_local_j, Y_local_j + Y_local.subDim(), Y_local_tmp.values() + Y_local_tmp.leadingDim()*j );
00500       }
00501       localBeta = beta;
00502     }
00503     else {
00504       // Not the root process
00505       localBeta = 0.0;
00506     }
00507   }
00508   else {
00509     // Local
00510     Y_local_tmp = Y_local.smv(); // Shallow copy only!
00511     localBeta = beta;
00512   }
00513 
00514 #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
00515   timer.stop();
00516   std::cout << "\nSpmdMultiVectorBase<Scalar>::apply(...): Time for setting up Y_local_tmp and localBeta = " << timer.totalElapsedTime() << " seconds\n";
00517 #endif
00518     
00519   //
00520   // Perform the local multiplication:
00521   //
00522   //     Y(local) = localBeta * Y(local) + alpha * op(M(local)) * X(local)
00523   //
00524   // or in BLAS lingo:
00525   //
00526   //     C        = beta      * C        + alpha * op(A)        * op(B)
00527   //
00528 
00529 #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
00530   timer.start();
00531 #endif
00532   Teuchos::ETransp t_transp;
00533   if(ST::isComplex) {
00534     switch(M_trans) {
00535       case NOTRANS:   t_transp = Teuchos::NO_TRANS;     break;
00536       case TRANS:     t_transp = Teuchos::TRANS;        break;
00537       case CONJTRANS: t_transp = Teuchos::CONJ_TRANS;   break;
00538       default: TEST_FOR_EXCEPT(true);
00539     }
00540   }
00541   else {
00542     switch(real_trans(M_trans)) {
00543       case NOTRANS:   t_transp = Teuchos::NO_TRANS;     break;
00544       case TRANS:     t_transp = Teuchos::TRANS;        break;
00545       default: TEST_FOR_EXCEPT(true);
00546     }
00547   }
00548   blas_.GEMM(
00549     t_transp                                                                  // TRANSA
00550     ,Teuchos::NO_TRANS                                                        // TRANSB
00551     ,Y_local.subDim()                                                         // M
00552     ,Y_local.numSubCols()                                                     // N
00553     ,real_trans(M_trans)==NOTRANS ? M_local.numSubCols() : M_local.subDim()   // K
00554     ,alpha                                                                    // ALPHA
00555     ,const_cast<Scalar*>(M_local.values())                                    // A
00556     ,M_local.leadingDim()                                                     // LDA
00557     ,const_cast<Scalar*>(X_local.values())                                    // B
00558     ,X_local.leadingDim()                                                     // LDB
00559     ,localBeta                                                                // BETA
00560     ,Y_local_tmp.values()                                                     // C
00561     ,Y_local_tmp.leadingDim()                                                 // LDC
00562     );
00563 #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
00564   timer.stop();
00565   std::cout
00566     << "\nSpmdMultiVectorBase<Scalar>::apply(...): Time for GEMM = "
00567     << timer.totalElapsedTime() << " seconds\n";
00568 #endif
00569 
00570   if( comm.get() ) {
00571     
00572     //
00573     // Perform the global reduction of Y_local_tmp back into Y_local
00574     //
00575     
00576     if( real_trans(M_trans)==TRANS && globalDim_ > localSubDim_ ) {
00577       // Contiguous buffer for final reduction
00578       Workspace<Scalar> Y_local_final_buff(wss,Y_local.subDim()*Y_local.numSubCols(),false);
00579       // Perform the reduction
00580       Teuchos::reduceAll<Index,Scalar>(
00581         *comm,Teuchos::REDUCE_SUM,Y_local_final_buff.size(),Y_local_tmp.values()
00582         ,&Y_local_final_buff[0]
00583         );
00584       // Load Y_local_final_buff back into Y_local
00585       const Scalar *Y_local_final_buff_ptr = &Y_local_final_buff[0];
00586       for( int j = 0; j < Y_local.numSubCols(); ++j ) {
00587         Scalar *Y_local_ptr = Y_local.values() + Y_local.leadingDim()*j;
00588         for( int i = 0; i < Y_local.subDim(); ++i ) {
00589           (*Y_local_ptr++) = (*Y_local_final_buff_ptr++);
00590         }
00591       }
00592     }
00593   }
00594   else {
00595 
00596     // When you get here the view Y_local will be committed back to Y
00597     // in the destructor to Y_local
00598 
00599   }
00600 
00601 #ifdef THYRA_SPMD_MULTI_VECTOR_BASE_PRINT_TIMES
00602   timer.stop();
00603   std::cout 
00604     << "\nSpmdMultiVectorBase<Scalar>::apply(...): Total time = "
00605     << timerTotal.totalElapsedTime() << " seconds\n";
00606 #endif
00607 
00608 }
00609 
00610 
00611 // Overridden from SingleScalarEuclideanLinearOpBase
00612 
00613 
00614 template<class Scalar>
00615 bool SpmdMultiVectorBase<Scalar>::opSupported(ETransp M_trans) const
00616 {
00617   typedef Teuchos::ScalarTraits<Scalar> ST;
00618   return ( ST::isComplex ? ( M_trans!=CONJ ) : true );
00619 }
00620 
00621 template<class Scalar>
00622 void SpmdMultiVectorBase<Scalar>::updateSpmdSpace()
00623 {
00624   if(globalDim_ == 0) {
00625     const SpmdVectorSpaceBase<Scalar> *spmdSpace = this->spmdSpace().get();
00626     if(spmdSpace) {
00627       globalDim_    = spmdSpace->dim();
00628       localOffset_  = spmdSpace->localOffset();
00629       localSubDim_  = spmdSpace->localSubDim();
00630       numCols_      = this->domain()->dim();
00631     }
00632     else {
00633       globalDim_    = 0;
00634       localOffset_  = -1;
00635       localSubDim_  = 0;
00636       numCols_      = 0;
00637     }
00638   }
00639 }
00640 
00641 
00642 template<class Scalar>
00643 Range1D SpmdMultiVectorBase<Scalar>::validateRowRange( const Range1D &rowRng_in ) const
00644 {
00645   const Range1D rowRng = Teuchos::full_range(rowRng_in,0,globalDim_-1);
00646 #ifdef TEUCHOS_DEBUG
00647   TEST_FOR_EXCEPTION(
00648     !( 0 <= rowRng.lbound() && rowRng.ubound() < globalDim_ ), std::invalid_argument
00649     ,"SpmdMultiVectorBase<Scalar>::validateRowRange(rowRng): Error, the range rowRng = ["
00650     <<rowRng.lbound()<<","<<rowRng.ubound()<<"] is not "
00651     "in the range [0,"<<(globalDim_-1)<<"]!"
00652     );
00653 #endif
00654   return rowRng;
00655 }
00656 
00657 
00658 template<class Scalar>
00659 Range1D SpmdMultiVectorBase<Scalar>::validateColRange( const Range1D &colRng_in ) const
00660 {
00661   const Range1D colRng = Teuchos::full_range(colRng_in,0,numCols_-1);
00662 #ifdef TEUCHOS_DEBUG
00663   TEST_FOR_EXCEPTION(
00664     !(0 <= colRng.lbound() && colRng.ubound() < numCols_), std::invalid_argument
00665     ,"SpmdMultiVectorBase<Scalar>::validateColRange(colRng): Error, the range colRng = ["
00666     <<colRng.lbound()<<","<<colRng.ubound()<<"] is not "
00667     "in the range [0,"<<(numCols_-1)<<"]!"
00668     );
00669 #endif
00670   return colRng;
00671 }
00672 
00673 
00674 } // end namespace Thyra
00675 
00676 
00677 #endif // THYRA_SPMD_MULTI_VECTOR_BASE_HPP

Generated on Tue Oct 20 12:46:59 2009 for Thyra Operator/Vector Support by doxygen 1.4.7