Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_PackedCRSOperatorAdaptor.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef KOKKOS_PACKEDCRSOPERATORADAPTOR_HPP
00030 #define KOKKOS_PACKEDCRSOPERATORADAPTOR_HPP
00031 
00032 namespace Kokkos {
00033 
00034   template <class Scalar, class Ordinal, class Node, class S, class M>
00035   class PackedCRSOperatorAdaptor {
00036   public:
00038 
00039 
00041     typedef Scalar  ScalarType;
00043     typedef Ordinal OrdinalType;
00045     typedef Node    NodeType;
00046 
00051     template <class OtherScalarn>
00052     struct rebind {
00053       typedef PackedCRSOperatorAdaptor<OtherScalarn,Ordinal,Node,S,M> other;
00054     };
00055 
00057 
00058 
00059 
00061     PackedCRSOperatorAdaptor(const RCP<Node> &node, S s, M m);
00062 
00064     ~PackedCRSOperatorAdaptor();
00065 
00067 
00068 
00069     
00071     RCP<Node> getNode() const;
00072 
00074 
00075 
00076 
00078     void initializeStructure(const CrsGraph<Ordinal,Node,PackedCRSOperatorAdaptor<void,Ordinal,Node,S,M> > &graph);
00079 
00081     void initializeValues(const CrsGraph<Scalar,Ordinal,Node,PackedCRSOperatorAdaptor<void,Ordinal,Node,S,M> > &matrix);
00082 
00084     void clear();
00085 
00087 
00088 
00089 
00091     template <class DomainScalar, class RangeScalar>
00092     void multiply(Teuchos::ETransp trans, RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, MultiVector<RangeScalar,Node> &Y) const;
00093 
00095     template <class DomainScalar, class RangeScalar>
00096     void multiply(Teuchos::ETransp trans, 
00097                   RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, RangeScalar beta, MultiVector<RangeScalar,Node> &Y) const;
00098 
00100     template <class DomainScalar, class RangeScalar>
00101     void solve(Teuchos::ETransp trans, Teuchos::EUplo uplo, Teuchos::EDiag diag, 
00102                const MultiVector<DomainScalar,Node> &Y, MultiVector<RangeScalar,Node> &X) const;
00103 
00105 
00106   protected:
00108     PackedCRSOperatorAdaptor(const PackedCRSOperatorAdaptor& source);
00109 
00110     RCP<Node> node_;
00111 
00112     // we do this one of two ways: 
00113     // 1D/packed: arrays of offsets, array of ordinals, array of values.
00114     // may or may not use these, will keep them
00115     ArrayRCP<const Ordinal> inds1D_;
00116     ArrayRCP<const size_t>  begs1D_, ends1D_;
00117     // 2D: array of arrays
00118     // don't use these, but must hold on to them in order to support sequential calls to initializeValues() 
00119     ArrayRCP<const ArrayRCP<Ordinal> > inds2D_;
00120     ArrayRCP<const size_t>         numEntries_;
00121 
00122     size_t numRows_;
00123     bool indsInit_, valsInit_, isEmpty_, isOpt_;
00124 
00125     M mult_;
00126     S solv_;
00127 
00128     static void packFrom2D(const ArrayRCP<const size_t>  &begs, 
00129                            const ArrayRCP<const size_t>  &ends,
00130                            const ArrayRCP<const Ordinal> &inds,
00131                            const ArrayRCP<const Scalar > &vals,
00132                                  ArrayRCP<size_t>  &rowOffsets,
00133                                  ArrayRCP<Ordinal> &packedInds,
00134                                  ArrayRCP<Scalar>  &packedVals);
00135 
00136     static void packFrom2D(const ArrayRCP<const size_t>  &begs, 
00137                            const ArrayRCP<const size_t>  &ends,
00138                            const ArrayRCP<ArrayRCP<Ordinal> > &inds,
00139                            const ArrayRCP<ArrayRCP<Scalar > > &vals,
00140                                  ArrayRCP<size_t>  &rowOffsets,
00141                                  ArrayRCP<Ordinal> &packedInds,
00142                                  ArrayRCP<Scalar>  &packedVals);
00143   };
00144 
00145 
00146   template<class Scalar, class Ordinal, class Node>
00147   PackedCRSOperatorAdaptor<Scalar,Ordinal,Node>::PackedCRSOperatorAdaptor(const RCP<Node> &node, M m, S s)
00148   : node_(node) 
00149   , mult_(m)
00150   , solv_(s)
00151   {
00152     clear();
00153   }
00154 
00155   template<class Scalar, class Ordinal, class Node>
00156   PackedCRSOperatorAdaptor<Scalar,Ordinal,Node>::~PackedCRSOperatorAdaptor() {
00157   }
00158 
00159   template <class Scalar, class Ordinal, class Node>
00160   RCP<Node> PackedCRSOperatorAdaptor<Scalar,Ordinal,Node>::getNode() const {
00161     return node_; 
00162   }
00163 
00164   template <class Scalar, class Ordinal, class Node>
00165   void PackedCRSOperatorAdaptor<Scalar,Ordinal,Node>::clear() {
00166     begs1D_     = null;
00167     ends1D_     = null;
00168     inds1D_     = null;
00169     inds2D_     = null;
00170     numEntries_ = null;
00171     numRows_  = 0;
00172     indsInit_ = false;
00173     valsInit_ = false;
00174     isEmpty_  = false;
00175     isOpt_    = false;
00176   }
00177 
00178   template <class Scalar, class Ordinal, class Node>
00179   void PackedCRSOperatorAdaptor<Scalar,Ordinal,Node>::initializeStructure(const CrsGraphHostCompute<Ordinal,Node,PackedCRSOperatorAdaptor<void,Ordinal,Node> > &graph) 
00180   {
00181     using Teuchos::arcp;
00182     TEST_FOR_EXCEPTION(indsInit_ == true || valsInit_ == true, std::runtime_error, 
00183         Teuchos::typeName(*this) << "::initializeStructure(): structure already initialized.");
00184     numRows_ = graph.getNumRows();
00185     if (graph.isEmpty() || numRows_ == 0) {
00186       isEmpty_ = true;
00187       isOpt_   = false;
00188     }
00189     else if (graph.is1DStructure()) {
00190       isEmpty_ = false;
00191       isOpt_   = graph.isOptimized();
00192       ArrayRCP<Ordinal> inds;
00193       ArrayRCP<size_t> begs, ends;
00194       const_cast<CrsGraphHostCompute<Ordinal,Node,PackedCRSOperatorAdaptor<void,Ordinal,Node> > &>(graph).get1DStructure( inds, begs, ends );
00195       inds1D_ = inds;
00196       begs1D_ = begs;
00197       ends1D_ = ends;
00198     }
00199     else {
00200       isEmpty_ = false;
00201       isOpt_   = false;
00202       ArrayRCP<ArrayRCP<Ordinal> > inds;
00203       ArrayRCP<size_t>  sizes;
00204       const_cast<CrsGraphHostCompute<Ordinal,Node,PackedCRSOperatorAdaptor<void,Ordinal,Node> > &>(graph).get2DStructure(inds,sizes);
00205       inds2D_     = inds;
00206       numEntries_ = sizes;
00207     }
00208     indsInit_ = true;
00209   }
00210 
00211 
00212   template <class Scalar, class Ordinal, class Node>
00213   void PackedCRSOperatorAdaptor<Scalar,Ordinal,Node>::initializeValues(const CrsMatrixHostCompute<Scalar,Ordinal,Node,PackedCRSOperatorAdaptor<void,Ordinal,Node> > &matrix) {
00214     using Teuchos::arcp;
00215     TEST_FOR_EXCEPTION(indsInit_ == false, std::runtime_error, 
00216         Teuchos::typeName(*this) << "::initializeValues(): must initialize values after graph.");
00217     TEST_FOR_EXCEPTION(numRows_ != matrix.getNumRows() || isEmpty_ != matrix.isEmpty() || 
00218                        (inds2D_ != null && matrix.is1DStructure()) || (inds1D_ != null && matrix.is2DStructure()),
00219                        std::runtime_error, Teuchos::typeName(*this) << "::initializeValues(): matrix not compatible with previously supplied graph.");
00220     if (!isEmpty_) {
00221       if (matrix.is1DStructure()) {
00222         ArrayRCP<Scalar> vals;
00223         const_cast<CrsMatrixHostCompute<Scalar,Ordinal,Node,PackedCRSOperatorAdaptor<void,Ordinal,Node> > &>(matrix).get1DValues( vals );
00224         vals1D_ = vals;
00225       }
00226       else {
00227         {
00228           ArrayRCP<ArrayRCP<Scalar> > vals;
00229           const_cast<CrsMatrixHostCompute<Scalar,Ordinal,Node,PackedCRSOperatorAdaptor<void,Ordinal,Node> > &>(matrix).get2DValues(vals);
00230           vals2D_ = vals;
00231         }
00232       }
00233     }
00234     valsInit_ = true;
00235   }
00236 
00237 
00238   template <class Scalar, class Ordinal, class Node>
00239   template <class DomainScalar, class RangeScalar>
00240   void PackedCRSOperatorAdaptor<Scalar,Ordinal,Node>::solve(
00241                       Teuchos::ETransp trans, Teuchos::EUplo uplo, Teuchos::EDiag diag, 
00242                       const MultiVector<DomainScalar,Node> &Y,
00243                             MultiVector<RangeScalar,Node> &X) const 
00244   {
00245     TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00246         Teuchos::typeName(*this) << "::solve(): this solve was not fully initialized.");
00247     TEST_FOR_EXCEPTION(X.getNumCols() != Y.getNumCols(), std::runtime_error,
00248         Teuchos::typeName(*this) << "::solve(): Left hand side and right hand side multivectors have differing numbers of vectors.");
00249     TEST_FOR_EXCEPTION(X.getNumRows() < numRows_, std::runtime_error,
00250         Teuchos::typeName(*this) << "::solve(): Left-hand-side multivector does not have enough rows. Likely cause is that the column map was not provided to the Tpetra::CrsMatrix in the case of an implicit unit diagonal.");
00251     ReadyBufferHelper<Node> rbh(node_);
00252     if (numRows_ == 0) {
00253       // null op
00254     }
00255     else if (isEmpty_) {
00256       TEST_FOR_EXCEPTION(diag != Teuchos::UNIT_DIAG, std::runtime_error,
00257           Teuchos::typeName(*this) << "::solve(): solve of empty matrix only valid for an implicit unit diagonal.");
00258       // solve I * X = Y for X = Y
00259       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Assign(X,Y);
00260     }
00261     else {
00262         TEST_FOR_EXCEPT(true);
00263         rbh.begin();
00264         // RangeScalar  *x = rbh.template addNonConstBuffer<DomainScalar>(X.getValuesNonConst());
00265         // DomainScalar *y = rbh.template addConstBuffer<RangeScalar>(Y.getValues());
00266         rbh.end();
00267         // wdp.numRows = numRows_;
00268         // wdp.xstride = X.getStride();
00269         // wdp.ystride = Y.getStride();
00270         // trans == Teuchos::NO_TRANS 
00271         // diag == Teuchos::UNIT_DIAG 
00272         // uplo == Teuchos::UPPER_TRI 
00273       }
00274     }
00275     return;
00276   }
00277 
00278 
00279   template <class Scalar, class Ordinal, class Node>
00280   template <class DomainScalar, class RangeScalar>
00281   void PackedCRSOperatorAdaptor<Scalar,Ordinal,Node>::multiply(
00282                                 Teuchos::ETransp trans, 
00283                                 RangeScalar alpha,
00284                                 const MultiVector<DomainScalar,Node> &X, 
00285                                       MultiVector<RangeScalar ,Node> &Y) const 
00286   {
00287     typedef DefaultSparseMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 1>  Op1D;
00288     typedef DefaultSparseMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 1>  Op2D;
00289     typedef DefaultSparseTransposeMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 1> TOp1D;
00290     typedef DefaultSparseTransposeMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 1> TOp2D;
00291     TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00292         Teuchos::typeName(*this) << "::multiply(): operation not fully initialized.");
00293     TEST_FOR_EXCEPT(X.getNumCols() != Y.getNumCols());
00294     ReadyBufferHelper<Node> rbh(node_);
00295     if (isEmpty_ == true) {
00296       // Y <= 0 * X
00297       //   <= 0
00298       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Init(Y,Teuchos::ScalarTraits<RangeScalar>::zero());
00299     }
00300     else if (begs1D_ != null) {
00301       if (trans == Teuchos::NO_TRANS) {
00302         Op1D wdp;
00303         rbh.begin();
00304         wdp.alpha   = alpha;
00305         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00306         wdp.numRows = numRows_;
00307         wdp.begs    = rbh.template addConstBuffer<size_t>(begs1D_);
00308         wdp.ends    = rbh.template addConstBuffer<size_t>(ends1D_);
00309         wdp.inds    = rbh.template addConstBuffer<Ordinal>(inds1D_);
00310         wdp.vals    = rbh.template addConstBuffer<Scalar>(vals1D_);
00311         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00312         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00313         wdp.xstride = X.getStride();
00314         wdp.ystride = Y.getStride();
00315         rbh.end();
00316         const size_t numRHS = X.getNumCols();
00317         node_->template parallel_for<Op1D>(0,numRows_*numRHS,wdp);
00318       }
00319       else {
00320         TOp1D wdp;
00321         rbh.begin();
00322         wdp.alpha   = alpha;
00323         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00324         wdp.numRows = numRows_;
00325         wdp.numCols = Y.getNumRows();
00326         wdp.begs    = rbh.template addConstBuffer<size_t>(begs1D_);
00327         wdp.ends    = rbh.template addConstBuffer<size_t>(ends1D_);
00328         wdp.inds    = rbh.template addConstBuffer<Ordinal>(inds1D_);
00329         wdp.vals    = rbh.template addConstBuffer<Scalar>(vals1D_);
00330         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00331         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00332         wdp.xstride = X.getStride();
00333         wdp.ystride = Y.getStride();
00334         rbh.end();
00335         const size_t numRHS = X.getNumCols();
00336         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00337       }
00338     }
00339     else {
00340       if (trans == Teuchos::NO_TRANS) {
00341         Op2D wdp;
00342         rbh.begin();
00343         wdp.alpha   = alpha;
00344         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00345         wdp.numRows = numRows_;
00346         wdp.numEntries = rbh.template addConstBuffer<size_t>(numEntries_);
00347         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(indPtrs_);
00348         wdp.vals_beg   = rbh.template addConstBuffer<const  Scalar *>(valPtrs_);
00349         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00350         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00351         rbh.end();
00352         wdp.xstride = X.getStride();
00353         wdp.ystride = Y.getStride();
00354         const size_t numRHS = X.getNumCols();
00355         node_->template parallel_for<Op2D>(0,numRows_*numRHS,wdp);
00356       }
00357       else {
00358         TOp2D wdp;
00359         rbh.begin();
00360         wdp.alpha   = alpha;
00361         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00362         wdp.numRows = numRows_;
00363         wdp.numCols = Y.getNumRows();
00364         wdp.numEntries = rbh.template addConstBuffer<size_t>(numEntries_);
00365         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(indPtrs_);
00366         wdp.vals_beg   = rbh.template addConstBuffer<const  Scalar *>(valPtrs_);
00367         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00368         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00369         wdp.xstride = X.getStride();
00370         wdp.ystride = Y.getStride();
00371         rbh.end();
00372         const size_t numRHS = X.getNumCols();
00373         node_->template parallel_for<TOp2D>(0,numRHS,wdp);
00374       }
00375     }
00376     return;
00377   }
00378 
00379 
00380   template <class Scalar, class Ordinal, class Node>
00381   template <class DomainScalar, class RangeScalar>
00382   void PackedCRSOperatorAdaptor<Scalar,Ordinal,Node>::multiply(
00383                                 Teuchos::ETransp trans, 
00384                                 RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, 
00385                                 RangeScalar beta, MultiVector<RangeScalar,Node> &Y) const 
00386   {
00387     // the 0 parameter means that beta is considered, and the output multivector enjoys accumulate semantics
00388     typedef DefaultSparseMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 0>  Op1D;
00389     typedef DefaultSparseMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 0>  Op2D;
00390     typedef DefaultSparseTransposeMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 0> TOp1D;
00391     typedef DefaultSparseTransposeMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 0> TOp2D;
00392     TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00393         Teuchos::typeName(*this) << "::multiply(): operation not fully initialized.");
00394     TEST_FOR_EXCEPT(X.getNumCols() != Y.getNumCols());
00395     ReadyBufferHelper<Node> rbh(node_);
00396     if (isEmpty_ == true) {
00397       // Y <= alpha * 0 * X + beta * Y 
00398       //   <= beta * Y
00399       // NOTE: this neglects NaNs in X, which don't satisfy 0*NaN == 0
00400       //       however, X and Y may be of different size, and we need entries to determine how to mix those potential NaNs in X into Y
00401       //       therefore, the best we can do is scale Y to zero. Setting Y to zero would destroy NaNs in Y, which violates the semantics of the call.
00402       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Scale(Y,beta);
00403     }
00404     else if (begs1D_ != null) {
00405       if (trans == Teuchos::NO_TRANS) {
00406         Op1D wdp;
00407         rbh.begin();
00408         wdp.alpha   = alpha;
00409         wdp.beta    = beta;
00410         wdp.numRows = numRows_;
00411         wdp.begs    = rbh.template addConstBuffer<size_t>(begs1D_);
00412         wdp.ends    = rbh.template addConstBuffer<size_t>(ends1D_);
00413         wdp.inds    = rbh.template addConstBuffer<Ordinal>(inds1D_);
00414         wdp.vals    = rbh.template addConstBuffer< Scalar>(vals1D_);
00415         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00416         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00417         wdp.xstride = X.getStride();
00418         wdp.ystride = Y.getStride();
00419         rbh.end();
00420         const size_t numRHS = X.getNumCols();
00421         node_->template parallel_for<Op1D>(0,numRows_*numRHS,wdp);
00422       }
00423       else {
00424         TOp1D wdp;
00425         rbh.begin();
00426         wdp.alpha   = alpha;
00427         wdp.beta    = beta;
00428         wdp.numRows = numRows_;
00429         wdp.numCols = Y.getNumRows();
00430         wdp.begs    = rbh.template addConstBuffer<size_t>(begs1D_);
00431         wdp.ends    = rbh.template addConstBuffer<size_t>(ends1D_);
00432         wdp.inds    = rbh.template addConstBuffer<Ordinal>(inds1D_);
00433         wdp.vals    = rbh.template addConstBuffer< Scalar>(vals1D_);
00434         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00435         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00436         wdp.xstride = X.getStride();
00437         wdp.ystride = Y.getStride();
00438         rbh.end();
00439         const size_t numRHS = X.getNumCols();
00440         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00441       }
00442     }
00443     else {
00444       if (trans == Teuchos::NO_TRANS) {
00445         Op2D wdp;
00446         rbh.begin();
00447         wdp.numRows = numRows_;
00448         wdp.numEntries = rbh.template addConstBuffer<size_t>(numEntries_);
00449         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(indPtrs_);
00450         wdp.vals_beg   = rbh.template addConstBuffer<const  Scalar *>(valPtrs_);
00451         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00452         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00453         rbh.end();
00454         wdp.alpha   = alpha;
00455         wdp.beta    = beta;
00456         wdp.xstride = X.getStride();
00457         wdp.ystride = Y.getStride();
00458         const size_t numRHS = X.getNumCols();
00459         node_->template parallel_for<Op2D>(0,numRows_*numRHS,wdp);
00460       }
00461       else {
00462         TOp2D wdp;
00463         rbh.begin();
00464         wdp.alpha   = alpha;
00465         wdp.beta    = beta;
00466         wdp.numRows = numRows_;
00467         wdp.numCols = Y.getNumRows();
00468         wdp.numEntries = rbh.template addConstBuffer<size_t>(numEntries_);
00469         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(indPtrs_);
00470         wdp.vals_beg   = rbh.template addConstBuffer<const  Scalar *>(valPtrs_);
00471         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00472         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00473         wdp.xstride = X.getStride();
00474         wdp.ystride = Y.getStride();
00475         rbh.end();
00476         const size_t numRHS = X.getNumCols();
00477         node_->template parallel_for<TOp2D>(0,numRHS,wdp);
00478       }
00479     }
00480     return;
00481   }
00482 
00483 
00487   template <class Scalar, class Ordinal, class Node>
00488   class DefaultDeviceSparseOps {
00489   public:
00490     typedef Scalar  ScalarType;
00491     typedef Ordinal OrdinalType;
00492     typedef Node    NodeType;
00493 
00494     template <class S2>
00495     struct rebind {
00496       typedef DefaultDeviceSparseOps<S2,Ordinal,Node> other;
00497     };
00498 
00500 
00502 
00504     DefaultDeviceSparseOps(const RCP<Node> &node);
00505 
00507     ~DefaultDeviceSparseOps();
00508 
00510 
00512 
00513     
00515     RCP<Node> getNode() const;
00516 
00518 
00520 
00522 
00524     void initializeStructure(const CrsGraphDeviceCompute<Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &graph);
00525 
00527     void initializeValues(const CrsMatrixDeviceCompute<Scalar,Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &matrix);
00528 
00530     void clear();
00531 
00533 
00535 
00537 
00539     template <class DomainScalar, class RangeScalar>
00540     void multiply(Teuchos::ETransp trans, RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, MultiVector<RangeScalar,Node> &Y) const;
00541 
00543     template <class DomainScalar, class RangeScalar>
00544     void multiply(Teuchos::ETransp trans, 
00545                   RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, RangeScalar beta, MultiVector<RangeScalar,Node> &Y) const;
00546 
00548     template <class DomainScalar, class RangeScalar>
00549     void solve(Teuchos::ETransp trans, Teuchos::EUplo uplo, Teuchos::EDiag diag, 
00550                const MultiVector<DomainScalar,Node> &Y, MultiVector<RangeScalar,Node> &X) const;
00551 
00553 
00554   protected:
00556     DefaultDeviceSparseOps(const DefaultDeviceSparseOps& source);
00557 
00558     RCP<Node> node_;
00559 
00560     // we do this one of two ways: 
00561     // 1D: array of offsets, pointer for ordinals, pointer for values.
00562     ArrayRCP<const size_t>  pbuf_offsets1D_;
00563     ArrayRCP<const Ordinal> pbuf_inds1D_;
00564     ArrayRCP<const Scalar>  pbuf_vals1D_;
00565 
00566     size_t numRows_;
00567     bool indsInit_, valsInit_, isEmpty_;
00568   };
00569 
00570   template<class Scalar, class Ordinal, class Node>
00571   DefaultDeviceSparseOps<Scalar,Ordinal,Node>::DefaultDeviceSparseOps(const RCP<Node> &node)
00572   : node_(node) 
00573   {
00574     clear();
00575   }
00576 
00577   template<class Scalar, class Ordinal, class Node>
00578   DefaultDeviceSparseOps<Scalar,Ordinal,Node>::~DefaultDeviceSparseOps() {
00579   }
00580 
00581   template <class Scalar, class Ordinal, class Node>
00582   RCP<Node> DefaultDeviceSparseOps<Scalar,Ordinal,Node>::getNode() const { 
00583     return node_; 
00584   }
00585 
00586   template <class Scalar, class Ordinal, class Node>
00587   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::clear() {
00588     pbuf_offsets1D_  = null;
00589     pbuf_inds1D_     = null;
00590     pbuf_vals1D_     = null;
00591     numRows_  = 0;
00592     indsInit_ = false;
00593     valsInit_ = false;
00594     isEmpty_  = false;
00595   }
00596 
00597   template <class Scalar, class Ordinal, class Node>
00598   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::initializeStructure(const CrsGraphDeviceCompute<Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &graph) {
00599     TEST_FOR_EXCEPTION(indsInit_ == true || valsInit_ == true, std::runtime_error, 
00600         Teuchos::typeName(*this) << "::initializeStructure(): structure already initialized.");
00601     numRows_ = graph.getNumRows();
00602     if (graph.isEmpty() || numRows_ == 0) {
00603       isEmpty_ = true;
00604     }
00605     else {
00606       ArrayRCP<Ordinal> inds;
00607       ArrayRCP<size_t > offs;
00608       const_cast<CrsGraphDeviceCompute<Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &>(graph).getDeviceBuffers(inds, offs);
00609       pbuf_inds1D_    = inds;
00610       pbuf_offsets1D_ = offs;
00611     }
00612     indsInit_ = true;
00613   }
00614 
00615   template <class Scalar, class Ordinal, class Node>
00616   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::initializeValues(const CrsMatrixDeviceCompute<Scalar,Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &matrix) {
00617     TEST_FOR_EXCEPTION(indsInit_ == false, std::runtime_error, 
00618         Teuchos::typeName(*this) << "::initializeValues(): must initialize values after graph.");
00619     TEST_FOR_EXCEPTION(numRows_ != matrix.getNumRows() || isEmpty_ != matrix.isEmpty(), std::runtime_error,
00620         Teuchos::typeName(*this) << "::initializeValues(): matrix not compatible with previously supplied graph.");
00621     if (!isEmpty_) {
00622       ArrayRCP<Scalar> vals;
00623       const_cast<CrsMatrixDeviceCompute<Scalar,Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &>(matrix).getDeviceBuffer(vals);
00624       pbuf_vals1D_ = vals;
00625     }
00626     valsInit_ = true;
00627   }
00628 
00629 
00630   template <class Scalar, class Ordinal, class Node>
00631   template <class DomainScalar, class RangeScalar>
00632   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::solve(
00633                       Teuchos::ETransp trans, Teuchos::EUplo uplo, Teuchos::EDiag diag, 
00634                       const MultiVector<DomainScalar,Node> &Y,
00635                             MultiVector<RangeScalar,Node> &X) const {
00636     typedef DefaultSparseSolveOp1<Scalar,Ordinal,DomainScalar,RangeScalar>  Op1D;
00637     typedef DefaultSparseTransposeSolveOp1<Scalar,Ordinal,DomainScalar,RangeScalar>  TOp1D;
00638     TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00639         Teuchos::typeName(*this) << "::solve(): this solve was not fully initialized.");
00640     TEST_FOR_EXCEPTION(X.getNumCols() != Y.getNumCols(), std::runtime_error,
00641         Teuchos::typeName(*this) << "::solve(): Left hand side and right hand side multivectors have differing numbers of vectors.");
00642     TEST_FOR_EXCEPTION(X.getNumRows() < numRows_, std::runtime_error,
00643         Teuchos::typeName(*this) << "::solve(): Left-hand-side multivector does not have enough rows. Likely cause is that the column map was not provided to the Tpetra::CrsMatrix in the case of an implicit unit diagonal.");
00644 
00645     ReadyBufferHelper<Node> rbh(node_);
00646     if (numRows_ == 0) {
00647       // null op
00648     }
00649     else if (isEmpty_) {
00650       TEST_FOR_EXCEPTION(diag != Teuchos::UNIT_DIAG, std::runtime_error,
00651           Teuchos::typeName(*this) << "::solve(): solve of empty matrix only valid for an implicit unit diagonal.");
00652       // solve I * X = Y for X = Y
00653       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Assign(X,Y);
00654     }
00655     else {
00656       if (trans == Teuchos::NO_TRANS) {
00657         Op1D wdp;
00658         rbh.begin();
00659         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00660         wdp.ends    = wdp.begs+1;
00661         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00662         wdp.vals    = rbh.template addConstBuffer< Scalar>(pbuf_vals1D_);
00663         wdp.x       = rbh.template addNonConstBuffer<DomainScalar>(X.getValuesNonConst());
00664         wdp.y       = rbh.template addConstBuffer<RangeScalar>(Y.getValues());
00665         rbh.end();
00666         wdp.numRows = numRows_;
00667         wdp.unitDiag = (diag == Teuchos::UNIT_DIAG ? true : false);
00668         wdp.upper    = (uplo == Teuchos::UPPER_TRI ? true : false);
00669         wdp.xstride = X.getStride();
00670         wdp.ystride = Y.getStride();
00671         const size_t numRHS = X.getNumCols();
00672         node_->template parallel_for<Op1D>(0,numRHS,wdp);
00673       }
00674       else {
00675         TOp1D wdp;
00676         rbh.begin();
00677         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00678         wdp.ends    = wdp.begs+1;
00679         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00680         wdp.vals    = rbh.template addConstBuffer< Scalar>(pbuf_vals1D_);
00681         wdp.x       = rbh.template addNonConstBuffer<DomainScalar>(X.getValuesNonConst());
00682         wdp.y       = rbh.template addConstBuffer<RangeScalar>(Y.getValues());
00683         rbh.end();
00684         wdp.numRows = numRows_;
00685         wdp.unitDiag = (diag == Teuchos::UNIT_DIAG ? true : false);
00686         wdp.upper    = (uplo == Teuchos::UPPER_TRI ? true : false);
00687         wdp.xstride = X.getStride();
00688         wdp.ystride = Y.getStride();
00689         const size_t numRHS = X.getNumCols();
00690         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00691       }
00692     }
00693     return;
00694   }
00695 
00696 
00697   template <class Scalar, class Ordinal, class Node>
00698   template <class DomainScalar, class RangeScalar>
00699   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::multiply(
00700                                 Teuchos::ETransp trans, 
00701                                 RangeScalar alpha,
00702                                 const MultiVector<DomainScalar,Node> &X, 
00703                                 MultiVector<RangeScalar,Node> &Y) const {
00704     typedef DefaultSparseMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 1>  Op1D;
00705     typedef DefaultSparseTransposeMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 1> TOp1D;
00706     TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00707         Teuchos::typeName(*this) << "::multiply(): operation not fully initialized.");
00708     TEST_FOR_EXCEPT(X.getNumCols() != Y.getNumCols());
00709     ReadyBufferHelper<Node> rbh(node_);
00710     if (isEmpty_ == true) {
00711       // Y <= 0 * X
00712       //   <= 0
00713       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Init(Y,Teuchos::ScalarTraits<RangeScalar>::zero());
00714     }
00715     else {
00716       if (trans == Teuchos::NO_TRANS) {
00717         Op1D wdp;
00718         rbh.begin();
00719         wdp.alpha   = alpha;
00720         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00721         wdp.numRows = numRows_;
00722         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00723         wdp.ends    = wdp.begs+1;
00724         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00725         wdp.vals    = rbh.template addConstBuffer<Scalar>(pbuf_vals1D_);
00726         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00727         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00728         wdp.xstride = X.getStride();
00729         wdp.ystride = Y.getStride();
00730         rbh.end();
00731         const size_t numRHS = X.getNumCols();
00732         node_->template parallel_for<Op1D>(0,numRows_*numRHS,wdp);
00733       }
00734       else {
00735         TOp1D wdp;
00736         rbh.begin();
00737         wdp.alpha   = alpha;
00738         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00739         wdp.numRows = numRows_;
00740         wdp.numCols = Y.getNumRows();
00741         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00742         wdp.ends    = wdp.begs+1;
00743         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00744         wdp.vals    = rbh.template addConstBuffer<Scalar>(pbuf_vals1D_);
00745         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00746         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00747         wdp.xstride = X.getStride();
00748         wdp.ystride = Y.getStride();
00749         rbh.end();
00750         const size_t numRHS = X.getNumCols();
00751         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00752       }
00753     }
00754     return;
00755   }
00756 
00757 
00758   template <class Scalar, class Ordinal, class Node>
00759   template <class DomainScalar, class RangeScalar>
00760   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::multiply(
00761                                 Teuchos::ETransp trans, 
00762                                 RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, 
00763                                 RangeScalar beta, MultiVector<RangeScalar,Node> &Y) const {
00764     // the 0 parameter means that beta is considered, and the output multivector enjoys accumulate semantics
00765     typedef DefaultSparseMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 0>  Op1D;
00766     typedef DefaultSparseTransposeMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 0> TOp1D;
00767     TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00768         Teuchos::typeName(*this) << "::multiply(): operation not fully initialized.");
00769     TEST_FOR_EXCEPT(X.getNumCols() != Y.getNumCols());
00770     ReadyBufferHelper<Node> rbh(node_);
00771     if (isEmpty_ == true) {
00772       // Y <= alpha * 0 * X + beta * Y 
00773       //   <= beta * Y
00774       // NOTE: this neglects NaNs in X, which don't satisfy 0*NaN == 0
00775       //       however, X and Y may be of different size, and we need entries to determine how to mix those potential NaNs in X into Y
00776       //       therefore, the best we can do is scale Y to zero. Setting Y to zero would destroy NaNs in Y, which violates the semantics of the call.
00777       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Scale(Y,beta);
00778     }
00779     else {
00780       if (trans == Teuchos::NO_TRANS) {
00781         Op1D wdp;
00782         rbh.begin();
00783         wdp.alpha   = alpha;
00784         wdp.beta    = beta;
00785         wdp.numRows = numRows_;
00786         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00787         wdp.ends    = wdp.begs+1;
00788         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00789         wdp.vals    = rbh.template addConstBuffer< Scalar>(pbuf_vals1D_);
00790         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00791         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00792         wdp.xstride = X.getStride();
00793         wdp.ystride = Y.getStride();
00794         rbh.end();
00795         const size_t numRHS = X.getNumCols();
00796         node_->template parallel_for<Op1D>(0,numRows_*numRHS,wdp);
00797       }
00798       else {
00799         TOp1D wdp;
00800         rbh.begin();
00801         wdp.alpha   = alpha;
00802         wdp.beta    = beta;
00803         wdp.numRows = numRows_;
00804         wdp.numCols = Y.getNumRows();
00805         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00806         wdp.ends    = wdp.begs+1;
00807         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00808         wdp.vals    = rbh.template addConstBuffer< Scalar>(pbuf_vals1D_);
00809         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00810         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00811         wdp.xstride = X.getStride();
00812         wdp.ystride = Y.getStride();
00813         rbh.end();
00814         const size_t numRHS = X.getNumCols();
00815         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00816       }
00817     }
00818     return;
00819   }
00820 
00821   template <class Scalar, class Ordinal, class Node, class S, class M>
00822   static void 
00823   PackedCRSOperatorAdaptor<Scalar,Ordinal,Node,S,N>::packFrom2D(
00824                       const ArrayRCP<const size_t>  &begs, 
00825                       const ArrayRCP<const size_t>  &ends,
00826                       const ArrayRCP<const Ordinal> &inds,
00827                       const ArrayRCP<const Scalar > &vals,
00828                             <size_t>  &rowOffsets,
00829                             ArrayRCP<Ordinal> &packedInds,
00830                             ArrayRCP<Scalar>  &packedVals) 
00831   { 
00832     TEST_FOR_EXECPT(true);
00833   }
00834 
00835   template <class Scalar, class Ordinal, class Node, class S, class M>
00836   static void 
00837   PackedCRSOperatorAdaptor<Scalar,Ordinal,Node,S,N>::packFrom2D(const ArrayRCP<const size_t>  &begs, 
00838                       const ArrayRCP<const size_t>  &ends,
00839                       const ArrayRCP<ArrayRCP<Ordinal> > &inds,
00840                       const ArrayRCP<ArrayRCP<Scalar > > &vals,
00841                             ArrayRCP<size_t>  &rowOffsets,
00842                             ArrayRCP<Ordinal> &packedInds,
00843                             ArrayRCP<Scalar>  &packedVals)
00844   {
00845     TEST_FOR_EXCEPT(true);
00846   }
00847 
00848 }
00849 
00850 
00851 
00852 #endif // KOKKOS_PACKEDCRSOPERATORADAPTOR_HPP
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends