Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_DefaultSparseOps.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef KOKKOS_DEFAULTSPARSEOPS_HPP
00043 #define KOKKOS_DEFAULTSPARSEOPS_HPP
00044 
00045 #include <Teuchos_ArrayRCP.hpp>
00046 #include <Teuchos_DataAccess.hpp>
00047 #include <Teuchos_Assert.hpp>
00048 #include <Teuchos_TypeNameTraits.hpp>
00049 #include <Teuchos_CompileTimeAssert.hpp>
00050 #include <stdexcept>
00051 
00052 #include "Kokkos_ConfigDefs.hpp"
00053 #include "Kokkos_CrsMatrix.hpp" 
00054 #include "Kokkos_CrsGraph.hpp" 
00055 #include "Kokkos_MultiVector.hpp"
00056 #include "Kokkos_NodeHelpers.hpp"
00057 #include "Kokkos_DefaultArithmetic.hpp"
00058 #include "Kokkos_DefaultSparseScaleKernelOps.hpp"
00059 #include "Kokkos_DefaultSparseSolveKernelOps.hpp"
00060 #include "Kokkos_DefaultSparseMultiplyKernelOps.hpp"
00061 
00062 namespace Kokkos {
00063 
00067   template <class Scalar, class Ordinal, class Node>
00068   class DefaultHostSparseOps {
00069   public:
00071 
00072 
00074     typedef Scalar  ScalarType;
00076     typedef Ordinal OrdinalType;
00078     typedef Node    NodeType;
00079 
00084     template <class S2>
00085     struct rebind {
00086       typedef DefaultHostSparseOps<S2,Ordinal,Node> other;
00087     };
00088 
00090 
00091 
00092 
00094     DefaultHostSparseOps(const RCP<Node> &node);
00095 
00097     ~DefaultHostSparseOps();
00098 
00100 
00101 
00102     
00104     RCP<Node> getNode() const;
00105 
00107 
00108 
00109 
00111     void initializeStructure(const CrsGraphHostCompute<Ordinal,Node,DefaultHostSparseOps<void,Ordinal,Node> > &graph);
00112 
00114     void initializeValues(const CrsMatrixHostCompute<Scalar,Ordinal,Node,DefaultHostSparseOps<void,Ordinal,Node> > &matrix);
00115 
00117     void clear();
00118 
00120 
00121 
00122 
00124     template <class DomainScalar, class RangeScalar>
00125     void multiply(Teuchos::ETransp trans, RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, MultiVector<RangeScalar,Node> &Y) const;
00126 
00128     template <class DomainScalar, class RangeScalar>
00129     void multiply(Teuchos::ETransp trans, 
00130                   RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, RangeScalar beta, MultiVector<RangeScalar,Node> &Y) const;
00131 
00133     template <class DomainScalar, class RangeScalar>
00134     void solve(Teuchos::ETransp trans, Teuchos::EUplo uplo, Teuchos::EDiag diag, 
00135                const MultiVector<DomainScalar,Node> &Y, MultiVector<RangeScalar,Node> &X) const;
00136 
00138     template <class VectorScalar>
00139     void leftScale(const MultiVector<VectorScalar,Node> &X);
00140 
00142 
00143   protected:
00145     DefaultHostSparseOps(const DefaultHostSparseOps& source);
00146 
00147     RCP<Node> node_;
00148 
00149     // we do this one of two ways: 
00150     // 1D/packed: arrays of offsets, array of ordinals, array of values.
00151     ArrayRCP<const Ordinal> inds1D_;
00152     ArrayRCP<const size_t>  begs1D_, ends1D_;
00153     ArrayRCP<Scalar>  vals1D_;
00154     // 2D: array of pointers
00155     ArrayRCP<const ArrayRCP<Ordinal> > inds2D_;
00156     ArrayRCP<const ArrayRCP<Scalar> >  vals2D_;
00157     ArrayRCP<const size_t>          numEntries_;
00158     ArrayRCP<const Ordinal *> indPtrs_;
00159     ArrayRCP<Scalar  *> valPtrs_;
00160 
00161     size_t numRows_;
00162     bool indsInit_, valsInit_, isEmpty_;
00163   };
00164 
00165   template<class Scalar, class Ordinal, class Node>
00166   DefaultHostSparseOps<Scalar,Ordinal,Node>::DefaultHostSparseOps(const RCP<Node> &node)
00167   : node_(node) 
00168   {
00169     Teuchos::CompileTimeAssert<Node::isHostNode == false> cta; (void)cta;
00170     clear();
00171   }
00172 
00173   template<class Scalar, class Ordinal, class Node>
00174   DefaultHostSparseOps<Scalar,Ordinal,Node>::~DefaultHostSparseOps() {
00175   }
00176 
00177   template <class Scalar, class Ordinal, class Node>
00178   RCP<Node> DefaultHostSparseOps<Scalar,Ordinal,Node>::getNode() const {
00179     return node_; 
00180   }
00181 
00182   template <class Scalar, class Ordinal, class Node>
00183   void DefaultHostSparseOps<Scalar,Ordinal,Node>::clear() {
00184     begs1D_     = null;
00185     ends1D_     = null;
00186     inds1D_     = null;
00187     vals1D_     = null;
00188     inds2D_     = null;
00189     vals2D_     = null;
00190     numEntries_ = null;
00191     indPtrs_    = null;
00192     valPtrs_    = null;
00193     numRows_  = 0;
00194     indsInit_ = false;
00195     valsInit_ = false;
00196     isEmpty_  = false;
00197   }
00198 
00199   template <class Scalar, class Ordinal, class Node>
00200   void DefaultHostSparseOps<Scalar,Ordinal,Node>::initializeStructure(const CrsGraphHostCompute<Ordinal,Node,DefaultHostSparseOps<void,Ordinal,Node> > &graph) {
00201     using Teuchos::arcp;
00202     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == true || valsInit_ == true, std::runtime_error, 
00203         Teuchos::typeName(*this) << "::initializeStructure(): structure already initialized.");
00204     numRows_ = graph.getNumRows();
00205     if (graph.isEmpty() || numRows_ == 0) {
00206       isEmpty_ = true;
00207     }
00208     else if (graph.is1DStructure()) {
00209       isEmpty_ = false;
00210       ArrayRCP<Ordinal> inds;
00211       ArrayRCP<size_t> begs, ends;
00212       const_cast<CrsGraphHostCompute<Ordinal,Node,DefaultHostSparseOps<void,Ordinal,Node> > &>(graph).get1DStructure( inds, begs, ends );
00213       inds1D_ = inds;
00214       begs1D_ = begs;
00215       ends1D_ = ends;
00216     }
00217     else {
00218       isEmpty_  = false;
00219       {
00220         ArrayRCP<ArrayRCP<Ordinal> > inds;
00221         ArrayRCP<size_t>  sizes;
00222         const_cast<CrsGraphHostCompute<Ordinal,Node,DefaultHostSparseOps<void,Ordinal,Node> > &>(graph).get2DStructure(inds,sizes);
00223         inds2D_     = inds;
00224         numEntries_ = sizes;
00225       }
00226       indPtrs_    = arcp<const Ordinal *>(numRows_);
00227       for (size_t r=0; r < numRows_; ++r) {
00228         indPtrs_[r] = inds2D_[r].getRawPtr();
00229       }
00230     }
00231     indsInit_ = true;
00232   }
00233 
00234 
00235   template <class Scalar, class Ordinal, class Node>
00236   void DefaultHostSparseOps<Scalar,Ordinal,Node>::initializeValues(const CrsMatrixHostCompute<Scalar,Ordinal,Node,DefaultHostSparseOps<void,Ordinal,Node> > &matrix) {
00237     using Teuchos::arcp;
00238     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == false, std::runtime_error, 
00239         Teuchos::typeName(*this) << "::initializeValues(): must initialize values after graph.");
00240     TEUCHOS_TEST_FOR_EXCEPTION(numRows_ != matrix.getNumRows() || isEmpty_ != matrix.isEmpty() || 
00241                        (inds2D_ != null && matrix.is1DStructure()) || (inds1D_ != null && matrix.is2DStructure()),
00242                        std::runtime_error, Teuchos::typeName(*this) << "::initializeValues(): matrix not compatible with previously supplied graph.");
00243     if (!isEmpty_) {
00244       if (matrix.is1DStructure()) {
00245         ArrayRCP<Scalar> vals;
00246         const_cast<CrsMatrixHostCompute<Scalar,Ordinal,Node,DefaultHostSparseOps<void,Ordinal,Node> > &>(matrix).get1DValues( vals );
00247         vals1D_ = vals;
00248       }
00249       else {
00250         {
00251           ArrayRCP<ArrayRCP<Scalar> > vals;
00252           const_cast<CrsMatrixHostCompute<Scalar,Ordinal,Node,DefaultHostSparseOps<void,Ordinal,Node> > &>(matrix).get2DValues(vals);
00253           vals2D_ = vals;
00254         }
00255         valPtrs_ = arcp<Scalar *>(numRows_);
00256         for (size_t r=0; r < numRows_; ++r) {
00257           valPtrs_[r] = vals2D_[r].getRawPtr();
00258         }
00259       }
00260     }
00261     valsInit_ = true;
00262   }
00263 
00264 
00265   template <class Scalar, class Ordinal, class Node>
00266   template <class DomainScalar, class RangeScalar>
00267   void DefaultHostSparseOps<Scalar,Ordinal,Node>::solve(
00268                       Teuchos::ETransp trans, Teuchos::EUplo uplo, Teuchos::EDiag diag, 
00269                       const MultiVector<DomainScalar,Node> &Y,
00270                             MultiVector<RangeScalar,Node> &X) const 
00271   {
00272     typedef DefaultSparseSolveOp1<Scalar,Ordinal,DomainScalar,RangeScalar>  Op1D;
00273     typedef DefaultSparseSolveOp2<Scalar,Ordinal,DomainScalar,RangeScalar>  Op2D;
00274     typedef DefaultSparseTransposeSolveOp1<Scalar,Ordinal,DomainScalar,RangeScalar>  TOp1D;
00275     typedef DefaultSparseTransposeSolveOp2<Scalar,Ordinal,DomainScalar,RangeScalar>  TOp2D;
00276     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00277         Teuchos::typeName(*this) << "::solve(): this solve was not fully initialized.");
00278     TEUCHOS_TEST_FOR_EXCEPTION(X.getNumCols() != Y.getNumCols(), std::runtime_error,
00279         Teuchos::typeName(*this) << "::solve(): Left hand side and right hand side multivectors have differing numbers of vectors.");
00280     TEUCHOS_TEST_FOR_EXCEPTION(X.getNumRows() < numRows_, std::runtime_error,
00281         Teuchos::typeName(*this) << "::solve(): Left-hand-side multivector does not have enough rows. Likely cause is that the column map was not provided to the Tpetra::CrsMatrix in the case of an implicit unit diagonal.");
00282 
00283     ReadyBufferHelper<Node> rbh(node_);
00284     if (numRows_ == 0) {
00285       // null op
00286     }
00287     else if (isEmpty_) {
00288       TEUCHOS_TEST_FOR_EXCEPTION(diag != Teuchos::UNIT_DIAG, std::runtime_error,
00289           Teuchos::typeName(*this) << "::solve(): solve of empty matrix only valid for an implicit unit diagonal.");
00290       // solve I * X = Y for X = Y
00291       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Assign(X,Y);
00292     }
00293     else if (begs1D_ != null) {
00294       if (trans == Teuchos::NO_TRANS) {
00295         Op1D wdp;
00296         rbh.begin();
00297         wdp.begs    = rbh.template addConstBuffer<size_t>(begs1D_);
00298         wdp.ends    = rbh.template addConstBuffer<size_t>(ends1D_);
00299         wdp.inds    = rbh.template addConstBuffer<Ordinal>(inds1D_);
00300         wdp.vals    = rbh.template addConstBuffer< Scalar>(vals1D_);
00301         wdp.x       = rbh.template addNonConstBuffer<DomainScalar>(X.getValuesNonConst());
00302         wdp.y       = rbh.template addConstBuffer<RangeScalar>(Y.getValues());
00303         rbh.end();
00304         wdp.numRows = numRows_;
00305         wdp.unitDiag = (diag == Teuchos::UNIT_DIAG ? true : false);
00306         wdp.upper    = (uplo == Teuchos::UPPER_TRI ? true : false);
00307         wdp.xstride = X.getStride();
00308         wdp.ystride = Y.getStride();
00309         const size_t numRHS = X.getNumCols();
00310         node_->template parallel_for<Op1D>(0,numRHS,wdp);
00311       }
00312       else {
00313         TOp1D wdp;
00314         rbh.begin();
00315         wdp.begs    = rbh.template addConstBuffer<size_t>(begs1D_);
00316         wdp.ends    = rbh.template addConstBuffer<size_t>(ends1D_);
00317         wdp.inds    = rbh.template addConstBuffer<Ordinal>(inds1D_);
00318         wdp.vals    = rbh.template addConstBuffer< Scalar>(vals1D_);
00319         wdp.x       = rbh.template addNonConstBuffer<DomainScalar>(X.getValuesNonConst());
00320         wdp.y       = rbh.template addConstBuffer<RangeScalar>(Y.getValues());
00321         rbh.end();
00322         wdp.numRows = numRows_;
00323         wdp.unitDiag = (diag == Teuchos::UNIT_DIAG ? true : false);
00324         wdp.upper    = (uplo == Teuchos::UPPER_TRI ? true : false);
00325         wdp.xstride = X.getStride();
00326         wdp.ystride = Y.getStride();
00327         const size_t numRHS = X.getNumCols();
00328         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00329       }
00330     }
00331     else {
00332       if (trans == Teuchos::NO_TRANS) {
00333         Op2D wdp;
00334         rbh.begin();
00335         wdp.numEntries = rbh.template addConstBuffer<size_t>(numEntries_);
00336         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(indPtrs_);
00337         wdp.vals_beg   = rbh.template addConstBuffer< Scalar *>(valPtrs_);
00338         wdp.x       = rbh.template addNonConstBuffer<DomainScalar>(X.getValuesNonConst());
00339         wdp.y       = rbh.template addConstBuffer<RangeScalar>(Y.getValues());
00340         rbh.end();
00341         wdp.numRows = numRows_;
00342         wdp.unitDiag = (diag == Teuchos::UNIT_DIAG ? true : false);
00343         wdp.upper    = (uplo == Teuchos::UPPER_TRI ? true : false);
00344         wdp.xstride = X.getStride();
00345         wdp.ystride = Y.getStride();
00346         const size_t numRHS = X.getNumCols();
00347         node_->template parallel_for<Op2D>(0,numRHS,wdp);
00348       }
00349       else {
00350         TOp2D wdp;
00351         rbh.begin();
00352         wdp.numEntries = rbh.template addConstBuffer<size_t>(numEntries_);
00353         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(indPtrs_);
00354         wdp.vals_beg   = rbh.template addConstBuffer< Scalar *>(valPtrs_);
00355         wdp.x       = rbh.template addNonConstBuffer<DomainScalar>(X.getValuesNonConst());
00356         wdp.y       = rbh.template addConstBuffer<RangeScalar>(Y.getValues());
00357         rbh.end();
00358         wdp.numRows = numRows_;
00359         wdp.unitDiag = (diag == Teuchos::UNIT_DIAG ? true : false);
00360         wdp.upper    = (uplo == Teuchos::UPPER_TRI ? true : false);
00361         wdp.xstride = X.getStride();
00362         wdp.ystride = Y.getStride();
00363         const size_t numRHS = X.getNumCols();
00364         node_->template parallel_for<TOp2D>(0,numRHS,wdp);
00365       }
00366     }
00367     return;
00368   }
00369 
00370 
00371   template <class Scalar, class Ordinal, class Node>
00372   template <class DomainScalar, class RangeScalar>
00373   void DefaultHostSparseOps<Scalar,Ordinal,Node>::multiply(
00374                                 Teuchos::ETransp trans, 
00375                                 RangeScalar alpha,
00376                                 const MultiVector<DomainScalar,Node> &X, 
00377                                 MultiVector<RangeScalar,Node> &Y) const 
00378   {
00379     // the 1 parameter to the template means that beta is ignored and the output multivector enjoys overwrite semantics
00380     typedef DefaultSparseMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 1>  Op1D;
00381     typedef DefaultSparseMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 1>  Op2D;
00382     typedef DefaultSparseTransposeMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 1> TOp1D;
00383     typedef DefaultSparseTransposeMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 1> TOp2D;
00384     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00385         Teuchos::typeName(*this) << "::multiply(): operation not fully initialized.");
00386     TEUCHOS_TEST_FOR_EXCEPT(X.getNumCols() != Y.getNumCols());
00387     ReadyBufferHelper<Node> rbh(node_);
00388     if (isEmpty_ == true) {
00389       // Y <= 0 * X
00390       //   <= 0
00391       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Init(Y,Teuchos::ScalarTraits<RangeScalar>::zero());
00392     }
00393     else if (begs1D_ != null) {
00394       if (trans == Teuchos::NO_TRANS) {
00395         Op1D wdp;
00396         rbh.begin();
00397         wdp.alpha   = alpha;
00398         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00399         wdp.numRows = numRows_;
00400         wdp.begs    = rbh.template addConstBuffer<size_t>(begs1D_);
00401         wdp.ends    = rbh.template addConstBuffer<size_t>(ends1D_);
00402         wdp.inds    = rbh.template addConstBuffer<Ordinal>(inds1D_);
00403         wdp.vals    = rbh.template addConstBuffer<Scalar>(vals1D_);
00404         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00405         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00406         wdp.xstride = X.getStride();
00407         wdp.ystride = Y.getStride();
00408         rbh.end();
00409         const size_t numRHS = X.getNumCols();
00410         node_->template parallel_for<Op1D>(0,numRows_*numRHS,wdp);
00411       }
00412       else {
00413         TOp1D wdp;
00414         rbh.begin();
00415         wdp.alpha   = alpha;
00416         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00417         wdp.numRows = numRows_;
00418         wdp.numCols = Y.getNumRows();
00419         wdp.begs    = rbh.template addConstBuffer<size_t>(begs1D_);
00420         wdp.ends    = rbh.template addConstBuffer<size_t>(ends1D_);
00421         wdp.inds    = rbh.template addConstBuffer<Ordinal>(inds1D_);
00422         wdp.vals    = rbh.template addConstBuffer<Scalar>(vals1D_);
00423         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00424         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00425         wdp.xstride = X.getStride();
00426         wdp.ystride = Y.getStride();
00427         rbh.end();
00428         const size_t numRHS = X.getNumCols();
00429         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00430       }
00431     }
00432     else {
00433       if (trans == Teuchos::NO_TRANS) {
00434         Op2D wdp;
00435         rbh.begin();
00436         wdp.alpha   = alpha;
00437         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00438         wdp.numRows = numRows_;
00439         wdp.numEntries = rbh.template addConstBuffer<size_t>(numEntries_);
00440         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(indPtrs_);
00441         wdp.vals_beg   = rbh.template addConstBuffer<Scalar *>(valPtrs_);
00442         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00443         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00444         rbh.end();
00445         wdp.xstride = X.getStride();
00446         wdp.ystride = Y.getStride();
00447         const size_t numRHS = X.getNumCols();
00448         node_->template parallel_for<Op2D>(0,numRows_*numRHS,wdp);
00449       }
00450       else {
00451         TOp2D wdp;
00452         rbh.begin();
00453         wdp.alpha   = alpha;
00454         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00455         wdp.numRows = numRows_;
00456         wdp.numCols = Y.getNumRows();
00457         wdp.numEntries = rbh.template addConstBuffer<size_t>(numEntries_);
00458         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(indPtrs_);
00459         wdp.vals_beg   = rbh.template addConstBuffer<Scalar *>(valPtrs_);
00460         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00461         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00462         wdp.xstride = X.getStride();
00463         wdp.ystride = Y.getStride();
00464         rbh.end();
00465         const size_t numRHS = X.getNumCols();
00466         node_->template parallel_for<TOp2D>(0,numRHS,wdp);
00467       }
00468     }
00469     return;
00470   }
00471 
00472 
00473   template <class Scalar, class Ordinal, class Node>
00474   template <class DomainScalar, class RangeScalar>
00475   void DefaultHostSparseOps<Scalar,Ordinal,Node>::multiply(
00476                                 Teuchos::ETransp trans, 
00477                                 RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, 
00478                                 RangeScalar beta, MultiVector<RangeScalar,Node> &Y) const 
00479   {
00480     // the 0 parameter means that beta is considered, and the output multivector enjoys accumulate semantics
00481     typedef DefaultSparseMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 0>  Op1D;
00482     typedef DefaultSparseMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 0>  Op2D;
00483     typedef DefaultSparseTransposeMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 0> TOp1D;
00484     typedef DefaultSparseTransposeMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 0> TOp2D;
00485     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00486         Teuchos::typeName(*this) << "::multiply(): operation not fully initialized.");
00487     TEUCHOS_TEST_FOR_EXCEPT(X.getNumCols() != Y.getNumCols());
00488     ReadyBufferHelper<Node> rbh(node_);
00489     if (isEmpty_ == true) {
00490       // Y <= alpha * 0 * X + beta * Y 
00491       //   <= beta * Y
00492       // NOTE: this neglects NaNs in X, which don't satisfy 0*NaN == 0
00493       //       however, X and Y may be of different size, and we need entries to determine how to mix those potential NaNs in X into Y
00494       //       therefore, the best we can do is scale Y to zero. Setting Y to zero would destroy NaNs in Y, which violates the semantics of the call.
00495       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Scale(Y,beta);
00496     }
00497     else if (begs1D_ != null) {
00498       if (trans == Teuchos::NO_TRANS) {
00499         Op1D wdp;
00500         rbh.begin();
00501         wdp.alpha   = alpha;
00502         wdp.beta    = beta;
00503         wdp.numRows = numRows_;
00504         wdp.begs    = rbh.template addConstBuffer<size_t>(begs1D_);
00505         wdp.ends    = rbh.template addConstBuffer<size_t>(ends1D_);
00506         wdp.inds    = rbh.template addConstBuffer<Ordinal>(inds1D_);
00507         wdp.vals    = rbh.template addConstBuffer< Scalar>(vals1D_);
00508         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00509         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00510         wdp.xstride = X.getStride();
00511         wdp.ystride = Y.getStride();
00512         rbh.end();
00513         const size_t numRHS = X.getNumCols();
00514         node_->template parallel_for<Op1D>(0,numRows_*numRHS,wdp);
00515       }
00516       else {
00517         TOp1D wdp;
00518         rbh.begin();
00519         wdp.alpha   = alpha;
00520         wdp.beta    = beta;
00521         wdp.numRows = numRows_;
00522         wdp.numCols = Y.getNumRows();
00523         wdp.begs    = rbh.template addConstBuffer<size_t>(begs1D_);
00524         wdp.ends    = rbh.template addConstBuffer<size_t>(ends1D_);
00525         wdp.inds    = rbh.template addConstBuffer<Ordinal>(inds1D_);
00526         wdp.vals    = rbh.template addConstBuffer< Scalar>(vals1D_);
00527         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00528         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00529         wdp.xstride = X.getStride();
00530         wdp.ystride = Y.getStride();
00531         rbh.end();
00532         const size_t numRHS = X.getNumCols();
00533         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00534       }
00535     }
00536     else {
00537       if (trans == Teuchos::NO_TRANS) {
00538         Op2D wdp;
00539         rbh.begin();
00540         wdp.numRows = numRows_;
00541         wdp.numEntries = rbh.template addConstBuffer<size_t>(numEntries_);
00542         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(indPtrs_);
00543         wdp.vals_beg   = rbh.template addConstBuffer<Scalar *>(valPtrs_);
00544         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00545         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00546         rbh.end();
00547         wdp.alpha   = alpha;
00548         wdp.beta    = beta;
00549         wdp.xstride = X.getStride();
00550         wdp.ystride = Y.getStride();
00551         const size_t numRHS = X.getNumCols();
00552         node_->template parallel_for<Op2D>(0,numRows_*numRHS,wdp);
00553       }
00554       else {
00555         TOp2D wdp;
00556         rbh.begin();
00557         wdp.alpha   = alpha;
00558         wdp.beta    = beta;
00559         wdp.numRows = numRows_;
00560         wdp.numCols = Y.getNumRows();
00561         wdp.numEntries = rbh.template addConstBuffer<size_t>(numEntries_);
00562         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(indPtrs_);
00563         wdp.vals_beg   = rbh.template addConstBuffer<Scalar *>(valPtrs_);
00564         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00565         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00566         wdp.xstride = X.getStride();
00567         wdp.ystride = Y.getStride();
00568         rbh.end();
00569         const size_t numRHS = X.getNumCols();
00570         node_->template parallel_for<TOp2D>(0,numRHS,wdp);
00571       }
00572     }
00573     return;
00574   }
00575 
00576 
00577 
00578 
00579   template <class Scalar, class Ordinal, class Node>
00580   template <class VectorScalar>
00581   void DefaultHostSparseOps<Scalar,Ordinal,Node>::leftScale(const MultiVector<VectorScalar,Node> &X) 
00582   {
00583     typedef DefaultSparseScaleOp1<Scalar,Ordinal,VectorScalar>  Op1D;
00584     typedef DefaultSparseScaleOp2<Scalar,Ordinal,VectorScalar>  Op2D;
00585     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00586         Teuchos::typeName(*this) << "::leftScale(): operation not fully initialized.");
00587     TEUCHOS_TEST_FOR_EXCEPT(X.getNumCols() != 1);
00588     ReadyBufferHelper<Node> rbh(node_);
00589     if (begs1D_ != null) {
00590       Op1D wdp;
00591       rbh.begin();
00592       wdp.numRows = numRows_;
00593       wdp.begs    = rbh.template addConstBuffer<size_t>(begs1D_);
00594       wdp.ends    = rbh.template addConstBuffer<size_t>(ends1D_);
00595       wdp.inds    = rbh.template addConstBuffer<Ordinal>(inds1D_);
00596       wdp.vals    = rbh.template addNonConstBuffer<Scalar>(vals1D_);
00597       wdp.x       = rbh.template addConstBuffer<VectorScalar>(X.getValues());
00598       rbh.end();
00599       node_->template parallel_for<Op1D>(0,numRows_,wdp); 
00600     }
00601     else {
00602       Op2D wdp;
00603       rbh.begin();
00604       wdp.numRows = numRows_;
00605       wdp.numEntries = rbh.template addConstBuffer<size_t>(numEntries_);
00606       wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(indPtrs_);
00607       wdp.vals_beg   = rbh.template addNonConstBuffer<Scalar *>(valPtrs_);
00608       wdp.x          = rbh.template addConstBuffer<VectorScalar>(X.getValues());
00609       rbh.end();
00610       node_->template parallel_for<Op2D>(0,numRows_,wdp);
00611     }
00612     return;
00613   }
00614 
00615 
00619   template <class Scalar, class Ordinal, class Node>
00620   class DefaultDeviceSparseOps {
00621   public:
00622     typedef Scalar  ScalarType;
00623     typedef Ordinal OrdinalType;
00624     typedef Node    NodeType;
00625 
00626     template <class S2>
00627     struct rebind {
00628       typedef DefaultDeviceSparseOps<S2,Ordinal,Node> other;
00629     };
00630 
00632 
00634 
00636     DefaultDeviceSparseOps(const RCP<Node> &node);
00637 
00639     ~DefaultDeviceSparseOps();
00640 
00642 
00644 
00645     
00647     RCP<Node> getNode() const;
00648 
00650 
00652 
00654 
00656     void initializeStructure(const CrsGraphDeviceCompute<Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &graph);
00657 
00659     void initializeValues(const CrsMatrixDeviceCompute<Scalar,Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &matrix);
00660 
00662     void clear();
00663 
00665 
00667 
00669 
00671     template <class DomainScalar, class RangeScalar>
00672     void multiply(Teuchos::ETransp trans, RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, MultiVector<RangeScalar,Node> &Y) const;
00673 
00675     template <class DomainScalar, class RangeScalar>
00676     void multiply(Teuchos::ETransp trans, 
00677                   RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, RangeScalar beta, MultiVector<RangeScalar,Node> &Y) const;
00678 
00680     template <class DomainScalar, class RangeScalar>
00681     void solve(Teuchos::ETransp trans, Teuchos::EUplo uplo, Teuchos::EDiag diag, 
00682                const MultiVector<DomainScalar,Node> &Y, MultiVector<RangeScalar,Node> &X) const;
00683 
00684 
00686     template <class VectorScalar>
00687     void leftScale(const MultiVector<VectorScalar,Node> &X);
00688 
00690 
00691   protected:
00693     DefaultDeviceSparseOps(const DefaultDeviceSparseOps& source);
00694 
00695     RCP<Node> node_;
00696 
00697     // we do this one of two ways: 
00698     // 1D: array of offsets, pointer for ordinals, pointer for values.
00699     ArrayRCP<const size_t>  pbuf_offsets1D_;
00700     ArrayRCP<const Ordinal> pbuf_inds1D_;
00701     ArrayRCP<Scalar>  pbuf_vals1D_;
00702 
00703     size_t numRows_;
00704     bool indsInit_, valsInit_, isEmpty_;
00705   };
00706 
00707   template<class Scalar, class Ordinal, class Node>
00708   DefaultDeviceSparseOps<Scalar,Ordinal,Node>::DefaultDeviceSparseOps(const RCP<Node> &node)
00709   : node_(node) 
00710   {
00711     clear();
00712   }
00713 
00714   template<class Scalar, class Ordinal, class Node>
00715   DefaultDeviceSparseOps<Scalar,Ordinal,Node>::~DefaultDeviceSparseOps() {
00716   }
00717 
00718   template <class Scalar, class Ordinal, class Node>
00719   RCP<Node> DefaultDeviceSparseOps<Scalar,Ordinal,Node>::getNode() const { 
00720     return node_; 
00721   }
00722 
00723   template <class Scalar, class Ordinal, class Node>
00724   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::clear() {
00725     pbuf_offsets1D_  = null;
00726     pbuf_inds1D_     = null;
00727     pbuf_vals1D_     = null;
00728     numRows_  = 0;
00729     indsInit_ = false;
00730     valsInit_ = false;
00731     isEmpty_  = false;
00732   }
00733 
00734   template <class Scalar, class Ordinal, class Node>
00735   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::initializeStructure(const CrsGraphDeviceCompute<Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &graph) {
00736     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == true || valsInit_ == true, std::runtime_error, 
00737         Teuchos::typeName(*this) << "::initializeStructure(): structure already initialized.");
00738     numRows_ = graph.getNumRows();
00739     if (graph.isEmpty() || numRows_ == 0) {
00740       isEmpty_ = true;
00741     }
00742     else {
00743       ArrayRCP<Ordinal> inds;
00744       ArrayRCP<size_t > offs;
00745       const_cast<CrsGraphDeviceCompute<Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &>(graph).getDeviceBuffers(inds, offs);
00746       pbuf_inds1D_    = inds;
00747       pbuf_offsets1D_ = offs;
00748     }
00749     indsInit_ = true;
00750   }
00751 
00752   template <class Scalar, class Ordinal, class Node>
00753   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::initializeValues(const CrsMatrixDeviceCompute<Scalar,Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &matrix) {
00754     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == false, std::runtime_error, 
00755         Teuchos::typeName(*this) << "::initializeValues(): must initialize values after graph.");
00756     TEUCHOS_TEST_FOR_EXCEPTION(numRows_ != matrix.getNumRows() || isEmpty_ != matrix.isEmpty(), std::runtime_error,
00757         Teuchos::typeName(*this) << "::initializeValues(): matrix not compatible with previously supplied graph.");
00758     if (!isEmpty_) {
00759       ArrayRCP<Scalar> vals;
00760       const_cast<CrsMatrixDeviceCompute<Scalar,Ordinal,Node,DefaultDeviceSparseOps<void,Ordinal,Node> > &>(matrix).getDeviceBuffer(vals);
00761       pbuf_vals1D_ = vals;
00762     }
00763     valsInit_ = true;
00764   }
00765 
00766 
00767   template <class Scalar, class Ordinal, class Node>
00768   template <class DomainScalar, class RangeScalar>
00769   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::solve(
00770                       Teuchos::ETransp trans, Teuchos::EUplo uplo, Teuchos::EDiag diag, 
00771                       const MultiVector<DomainScalar,Node> &Y,
00772                             MultiVector<RangeScalar,Node> &X) const {
00773     typedef DefaultSparseSolveOp1<Scalar,Ordinal,DomainScalar,RangeScalar>  Op1D;
00774     typedef DefaultSparseTransposeSolveOp1<Scalar,Ordinal,DomainScalar,RangeScalar>  TOp1D;
00775     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00776         Teuchos::typeName(*this) << "::solve(): this solve was not fully initialized.");
00777     TEUCHOS_TEST_FOR_EXCEPTION(X.getNumCols() != Y.getNumCols(), std::runtime_error,
00778         Teuchos::typeName(*this) << "::solve(): Left hand side and right hand side multivectors have differing numbers of vectors.");
00779     TEUCHOS_TEST_FOR_EXCEPTION(X.getNumRows() < numRows_, std::runtime_error,
00780         Teuchos::typeName(*this) << "::solve(): Left-hand-side multivector does not have enough rows. Likely cause is that the column map was not provided to the Tpetra::CrsMatrix in the case of an implicit unit diagonal.");
00781 
00782     ReadyBufferHelper<Node> rbh(node_);
00783     if (numRows_ == 0) {
00784       // null op
00785     }
00786     else if (isEmpty_) {
00787       TEUCHOS_TEST_FOR_EXCEPTION(diag != Teuchos::UNIT_DIAG, std::runtime_error,
00788           Teuchos::typeName(*this) << "::solve(): solve of empty matrix only valid for an implicit unit diagonal.");
00789       // solve I * X = Y for X = Y
00790       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Assign(X,Y);
00791     }
00792     else {
00793       if (trans == Teuchos::NO_TRANS) {
00794         Op1D wdp;
00795         rbh.begin();
00796         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00797         wdp.ends    = wdp.begs+1;
00798         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00799         wdp.vals    = rbh.template addConstBuffer< Scalar>(pbuf_vals1D_);
00800         wdp.x       = rbh.template addNonConstBuffer<DomainScalar>(X.getValuesNonConst());
00801         wdp.y       = rbh.template addConstBuffer<RangeScalar>(Y.getValues());
00802         rbh.end();
00803         wdp.numRows = numRows_;
00804         wdp.unitDiag = (diag == Teuchos::UNIT_DIAG ? true : false);
00805         wdp.upper    = (uplo == Teuchos::UPPER_TRI ? true : false);
00806         wdp.xstride = X.getStride();
00807         wdp.ystride = Y.getStride();
00808         const size_t numRHS = X.getNumCols();
00809         node_->template parallel_for<Op1D>(0,numRHS,wdp);
00810       }
00811       else {
00812         TOp1D wdp;
00813         rbh.begin();
00814         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00815         wdp.ends    = wdp.begs+1;
00816         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00817         wdp.vals    = rbh.template addConstBuffer< Scalar>(pbuf_vals1D_);
00818         wdp.x       = rbh.template addNonConstBuffer<DomainScalar>(X.getValuesNonConst());
00819         wdp.y       = rbh.template addConstBuffer<RangeScalar>(Y.getValues());
00820         rbh.end();
00821         wdp.numRows = numRows_;
00822         wdp.unitDiag = (diag == Teuchos::UNIT_DIAG ? true : false);
00823         wdp.upper    = (uplo == Teuchos::UPPER_TRI ? true : false);
00824         wdp.xstride = X.getStride();
00825         wdp.ystride = Y.getStride();
00826         const size_t numRHS = X.getNumCols();
00827         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00828       }
00829     }
00830     return;
00831   }
00832 
00833 
00834   template <class Scalar, class Ordinal, class Node>
00835   template <class DomainScalar, class RangeScalar>
00836   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::multiply(
00837                                 Teuchos::ETransp trans, 
00838                                 RangeScalar alpha,
00839                                 const MultiVector<DomainScalar,Node> &X, 
00840                                 MultiVector<RangeScalar,Node> &Y) const {
00841     // the 1 parameter to the template means that beta is ignored and the output multivector enjoys overwrite semantics
00842     typedef DefaultSparseMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 1>  Op1D;
00843     typedef DefaultSparseTransposeMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 1> TOp1D;
00844     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00845         Teuchos::typeName(*this) << "::multiply(): operation not fully initialized.");
00846     TEUCHOS_TEST_FOR_EXCEPT(X.getNumCols() != Y.getNumCols());
00847     ReadyBufferHelper<Node> rbh(node_);
00848     if (isEmpty_ == true) {
00849       // Y <= 0 * X
00850       //   <= 0
00851       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Init(Y,Teuchos::ScalarTraits<RangeScalar>::zero());
00852     }
00853     else {
00854       if (trans == Teuchos::NO_TRANS) {
00855         Op1D wdp;
00856         rbh.begin();
00857         wdp.alpha   = alpha;
00858         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00859         wdp.numRows = numRows_;
00860         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00861         wdp.ends    = wdp.begs+1;
00862         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00863         wdp.vals    = rbh.template addConstBuffer<Scalar>(pbuf_vals1D_);
00864         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00865         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00866         wdp.xstride = X.getStride();
00867         wdp.ystride = Y.getStride();
00868         rbh.end();
00869         const size_t numRHS = X.getNumCols();
00870         node_->template parallel_for<Op1D>(0,numRows_*numRHS,wdp);
00871       }
00872       else {
00873         TOp1D wdp;
00874         rbh.begin();
00875         wdp.alpha   = alpha;
00876         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00877         wdp.numRows = numRows_;
00878         wdp.numCols = Y.getNumRows();
00879         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00880         wdp.ends    = wdp.begs+1;
00881         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00882         wdp.vals    = rbh.template addConstBuffer<Scalar>(pbuf_vals1D_);
00883         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00884         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00885         wdp.xstride = X.getStride();
00886         wdp.ystride = Y.getStride();
00887         rbh.end();
00888         const size_t numRHS = X.getNumCols();
00889         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00890       }
00891     }
00892     return;
00893   }
00894 
00895 
00896   template <class Scalar, class Ordinal, class Node>
00897   template <class DomainScalar, class RangeScalar>
00898   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::multiply(
00899                                 Teuchos::ETransp trans, 
00900                                 RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, 
00901                                 RangeScalar beta, MultiVector<RangeScalar,Node> &Y) const {
00902     // the 0 parameter means that beta is considered, and the output multivector enjoys accumulate semantics
00903     typedef DefaultSparseMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 0>  Op1D;
00904     typedef DefaultSparseTransposeMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 0> TOp1D;
00905     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00906         Teuchos::typeName(*this) << "::multiply(): operation not fully initialized.");
00907     TEUCHOS_TEST_FOR_EXCEPT(X.getNumCols() != Y.getNumCols());
00908     ReadyBufferHelper<Node> rbh(node_);
00909     if (isEmpty_ == true) {
00910       // Y <= alpha * 0 * X + beta * Y 
00911       //   <= beta * Y
00912       // NOTE: this neglects NaNs in X, which don't satisfy 0*NaN == 0
00913       //       however, X and Y may be of different size, and we need entries to determine how to mix those potential NaNs in X into Y
00914       //       therefore, the best we can do is scale Y to zero. Setting Y to zero would destroy NaNs in Y, which violates the semantics of the call.
00915       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Scale(Y,beta);
00916     }
00917     else {
00918       if (trans == Teuchos::NO_TRANS) {
00919         Op1D wdp;
00920         rbh.begin();
00921         wdp.alpha   = alpha;
00922         wdp.beta    = beta;
00923         wdp.numRows = numRows_;
00924         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00925         wdp.ends    = wdp.begs+1;
00926         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00927         wdp.vals    = rbh.template addConstBuffer< Scalar>(pbuf_vals1D_);
00928         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00929         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00930         wdp.xstride = X.getStride();
00931         wdp.ystride = Y.getStride();
00932         rbh.end();
00933         const size_t numRHS = X.getNumCols();
00934         node_->template parallel_for<Op1D>(0,numRows_*numRHS,wdp);
00935       }
00936       else {
00937         TOp1D wdp;
00938         rbh.begin();
00939         wdp.alpha   = alpha;
00940         wdp.beta    = beta;
00941         wdp.numRows = numRows_;
00942         wdp.numCols = Y.getNumRows();
00943         wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00944         wdp.ends    = wdp.begs+1;
00945         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00946         wdp.vals    = rbh.template addConstBuffer< Scalar>(pbuf_vals1D_);
00947         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00948         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00949         wdp.xstride = X.getStride();
00950         wdp.ystride = Y.getStride();
00951         rbh.end();
00952         const size_t numRHS = X.getNumCols();
00953         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00954       }
00955     }
00956     return;
00957   }
00958 
00959 
00960 
00961 
00962 
00963   template <class Scalar, class Ordinal, class Node>
00964   template <class VectorScalar>
00965   void DefaultDeviceSparseOps<Scalar,Ordinal,Node>::leftScale(const MultiVector<VectorScalar,Node> &X)
00966   {
00967     typedef DefaultSparseScaleOp1<Scalar,Ordinal,VectorScalar>  Op1D;
00968     TEUCHOS_TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00969         Teuchos::typeName(*this) << "::scale(): operation not fully initialized.");
00970     TEUCHOS_TEST_FOR_EXCEPT(X.getNumCols() != 1);
00971     ReadyBufferHelper<Node> rbh(node_);
00972 
00973 
00974     Op1D wdp;
00975     rbh.begin();
00976     wdp.numRows = numRows_;
00977     wdp.begs    = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00978     wdp.ends    = wdp.begs+1;
00979     wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00980     wdp.vals    = rbh.template addNonConstBuffer<Scalar>(pbuf_vals1D_);
00981     wdp.x       = rbh.template addConstBuffer<VectorScalar>(X.getValues());
00982     rbh.end();
00983     const size_t numRHS = X.getNumCols();
00984     node_->template parallel_for<Op1D>(0,numRows_*numRHS,wdp);
00985     
00986     return;
00987   }
00988 
00989 
00990 
00991 
00992 
00997 } // namespace Kokkos
00998 
00999 #endif /* KOKKOS_DEFAULTSPARSEOPS_HPP */
01000 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends