Kokkos_DefaultSparseMultiply.hpp

00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef KOKKOS_DEFAULTSPARSEMULTIPLY_HPP
00030 #define KOKKOS_DEFAULTSPARSEMULTIPLY_HPP
00031 
00032 #include <Teuchos_ArrayRCP.hpp>
00033 #include <Teuchos_DataAccess.hpp>
00034 #include <Teuchos_TestForException.hpp>
00035 #include <Teuchos_TypeNameTraits.hpp>
00036 #include <Teuchos_BLAS_types.hpp>
00037 #include <stdexcept>
00038 
00039 #include "Kokkos_ConfigDefs.hpp"
00040 #include "Kokkos_CrsMatrix.hpp" 
00041 #include "Kokkos_CrsGraph.hpp" 
00042 #include "Kokkos_MultiVector.hpp"
00043 #include "Kokkos_NodeHelpers.hpp"
00044 #include "Kokkos_DefaultArithmetic.hpp"
00045 #include "Kokkos_DefaultSparseMultiplyKernelOps.hpp"
00046 
00047 namespace Kokkos {
00048 
00049   // default implementation
00050   template <class Scalar, class Ordinal, class Node = DefaultNode::DefaultNodeType>
00054   class DefaultSparseMultiply {
00055   public:
00056     typedef Scalar  ScalarType;
00057     typedef Ordinal OrdinalType;
00058     typedef Node    NodeType;
00059 
00061 
00063 
00065     DefaultSparseMultiply(const Teuchos::RCP<Node> &node = DefaultNode::getDefaultNode());
00066 
00068     ~DefaultSparseMultiply();
00069 
00071 
00073 
00074     
00076     Teuchos::RCP<Node> getNode() const;
00077 
00079 
00081 
00083 
00085     template <class GRAPH>
00086     Teuchos::DataAccess initializeStructure(const GRAPH &graph, Teuchos::DataAccess cv);
00087 
00089     template <class MATRIX>
00090     Teuchos::DataAccess initializeValues(const MATRIX &matrix, Teuchos::DataAccess cv);
00091 
00093     Teuchos::DataAccess initializeStructure(const CrsGraph<Ordinal,Node> &graph, Teuchos::DataAccess cv);
00094 
00096     Teuchos::DataAccess initializeValues(const CrsMatrix<Scalar,Node> &matrix, Teuchos::DataAccess cv);
00097 
00099     void clear();
00100 
00102 
00104 
00106 
00108     template <class DomainScalar, class RangeScalar>
00109     void multiply(Teuchos::ETransp trans, RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, MultiVector<RangeScalar,Node> &Y) const;
00110 
00112     template <class DomainScalar, class RangeScalar>
00113     void multiply(Teuchos::ETransp trans, 
00114                   RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, RangeScalar beta, MultiVector<RangeScalar,Node> &Y) const;
00115 
00117 
00118   protected:
00120     DefaultSparseMultiply(const DefaultSparseMultiply& source);
00121 
00122     Teuchos::RCP<Node> node_;
00123 
00124     // we do this one of two ways: 
00125     // 1D/packed: array of offsets, pointer for ordinals, pointer for values. obviously the smallest footprint.
00126     Teuchos::ArrayRCP<const Ordinal> pbuf_inds1D_;
00127     Teuchos::ArrayRCP<const size_t>  pbuf_offsets1D_;
00128     Teuchos::ArrayRCP<const Scalar>  pbuf_vals1D_;
00129     // 2D: array of pointers
00130     Teuchos::ArrayRCP<const Ordinal *> pbuf_inds2D_;
00131     Teuchos::ArrayRCP<const Scalar  *> pbuf_vals2D_;
00132     Teuchos::ArrayRCP<size_t>          pbuf_numEntries_;
00133 
00134     size_t numRows_;
00135     bool indsInit_, valsInit_, isPacked_, isEmpty_;
00136   };
00137 
00138   template<class Scalar, class Ordinal, class Node>
00139   DefaultSparseMultiply<Scalar,Ordinal,Node>::DefaultSparseMultiply(const Teuchos::RCP<Node> &node)
00140   : node_(node)
00141   , indsInit_(false)
00142   , valsInit_(false)
00143   , isPacked_(false)
00144   , isEmpty_(false) {
00145   }
00146 
00147   template<class Scalar, class Ordinal, class Node>
00148   DefaultSparseMultiply<Scalar,Ordinal,Node>::~DefaultSparseMultiply() {
00149   }
00150 
00151   template<class Scalar, class Ordinal, class Node>
00152   template <class GRAPH>
00153   Teuchos::DataAccess DefaultSparseMultiply<Scalar,Ordinal,Node>::initializeStructure(const GRAPH &graph, Teuchos::DataAccess cv) {
00154     // not implemented for general sparse graphs
00155     TEST_FOR_EXCEPTION(true, std::exception, 
00156         Teuchos::typeName(*this) << "::initializeStructure(): method is not implemented for graph of type " << Teuchos::typeName(graph));
00157   }
00158 
00159   template<class Scalar, class Ordinal, class Node>
00160   template <class MATRIX>
00161   Teuchos::DataAccess DefaultSparseMultiply<Scalar,Ordinal,Node>::initializeValues(const MATRIX &matrix, Teuchos::DataAccess cv) {
00162     // not implemented for general sparse matrices
00163     TEST_FOR_EXCEPTION(true, std::exception, 
00164         Teuchos::typeName(*this) << "::initializeValues(): method is not implemented for matrix of type " << Teuchos::typeName(matrix));
00165   }
00166 
00167 
00168   template <class Scalar, class Ordinal, class Node>
00169   Teuchos::DataAccess DefaultSparseMultiply<Scalar,Ordinal,Node>::initializeStructure(const CrsGraph<Ordinal,Node> &graph, Teuchos::DataAccess cv) {
00170     using Teuchos::ArrayRCP;
00171     TEST_FOR_EXCEPTION(cv != Teuchos::View, std::runtime_error,
00172         Teuchos::typeName(*this) << "::initializeStructure(): requires View access.");
00173     TEST_FOR_EXCEPTION(indsInit_ == true, std::runtime_error, 
00174         Teuchos::typeName(*this) << "::initializeStructure(): structure already initialized.");
00175     numRows_ = graph.getNumRows();
00176     if (graph.isEmpty() || numRows_ == 0) {
00177       isEmpty_ = true;
00178     }
00179     else if (graph.isPacked()) {
00180       isEmpty_ = false;
00181       isPacked_ = true;
00182       pbuf_inds1D_    = graph.getPackedIndices();
00183       pbuf_offsets1D_ = graph.getPackedOffsets();
00184     }
00185     else {
00186       isEmpty_ = false;
00187       isPacked_ = false;
00188       pbuf_inds2D_     = node_->template allocBuffer<const Ordinal *>(numRows_);
00189       pbuf_numEntries_ = node_->template allocBuffer<size_t>(numRows_);
00190       ArrayRCP<const Ordinal *> inds2Dview = node_->template viewBufferNonConst<const Ordinal *>(WriteOnly, numRows_, pbuf_inds2D_);
00191       ArrayRCP<         size_t> numEntview = node_->template viewBufferNonConst<         size_t>(WriteOnly, numRows_, pbuf_numEntries_);
00192       for (size_t r=0; r < numRows_; ++r) {
00193         ArrayRCP<const Ordinal> rowinds = graph.get2DIndices(r);
00194         if (rowinds != Teuchos::null) {
00195           inds2Dview[r] = rowinds.getRawPtr();
00196           numEntview[r] = rowinds.size();
00197         }
00198         else {
00199           inds2Dview[r] = NULL;
00200           numEntview[r] = 0;
00201         }
00202       }
00203     }
00204     indsInit_ = true;
00205     return Teuchos::View;
00206   }
00207 
00208 
00209   template <class Scalar, class Ordinal, class Node>
00210   Teuchos::DataAccess DefaultSparseMultiply<Scalar,Ordinal,Node>::initializeValues(const CrsMatrix<Scalar,Node> &matrix, Teuchos::DataAccess cv) {
00211     using Teuchos::ArrayRCP;
00212     TEST_FOR_EXCEPTION(cv != Teuchos::View, std::runtime_error,
00213         Teuchos::typeName(*this) << "::initializeValues(): requires View access.");
00214     TEST_FOR_EXCEPTION(valsInit_ == true, std::runtime_error, 
00215         Teuchos::typeName(*this) << "::initializeValues(): values already initialized.");
00216     TEST_FOR_EXCEPTION(numRows_ != matrix.getNumRows() || (!isEmpty_ && isPacked_ != matrix.isPacked()), std::runtime_error,
00217         Teuchos::typeName(*this) << "::initializeValues(): matrix not compatible with previously supplied graph.");
00218     if (isEmpty_ || matrix.isEmpty() || numRows_ == 0) {
00219       isEmpty_ = true;
00220     }
00221     else if (matrix.isPacked()) {
00222       isEmpty_ = false;
00223       pbuf_vals1D_ = matrix.getPackedValues();
00224     }
00225     else {
00226       isEmpty_ = false;
00227       pbuf_vals2D_ = node_->template allocBuffer<const Scalar *>(numRows_);
00228       ArrayRCP<const Scalar *> vals2Dview = node_->template viewBufferNonConst<const Scalar *>(WriteOnly, numRows_, pbuf_vals2D_);
00229       for (size_t r=0; r < numRows_; ++r) {
00230         ArrayRCP<const Scalar> rowvals = matrix.get2DValues(r);
00231         if (rowvals != Teuchos::null) {
00232           vals2Dview[r] = rowvals.getRawPtr();
00233         }
00234         else {
00235           vals2Dview[r] = NULL;
00236         }
00237       }
00238     }
00239     valsInit_ = true;
00240     return Teuchos::View;
00241   }
00242 
00243 
00244   template <class Scalar, class Ordinal, class Node>
00245   Teuchos::RCP<Node> DefaultSparseMultiply<Scalar,Ordinal,Node>::getNode() const { 
00246     return node_; 
00247   }
00248 
00249 
00250   template <class Scalar, class Ordinal, class Node>
00251   void DefaultSparseMultiply<Scalar,Ordinal,Node>::clear() {
00252     pbuf_inds1D_     = Teuchos::null;
00253     pbuf_offsets1D_  = Teuchos::null;
00254     pbuf_vals1D_     = Teuchos::null;
00255     pbuf_inds2D_     = Teuchos::null;
00256     pbuf_vals2D_     = Teuchos::null;
00257     pbuf_numEntries_ = Teuchos::null;
00258     indsInit_ = false;
00259     valsInit_ = false;
00260     isPacked_ = false;
00261     isEmpty_  = false;
00262   }
00263 
00264 
00265   template <class Scalar, class Ordinal, class Node>
00266   template <class DomainScalar, class RangeScalar>
00267   void DefaultSparseMultiply<Scalar,Ordinal,Node>::multiply(
00268                                 Teuchos::ETransp trans, 
00269                                 RangeScalar alpha,
00270                                 const MultiVector<DomainScalar,Node> &X, 
00271                                 MultiVector<RangeScalar,Node> &Y) const {
00272     // the 1 parameter to the template means that beta is ignored and the output multivector enjoys overwrite semantics
00273     typedef DefaultSparseMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 1>  Op1D;
00274     typedef DefaultSparseMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 1>  Op2D;
00275     typedef DefaultSparseTransposeMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 1> TOp1D;
00276     typedef DefaultSparseTransposeMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 1> TOp2D;
00277     TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00278         Teuchos::typeName(*this) << "::multiply(): operation not fully initialized.");
00279     TEST_FOR_EXCEPT(X.getNumCols() != Y.getNumCols());
00280     ReadyBufferHelper<Node> rbh(node_);
00281     if (isEmpty_ == true) {
00282       // Y <= 0 * X
00283       //   <= 0
00284       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Init(Y,Teuchos::ScalarTraits<RangeScalar>::zero());
00285     }
00286     else if (isPacked_ == true) {
00287       if (trans == Teuchos::NO_TRANS) {
00288         Op1D wdp;
00289         rbh.begin();
00290         wdp.alpha   = alpha;
00291         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00292         wdp.numRows = numRows_;
00293         wdp.offsets = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00294         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00295         wdp.vals    = rbh.template addConstBuffer<Scalar>(pbuf_vals1D_);
00296         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00297         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00298         wdp.xstride = X.getStride();
00299         wdp.ystride = Y.getStride();
00300         rbh.end();
00301         const size_t numRHS = X.getNumCols();
00302         node_->template parallel_for<Op1D>(0,numRows_*numRHS,wdp);
00303       }
00304       else {
00305         TOp1D wdp;
00306         rbh.begin();
00307         wdp.alpha   = alpha;
00308         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00309         wdp.numRows = numRows_;
00310         wdp.numCols = Y.getNumRows();
00311         wdp.offsets = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00312         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00313         wdp.vals    = rbh.template addConstBuffer<Scalar>(pbuf_vals1D_);
00314         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00315         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00316         wdp.xstride = X.getStride();
00317         wdp.ystride = Y.getStride();
00318         rbh.end();
00319         const size_t numRHS = X.getNumCols();
00320         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00321       }
00322     }
00323     else {
00324       if (trans == Teuchos::NO_TRANS) {
00325         Op2D wdp;
00326         rbh.begin();
00327         wdp.alpha   = alpha;
00328         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00329         wdp.numRows = numRows_;
00330         wdp.numEntries = rbh.template addConstBuffer<size_t>(pbuf_numEntries_);
00331         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(pbuf_inds2D_);
00332         wdp.vals_beg   = rbh.template addConstBuffer<const Scalar *>(pbuf_vals2D_);
00333         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00334         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00335         rbh.end();
00336         wdp.xstride = X.getStride();
00337         wdp.ystride = Y.getStride();
00338         const size_t numRHS = X.getNumCols();
00339         node_->template parallel_for<Op2D>(0,numRows_*numRHS,wdp);
00340       }
00341       else {
00342         TOp2D wdp;
00343         rbh.begin();
00344         wdp.alpha   = alpha;
00345         wdp.beta    = Teuchos::ScalarTraits<RangeScalar>::zero(); // not used
00346         wdp.numRows = numRows_;
00347         wdp.numCols = Y.getNumRows();
00348         wdp.numEntries = rbh.template addConstBuffer<size_t>(pbuf_numEntries_);
00349         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(pbuf_inds2D_);
00350         wdp.vals_beg   = rbh.template addConstBuffer<const Scalar *>(pbuf_vals2D_);
00351         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00352         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00353         wdp.xstride = X.getStride();
00354         wdp.ystride = Y.getStride();
00355         rbh.end();
00356         const size_t numRHS = X.getNumCols();
00357         node_->template parallel_for<TOp2D>(0,numRHS,wdp);
00358       }
00359     }
00360     return;
00361   }
00362 
00363 
00364   template <class Scalar, class Ordinal, class Node>
00365   template <class DomainScalar, class RangeScalar>
00366   void DefaultSparseMultiply<Scalar,Ordinal,Node>::multiply(
00367                                 Teuchos::ETransp trans, 
00368                                 RangeScalar alpha, const MultiVector<DomainScalar,Node> &X, 
00369                                 RangeScalar beta, MultiVector<RangeScalar,Node> &Y) const {
00370     // the 0 parameter means that beta is considered, and the output multivector enjoys accumulate semantics
00371     typedef DefaultSparseMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 0>  Op1D;
00372     typedef DefaultSparseMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 0>  Op2D;
00373     typedef DefaultSparseTransposeMultiplyOp1<Scalar,Ordinal,DomainScalar,RangeScalar, 0> TOp1D;
00374     typedef DefaultSparseTransposeMultiplyOp2<Scalar,Ordinal,DomainScalar,RangeScalar, 0> TOp2D;
00375     TEST_FOR_EXCEPTION(indsInit_ == false || valsInit_ == false, std::runtime_error,
00376         Teuchos::typeName(*this) << "::multiply(): operation not fully initialized.");
00377     TEST_FOR_EXCEPT(X.getNumCols() != Y.getNumCols());
00378     ReadyBufferHelper<Node> rbh(node_);
00379     if (isEmpty_ == true) {
00380       // Y <= alpha * 0 * X + beta * Y 
00381       //   <= beta * Y
00382       // NOTE: this neglects NaNs in X, which don't satisfy 0*NaN == 0
00383       //       however, X and Y may be of different size, and we need entries to determine how to mix those potential NaNs in X into Y
00384       //       therefore, the best we can do is scale Y to zero. Setting Y to zero would destroy NaNs in Y, which violates the semantics of the call.
00385       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Scale(Y,beta);
00386     }
00387     else if (isPacked_ == true) {
00388       if (trans == Teuchos::NO_TRANS) {
00389         Op1D wdp;
00390         rbh.begin();
00391         wdp.alpha   = alpha;
00392         wdp.beta    = beta;
00393         wdp.numRows = numRows_;
00394         wdp.offsets = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00395         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00396         wdp.vals    = rbh.template addConstBuffer<Scalar>(pbuf_vals1D_);
00397         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00398         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00399         wdp.xstride = X.getStride();
00400         wdp.ystride = Y.getStride();
00401         rbh.end();
00402         const size_t numRHS = X.getNumCols();
00403         node_->template parallel_for<Op1D>(0,numRows_*numRHS,wdp);
00404       }
00405       else {
00406         TOp1D wdp;
00407         rbh.begin();
00408         wdp.alpha   = alpha;
00409         wdp.beta    = beta;
00410         wdp.numRows = numRows_;
00411         wdp.numCols = Y.getNumRows();
00412         wdp.offsets = rbh.template addConstBuffer<size_t>(pbuf_offsets1D_);
00413         wdp.inds    = rbh.template addConstBuffer<Ordinal>(pbuf_inds1D_);
00414         wdp.vals    = rbh.template addConstBuffer<Scalar>(pbuf_vals1D_);
00415         wdp.x       = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00416         wdp.y       = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00417         wdp.xstride = X.getStride();
00418         wdp.ystride = Y.getStride();
00419         rbh.end();
00420         const size_t numRHS = X.getNumCols();
00421         node_->template parallel_for<TOp1D>(0,numRHS,wdp);
00422       }
00423     }
00424     else {
00425       if (trans == Teuchos::NO_TRANS) {
00426         Op2D wdp;
00427         rbh.begin();
00428         wdp.numRows = numRows_;
00429         wdp.numEntries = rbh.template addConstBuffer<size_t>(pbuf_numEntries_);
00430         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(pbuf_inds2D_);
00431         wdp.vals_beg   = rbh.template addConstBuffer<const Scalar *>(pbuf_vals2D_);
00432         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00433         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00434         rbh.end();
00435         wdp.alpha   = alpha;
00436         wdp.beta    = beta;
00437         wdp.xstride = X.getStride();
00438         wdp.ystride = Y.getStride();
00439         const size_t numRHS = X.getNumCols();
00440         node_->template parallel_for<Op2D>(0,numRows_*numRHS,wdp);
00441       }
00442       else {
00443         TOp2D wdp;
00444         rbh.begin();
00445         wdp.alpha   = alpha;
00446         wdp.beta    = beta;
00447         wdp.numRows = numRows_;
00448         wdp.numCols = Y.getNumRows();
00449         wdp.numEntries = rbh.template addConstBuffer<size_t>(pbuf_numEntries_);
00450         wdp.inds_beg   = rbh.template addConstBuffer<const Ordinal *>(pbuf_inds2D_);
00451         wdp.vals_beg   = rbh.template addConstBuffer<const Scalar *>(pbuf_vals2D_);
00452         wdp.x          = rbh.template addConstBuffer<DomainScalar>(X.getValues());
00453         wdp.y          = rbh.template addNonConstBuffer<RangeScalar>(Y.getValuesNonConst());
00454         wdp.xstride = X.getStride();
00455         wdp.ystride = Y.getStride();
00456         rbh.end();
00457         const size_t numRHS = X.getNumCols();
00458         node_->template parallel_for<TOp2D>(0,numRHS,wdp);
00459       }
00460     }
00461     return;
00462   }
00463 
00464 } // namespace Kokkos
00465 
00466 #endif /* KOKKOS_DEFAULTSPARSEMULTIPLY_HPP */