Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_CUSPARSEOps.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 //
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 //
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef KOKKOS_CUSPARSEOPS_HPP
00043 #define KOKKOS_CUSPARSEOPS_HPP
00044 
00045 #include <Teuchos_DataAccess.hpp>
00046 #include <Teuchos_CompileTimeAssert.hpp>
00047 #include <Teuchos_TypeTraits.hpp>
00048 #include <Teuchos_BLAS_types.hpp>
00049 
00050 #include <Kokkos_ConfigDefs.hpp>
00051 #include <Kokkos_CUDANodeUtils.hpp>
00052 
00053 #include "Kokkos_CrsMatrixBase.hpp"
00054 #include "Kokkos_CrsGraphBase.hpp"
00055 #include "Kokkos_MultiVector.hpp"
00056 #include "Kokkos_NodeHelpers.hpp"
00057 
00058 #include <cusparse_v2.h>
00059 
00060 namespace Teuchos {
00061   template<>
00062   class TypeNameTraits<cusparseHandle_t> {
00063   public:
00064     static std::string name() { return std::string("cusparseHandle_t"); }
00065     static std::string concreteName( const cusparseHandle_t& ) { return std::string("cusparseHandle_t"); }
00066   };
00067   template<>
00068   class TypeNameTraits<cusparseMatDescr_t> {
00069   public:
00070     static std::string name() { return std::string("cusparseMatDescr_t"); }
00071     static std::string concreteName( const cusparseMatDescr_t& ) { return std::string("cusparseMatDescr_t"); }
00072   };
00073   template<>
00074   class TypeNameTraits<cusparseSolveAnalysisInfo_t> {
00075   public:
00076     static std::string name() { return std::string("cusparseSolveAnalysisInfo_t"); }
00077     static std::string concreteName( const cusparseSolveAnalysisInfo_t& ) { return std::string("cusparseSolveAnalysisInfo_t"); }
00078   };
00079 }
00080 
00081 namespace Kokkos {
00082 
00083   namespace CUSPARSEdetails {
00084 
00085     class Session {
00086       public:
00088         static void init();
00089 
00091         static RCP<const cusparseHandle_t> getHandle();
00092 
00093       private:
00094         static RCP<cusparseHandle_t> session_handle_;
00095     };
00096 
00097     RCP<cusparseMatDescr_t>           createMatDescr();
00098     RCP<cusparseSolveAnalysisInfo_t>  createSolveAnalysisInfo();
00099 
00100     class CUSPARSESessionDestroyer {
00101     public:
00102       CUSPARSESessionDestroyer();
00103       void free(cusparseHandle_t *ptr);
00104     };
00105 
00106     class CUSPARSEMatDescDestroyer {
00107     public:
00108       CUSPARSEMatDescDestroyer();
00109       void free(cusparseMatDescr_t *ptr);
00110     };
00111 
00112     class CUSPARSESolveAnalysisDestroyer {
00113     public:
00114       CUSPARSESolveAnalysisDestroyer();
00115       void free(cusparseSolveAnalysisInfo_t *ptr);
00116     };
00117 
00118 
00119     template <typename T>
00120     struct CUSPARSEUnsupportedScalar
00121     {
00122       public:
00123       static inline T notSupported() { return T::this_type_is_not_supported_by_CUSPARSE(); }
00124     };
00125 
00126 
00127     template <class T>
00128     class CUSPARSETemplateAdaptors {
00129       public:
00130       //
00131       static inline cusparseStatus_t
00132       CSRMM(cusparseHandle_t handle, cusparseOperation_t transA,
00133             int m, int n, int k, int nnz, const T *alpha,
00134             const cusparseMatDescr_t descrA, const T *csrValA,
00135             const int *csrRowPtrA, const int *csrColIndA,
00136             const T *B, int ldb,
00137             const T *beta, T *C, int ldc)
00138       { return CUSPARSEUnsupportedScalar<T>::notSupported(); }
00139       //
00140       static inline cusparseStatus_t
00141       CSRSM_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
00142                      int m, int nnz, const cusparseMatDescr_t descrA,
00143                      const T *csrValA, const int *csrRowPtrA,
00144                      const int *csrColIndA, cusparseSolveAnalysisInfo_t info)
00145       { return CUSPARSEUnsupportedScalar<T>::notSupported(); }
00146       //
00147       static inline cusparseStatus_t
00148       CSRSM_solve(cusparseHandle_t handle, cusparseOperation_t transA,
00149                   int m, int n, const T *alpha,
00150                   const cusparseMatDescr_t descrA,
00151                   const T *csrValA, const int *csrRowPtrA,
00152                   const int *csrColIndA, cusparseSolveAnalysisInfo_t info,
00153                   const T *X, int ldx,
00154                   T *Y, int ldy)
00155       { return CUSPARSEUnsupportedScalar<T>::notSupported(); }
00156     };
00157 
00158     // float support
00159 #ifdef HAVE_KOKKOS_CUDA_FLOAT
00160     template <>
00161     class CUSPARSETemplateAdaptors<float> {
00162       public:
00163       //
00164       static inline cusparseStatus_t
00165       CSRMM(cusparseHandle_t handle, cusparseOperation_t transA,
00166             int m, int n, int k, int nnz, const float *alpha,
00167             const cusparseMatDescr_t descrA, const float *csrValA,
00168             const int *csrRowPtrA, const int *csrColIndA,
00169             const float *B, int ldb,
00170             const float *beta, float *C, int ldc)
00171       { return cusparseScsrmm(handle,transA,m,n,k,nnz,
00172                               alpha,descrA,csrValA,csrRowPtrA,csrColIndA,
00173                               B,ldb,beta,C,ldc); }
00174       //
00175       static inline cusparseStatus_t
00176       CSRSM_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
00177                      int m, int nnz, const cusparseMatDescr_t descrA,
00178                      const float *csrValA, const int *csrRowPtrA,
00179                      const int *csrColIndA, cusparseSolveAnalysisInfo_t info)
00180       { return cusparseScsrsm_analysis(handle,transA,m,nnz,descrA,
00181                                        csrValA,csrRowPtrA,csrColIndA,info); }
00182       //
00183       static inline cusparseStatus_t
00184       CSRSM_solve(cusparseHandle_t handle, cusparseOperation_t transA,
00185                   int m, int n, const float *alpha,
00186                   const cusparseMatDescr_t descrA,
00187                   const float *csrValA, const int *csrRowPtrA,
00188                   const int *csrColIndA, cusparseSolveAnalysisInfo_t info,
00189                   const float *X, int ldx,
00190                   float *Y, int ldy)
00191       { return cusparseScsrsm_solve(handle,transA,m,n,
00192                                     alpha,descrA,csrValA,csrRowPtrA,csrColIndA,
00193                                     info,X,ldx,Y,ldy); }
00194     };
00195 #endif
00196 
00197 #ifdef HAVE_KOKKOS_CUDA_DOUBLE
00198     // double support
00199     template <>
00200     class CUSPARSETemplateAdaptors<double> {
00201       public:
00202       //
00203       static inline cusparseStatus_t
00204       CSRMM(cusparseHandle_t handle, cusparseOperation_t transA,
00205             int m, int n, int k, int nnz, const double *alpha,
00206             const cusparseMatDescr_t descrA, const double *csrValA,
00207             const int *csrRowPtrA, const int *csrColIndA,
00208             const double *B, int ldb,
00209             const double *beta, double *C, int ldc)
00210       { return cusparseDcsrmm(handle,transA,m,n,k,nnz,
00211                               alpha,descrA,csrValA,csrRowPtrA,csrColIndA,
00212                               B,ldb,beta,C,ldc); }
00213       //
00214       static inline cusparseStatus_t
00215       CSRSM_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
00216                      int m, int nnz, const cusparseMatDescr_t descrA,
00217                      const double *csrValA, const int *csrRowPtrA,
00218                      const int *csrColIndA, cusparseSolveAnalysisInfo_t info)
00219       { return cusparseDcsrsm_analysis(handle,transA,m,nnz,descrA,
00220                                        csrValA,csrRowPtrA,csrColIndA,info); }
00221       //
00222       static inline cusparseStatus_t
00223       CSRSM_solve(cusparseHandle_t handle, cusparseOperation_t transA,
00224                   int m, int n, const double *alpha,
00225                   const cusparseMatDescr_t descrA,
00226                   const double *csrValA, const int *csrRowPtrA,
00227                   const int *csrColIndA, cusparseSolveAnalysisInfo_t info,
00228                   const double *X, int ldx,
00229                   double *Y, int ldy)
00230       { return cusparseDcsrsm_solve(handle,transA,m,n,
00231                                     alpha,descrA,csrValA,csrRowPtrA,csrColIndA,
00232                                     info,X,ldx,Y,ldy); }
00233     };
00234 #endif
00235 
00236 #ifdef HAVE_KOKKOS_CUDA_COMPLEX_FLOAT
00237     // complex<float> support
00238     template <>
00239     class CUSPARSETemplateAdaptors<std::complex<float> > {
00240       public:
00241       //
00242       static inline cusparseStatus_t
00243       CSRMM(cusparseHandle_t handle, cusparseOperation_t transA,
00244             int m, int n, int k, int nnz, const cuComplex *alpha,
00245             const cusparseMatDescr_t descrA, const cuComplex *csrValA,
00246             const int *csrRowPtrA, const int *csrColIndA,
00247             const cuComplex *B, int ldb,
00248             const cuComplex *beta, cuComplex *C, int ldc)
00249       { return cusparseCcsrmm(handle,transA,m,n,k,nnz,
00250                               alpha,descrA,csrValA,csrRowPtrA,csrColIndA,
00251                               B,ldb,beta,C,ldc); }
00252       //
00253       static inline cusparseStatus_t
00254       CSRSM_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
00255                      int m, int nnz, const cusparseMatDescr_t descrA,
00256                      const cuComplex *csrValA, const int *csrRowPtrA,
00257                      const int *csrColIndA, cusparseSolveAnalysisInfo_t info)
00258       { return cusparseCcsrsm_analysis(handle,transA,m,nnz,descrA,
00259                                        csrValA,csrRowPtrA,csrColIndA,info); }
00260       //
00261       static inline cusparseStatus_t
00262       CSRSM_solve(cusparseHandle_t handle, cusparseOperation_t transA,
00263                   int m, int n, const cuComplex *alpha,
00264                   const cusparseMatDescr_t descrA,
00265                   const cuComplex *csrValA, const int *csrRowPtrA,
00266                   const int *csrColIndA, cusparseSolveAnalysisInfo_t info,
00267                   const cuComplex *X, int ldx,
00268                   cuComplex *Y, int ldy)
00269       { return cusparseCcsrsm_solve(handle,transA,m,n,
00270                                     alpha,descrA,csrValA,csrRowPtrA,csrColIndA,
00271                                     info,X,ldx,Y,ldy); }
00272     };
00273 #endif
00274 
00275 #ifdef HAVE_KOKKOS_CUDA_COMPLEX_DOUBLE
00276     // complex<double> support
00277     template <>
00278     class CUSPARSETemplateAdaptors<std::complex<double> > {
00279       public:
00280       //
00281       static inline cusparseStatus_t
00282       CSRMM(cusparseHandle_t handle, cusparseOperation_t transA,
00283             int m, int n, int k, int nnz, const cuDoubleComplex *alpha,
00284             const cusparseMatDescr_t descrA, const cuDoubleComplex *csrValA,
00285             const int *csrRowPtrA, const int *csrColIndA,
00286             const cuDoubleComplex *B, int ldb,
00287             const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc)
00288       { return cusparseZcsrmm(handle,transA,m,n,k,nnz,
00289                               alpha,descrA,csrValA,csrRowPtrA,csrColIndA,
00290                               B,ldb,beta,C,ldc); }
00291       //
00292       static inline cusparseStatus_t
00293       CSRSM_analysis(cusparseHandle_t handle, cusparseOperation_t transA,
00294                      int m, int nnz, const cusparseMatDescr_t descrA,
00295                      const cuDoubleComplex *csrValA, const int *csrRowPtrA,
00296                      const int *csrColIndA, cusparseSolveAnalysisInfo_t info)
00297       { return cusparseZcsrsm_analysis(handle,transA,m,nnz,descrA,
00298                                        csrValA,csrRowPtrA,csrColIndA,info); }
00299       //
00300       static inline cusparseStatus_t
00301       CSRSM_solve(cusparseHandle_t handle, cusparseOperation_t transA,
00302                   int m, int n, const cuDoubleComplex *alpha,
00303                   const cusparseMatDescr_t descrA,
00304                   const cuDoubleComplex *csrValA, const int *csrRowPtrA,
00305                   const int *csrColIndA, cusparseSolveAnalysisInfo_t info,
00306                   const cuDoubleComplex *X, int ldx,
00307                   cuDoubleComplex *Y, int ldy)
00308       { return cusparseZcsrsm_solve(handle,transA,m,n,
00309                                     alpha,descrA,csrValA,csrRowPtrA,csrColIndA,
00310                                     info,X,ldx,Y,ldy); }
00311     };
00312 #endif
00313 
00314   } // end of namespace CUSPARSEdetails
00315 
00317 
00319   template <class Node>
00320   class CUSPARSECrsGraph : public CrsGraphBase<int,Node>
00321   {
00322     public:
00323       CUSPARSECrsGraph(int numRows, int numCols, const RCP<Node> &node, const RCP<ParameterList> &params);
00324       bool isEmpty() const;
00325       void setStructure(const ArrayRCP<const size_t>  &ptrs,
00326                         const ArrayRCP<const int> &inds);
00327       void setDeviceData(const ArrayRCP<const int> &devptrs, const ArrayRCP<const int> &devinds);
00328       inline ArrayRCP<const size_t> getPointers() const;
00329       inline ArrayRCP<const int> getIndices() const;
00330       inline ArrayRCP<const int> getDevPointers() const;
00331       inline ArrayRCP<const int> getDevIndices() const;
00332       inline bool isInitialized() const;
00333       void setMatDesc(Teuchos::EUplo uplo, Teuchos::EDiag diag);
00334       void getMatDesc(Teuchos::EUplo &uplo, Teuchos::EDiag &diag) const;
00335       RCP<cusparseMatDescr_t> getMatDesc() const;
00336     private:
00337       bool isInitialized_;
00338       bool isEmpty_;
00339       // cusparse matrix description handle
00340       RCP<cusparseMatDescr_t> matdescr_;
00341       Teuchos::EUplo uplo_;
00342       Teuchos::EDiag diag_;
00343       // graph data
00344       ArrayRCP<const size_t> host_rowptrs_;
00345       ArrayRCP<const int>    dev_rowptrs_;
00346       ArrayRCP<const int>    host_colinds_, dev_colinds_;
00347       // TODO: add CSC data, for efficient transpose multiply
00348   };
00349 
00351 
00353   template <class Scalar,
00354             class Node>
00355   class CUSPARSECrsMatrix : public CrsMatrixBase<Scalar,int,Node>
00356   {
00357     public:
00358       CUSPARSECrsMatrix(const RCP<const CUSPARSECrsGraph<Node> > &graph, const RCP<ParameterList> &params);
00359       void setValues(const ArrayRCP<const Scalar> &vals);
00360       void setDeviceData(const ArrayRCP<const Scalar> &devvals);
00361       inline ArrayRCP<const Scalar> getValues() const;
00362       inline ArrayRCP<const Scalar> getDevValues() const;
00363       inline bool isInitialized() const;
00364       void setAnalyses(const RCP<cusparseSolveAnalysisInfo_t> &analysisNoTrans,
00365                        const RCP<cusparseSolveAnalysisInfo_t> &analysisTrans,
00366                        const RCP<cusparseSolveAnalysisInfo_t> &analysisConjTrans);
00367       void getAnalyses(RCP<cusparseSolveAnalysisInfo_t> &analysisNoTrans,
00368                        RCP<cusparseSolveAnalysisInfo_t> &analysisTrans,
00369                        RCP<cusparseSolveAnalysisInfo_t> &analysisConjTrans) const;
00370     private:
00371       bool isInitialized_;
00372       // cusparse analysis handles
00373       RCP<cusparseSolveAnalysisInfo_t> analysisNoTrans_, analysisConjTrans_, analysisTrans_;
00374       // matrix data
00375       ArrayRCP<const Scalar> vals_, dev_vals_;
00376       // TODO: add CSC data, for efficient transpose multiply
00377   };
00378 
00379   template <class Node>
00380   CUSPARSECrsGraph<Node>::CUSPARSECrsGraph(int numRows, int numCols, const RCP<Node> &node, const RCP<ParameterList> &params)
00381   : CrsGraphBase<int,Node>(numRows,numCols,node,params)
00382   , isInitialized_(false)
00383   , isEmpty_(false)
00384   {
00385     CUSPARSEdetails::Session::init();
00386     // Make sure that users only specialize for Kokkos Node types that are CUDA Nodes
00387     Teuchos::CompileTimeAssert<Node::isCUDANode == false> cta; (void)cta;
00388   }
00389 
00390   template <class Node>
00391   bool CUSPARSECrsGraph<Node>::isEmpty() const
00392   { return isEmpty_; }
00393 
00394   template <class Node>
00395   void CUSPARSECrsGraph<Node>::setStructure(const ArrayRCP<const size_t> &ptrs,
00396                                             const ArrayRCP<const int> &inds)
00397   {
00398     std::string tfecfFuncName("setStructure(ptrs,inds)");
00399     const int numrows = this->getNumRows();
00400     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00401         (size_t)ptrs.size() != (size_t)numrows+1
00402         || ptrs[0] != 0
00403         || (size_t)inds.size() != (size_t)ptrs[numrows],
00404         std::runtime_error, " graph data not coherent."
00405     )
00406     const int numEntries = ptrs[numrows];
00407     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00408         isInitialized_ == true,
00409         std::runtime_error, " matrix has already been initialized"
00410     )
00411     if (numrows == 0 || numEntries == 0) isEmpty_ = true;
00412     host_rowptrs_ = ptrs;
00413     host_colinds_ = inds;
00414     isInitialized_ = true;
00415   }
00416 
00417   template <class Node>
00418   ArrayRCP<const size_t> CUSPARSECrsGraph<Node>::getPointers() const
00419   { return host_rowptrs_; }
00420 
00421   template <class Node>
00422   ArrayRCP<const int> CUSPARSECrsGraph<Node>::getIndices() const
00423   { return host_colinds_; }
00424 
00425   template <class Node>
00426   ArrayRCP<const int> CUSPARSECrsGraph<Node>::getDevPointers() const
00427   { return dev_rowptrs_; }
00428 
00429   template <class Node>
00430   ArrayRCP<const int> CUSPARSECrsGraph<Node>::getDevIndices() const
00431   { return dev_colinds_; }
00432 
00433   template <class Node>
00434   void CUSPARSECrsGraph<Node>::setDeviceData(const ArrayRCP<const int> &devptrs,
00435                                              const ArrayRCP<const int>    &devinds)
00436   { dev_rowptrs_ = devptrs; dev_colinds_ = devinds; }
00437 
00438   template <class Node>
00439   bool CUSPARSECrsGraph<Node>::isInitialized() const
00440   { return isInitialized_; }
00441 
00442   template <class Node>
00443   void CUSPARSECrsGraph<Node>::setMatDesc(Teuchos::EUplo uplo, Teuchos::EDiag diag)
00444   {
00445     std::string tfecfFuncName("setMatDesc()");
00446     //
00447     uplo_ = uplo;
00448     diag_ = diag;
00449     //
00450     matdescr_ = CUSPARSEdetails::createMatDescr();
00451     cusparseStatus_t status = cusparseSetMatIndexBase(*matdescr_, CUSPARSE_INDEX_BASE_ZERO);
00452     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(status != CUSPARSE_STATUS_SUCCESS, std::runtime_error, "error setting matrix descriptor (index base).")
00453     // upper or lower
00454     if (uplo == Teuchos::UPPER_TRI) {
00455       status = cusparseSetMatFillMode(*matdescr_, CUSPARSE_FILL_MODE_UPPER);
00456       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(status != CUSPARSE_STATUS_SUCCESS, std::runtime_error, "error setting matrix descriptor (upper).")
00457       status = cusparseSetMatType(*matdescr_, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
00458       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(status != CUSPARSE_STATUS_SUCCESS, std::runtime_error, "error setting matrix descriptor (triangular).")
00459     }
00460     else if (uplo == Teuchos::LOWER_TRI) {
00461       status = cusparseSetMatFillMode(*matdescr_, CUSPARSE_FILL_MODE_LOWER);
00462       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(status != CUSPARSE_STATUS_SUCCESS, std::runtime_error, "error setting matrix descriptor (lower).")
00463       status = cusparseSetMatType(*matdescr_, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
00464       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(status != CUSPARSE_STATUS_SUCCESS, std::runtime_error, "error setting matrix descriptor (triangular).")
00465     }
00466     else {
00467       status = cusparseSetMatType(*matdescr_, CUSPARSE_MATRIX_TYPE_GENERAL);
00468       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(status != CUSPARSE_STATUS_SUCCESS, std::runtime_error, "error setting matrix descriptor (general).")
00469     }
00470     if (diag == Teuchos::UNIT_DIAG) {
00471       status = cusparseSetMatDiagType(*matdescr_, CUSPARSE_DIAG_TYPE_UNIT);
00472       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(status != CUSPARSE_STATUS_SUCCESS, std::runtime_error, "error setting matrix descriptor (unit).")
00473     }
00474     else {
00475       status = cusparseSetMatDiagType(*matdescr_, CUSPARSE_DIAG_TYPE_NON_UNIT);
00476       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(status != CUSPARSE_STATUS_SUCCESS, std::runtime_error, "error setting matrix descriptor (non-unit).")
00477     }
00478   }
00479 
00480   template <class Node>
00481   void CUSPARSECrsGraph<Node>::getMatDesc(Teuchos::EUplo &uplo, Teuchos::EDiag &diag) const
00482   {
00483     uplo = uplo_;
00484     diag = diag_;
00485   }
00486 
00487   template <class Node>
00488   RCP<cusparseMatDescr_t> CUSPARSECrsGraph<Node>::getMatDesc() const
00489   {
00490     return matdescr_;
00491   }
00492 
00493   template <class Scalar, class Node>
00494   CUSPARSECrsMatrix<Scalar,Node>::CUSPARSECrsMatrix(const RCP<const CUSPARSECrsGraph<Node> > &graph, const RCP<ParameterList> &params)
00495   : CrsMatrixBase<Scalar,int,Node>(graph,params)
00496   , isInitialized_(false)
00497   {
00498     // Make sure that users only specialize for Kokkos Node types that are CUDA Nodes
00499     Teuchos::CompileTimeAssert<Node::isCUDANode == false> cta; (void)cta;
00500   }
00501 
00502   template <class Scalar, class Node>
00503   void CUSPARSECrsMatrix<Scalar,Node>::setValues(const ArrayRCP<const Scalar> &vals)
00504   {
00505     std::string tfecfFuncName("setValues(vals)");
00506     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00507         isInitialized_ == true,
00508         std::runtime_error, " matrix is already initialized."
00509     )
00510     vals_ = vals;
00511     isInitialized_ = true;
00512   }
00513 
00514   template <class Scalar, class Node>
00515   ArrayRCP<const Scalar> CUSPARSECrsMatrix<Scalar,Node>::getValues() const
00516   { return vals_; }
00517 
00518   template <class Scalar, class Node>
00519   bool CUSPARSECrsMatrix<Scalar,Node>::isInitialized() const
00520   { return isInitialized_; }
00521 
00522   template <class Scalar, class Node>
00523   ArrayRCP<const Scalar> CUSPARSECrsMatrix<Scalar,Node>::getDevValues() const
00524   { return dev_vals_; }
00525 
00526   template <class Scalar, class Node>
00527   void CUSPARSECrsMatrix<Scalar,Node>::setDeviceData(const ArrayRCP<const Scalar> &devvals)
00528   { dev_vals_ = devvals; }
00529 
00530   template <class Scalar, class Node>
00531   void CUSPARSECrsMatrix<Scalar,Node>::setAnalyses(const RCP<cusparseSolveAnalysisInfo_t> &analysisNoTrans,
00532                                                    const RCP<cusparseSolveAnalysisInfo_t> &analysisTrans,
00533                                                    const RCP<cusparseSolveAnalysisInfo_t> &analysisConjTrans)
00534   {
00535     analysisNoTrans_   = analysisNoTrans;
00536     analysisTrans_     = analysisTrans;
00537     analysisConjTrans_ = analysisConjTrans;
00538   }
00539 
00540   template <class Scalar, class Node>
00541   void CUSPARSECrsMatrix<Scalar,Node>::getAnalyses(RCP<cusparseSolveAnalysisInfo_t> &analysisNoTrans,
00542                                                    RCP<cusparseSolveAnalysisInfo_t> &analysisTrans,
00543                                                    RCP<cusparseSolveAnalysisInfo_t> &analysisConjTrans) const
00544   {
00545     analysisNoTrans   = analysisNoTrans_;
00546     analysisTrans     = analysisTrans_;
00547     analysisConjTrans = analysisConjTrans_;
00548   }
00549 
00557   template <class Scalar, class Node>
00558   class CUSPARSEOps {
00559   public:
00561 
00562 
00564     typedef Scalar  scalar_type;
00566     typedef int     ordinal_type;
00568     typedef Node    node_type;
00570     typedef CUSPARSEOps<Scalar,Node> sparse_ops_type;
00571 
00573     template <class O, class N>
00574     struct graph {
00575       typedef typename O::this_ordinal_not_supported_by_cusparse graph_type;
00576     };
00577 
00579     template <class N>
00580     struct graph<int,N> {
00581       typedef CUSPARSECrsGraph<N> graph_type;
00582     };
00583 
00585     template <class S, class O, class N>
00586     struct matrix {
00587       typedef typename O::this_ordinal_not_supported_by_cusparse matrix_type;
00588     };
00589 
00591     template <class S, class N>
00592     struct matrix<S,int,N> {
00593       typedef CUSPARSECrsMatrix<S,N> matrix_type;
00594     };
00595 
00608     template <class S2>
00609     struct bind_scalar {
00610       typedef CUSPARSEOps<S2,Node> other_type;
00611     };
00612 
00614 
00615 
00616 
00618     CUSPARSEOps(const RCP<Node> &node);
00619 
00621     ~CUSPARSEOps();
00622 
00624 
00625 
00626 
00628     RCP<Node> getNode() const;
00629 
00631 
00633 
00634 
00636     static ArrayRCP<size_t> allocRowPtrs(const RCP<Node> &node, const ArrayView<const size_t> &rowPtrs);
00637 
00639     template <class T>
00640     static ArrayRCP<T> allocStorage(const RCP<Node> &node, const ArrayView<const size_t> &ptrs);
00641 
00643     static void finalizeGraph(Teuchos::EUplo uplo, Teuchos::EDiag diag, CUSPARSECrsGraph<Node> &graph, const RCP<ParameterList> &params);
00644 
00646     static void finalizeMatrix(const CUSPARSECrsGraph<Node> &graph, CUSPARSECrsMatrix<Scalar,Node> &matrix, const RCP<ParameterList> &params);
00647 
00649     static void finalizeGraphAndMatrix(Teuchos::EUplo uplo, Teuchos::EDiag diag, CUSPARSECrsGraph<Node> &graph, CUSPARSECrsMatrix<Scalar,Node> &matrix, const RCP<ParameterList> &params);
00650 
00652     void setGraphAndMatrix(const RCP<const CUSPARSECrsGraph<Node> > &graph,
00653                            const RCP<const CUSPARSECrsMatrix<Scalar,Node> > &matrix);
00654 
00656 
00657 
00658 
00683     template <class DomainScalar, class RangeScalar>
00684     void
00685     multiply (Teuchos::ETransp trans,
00686               RangeScalar alpha,
00687               const MultiVector<DomainScalar,Node> &X,
00688               MultiVector<RangeScalar,Node> &Y) const;
00689 
00714     template <class DomainScalar, class RangeScalar>
00715     void
00716     multiply (Teuchos::ETransp trans,
00717               RangeScalar alpha,
00718               const MultiVector<DomainScalar,Node> &X,
00719               RangeScalar beta,
00720               MultiVector<RangeScalar,Node> &Y) const;
00721 
00748     template <class DomainScalar, class RangeScalar>
00749     void
00750     solve (Teuchos::ETransp trans,
00751            const MultiVector<DomainScalar,Node> &Y,
00752            MultiVector<RangeScalar,Node> &X) const;
00753 
00755 
00756   private:
00758     CUSPARSEOps(const CUSPARSEOps& source);
00759 
00761     RCP<Node> node_;
00762 
00763     int numRows_, numCols_, numNZ_;
00764     bool isInitialized_;
00765 
00766     ArrayRCP<const int> rowPtrs_, colInds_;
00767     ArrayRCP<const Scalar> rowVals_;
00768     RCP<cusparseMatDescr_t>          matdescr_;
00769     RCP<cusparseSolveAnalysisInfo_t> aiNoTrans_, aiTrans_, aiConjTrans_;
00770   };
00771 
00772 
00773   // ======= matrix finalization ===========
00774   template <class Scalar, class Node>
00775   void CUSPARSEOps<Scalar,Node>::finalizeGraph(Teuchos::EUplo uplo, Teuchos::EDiag diag,
00776                                                CUSPARSECrsGraph<Node> &graph, const RCP<ParameterList> &params)
00777   {
00778     const size_t CUDA_MAX_INT = 2147483647;
00779     const std::string prefix("finalizeGraph()");
00780     RCP<Node> node = graph.getNode();
00781     TEUCHOS_TEST_FOR_EXCEPTION(
00782         graph.isInitialized() == false,
00783         std::runtime_error, prefix << ": graph has not yet been initialized."
00784     )
00785     // diag: have to allocate and indicate
00786     ArrayRCP<int>    devinds, devptrs;
00787     ArrayRCP<const int>    hostinds = graph.getIndices();
00788     ArrayRCP<const size_t> hostptrs = graph.getPointers();
00789     const int numRows = graph.getNumRows();
00790     // set description
00791     graph.setMatDesc(uplo,diag);
00792     // allocate and initialize data
00793     if (diag == Teuchos::UNIT_DIAG) {
00794       // CUSPARSE, unfortunately, always assumes that the diagonal
00795       // entries are present in the storage.  Therefore, this flag
00796       // only specifies whether they are considered or not; they are
00797       // assumed to be present, and neglecting them will result in
00798       // incorrect behavior (causing the adjacent entry to be
00799       // neglected instead).  Therefore, because our API doesn't give
00800       // us explicit diagonal entries if diag == Teuchos::UNIT_DIAG,
00801       // we must allocate space for them.
00802       const size_t numnz = hostinds.size() + numRows;
00803       TEUCHOS_TEST_FOR_EXCEPTION( 
00804           numnz > CUDA_MAX_INT, std::runtime_error, 
00805           "Kokkos::CUSPARSEOps: CUSPARSE does not support more than " << CUDA_MAX_INT << " non-zeros." 
00806       );
00807       devptrs = node->template allocBuffer<int>( numRows+1 );
00808       if (numnz) devinds = node->template allocBuffer<int>( numnz );
00809       ArrayRCP<int> h_devptrs = node->viewBufferNonConst(WriteOnly, numRows+1, devptrs);
00810       ArrayRCP<int> h_devinds = node->viewBufferNonConst(WriteOnly, numnz,     devinds);
00811       for (int r=0; r < numRows; ++r) {
00812         h_devptrs[r] = (int)(hostptrs[r]+r);
00813         if (uplo == Teuchos::LOWER_TRI) {
00814           // leave one space at the end
00815           std::copy (hostinds.begin()+hostptrs[r],
00816                      hostinds.begin()+hostptrs[r+1],
00817                      h_devinds.begin()+h_devptrs[r]);
00818         }
00819         else {
00820           // leave one space at the beginning
00821           std::copy (hostinds.begin()+hostptrs[r],
00822                      hostinds.begin()+hostptrs[r+1],
00823                      h_devinds.begin()+h_devptrs[r]+1);
00824         }
00825       }
00826       h_devptrs[numRows] = (int)(hostptrs[numRows]+numRows);
00827       // copy back
00828       h_devptrs = null;
00829       h_devinds = null;
00830     }
00831     else {
00832       // our format == their format; just allocate and copy
00833       const size_t numnz = hostinds.size();
00834       TEUCHOS_TEST_FOR_EXCEPTION( 
00835           numnz > CUDA_MAX_INT, std::runtime_error, 
00836           "Kokkos::CUSPARSEOps: CUSPARSE does not support more than " << CUDA_MAX_INT << " non-zeros." 
00837       );
00838       devptrs = node->template allocBuffer<int>( numRows+1 );
00839       ArrayRCP<int> h_devptrs = node->viewBufferNonConst(WriteOnly, numRows+1, devptrs);
00840       std::copy( hostptrs.begin(), hostptrs.end(), h_devptrs.begin() );
00841       h_devptrs = null;
00842       if (numnz) {
00843         devinds = node->template allocBuffer<int>( numnz );
00844         node->copyToBuffer( numnz, hostinds(), devinds );
00845       }
00846     }
00847     // set the data
00848     graph.setDeviceData(devptrs,devinds);
00849   }
00850 
00851   // ======= matrix finalization ===========
00852   template <class Scalar, class Node>
00853   void CUSPARSEOps<Scalar,Node>::
00854   finalizeMatrix (const CUSPARSECrsGraph<Node> &graph,
00855                   CUSPARSECrsMatrix<Scalar,Node> &matrix,
00856                   const RCP<ParameterList> &params)
00857   {
00858     std::string FuncName("Kokkos::CUSPARSEOps::finalizeMatrix()");
00859     RCP<Node> node = graph.getNode();
00860     TEUCHOS_TEST_FOR_EXCEPTION(
00861         matrix.isInitialized() == false,
00862         std::runtime_error, FuncName << ": matrix has not yet been initialized."
00863     )
00864     // diag: have to allocate and indicate
00865     Teuchos::EUplo uplo;
00866     Teuchos::EDiag diag;
00867     graph.getMatDesc(uplo,diag);
00868     ArrayRCP<Scalar>       devvals;
00869     ArrayRCP<const size_t> hostptrs = graph.getPointers();
00870     ArrayRCP<const Scalar> hostvals = matrix.getValues();
00871     const int numRows = graph.getNumRows();
00872     if (diag == Teuchos::UNIT_DIAG) {
00873       // cuSPARSE, unfortunately, always assumes that the diagonal
00874       // entries are present in the storage.  Therefore, this flag
00875       // only specifies whether they are considered or not; they are
00876       // assumed to be present, and neglecting them will result in
00877       // incorrect behavior (causing the adjacent entry to be
00878       // neglected instead).  Therefore, because our API doesn't give
00879       // us diagonal entries if diag == Teuchos::UNIT_DIAG, we must
00880       // allocate space for them.
00881       const int numnz = hostptrs[numRows] + numRows;
00882       devvals = node->template allocBuffer<Scalar>( numnz );
00883       ArrayRCP<Scalar> h_devvals = node->viewBufferNonConst(WriteOnly, numnz, devvals);
00884       for (int r=0; r < numRows; ++r) {
00885         if (uplo == Teuchos::LOWER_TRI) {
00886           // leave one space at the end
00887           std::copy( hostvals.begin()+hostptrs[r], hostvals.begin()+hostptrs[r+1], h_devvals.begin()+hostptrs[r]+r );
00888         }
00889         else {
00890           // leave one space at the beginning
00891           std::copy( hostvals.begin()+hostptrs[r], hostvals.begin()+hostptrs[r+1], h_devvals.begin()+hostptrs[r]+r+1 );
00892         }
00893       }
00894       // copy back
00895       h_devvals = null;
00896     }
00897     else {
00898       const int numnz = hostptrs[numRows];
00899       if (numnz) {
00900         devvals = node->template allocBuffer<Scalar>( numnz );
00901         node->copyToBuffer( numnz,     hostvals(), devvals );
00902       }
00903     }
00904     matrix.setDeviceData(devvals);
00905 
00906     const int numnz = (int)devvals.size();
00907     RCP<cusparseMatDescr_t> descr = graph.getMatDesc();
00908     ArrayRCP<const int> devptrs = graph.getDevPointers(),
00909                         devinds = graph.getDevIndices();
00910 
00911     RCP<const cusparseHandle_t> hndl = CUSPARSEdetails::Session::getHandle();
00912     // look at the parameter list and do any analyses requested for solves
00913     RCP<cusparseSolveAnalysisInfo_t> ai_non, ai_trans, ai_conj;
00914     if (params != null && params->get("Prepare Solve",false)) {
00915       ai_non = CUSPARSEdetails::createSolveAnalysisInfo();
00916       cusparseStatus_t stat = CUSPARSEdetails::CUSPARSETemplateAdaptors<Scalar>::CSRSM_analysis(
00917           *hndl, CUSPARSE_OPERATION_NON_TRANSPOSE,numRows,
00918           numnz, *descr, devvals.getRawPtr(), devptrs.getRawPtr(), devinds.getRawPtr(),
00919           *ai_non);
00920       TEUCHOS_TEST_FOR_EXCEPTION(stat != CUSPARSE_STATUS_SUCCESS, std::runtime_error,
00921           FuncName << ": CSRSM_analysis(non-trans) returned error " << stat);
00922     }
00923     if (params != null && params->get("Prepare Transpose Solve",false)) {
00924       ai_trans = CUSPARSEdetails::createSolveAnalysisInfo();
00925       cusparseStatus_t stat = CUSPARSEdetails::CUSPARSETemplateAdaptors<Scalar>::CSRSM_analysis(
00926           *hndl, CUSPARSE_OPERATION_TRANSPOSE,numRows,
00927           numnz, *descr, devvals.getRawPtr(), devptrs.getRawPtr(), devinds.getRawPtr(),
00928           *ai_trans);
00929       TEUCHOS_TEST_FOR_EXCEPTION(stat != CUSPARSE_STATUS_SUCCESS, std::runtime_error,
00930           FuncName << ": CSRSM_analysis(trans) returned error " << stat);
00931     }
00932     if (params != null && params->get("Prepare Conjugate Transpose Solve",false)) {
00933       ai_conj = CUSPARSEdetails::createSolveAnalysisInfo();
00934       cusparseStatus_t stat = CUSPARSEdetails::CUSPARSETemplateAdaptors<Scalar>::CSRSM_analysis(
00935           *hndl, CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE,numRows,
00936           numnz, *descr, devvals.getRawPtr(), devptrs.getRawPtr(), devinds.getRawPtr(),
00937           *ai_conj);
00938       TEUCHOS_TEST_FOR_EXCEPTION(stat != CUSPARSE_STATUS_SUCCESS, std::runtime_error,
00939           FuncName << ": CSRSM_analysis(conj-trans) returned error " << stat);
00940     }
00941     matrix.setAnalyses( ai_non, ai_trans, ai_conj );
00942     //
00943     // if (params != null && params->get("Prepare Transpose Multiply",false)) {
00944     //   // finish: compute CSC for transpose
00945     //   TEUCHOS_TEST_FOR_EXCEPT(true);
00946     // }
00947   }
00948 
00949   // ======= graph and matrix finalization ===========
00950   template <class Scalar, class Node>
00951   void CUSPARSEOps<Scalar,Node>::finalizeGraphAndMatrix(Teuchos::EUplo uplo, Teuchos::EDiag diag,
00952                                                         CUSPARSECrsGraph<Node> &graph, CUSPARSECrsMatrix<Scalar,Node> &matrix,
00953                                                         const RCP<ParameterList> &params)
00954   {
00955     std::string FuncName("Kokkos::CUSPARSEOps::finalizeGraphAndMatrix(graph,matrix,params)");
00956     TEUCHOS_TEST_FOR_EXCEPTION(
00957         graph.isInitialized() == false,
00958         std::runtime_error, FuncName << ": graph has not yet been initialized."
00959     )
00960     TEUCHOS_TEST_FOR_EXCEPTION(
00961         matrix.isInitialized() == false,
00962         std::runtime_error, FuncName << ": matrix has not yet been initialized."
00963     )
00964     // no benefit to doing them together; do them separately
00965     finalizeGraph(uplo,diag,graph,params);
00966     finalizeMatrix(graph,matrix,params);
00967   }
00968 
00969 
00970   // ======= pointer allocation ===========
00971   template <class Scalar, class Node>
00972   ArrayRCP<size_t>
00973   CUSPARSEOps<Scalar,Node>::allocRowPtrs(const RCP<Node> &/*node*/, const ArrayView<const size_t> &numEntriesPerRow)
00974   {
00975     // alloc page-locked ("pinned") memory on the host, specially allocated and specially deallocated
00976     CUDANodeHostPinnedDeallocator<size_t> dealloc;
00977     ArrayRCP<size_t> ptrs = dealloc.alloc(numEntriesPerRow.size() + 1);
00978     ptrs[0] = 0;
00979     std::partial_sum( numEntriesPerRow.getRawPtr(), numEntriesPerRow.getRawPtr()+numEntriesPerRow.size(), ptrs.begin()+1 );
00980     return ptrs;
00981   }
00982 
00983   // ======= other allocation ===========
00984   template <class Scalar, class Node>
00985   template <class T>
00986   ArrayRCP<T>
00987   CUSPARSEOps<Scalar,Node>::allocStorage(const RCP<Node> &/*node*/, const ArrayView<const size_t> &rowPtrs)
00988   {
00989     // alloc page-locked ("pinned") memory on the host, specially allocated and specially deallocated
00990     const int totalNumEntries = *(rowPtrs.end()-1);
00991     CUDANodeHostPinnedDeallocator<T> dealloc;
00992     ArrayRCP<T> buf = dealloc.alloc(totalNumEntries);
00993     std::fill(buf.begin(), buf.end(), Teuchos::ScalarTraits<T>::zero() );
00994     return buf;
00995   }
00996 
00997   template<class Scalar, class Node>
00998   CUSPARSEOps<Scalar,Node>::CUSPARSEOps(const RCP<Node> &node)
00999   : node_(node)
01000   , numRows_(0)
01001   , numCols_(0)
01002   , numNZ_(0)
01003   , isInitialized_(false)
01004   {
01005     CUSPARSEdetails::Session::init();
01006     // Make sure that users only specialize for Kokkos Node types that are CUDA Nodes
01007     Teuchos::CompileTimeAssert<Node::isCUDANode == false> cta; (void)cta;
01008   }
01009 
01010   template<class Scalar, class Node>
01011   CUSPARSEOps<Scalar,Node>::~CUSPARSEOps()
01012   { }
01013 
01014   template <class Scalar, class Node>
01015   RCP<Node> CUSPARSEOps<Scalar,Node>::getNode() const {
01016     return node_;
01017   }
01018 
01019   template <class Scalar, class Node>
01020   void CUSPARSEOps<Scalar,Node>::setGraphAndMatrix(const RCP<const CUSPARSECrsGraph<Node> > &graph, const RCP<const CUSPARSECrsMatrix<Scalar,Node> > &matrix)
01021   {
01022     std::string tfecfFuncName("setGraphAndMatrix(graph,matrix)");
01023     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01024         isInitialized_ == true,
01025         std::runtime_error, " operators already initialized.");
01026     // get cusparse data from the matrix
01027     numRows_ = graph->getNumRows();
01028     numCols_ = graph->getNumCols();
01029     matdescr_ = graph->getMatDesc();
01030     rowPtrs_ = graph->getDevPointers();
01031     colInds_ = graph->getDevIndices();
01032     rowVals_ = matrix->getDevValues();
01033     numNZ_ = colInds_.size();
01034     matrix->getAnalyses( aiNoTrans_, aiTrans_, aiConjTrans_ );
01035     isInitialized_ = true;
01036   }
01037 
01038   template <class Scalar, class Node>
01039   template <class DomainScalar, class RangeScalar>
01040   void CUSPARSEOps<Scalar,Node>::multiply(Teuchos::ETransp trans,
01041                                           RangeScalar alpha,
01042                                           const MultiVector<DomainScalar,Node> &X,
01043                                                 MultiVector< RangeScalar,Node> &Y) const
01044   {
01045     // CUSPARSE doesn't support mixed precision
01046     Teuchos::CompileTimeAssert<Teuchos::TypeTraits::is_same<DomainScalar,Scalar>::value == false ||
01047                                Teuchos::TypeTraits::is_same< RangeScalar,Scalar>::value == false > cta; (void)cta;
01048     //
01049     std::string tfecfFuncName("multiply(trans,alpha,X,Y)");
01050     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01051         isInitialized_ == false,
01052         std::runtime_error, "sparse operators have not been initialized with graph and matrix data; call setGraphAndMatrix() first.")
01053     // get pointers,stride from X and Y
01054     int stride_x = (int)X.getStride(),
01055         stride_y = (int)Y.getStride();
01056     const Scalar * data_x = X.getValues().getRawPtr();
01057     Scalar * data_y = Y.getValuesNonConst().getRawPtr();
01058     const int numMatRows = numRows_;
01059     const int numMatCols = numCols_;
01060     const int opRows     = (trans == Teuchos::NO_TRANS ? numMatRows : numMatCols);
01061     const int opCols     = (trans == Teuchos::NO_TRANS ? numMatCols : numMatRows);
01062     const int numRHS     = X.getNumCols();
01063     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01064         X.getNumCols() != Y.getNumCols(),
01065         std::runtime_error, "X and Y do not have the same number of column vectors.")
01066     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01067         (size_t)X.getNumRows() != (size_t)opCols,
01068         std::runtime_error, "Size of X is not congruous with dimensions of operator.")
01069     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01070         (size_t)Y.getNumRows() != (size_t)opRows,
01071         std::runtime_error, "Size of Y is not congruous with dimensions of operator.")
01072     // call mat-vec
01073     cusparseOperation_t op;
01074     if      (trans == Teuchos::NO_TRANS)     op = CUSPARSE_OPERATION_NON_TRANSPOSE;
01075     else if (trans == Teuchos::TRANS)        op = CUSPARSE_OPERATION_TRANSPOSE;
01076     else /* (trans == Teuchos::CONJ_TRANS)*/ op = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;
01077     RCP<const cusparseHandle_t> sess = CUSPARSEdetails::Session::getHandle();
01078     const Scalar s_alpha = (Scalar)alpha,
01079                  s_beta  = Teuchos::ScalarTraits<Scalar>::zero();
01080 #ifdef HAVE_KOKKOS_DEBUG
01081     cudaError_t err = cudaGetLastError();
01082     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( cudaSuccess != err, std::runtime_error,
01083         ": cudaGetLastError() returned error after function call:\n"
01084         << cudaGetErrorString(err) );
01085     err = cudaThreadSynchronize();
01086     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( cudaSuccess != err, std::runtime_error,
01087         ": cudaThreadSynchronize() returned error after function call:\n"
01088         << cudaGetErrorString(err) );
01089 #endif
01090     cusparseStatus_t stat;
01091     stat = CUSPARSEdetails::CUSPARSETemplateAdaptors<Scalar>::CSRMM(
01092         *sess, op, numMatRows, numRHS, numMatCols, numNZ_, &s_alpha,
01093         *matdescr_, rowVals_.getRawPtr(), rowPtrs_.getRawPtr(), colInds_.getRawPtr(),
01094         data_x, stride_x, &s_beta, data_y, stride_y);
01095 #ifdef HAVE_KOKKOS_DEBUG
01096     err = cudaGetLastError();
01097     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( cudaSuccess != err, std::runtime_error,
01098         ": cudaGetLastError() returned error after function call:\n"
01099         << cudaGetErrorString(err) );
01100     err = cudaThreadSynchronize();
01101     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( cudaSuccess != err, std::runtime_error,
01102         ": cudaThreadSynchronize() returned error after function call:\n"
01103         << cudaGetErrorString(err) );
01104 #endif
01105     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01106         stat != CUSPARSE_STATUS_SUCCESS,
01107         std::runtime_error, ": CSRMM returned error " << stat);
01108     return;
01109   }
01110 
01111 
01112   template <class Scalar, class Node>
01113   template <class DomainScalar, class RangeScalar>
01114   void CUSPARSEOps<Scalar,Node>::multiply(Teuchos::ETransp trans,
01115                                           RangeScalar alpha, const MultiVector<DomainScalar,Node> &X,
01116                                           RangeScalar beta, MultiVector<RangeScalar,Node> &Y) const
01117   {
01118     // CUSPARSE doesn't support mixed precision
01119     Teuchos::CompileTimeAssert<Teuchos::TypeTraits::is_same<DomainScalar,Scalar>::value == false ||
01120                                Teuchos::TypeTraits::is_same< RangeScalar,Scalar>::value == false > cta; (void)cta;
01121     //
01122     std::string tfecfFuncName("multiply(trans,alpha,X,beta,Y)");
01123     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01124         isInitialized_ == false,
01125         std::runtime_error, "sparse operators have not been initialized with graph and matrix data; call setGraphAndMatrix() first.")
01126     // get pointers,stride from X and Y
01127     int stride_x = (int)X.getStride(),
01128         stride_y = (int)Y.getStride();
01129     const Scalar * data_x = X.getValues().getRawPtr();
01130     Scalar * data_y = Y.getValuesNonConst().getRawPtr();
01131     const int numMatRows = numRows_;
01132     const int numMatCols = numCols_;
01133     const int opRows     = (trans == Teuchos::NO_TRANS ? numMatRows : numMatCols);
01134     const int opCols     = (trans == Teuchos::NO_TRANS ? numMatCols : numMatRows);
01135     const int numRHS     = X.getNumCols();
01136     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01137         X.getNumCols() != Y.getNumCols(),
01138         std::runtime_error, "X and Y do not have the same number of column vectors.")
01139     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01140         (size_t)X.getNumRows() != (size_t)opCols,
01141         std::runtime_error, "Size of X is not congruous with dimensions of operator.")
01142     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01143         (size_t)Y.getNumRows() != (size_t)opRows,
01144         std::runtime_error, "Size of Y is not congruous with dimensions of operator.")
01145     // call mat-vec
01146     cusparseOperation_t op;
01147     RCP<const cusparseHandle_t> hndl = CUSPARSEdetails::Session::getHandle();
01148     if      (trans == Teuchos::NO_TRANS)     op = CUSPARSE_OPERATION_NON_TRANSPOSE;
01149     else if (trans == Teuchos::TRANS)        op = CUSPARSE_OPERATION_TRANSPOSE;
01150     else /* (trans == Teuchos::CONJ_TRANS)*/ op = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;
01151     const Scalar s_alpha = (Scalar)alpha,
01152                  s_beta  = (Scalar)beta;
01153     cusparseStatus_t stat = CUSPARSEdetails::CUSPARSETemplateAdaptors<Scalar>::CSRMM(
01154         *hndl, op, numMatRows, numRHS, numMatCols, numNZ_, &s_alpha,
01155         *matdescr_, rowVals_.getRawPtr(), rowPtrs_.getRawPtr(), colInds_.getRawPtr(),
01156         data_x, stride_x, &s_beta, data_y, stride_y);
01157 #ifdef HAVE_KOKKOS_DEBUG
01158     cudaError_t err = cudaGetLastError();
01159     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( cudaSuccess != err, std::runtime_error,
01160         ": cudaGetLastError() returned error after function call:\n"
01161         << cudaGetErrorString(err) );
01162     err = cudaThreadSynchronize();
01163     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( cudaSuccess != err, std::runtime_error,
01164         ": cudaThreadSynchronize() returned error after function call:\n"
01165         << cudaGetErrorString(err) );
01166 #endif
01167     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01168         stat != CUSPARSE_STATUS_SUCCESS,
01169         std::runtime_error, ": CSRMM returned error " << stat);
01170     return;
01171   }
01172 
01173   template <class Scalar, class Node>
01174   template <class DomainScalar, class RangeScalar>
01175   void CUSPARSEOps<Scalar,Node>::solve(Teuchos::ETransp trans,
01176                                        const MultiVector<DomainScalar,Node> &Y,
01177                                              MultiVector< RangeScalar,Node> &X) const
01178   {
01179     // CUSPARSE doesn't support mixed precision; partial specialize, then nix the generic versions
01180     Teuchos::CompileTimeAssert<Teuchos::TypeTraits::is_same<DomainScalar,Scalar>::value == false ||
01181                                Teuchos::TypeTraits::is_same< RangeScalar,Scalar>::value == false > cta; (void)cta;
01182     //
01183     std::string tfecfFuncName("solve()");
01184     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01185         isInitialized_ == false,
01186         std::runtime_error, "sparse operators have not been initialized with graph and matrix data; call setGraphAndMatrix() first.")
01187     // get pointers,stride from X and Y
01188     int stride_x = (int)X.getStride(),
01189         stride_y = (int)Y.getStride();
01190     const Scalar * data_y = Y.getValues().getRawPtr();
01191     Scalar * data_x = X.getValuesNonConst().getRawPtr();
01192     const int numMatRows = X.getNumRows();
01193     const int numRHS     = X.getNumCols();
01194     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01195         X.getNumRows() != Y.getNumRows(),
01196         std::runtime_error, "X and Y do not have the same number of row vectors.")
01197     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01198         X.getNumCols() != Y.getNumCols(),
01199         std::runtime_error, "X and Y do not have the same number of column vectors.")
01200     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01201         Y.getNumRows() != numRows_,
01202         std::runtime_error, "Y does not have the same number of rows as does the matrix.")
01203     //
01204     RCP<cusparseSolveAnalysisInfo_t> solveInfo;
01205     cusparseOperation_t                     op;
01206     if (trans == Teuchos::NO_TRANS) {
01207       solveInfo = aiNoTrans_;
01208       op = CUSPARSE_OPERATION_NON_TRANSPOSE;
01209     }
01210     else if (trans == Teuchos::TRANS) {
01211       solveInfo = aiTrans_;
01212       op = CUSPARSE_OPERATION_TRANSPOSE;
01213     }
01214     else if (trans == Teuchos::CONJ_TRANS) {
01215       solveInfo = aiConjTrans_;
01216       op = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;
01217     }
01218     else {
01219       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01220           true, std::runtime_error, "invalid transformation (none ofe of NO_TRANS, TRANS or CONJ_TRANS): " << trans);
01221     }
01222     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01223       solveInfo == null, std::runtime_error,
01224       "solve info not computed at matrix finalization time for requested transformation: " << trans);
01225     RCP<const cusparseHandle_t> hndl = CUSPARSEdetails::Session::getHandle();
01226     const Scalar s_alpha = Teuchos::ScalarTraits<Scalar>::one();
01227     cusparseStatus_t stat = CUSPARSEdetails::CUSPARSETemplateAdaptors<Scalar>::CSRSM_solve(
01228         *hndl, op, numMatRows, numRHS, &s_alpha,
01229         *matdescr_, rowVals_.getRawPtr(), rowPtrs_.getRawPtr(), colInds_.getRawPtr(), *solveInfo,
01230         data_y, stride_y, data_x, stride_x);
01231     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01232         stat != CUSPARSE_STATUS_SUCCESS,
01233         std::runtime_error, ": CSRSM_solve returned error " << stat);
01234     return;
01235   }
01236 
01237 
01238 
01239 
01240 } // namespace Kokkos
01241 
01242 #endif /* KOKKOS_CUSPARSEOPS_HPP */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends