Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_DefaultArithmetic.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef KOKKOS_DEFAULTARITHMETIC_H
00030 #define KOKKOS_DEFAULTARITHMETIC_H
00031 
00032 #include <Teuchos_BLAS_types.hpp>
00033 #include <Teuchos_TestForException.hpp>
00034 #include <Teuchos_TypeNameTraits.hpp>
00035 #include <Teuchos_ArrayView.hpp>
00036 #include <Teuchos_Tuple.hpp>
00037 #include <stdexcept>
00038 
00039 #include "Kokkos_MultiVector.hpp"
00040 #include "Kokkos_MultiVectorKernelOps.hpp"
00041 #include "Kokkos_NodeHelpers.hpp"
00042 #ifdef HAVE_KOKKOS_TBB
00043 #include "Kokkos_TBBNode.hpp"
00044 #endif
00045 #ifdef HAVE_KOKKOS_THREADPOOL
00046 #include "Kokkos_TPINode.hpp"
00047 #endif
00048 #ifdef HAVE_KOKKOS_THRUST
00049 #include "Kokkos_ThrustGPUNode.hpp"
00050 #include "cublas.h"
00051 #endif
00052 #include "Kokkos_SerialNode.hpp"
00053 #include <Teuchos_BLAS.hpp>
00054 
00055 namespace Kokkos {
00056 
00057   // Class for providing GEMM for a particular Node
00058   template <typename Scalar, typename Node> 
00059   struct NodeGEMM {
00060     public:
00061       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B, Scalar beta, MultiVector<Scalar,Node> &C) {
00062         TEST_FOR_EXCEPT(true);
00063       }
00064   };
00065 
00066   template <typename Scalar> 
00067   struct NodeGEMM<Scalar,SerialNode> {
00068     public:
00069       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,SerialNode> &A, const MultiVector<Scalar,SerialNode> &B, Scalar beta, MultiVector<Scalar,SerialNode> &C) {
00070         Teuchos::BLAS<int,Scalar> blas;
00071         const int m = Teuchos::as<int>(C.getNumRows()),
00072                   n = Teuchos::as<int>(C.getNumCols()),
00073                   k = (transA == Teuchos::NO_TRANS ? A.getNumCols() : A.getNumRows()),
00074                   lda = Teuchos::as<int>(A.getStride()),
00075                   ldb = Teuchos::as<int>(B.getStride()),
00076                   ldc = Teuchos::as<int>(C.getStride());
00077         blas.GEMM(transA, transB, m, n, k, alpha, A.getValues().getRawPtr(), lda, B.getValues().getRawPtr(), ldb, beta, C.getValuesNonConst().getRawPtr(), ldc);
00078       }
00079   };
00080 
00081 #ifdef HAVE_KOKKOS_TBB
00082   template <typename Scalar> 
00083   struct NodeGEMM<Scalar,TBBNode> {
00084     public:
00085       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,TBBNode> &A, const MultiVector<Scalar,TBBNode> &B, Scalar beta, MultiVector<Scalar,TBBNode> &C) {
00086         Teuchos::BLAS<int,Scalar> blas;
00087         const int m = Teuchos::as<int>(C.getNumRows()),
00088                   n = Teuchos::as<int>(C.getNumCols()),
00089                   k = (transA == Teuchos::NO_TRANS ? A.getNumCols() : A.getNumRows()),
00090                   lda = Teuchos::as<int>(A.getStride()),
00091                   ldb = Teuchos::as<int>(B.getStride()),
00092                   ldc = Teuchos::as<int>(C.getStride());
00093         blas.GEMM(transA, transB, m, n, k, alpha, A.getValues().getRawPtr(), lda, B.getValues().getRawPtr(), ldb, beta, C.getValuesNonConst().getRawPtr(), ldc);
00094       }
00095   };
00096 #endif
00097 
00098 #ifdef HAVE_KOKKOS_THREADPOOL
00099   template <typename Scalar> 
00100   struct NodeGEMM<Scalar,TPINode> {
00101     public:
00102       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,TPINode> &A, const MultiVector<Scalar,TPINode> &B, Scalar beta, MultiVector<Scalar,TPINode> &C) {
00103 #ifndef KOKKOS_DONT_BLOCK_TPI_GEMM
00104         TPI_Block();
00105 #endif
00106         Teuchos::BLAS<int,Scalar> blas;
00107         const int m = Teuchos::as<int>(C.getNumRows()),
00108                   n = Teuchos::as<int>(C.getNumCols()),
00109                   k = (transA == Teuchos::NO_TRANS ? A.getNumCols() : A.getNumRows()),
00110                   lda = Teuchos::as<int>(A.getStride()),
00111                   ldb = Teuchos::as<int>(B.getStride()),
00112                   ldc = Teuchos::as<int>(C.getStride());
00113         blas.GEMM(transA, transB, m, n, k, alpha, A.getValues().getRawPtr(), lda, B.getValues().getRawPtr(), ldb, beta, C.getValuesNonConst().getRawPtr(), ldc);
00114 #ifndef KOKKOS_DONT_BLOCK_TPI_GEMM
00115         TPI_Unblock();
00116 #endif
00117       }
00118   };
00119 #endif
00120 
00121 #ifdef HAVE_KOKKOS_THRUST 
00122   template <typename Scalar>
00123   struct NodeGEMM<Scalar,ThrustGPUNode> {
00124     public:
00125       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,ThrustGPUNode> &A, const MultiVector<Scalar,ThrustGPUNode> &B, Scalar beta, MultiVector<Scalar,ThrustGPUNode> &C) {
00126         TEST_FOR_EXCEPTION(true, std::logic_error, "NodeGEMM: ThrustGPUNode has no support for GEMM operations over Scalar=" << Teuchos::typeName(alpha) << ".");
00127       }
00128   };
00129 
00130 
00131 #ifdef HAVE_KOKKOS_CUDA_FLOAT
00132   template <>
00133   struct NodeGEMM<float,ThrustGPUNode> {
00134     public:
00135       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, float alpha, const MultiVector<float,ThrustGPUNode> &A, const MultiVector<float,ThrustGPUNode> &B, float beta, MultiVector<float,ThrustGPUNode> &C) {
00136         const int m = Teuchos::as<int>(C.getNumRows()),
00137                   n = Teuchos::as<int>(C.getNumCols()),
00138                   k = (transA == Teuchos::NO_TRANS ? A.getNumCols() : A.getNumRows()),
00139                   lda = Teuchos::as<int>(A.getStride()),
00140                   ldb = Teuchos::as<int>(B.getStride()),
00141                   ldc = Teuchos::as<int>(C.getStride());
00142         const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'),
00143                    char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
00144         cublasSgemm(char_transA, char_transB, m, n, k, alpha, A.getValues().getRawPtr(), lda, B.getValues().getRawPtr(), ldb, beta, C.getValuesNonConst().getRawPtr(), ldc);
00145 #ifdef HAVE_KOKKOS_DEBUG
00146         cublasStatus info = cublasGetError();
00147         TEST_FOR_EXCEPTION( info != CUBLAS_STATUS_SUCCESS, std::runtime_error, "cublasSgemm failed with status " << info << "." );
00148 #endif
00149       }
00150   };
00151 #endif
00152 
00153 #ifdef HAVE_KOKKOS_CUDA_DOUBLE
00154   template <>
00155   struct NodeGEMM<double,ThrustGPUNode> {
00156     public:
00157       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, double alpha, const MultiVector<double,ThrustGPUNode> &A, const MultiVector<double,ThrustGPUNode> &B, double beta, MultiVector<double,ThrustGPUNode> &C) {
00158         const int m = Teuchos::as<int>(C.getNumRows()),
00159                   n = Teuchos::as<int>(C.getNumCols()),
00160                   k = (transA == Teuchos::NO_TRANS ? A.getNumCols() : A.getNumRows()),
00161                   lda = Teuchos::as<int>(A.getStride()),
00162                   ldb = Teuchos::as<int>(B.getStride()),
00163                   ldc = Teuchos::as<int>(C.getStride());
00164         const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'),
00165                    char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
00166         cublasDgemm(char_transA, char_transB, m, n, k, alpha, A.getValues().getRawPtr(), lda, B.getValues().getRawPtr(), ldb, beta, C.getValuesNonConst().getRawPtr(), ldc);
00167 #ifdef HAVE_KOKKOS_DEBUG
00168         cublasStatus info = cublasGetError();
00169         TEST_FOR_EXCEPTION( info != CUBLAS_STATUS_SUCCESS, std::runtime_error, "cublasDgemm failed with status " << info << "." );
00170 #endif
00171       }
00172   };
00173 #endif
00174 
00175 #endif
00176 
00179   template <class MV>
00180   class DefaultArithmetic {
00181     // nothing here
00182   };
00183 
00186   template <class Scalar, class Node>
00187   class DefaultArithmetic<MultiVector<Scalar,Node> > {
00188 
00189     public:
00191       inline static void Init(MultiVector<Scalar,Node> &A, Scalar alpha) {
00192         const size_t nR = A.getNumRows();
00193         const size_t nC = A.getNumCols();
00194         if (nR*nC == 0) return;
00195         const size_t stride = A.getStride();
00196         RCP<Node> node = A.getNode();
00197         ArrayRCP<Scalar> data = A.getValuesNonConst();
00198         // prepare buffers
00199         ReadyBufferHelper<Node> rbh(node);
00200         rbh.begin();
00201         rbh.template addNonConstBuffer<Scalar>(data);
00202         rbh.end();
00203         // prepare op
00204         InitOp<Scalar> wdp;
00205         wdp.alpha = alpha;
00206         if (stride == nR) {
00207           // one kernel invocation for whole multivector
00208           wdp.x = data(0,nR*nC).getRawPtr();
00209           node->template parallel_for<InitOp<Scalar> >(0,nR*nC,wdp);
00210         }
00211         else {
00212           // one kernel invocation for each column
00213           for (size_t j=0; j<nC; ++j) {
00214             wdp.x = data(0,nR).getRawPtr();
00215             node->template parallel_for<InitOp<Scalar> >(0,nR,wdp);
00216             data += stride;
00217           }
00218         }
00219       }
00220 
00222       inline static void Recip(MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B) {
00223         const size_t nR = A.getNumRows();
00224         const size_t nC = A.getNumCols();
00225         const size_t Astride = A.getStride();
00226         const size_t Bstride = B.getStride();
00227         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00228                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Recip(A,B): A and B must have the same dimensions.");
00229         RCP<Node> node = B.getNode();
00230         ArrayRCP<const Scalar> Bdata = B.getValues();
00231         ArrayRCP<Scalar>       Adata = A.getValuesNonConst();
00232         // prepare buffers
00233         ReadyBufferHelper<Node> rbh(node);
00234         rbh.begin();
00235         rbh.template addConstBuffer<Scalar>(Bdata);
00236         rbh.template addNonConstBuffer<Scalar>(Adata);
00237         rbh.end();
00238         RecipOp<Scalar> wdp;
00239         if (A.getStride() == nR && B.getStride() == nR) {
00240           // one kernel invocation for whole multivector
00241           wdp.scale = Bdata(0,nR*nC).getRawPtr();
00242           wdp.x = Adata(0,nR*nC).getRawPtr();
00243           node->template parallel_for<RecipOp<Scalar> >(0,nR*nC,wdp);
00244         }
00245         else {
00246           // one kernel invocation for each column
00247           for (size_t j=0; j<nC; ++j) {
00248             wdp.x = Adata(0,nR).getRawPtr();
00249             wdp.scale = Bdata(0,nR).getRawPtr();
00250             node->template parallel_for<RecipOp<Scalar> >(0,nR,wdp);
00251             Adata += Astride;
00252             Bdata += Bstride;
00253           }
00254         }
00255       }
00256 
00258 
00260       inline static void ElemMult(MultiVector<Scalar,Node> &C, Scalar scalarC,
00261                                Scalar scalarAB,
00262                                const MultiVector<Scalar,Node> &A,
00263                                const MultiVector<Scalar,Node> &B) {
00264         const size_t nR_A = A.getNumRows();
00265         const size_t nC_A = A.getNumCols();
00266         TEST_FOR_EXCEPTION(nC_A != 1, std::runtime_error,
00267                            "DefaultArithmetic<"<<Teuchos::typeName(A) << ">::ElemMult(C,sC,sAB,A,B): A must have just 1 column.");
00268         const size_t Cstride = C.getStride();
00269         const size_t Bstride = B.getStride();
00270         const size_t nC_C = C.getNumCols();
00271         const size_t nR_C = C.getNumRows();
00272         TEST_FOR_EXCEPTION(nC_C != B.getNumCols() || nR_A != B.getNumRows() || nR_C != B.getNumRows(), std::runtime_error,
00273                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::ElemMult(C,sC,sAB,A,B): A, B and C must have the same num-rows, B and C must have the same num-cols.");
00274         RCP<Node> node = B.getNode();
00275         ArrayRCP<Scalar> Cdata = C.getValuesNonConst();
00276         ArrayRCP<const Scalar> Bdata = B.getValues();
00277         ArrayRCP<const Scalar>       Adata = A.getValues();
00278         // prepare buffers
00279         ReadyBufferHelper<Node> rbh(node);
00280         rbh.begin();
00281         rbh.template addNonConstBuffer<Scalar>(Cdata);
00282         rbh.template addConstBuffer<Scalar>(Bdata);
00283         rbh.template addConstBuffer<Scalar>(Adata);
00284         rbh.end();
00285         MVElemMultOp<Scalar> wdp;
00286         wdp.scalarX = scalarC;
00287         wdp.scalarYZ = scalarAB;
00288         // one kernel invocation for each column
00289         for (size_t j=0; j<nC_C; ++j) {
00290           wdp.x = Cdata(0,nR_C).getRawPtr();
00291           wdp.y = Adata(0,nR_C).getRawPtr();
00292           wdp.z = Bdata(0,nR_C).getRawPtr();
00293           node->template parallel_for<MVElemMultOp<Scalar> >(0,nR_C,wdp);
00294           Cdata += Cstride;
00295           Bdata += Bstride;
00296         }
00297       }
00298 
00300       inline static void Assign(MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B) {
00301         const size_t nR = A.getNumRows();
00302         const size_t nC = A.getNumCols();
00303         const size_t Astride = A.getStride();
00304         const size_t Bstride = B.getStride();
00305         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00306                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Assign(A,B): A and B must have the same dimensions.");
00307         if (nC*nR == 0) return;
00308         RCP<Node> node = A.getNode();
00309         ArrayRCP<const Scalar> Bdata = B.getValues();
00310         ArrayRCP<Scalar>       Adata = A.getValuesNonConst();
00311         // prepare buffers
00312         ReadyBufferHelper<Node> rbh(node);
00313         rbh.begin();
00314         rbh.template addConstBuffer<Scalar>(Bdata);
00315         rbh.template addNonConstBuffer<Scalar>(Adata);
00316         rbh.end();
00317         // prepare op
00318         AssignOp<Scalar> wdp;
00319         if (Astride == nR && Bstride == nR) {
00320           // one kernel invocation for whole multivector assignment
00321           wdp.x = Adata(0,nR*nC).getRawPtr();
00322           wdp.y = Bdata(0,nR*nC).getRawPtr();
00323           node->template parallel_for<AssignOp<Scalar> >(0,nR*nC,wdp);
00324         }
00325         else {
00326           // one kernel invocation for each column
00327           for (size_t j=0; j<nC; ++j) {
00328             wdp.x = Adata(0,nR).getRawPtr();
00329             wdp.y = Bdata(0,nR).getRawPtr();
00330             node->template parallel_for<AssignOp<Scalar> >(0,nR,wdp);
00331             Adata += Astride;
00332             Bdata += Bstride;
00333           }
00334         }
00335       }
00336 
00337       inline static void Dot(const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B, const ArrayView<Scalar> &dots) {
00338         const size_t nR = A.getNumRows();
00339         const size_t nC = A.getNumCols();
00340         const size_t Astride = A.getStride();
00341         const size_t Bstride = B.getStride();
00342         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00343                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Dot(A,B,dots): A and B must have the same dimensions.");
00344         TEST_FOR_EXCEPTION(nC > Teuchos::as<size_t>(dots.size()), std::runtime_error, 
00345             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Dot(A,B,dots): dots must have length as large as number of columns of A and B.");
00346         if (nR*nC == 0) {
00347           std::fill( dots.begin(), dots.begin() + nC, Teuchos::ScalarTraits<Scalar>::zero() );
00348           return;
00349         }
00350         RCP<Node> node = A.getNode();
00351         ArrayRCP<const Scalar> Bdata = B.getValues(),
00352                                         Adata = A.getValues();
00353         // prepare buffers
00354         ReadyBufferHelper<Node> rbh(node);
00355         rbh.begin();
00356         rbh.template addConstBuffer<Scalar>(Bdata);
00357         rbh.template addConstBuffer<Scalar>(Adata);
00358         rbh.end();
00359         DotOp2<Scalar> op;
00360         for (size_t j=0; j<nC; ++j) {
00361           op.x = Adata(0,nR).getRawPtr();
00362           op.y = Bdata(0,nR).getRawPtr();
00363           dots[j] = node->parallel_reduce(0,nR,op);
00364           Adata += Astride;
00365           Bdata += Bstride;
00366         }
00367       }
00368 
00369       inline static Scalar Dot(const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B) {
00370         const size_t nR = A.getNumRows();
00371         const size_t nC = A.getNumCols();
00372         TEST_FOR_EXCEPTION(nR != B.getNumRows(), std::runtime_error,
00373                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Dot(A,B,dots): A and B must have the same number of rows.");
00374         if (nR*nC == 0) {
00375           return Teuchos::ScalarTraits<Scalar>::zero();
00376         }
00377         RCP<Node> node = A.getNode();
00378         ArrayRCP<const Scalar> Bdata = B.getValues(0),
00379                                         Adata = A.getValues(0);
00380         // prepare buffers
00381         ReadyBufferHelper<Node> rbh(node);
00382         rbh.begin();
00383         rbh.template addConstBuffer<Scalar>(Bdata);
00384         rbh.template addConstBuffer<Scalar>(Adata);
00385         rbh.end();
00386         DotOp2<Scalar> op;
00387         op.x = Adata(0,nR).getRawPtr();
00388         op.y = Bdata(0,nR).getRawPtr();
00389         return node->parallel_reduce(0,nR,op);
00390       }
00391 
00392       inline static void GEMM(MultiVector<Scalar,Node> &C, Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B, Scalar beta) {
00393         NodeGEMM<Scalar,Node>::GEMM(transA, transB, alpha, A, B, beta, C);
00394       }
00395 
00396       inline static void GESUM(MultiVector<Scalar,Node> &B, Scalar alpha, const MultiVector<Scalar,Node> &A, Scalar beta) {
00397         const size_t nR = A.getNumRows();
00398         const size_t nC = A.getNumCols();
00399         const size_t Astride = A.getStride();
00400         const size_t Bstride = B.getStride();
00401         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00402                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::GESUM(B,alpha,A,beta): A and B must have the same dimensions.");
00403         RCP<Node> node = B.getNode();
00404         ArrayRCP<const Scalar> Adata = A.getValues();
00405         ArrayRCP<Scalar>       Bdata = B.getValuesNonConst();
00406         // prepare buffers
00407         ReadyBufferHelper<Node> rbh(node);
00408         rbh.begin();
00409         rbh.template addConstBuffer<Scalar>(Adata);
00410         rbh.template addNonConstBuffer<Scalar>(Bdata);
00411         rbh.end();
00412         GESUMOp<Scalar> wdp;
00413         wdp.alpha = alpha;
00414         wdp.beta  = beta;
00415         if (Astride == nR && Bstride == nR) {
00416           // one kernel invocation for whole multivector
00417           wdp.y = Bdata(0,nR*nC).getRawPtr();
00418           wdp.x = Adata(0,nR*nC).getRawPtr();
00419           node->template parallel_for<GESUMOp<Scalar> >(0,nR*nC,wdp);
00420         }
00421         else {
00422           // one kernel invocation for each column
00423           for (size_t j=0; j<nC; ++j) {
00424             wdp.y = Bdata(0,nR).getRawPtr();
00425             wdp.x = Adata(0,nR).getRawPtr();
00426             node->template parallel_for<GESUMOp<Scalar> >(0,nR,wdp);
00427             Adata += Astride;
00428             Bdata += Bstride;
00429           }
00430         }
00431       }
00432 
00433       inline static void GESUM(MultiVector<Scalar,Node> &C, Scalar alpha, const MultiVector<Scalar,Node> &A, Scalar beta, const MultiVector<Scalar,Node> &B, Scalar gamma) {
00434         const size_t nR = A.getNumRows();
00435         const size_t nC = A.getNumCols();
00436         const size_t Astride = A.getStride();
00437         const size_t Bstride = B.getStride();
00438         const size_t Cstride = C.getStride();
00439         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00440                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::GESUM(C,alpha,A,beta,B,gamma): A and B must have the same dimensions.");
00441         RCP<Node> node = B.getNode();
00442         ArrayRCP<const Scalar> Adata = A.getValues(),
00443                                         Bdata = B.getValues();
00444         ArrayRCP<Scalar>       Cdata = C.getValuesNonConst();
00445         // prepare buffers
00446         ReadyBufferHelper<Node> rbh(node);
00447         rbh.begin();
00448         rbh.template addConstBuffer<Scalar>(Adata);
00449         rbh.template addConstBuffer<Scalar>(Bdata);
00450         rbh.template addNonConstBuffer<Scalar>(Cdata);
00451         rbh.end();
00452         GESUMOp3<Scalar> wdp;
00453         wdp.alpha = alpha;
00454         wdp.beta  = beta;
00455         wdp.gamma = gamma;
00456         if (Astride == nR && Bstride == nR && Cstride == nR) {
00457           // one kernel invocation for whole multivector
00458           wdp.z = Cdata(0,nR*nC).getRawPtr();
00459           wdp.y = Bdata(0,nR*nC).getRawPtr();
00460           wdp.x = Adata(0,nR*nC).getRawPtr();
00461           node->template parallel_for<GESUMOp3<Scalar> >(0,nR*nC,wdp);
00462         }
00463         else {
00464           // one kernel invocation for each column
00465           for (size_t j=0; j<nC; ++j) {
00466             wdp.z = Cdata(0,nR).getRawPtr();
00467             wdp.y = Bdata(0,nR).getRawPtr();
00468             wdp.x = Adata(0,nR).getRawPtr();
00469             node->template parallel_for<GESUMOp3<Scalar> >(0,nR,wdp);
00470             Adata += Astride;
00471             Bdata += Bstride;
00472             Cdata += Cstride;
00473           }
00474         }
00475       }
00476 
00477       inline static void Norm1(const MultiVector<Scalar,Node> &A, const ArrayView<typename Teuchos::ScalarTraits<Scalar>::magnitudeType> &norms) {
00478         const size_t nR = A.getNumRows();
00479         const size_t nC = A.getNumCols();
00480         const size_t Astride = A.getStride();
00481         TEST_FOR_EXCEPTION(nC > Teuchos::as<size_t>(norms.size()), std::runtime_error, 
00482             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Norm1(A,norms): norms must have length as large as number of columns of A.");
00483         if (nR*nC == 0) {
00484           std::fill( norms.begin(), norms.begin() + nC, Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero() );
00485           return;
00486         }
00487         RCP<Node> node = A.getNode();
00488         ArrayRCP<const Scalar> Adata = A.getValues();
00489         // prepare buffers
00490         ReadyBufferHelper<Node> rbh(node);
00491         rbh.begin();
00492         rbh.template addConstBuffer<Scalar>(Adata);
00493         rbh.end();
00494         SumAbsOp<Scalar> op;
00495         for (size_t j=0; j<nC; ++j) {
00496           op.x = Adata(0,nR).getRawPtr();
00497           norms[j] = node->parallel_reduce(0,nR,op);
00498           Adata += Astride;
00499         }
00500       }
00501 
00502       inline static typename Teuchos::ScalarTraits<Scalar>::magnitudeType
00503       Norm1(const MultiVector<Scalar,Node> &A) {
00504         const size_t nR = A.getNumRows();
00505         const size_t nC = A.getNumCols();
00506         if (nR*nC == 0) {
00507           return Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero();
00508         }
00509         RCP<Node> node = A.getNode();
00510         ArrayRCP<const Scalar> Adata = A.getValues(0);
00511         // prepare buffers
00512         ReadyBufferHelper<Node> rbh(node);
00513         rbh.begin();
00514         rbh.template addConstBuffer<Scalar>(Adata);
00515         rbh.end();
00516         SumAbsOp<Scalar> op;
00517         op.x = Adata(0,nR).getRawPtr();
00518         return node->parallel_reduce(0,nR,op);
00519       }
00520 
00521       inline static void Sum(const MultiVector<Scalar,Node> &A, const ArrayView<Scalar> &sums) {
00522         const size_t nR = A.getNumRows();
00523         const size_t nC = A.getNumCols();
00524         const size_t Astride = A.getStride();
00525         TEST_FOR_EXCEPTION(nC > (size_t)sums.size(), std::runtime_error, 
00526             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Sum(A,sums): sums must have length as large as number of columns of A.");
00527         if (nR*nC == 0) {
00528           std::fill( sums.begin(), sums.begin() + nC, Teuchos::ScalarTraits<Scalar>::zero() );
00529           return;
00530         }
00531         RCP<Node> node = A.getNode();
00532         ArrayRCP<const Scalar> Adata = A.getValues();
00533         // prepare buffers
00534         ReadyBufferHelper<Node> rbh(node);
00535         rbh.begin();
00536         rbh.template addConstBuffer<Scalar>(Adata);
00537         rbh.end();
00538         SumOp<Scalar> op;
00539         for (size_t j=0; j<nC; ++j) {
00540           op.x = Adata(0,nR).getRawPtr();
00541           sums[j] = node->parallel_reduce(0,nR,op);
00542           Adata += Astride;
00543         }
00544       }
00545 
00546       inline static Scalar Sum(const MultiVector<Scalar,Node> &A) {
00547         const size_t nR = A.getNumRows();
00548         const size_t nC = A.getNumCols();
00549         if (nR*nC == 0) {
00550           return Teuchos::ScalarTraits<Scalar>::zero();
00551         }
00552         RCP<Node> node = A.getNode();
00553         ArrayRCP<const Scalar> Adata = A.getValues(0);
00554         // prepare buffers
00555         ReadyBufferHelper<Node> rbh(node);
00556         rbh.begin();
00557         rbh.template addConstBuffer<Scalar>(Adata);
00558         rbh.end();
00559         SumOp<Scalar> op;
00560         op.x = Adata(0,nR).getRawPtr();
00561         return node->parallel_reduce(0,nR,op);
00562       }
00563 
00564       inline static typename Teuchos::ScalarTraits<Scalar>::magnitudeType NormInf(const MultiVector<Scalar,Node> &A) {
00565         const size_t nR = A.getNumRows();
00566         const size_t nC = A.getNumCols();
00567         if (nR*nC == 0) {
00568           return Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero();
00569         }
00570         RCP<Node> node = A.getNode();
00571         ArrayRCP<const Scalar> Adata = A.getValues(0);
00572         // prepare buffers
00573         ReadyBufferHelper<Node> rbh(node);
00574         rbh.begin();
00575         rbh.template addConstBuffer<Scalar>(Adata);
00576         rbh.end();
00577         MaxAbsOp<Scalar> op;
00578         op.x = Adata(0,nR).getRawPtr();
00579         return node->parallel_reduce(0,nR,op);
00580       }
00581 
00582       inline static void NormInf(const MultiVector<Scalar,Node> &A, const ArrayView<typename Teuchos::ScalarTraits<Scalar>::magnitudeType> &norms) {
00583         const size_t nR = A.getNumRows();
00584         const size_t nC = A.getNumCols();
00585         const size_t Astride = A.getStride();
00586         TEST_FOR_EXCEPTION(nC > Teuchos::as<size_t>(norms.size()), std::runtime_error, 
00587             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::NormInf(A,norms): norms must have length as large as number of columns of A.");
00588         if (nR*nC == 0) {
00589           std::fill( norms.begin(), norms.begin() + nC, Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero() );
00590           return;
00591         }
00592         RCP<Node> node = A.getNode();
00593         ArrayRCP<const Scalar> Adata = A.getValues();
00594         // prepare buffers
00595         ReadyBufferHelper<Node> rbh(node);
00596         rbh.begin();
00597         rbh.template addConstBuffer<Scalar>(Adata);
00598         rbh.end();
00599         MaxAbsOp<Scalar> op;
00600         for (size_t j=0; j<nC; ++j) {
00601           op.x = Adata(0,nR).getRawPtr();
00602           norms[j] = node->parallel_reduce(0,nR,op);
00603           Adata += Astride;
00604         }
00605       }
00606 
00607       inline static void Norm2Squared(const MultiVector<Scalar,Node> &A, const ArrayView<typename Teuchos::ScalarTraits<Scalar>::magnitudeType> &norms) {
00608         const size_t nR = A.getNumRows();
00609         const size_t nC = A.getNumCols();
00610         const size_t Astride = A.getStride();
00611         TEST_FOR_EXCEPTION(nC > Teuchos::as<size_t>(norms.size()), std::runtime_error, 
00612             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Norm2Squared(A,norms): norms must have length as large as number of columns of A.");
00613         if (nR*nC == 0) {
00614           std::fill( norms.begin(), norms.begin() + nC, Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero() );
00615           return;
00616         }
00617         RCP<Node> node = A.getNode();
00618         ArrayRCP<const Scalar> Adata = A.getValues();
00619         // prepare buffers
00620         ReadyBufferHelper<Node> rbh(node);
00621         rbh.begin();
00622         rbh.template addConstBuffer<Scalar>(Adata);
00623         rbh.end();
00624         DotOp1<Scalar> op;
00625         for (size_t j=0; j<nC; ++j) {
00626           op.x = Adata(0,nR).getRawPtr();
00627           norms[j] = node->parallel_reduce(0,nR,op);
00628           Adata += Astride;
00629         }
00630       }
00631 
00632       inline static typename Teuchos::ScalarTraits<Scalar>::magnitudeType 
00633       Norm2Squared(const MultiVector<Scalar,Node> &A) {
00634         const size_t nR = A.getNumRows();
00635         if (nR == 0) {
00636           return Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero();
00637         }
00638         RCP<Node> node = A.getNode();
00639         ArrayRCP<const Scalar> Adata = A.getValues(0);
00640         // prepare buffers
00641         ReadyBufferHelper<Node> rbh(node);
00642         rbh.begin();
00643         rbh.template addConstBuffer<Scalar>(Adata);
00644         rbh.end();
00645         DotOp1<Scalar> op;
00646         op.x = Adata(0,nR).getRawPtr();
00647         return node->parallel_reduce(0,nR,op);
00648       }
00649 
00650       inline static typename Teuchos::ScalarTraits<Scalar>::magnitudeType
00651       WeightedNorm(const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &weightVector) {
00652         const size_t nR = A.getNumRows();
00653         const size_t nC = A.getNumCols();
00654         if (nR*nC == 0) {
00655           return Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero();
00656         }
00657         RCP<Node> node = A.getNode();
00658         ArrayRCP<const Scalar> Adata = A.getValues(0),
00659                                         Wdata = weightVector.getValues(0);
00660         // prepare buffers
00661         ReadyBufferHelper<Node> rbh(node);
00662         rbh.begin();
00663         rbh.template addConstBuffer<Scalar>(Adata);
00664         rbh.template addConstBuffer<Scalar>(Wdata);
00665         rbh.end();
00666         WeightNormOp<Scalar> op;
00667         op.x = Adata(0,nR).getRawPtr();
00668         op.w = Wdata(0,nR).getRawPtr();
00669         return node->parallel_reduce(0,nR,op);
00670       }
00671 
00672       inline static void WeightedNorm(const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &weightVector, const ArrayView<typename Teuchos::ScalarTraits<Scalar>::magnitudeType> &norms) {
00673         const size_t nR = A.getNumRows();
00674         const size_t nC = A.getNumCols();
00675         const size_t Astride = A.getStride(),
00676                      Wstride = weightVector.getStride();
00677         TEST_FOR_EXCEPTION(nC > Teuchos::as<size_t>(norms.size()), std::runtime_error, 
00678             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Norm1(A,norms): norms must have length as large as number of columns of A.");
00679         if (nR*nC == 0) {
00680           std::fill( norms.begin(), norms.begin() + nC, Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero() );
00681           return;
00682         }
00683         RCP<Node> node = A.getNode();
00684         ArrayRCP<const Scalar> Adata = A.getValues(),
00685                                         Wdata = weightVector.getValues();
00686         const bool OneW = (weightVector.getNumCols() == 1);
00687         // prepare buffers
00688         ReadyBufferHelper<Node> rbh(node);
00689         rbh.begin();
00690         rbh.template addConstBuffer<Scalar>(Adata);
00691         rbh.template addConstBuffer<Scalar>(Wdata);
00692         rbh.end();
00693         WeightNormOp<Scalar> op;
00694         if (OneW) {
00695           op.w = Wdata(0,nR).getRawPtr();
00696           for (size_t j=0; j<nC; ++j) {
00697             op.x = Adata(0,nR).getRawPtr();
00698             norms[j] = node->parallel_reduce(0,nR,op);
00699             Adata += Astride;
00700           }
00701         }
00702         else {
00703           for (size_t j=0; j<nC; ++j) {
00704             op.x = Adata(0,nR).getRawPtr();
00705             op.w = Wdata(0,nR).getRawPtr();
00706             norms[j] = node->parallel_reduce(0,nR,op);
00707             Adata += Astride;
00708             Wdata += Wstride;
00709           }
00710         }
00711       }
00712 
00713       inline static void Random(MultiVector<Scalar,Node> &A) {
00714         // TODO: consider adding rand() functionality to node
00715         // in the meantime, just generate random numbers via Teuchos and then copy to node
00716         typedef Teuchos::ScalarTraits<Scalar> SCT;
00717         const size_t stride = A.getStride();
00718         const size_t nR = A.getNumRows();
00719         const size_t nC = A.getNumCols();
00720         if (nR*nC == 0) return;
00721         RCP<Node> node = A.getNode();
00722         ArrayRCP<Scalar> Adata = A.getValuesNonConst();
00723         // we'll overwrite all data covered by the multivector, but not off-stride data
00724         // therefore, we are write-only only in the case that stride=nR
00725         ReadWriteOption rw = (stride == nR ? WriteOnly : ReadWrite);
00726         ArrayRCP<Scalar> mvdata = node->template viewBufferNonConst<Scalar>(rw,stride*(nC-1)+nR,Adata);
00727         for (size_t j=0; j<nC; ++j) {
00728           for (size_t i=0; i<nR; ++i) {
00729             mvdata[j*stride + i] = SCT::random();
00730           }
00731         }
00732         mvdata = null;
00733       }
00734 
00735       inline static void Abs(MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B) {
00736         const size_t nR = A.getNumRows();
00737         const size_t nC = A.getNumCols();
00738         const size_t Astride = A.getStride();
00739         const size_t Bstride = B.getStride();
00740         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00741                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Abs(A,B): A and B must have the same dimensions.");
00742         if (nC*nR == 0) return;
00743         RCP<Node> node = A.getNode();
00744         ArrayRCP<const Scalar> Bdata = B.getValues();
00745         ArrayRCP<Scalar>       Adata = A.getValuesNonConst();
00746         // prepare buffers
00747         ReadyBufferHelper<Node> rbh(node);
00748         rbh.begin();
00749         rbh.template addConstBuffer<Scalar>(Bdata);
00750         rbh.template addNonConstBuffer<Scalar>(Adata);
00751         rbh.end();
00752         // prepare op
00753         AbsOp<Scalar> wdp;
00754         if (Astride == nR && Bstride == nR) {
00755           // one kernel invocation for whole multivector assignment
00756           wdp.x = Adata(0,nR*nC).getRawPtr();
00757           wdp.y = Bdata(0,nR*nC).getRawPtr();
00758           node->template parallel_for<AbsOp<Scalar> >(0,nR*nC,wdp);
00759         }
00760         else {
00761           // one kernel invocation for each column
00762           for (size_t j=0; j<nC; ++j) {
00763             wdp.x = Adata(0,nR).getRawPtr();
00764             wdp.y = Bdata(0,nR).getRawPtr();
00765             node->template parallel_for<AbsOp<Scalar> >(0,nR,wdp);
00766             Adata += Astride;
00767             Bdata += Bstride;
00768           }
00769         }
00770       }
00771 
00772       inline static void Scale(MultiVector<Scalar,Node> &B, Scalar alpha, const MultiVector<Scalar,Node> &A) {
00773         const size_t nR = A.getNumRows();
00774         const size_t nC = A.getNumCols();
00775         const size_t Astride = A.getStride();
00776         const size_t Bstride = B.getStride();
00777         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00778                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Scale(B,alpha,A): A and B must have the same dimensions.");
00779         RCP<Node> node = B.getNode();
00780         ArrayRCP<const Scalar> Adata = A.getValues();
00781         ArrayRCP<Scalar>       Bdata = B.getValuesNonConst();
00782         // prepare buffers
00783         ReadyBufferHelper<Node> rbh(node);
00784         rbh.begin();
00785         rbh.template addConstBuffer<Scalar>(Adata);
00786         rbh.template addNonConstBuffer<Scalar>(Bdata);
00787         rbh.end();
00788         MVScaleOp<Scalar> wdp;
00789         wdp.alpha = alpha;
00790         if (Astride == nR && Bstride == nR) {
00791           // one kernel invocation for whole multivector
00792           wdp.x = Bdata(0,nR*nC).getRawPtr();
00793           wdp.y = Adata(0,nR*nC).getRawPtr();
00794           node->template parallel_for<MVScaleOp<Scalar> >(0,nR*nC,wdp);
00795         }
00796         else {
00797           // one kernel invocation for each column
00798           for (size_t j=0; j<nC; ++j) {
00799             wdp.x = Bdata(0,nR).getRawPtr();
00800             wdp.y = Adata(0,nR).getRawPtr();
00801             node->template parallel_for<MVScaleOp<Scalar> >(0,nR,wdp);
00802             Adata += Astride;
00803             Bdata += Bstride;
00804           }
00805         }
00806       }
00807 
00808       inline static void Scale(MultiVector<Scalar,Node> &A, Scalar alpha) {
00809         const size_t nR = A.getNumRows();
00810         const size_t nC = A.getNumCols();
00811         const size_t stride = A.getStride();
00812         RCP<Node> node = A.getNode();
00813         ArrayRCP<Scalar> data = A.getValuesNonConst();
00814         // prepare buffers
00815         ReadyBufferHelper<Node> rbh(node);
00816         rbh.begin();
00817         rbh.template addNonConstBuffer<Scalar>(data);
00818         rbh.end();
00819         // prepare op
00820         SingleScaleOp<Scalar> wdp;
00821         wdp.alpha = alpha;
00822         if (stride == nR) {
00823           // one kernel invocation for whole multivector
00824           wdp.x = data(0,nR*nC).getRawPtr();
00825           node->template parallel_for<SingleScaleOp<Scalar> >(0,nR*nC,wdp);
00826         }
00827         else {
00828           // one kernel invocation for each column
00829           for (size_t j=0; j<nC; ++j) {
00830             wdp.x = data(0,nR).getRawPtr();
00831             node->template parallel_for<SingleScaleOp<Scalar> >(0,nR,wdp);
00832             data += stride;
00833           }
00834         }
00835       }
00836 
00837       inline static void initializeValues(MultiVector<Scalar,Node> &A,
00838                                    size_t numRows, size_t numCols, 
00839                                    const ArrayRCP<Scalar> &values,
00840                                    size_t stride) {
00841         A.initializeValues(numRows,numCols,values,stride);
00842       }
00843 
00844       inline static ArrayRCP<const Scalar> getValues(const MultiVector<Scalar,Node> &A) {
00845         return A.getValues();
00846       }
00847 
00848       inline static ArrayRCP<const Scalar> getValues(const MultiVector<Scalar,Node> &A, size_t j) {
00849         return A.getValues(j);
00850       }
00851 
00852       inline static ArrayRCP<Scalar> getValuesNonConst(MultiVector<Scalar,Node> &A) {
00853         return A.getValuesNonConst();
00854       }
00855 
00856       inline static ArrayRCP<Scalar> getValuesNonConst(MultiVector<Scalar,Node> &A, size_t j) {
00857         return A.getValuesNonConst(j);
00858       }
00859 
00860       inline static size_t getNumRows(const MultiVector<Scalar,Node> &A) {
00861         return A.getNumRows();
00862       }
00863 
00864       inline static size_t getNumCols(const MultiVector<Scalar,Node> &A) {
00865         return A.getNumCols();
00866       }
00867 
00868       inline static size_t getStride(const MultiVector<Scalar,Node> &A) {
00869         return A.getStride();
00870       }
00871 
00872       inline static RCP<Node> getNode(const MultiVector<Scalar,Node> &A) {
00873         return A.getNode();
00874       }
00875 
00876   };
00877 
00878 }
00879 
00880 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends