Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_DefaultArithmetic.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef KOKKOS_DEFAULTARITHMETIC_H
00030 #define KOKKOS_DEFAULTARITHMETIC_H
00031 
00032 #include <Teuchos_BLAS_types.hpp>
00033 #include <Teuchos_TestForException.hpp>
00034 #include <Teuchos_TypeNameTraits.hpp>
00035 #include <Teuchos_ArrayView.hpp>
00036 #include <Teuchos_Tuple.hpp>
00037 #include <stdexcept>
00038 
00039 #include "Kokkos_MultiVector.hpp"
00040 #include "Kokkos_MultiVectorKernelOps.hpp"
00041 #include "Kokkos_NodeHelpers.hpp"
00042 #ifdef HAVE_KOKKOS_TBB
00043 #include "Kokkos_TBBNode.hpp"
00044 #endif
00045 #ifdef HAVE_KOKKOS_THREADPOOL
00046 #include "Kokkos_TPINode.hpp"
00047 #endif
00048 #ifdef HAVE_KOKKOS_THRUST
00049 #include "Kokkos_ThrustGPUNode.hpp"
00050 #include "cublas.h"
00051 #endif
00052 #include "Kokkos_SerialNode.hpp"
00053 #include <Teuchos_BLAS.hpp>
00054 
00055 namespace Kokkos {
00056 
00057   // Class for providing GEMM for a particular Node
00058   template <typename Scalar, typename Node> 
00059   struct NodeGEMM {
00060     public:
00061       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B, Scalar beta, MultiVector<Scalar,Node> &C) {
00062         TEST_FOR_EXCEPT(true);
00063       }
00064   };
00065 
00066   template <typename Scalar> 
00067   struct NodeGEMM<Scalar,SerialNode> {
00068     public:
00069       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,SerialNode> &A, const MultiVector<Scalar,SerialNode> &B, Scalar beta, MultiVector<Scalar,SerialNode> &C) {
00070         Teuchos::BLAS<int,Scalar> blas;
00071         const int m = Teuchos::as<int>(C.getNumRows()),
00072                   n = Teuchos::as<int>(C.getNumCols()),
00073                   k = (transA == Teuchos::NO_TRANS ? A.getNumCols() : A.getNumRows()),
00074                   lda = Teuchos::as<int>(A.getStride()),
00075                   ldb = Teuchos::as<int>(B.getStride()),
00076                   ldc = Teuchos::as<int>(C.getStride());
00077         blas.GEMM(transA, transB, m, n, k, alpha, A.getValues().getRawPtr(), lda, B.getValues().getRawPtr(), ldb, beta, C.getValuesNonConst().getRawPtr(), ldc);
00078       }
00079   };
00080 
00081 #ifdef HAVE_KOKKOS_TBB
00082   template <typename Scalar> 
00083   struct NodeGEMM<Scalar,TBBNode> {
00084     public:
00085       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,TBBNode> &A, const MultiVector<Scalar,TBBNode> &B, Scalar beta, MultiVector<Scalar,TBBNode> &C) {
00086         Teuchos::BLAS<int,Scalar> blas;
00087         const int m = Teuchos::as<int>(C.getNumRows()),
00088                   n = Teuchos::as<int>(C.getNumCols()),
00089                   k = (transA == Teuchos::NO_TRANS ? A.getNumCols() : A.getNumRows()),
00090                   lda = Teuchos::as<int>(A.getStride()),
00091                   ldb = Teuchos::as<int>(B.getStride()),
00092                   ldc = Teuchos::as<int>(C.getStride());
00093         blas.GEMM(transA, transB, m, n, k, alpha, A.getValues().getRawPtr(), lda, B.getValues().getRawPtr(), ldb, beta, C.getValuesNonConst().getRawPtr(), ldc);
00094       }
00095   };
00096 #endif
00097 
00098 #ifdef HAVE_KOKKOS_THREADPOOL
00099   template <typename Scalar> 
00100   struct NodeGEMM<Scalar,TPINode> {
00101     public:
00102       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,TPINode> &A, const MultiVector<Scalar,TPINode> &B, Scalar beta, MultiVector<Scalar,TPINode> &C) {
00103 #ifndef KOKKOS_DONT_BLOCK_TPI_GEMM
00104         TPI_Block();
00105 #endif
00106         Teuchos::BLAS<int,Scalar> blas;
00107         const int m = Teuchos::as<int>(C.getNumRows()),
00108                   n = Teuchos::as<int>(C.getNumCols()),
00109                   k = (transA == Teuchos::NO_TRANS ? A.getNumCols() : A.getNumRows()),
00110                   lda = Teuchos::as<int>(A.getStride()),
00111                   ldb = Teuchos::as<int>(B.getStride()),
00112                   ldc = Teuchos::as<int>(C.getStride());
00113         blas.GEMM(transA, transB, m, n, k, alpha, A.getValues().getRawPtr(), lda, B.getValues().getRawPtr(), ldb, beta, C.getValuesNonConst().getRawPtr(), ldc);
00114 #ifndef KOKKOS_DONT_BLOCK_TPI_GEMM
00115         TPI_Unblock();
00116 #endif
00117       }
00118   };
00119 #endif
00120 
00121 #ifdef HAVE_KOKKOS_THRUST 
00122   template <typename Scalar>
00123   struct NodeGEMM<Scalar,ThrustGPUNode> {
00124     public:
00125       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,ThrustGPUNode> &A, const MultiVector<Scalar,ThrustGPUNode> &B, Scalar beta, MultiVector<Scalar,ThrustGPUNode> &C) {
00126         TEST_FOR_EXCEPTION(true, std::logic_error, "NodeGEMM: ThrustGPUNode has no support for GEMM operations over Scalar=" << Teuchos::typeName(alpha) << ".");
00127       }
00128   };
00129 
00130 
00131 #ifdef HAVE_KOKKOS_CUDA_FLOAT
00132   template <>
00133   struct NodeGEMM<float,ThrustGPUNode> {
00134     public:
00135       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, float alpha, const MultiVector<float,ThrustGPUNode> &A, const MultiVector<float,ThrustGPUNode> &B, float beta, MultiVector<float,ThrustGPUNode> &C) {
00136         const int m = Teuchos::as<int>(C.getNumRows()),
00137                   n = Teuchos::as<int>(C.getNumCols()),
00138                   k = (transA == Teuchos::NO_TRANS ? A.getNumCols() : A.getNumRows()),
00139                   lda = Teuchos::as<int>(A.getStride()),
00140                   ldb = Teuchos::as<int>(B.getStride()),
00141                   ldc = Teuchos::as<int>(C.getStride());
00142         const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'),
00143                    char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
00144         cublasSgemm(char_transA, char_transB, m, n, k, alpha, A.getValues().getRawPtr(), lda, B.getValues().getRawPtr(), ldb, beta, C.getValuesNonConst().getRawPtr(), ldc);
00145 #ifdef HAVE_KOKKOS_DEBUG
00146         cublasStatus info = cublasGetError();
00147         TEST_FOR_EXCEPTION( info != CUBLAS_STATUS_SUCCESS, std::runtime_error, "cublasSgemm failed with status " << info << "." );
00148 #endif
00149       }
00150   };
00151 #endif
00152 
00153 #ifdef HAVE_KOKKOS_CUDA_DOUBLE
00154   template <>
00155   struct NodeGEMM<double,ThrustGPUNode> {
00156     public:
00157       static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, double alpha, const MultiVector<double,ThrustGPUNode> &A, const MultiVector<double,ThrustGPUNode> &B, double beta, MultiVector<double,ThrustGPUNode> &C) {
00158         const int m = Teuchos::as<int>(C.getNumRows()),
00159                   n = Teuchos::as<int>(C.getNumCols()),
00160                   k = (transA == Teuchos::NO_TRANS ? A.getNumCols() : A.getNumRows()),
00161                   lda = Teuchos::as<int>(A.getStride()),
00162                   ldb = Teuchos::as<int>(B.getStride()),
00163                   ldc = Teuchos::as<int>(C.getStride());
00164         const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'),
00165                    char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
00166         cublasDgemm(char_transA, char_transB, m, n, k, alpha, A.getValues().getRawPtr(), lda, B.getValues().getRawPtr(), ldb, beta, C.getValuesNonConst().getRawPtr(), ldc);
00167 #ifdef HAVE_KOKKOS_DEBUG
00168         cublasStatus info = cublasGetError();
00169         TEST_FOR_EXCEPTION( info != CUBLAS_STATUS_SUCCESS, std::runtime_error, "cublasDgemm failed with status " << info << "." );
00170 #endif
00171       }
00172   };
00173 #endif
00174 
00175 #endif
00176 
00179   template <class MV>
00180   class DefaultArithmetic {
00181     // nothing here
00182   };
00183 
00186   template <class Scalar, class Node>
00187   class DefaultArithmetic<MultiVector<Scalar,Node> > {
00188 
00189     public:
00191       inline static void Init(MultiVector<Scalar,Node> &A, Scalar alpha) {
00192         const size_t nR = A.getNumRows();
00193         const size_t nC = A.getNumCols();
00194         const size_t stride = A.getStride();
00195         RCP<Node> node = A.getNode();
00196         ArrayRCP<Scalar> data = A.getValuesNonConst();
00197         // prepare buffers
00198         ReadyBufferHelper<Node> rbh(node);
00199         rbh.begin();
00200         rbh.template addNonConstBuffer<Scalar>(data);
00201         rbh.end();
00202         // prepare op
00203         InitOp<Scalar> wdp;
00204         wdp.alpha = alpha;
00205         if (stride == nR) {
00206           // one kernel invocation for whole multivector
00207           wdp.x = data(0,nR*nC).getRawPtr();
00208           node->template parallel_for<InitOp<Scalar> >(0,nR*nC,wdp);
00209         }
00210         else {
00211           // one kernel invocation for each column
00212           for (size_t j=0; j<nC; ++j) {
00213             wdp.x = data(0,nR).getRawPtr();
00214             node->template parallel_for<InitOp<Scalar> >(0,nR,wdp);
00215             data += stride;
00216           }
00217         }
00218       }
00219 
00221       inline static void Recip(MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B) {
00222         const size_t nR = A.getNumRows();
00223         const size_t nC = A.getNumCols();
00224         const size_t Astride = A.getStride();
00225         const size_t Bstride = B.getStride();
00226         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00227                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Recip(A,B): A and B must have the same dimensions.");
00228         RCP<Node> node = B.getNode();
00229         ArrayRCP<const Scalar> Bdata = B.getValues();
00230         ArrayRCP<Scalar>       Adata = A.getValuesNonConst();
00231         // prepare buffers
00232         ReadyBufferHelper<Node> rbh(node);
00233         rbh.begin();
00234         rbh.template addConstBuffer<Scalar>(Bdata);
00235         rbh.template addNonConstBuffer<Scalar>(Adata);
00236         rbh.end();
00237         RecipOp<Scalar> wdp;
00238         if (A.getStride() == nR && B.getStride() == nR) {
00239           // one kernel invocation for whole multivector
00240           wdp.scale = Bdata(0,nR*nC).getRawPtr();
00241           wdp.x = Adata(0,nR*nC).getRawPtr();
00242           node->template parallel_for<RecipOp<Scalar> >(0,nR*nC,wdp);
00243         }
00244         else {
00245           // one kernel invocation for each column
00246           for (size_t j=0; j<nC; ++j) {
00247             wdp.x = Adata(0,nR).getRawPtr();
00248             wdp.scale = Bdata(0,nR).getRawPtr();
00249             node->template parallel_for<RecipOp<Scalar> >(0,nR,wdp);
00250             Adata += Astride;
00251             Bdata += Bstride;
00252           }
00253         }
00254       }
00255 
00257 
00259       inline static void ElemMult(MultiVector<Scalar,Node> &C, Scalar scalarC,
00260                                Scalar scalarAB,
00261                                const MultiVector<Scalar,Node> &A,
00262                                const MultiVector<Scalar,Node> &B) {
00263         const size_t nR_A = A.getNumRows();
00264         const size_t nC_A = A.getNumCols();
00265         TEST_FOR_EXCEPTION(nC_A != 1, std::runtime_error,
00266                            "DefaultArithmetic<"<<Teuchos::typeName(A) << ">::ElemMult(C,sC,sAB,A,B): A must have just 1 column.");
00267         const size_t Cstride = C.getStride();
00268         const size_t Bstride = B.getStride();
00269         const size_t nC_C = C.getNumCols();
00270         const size_t nR_C = C.getNumRows();
00271         TEST_FOR_EXCEPTION(nC_C != B.getNumCols() || nR_A != B.getNumRows() || nR_C != B.getNumRows(), std::runtime_error,
00272                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::ElemMult(C,sC,sAB,A,B): A, B and C must have the same num-rows, B and C must have the same num-cols.");
00273         RCP<Node> node = B.getNode();
00274         ArrayRCP<Scalar> Cdata = C.getValuesNonConst();
00275         ArrayRCP<const Scalar> Bdata = B.getValues();
00276         ArrayRCP<const Scalar>       Adata = A.getValues();
00277         // prepare buffers
00278         ReadyBufferHelper<Node> rbh(node);
00279         rbh.begin();
00280         rbh.template addNonConstBuffer<Scalar>(Cdata);
00281         rbh.template addConstBuffer<Scalar>(Bdata);
00282         rbh.template addConstBuffer<Scalar>(Adata);
00283         rbh.end();
00284         MVElemMultOp<Scalar> wdp;
00285         wdp.scalarX = scalarC;
00286         wdp.scalarYZ = scalarAB;
00287         // one kernel invocation for each column
00288         for (size_t j=0; j<nC_C; ++j) {
00289           wdp.x = Cdata(0,nR_C).getRawPtr();
00290           wdp.y = Adata(0,nR_C).getRawPtr();
00291           wdp.z = Bdata(0,nR_C).getRawPtr();
00292           node->template parallel_for<MVElemMultOp<Scalar> >(0,nR_C,wdp);
00293           Cdata += Cstride;
00294           Bdata += Bstride;
00295         }
00296       }
00297 
00299       inline static void Assign(MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B) {
00300         const size_t nR = A.getNumRows();
00301         const size_t nC = A.getNumCols();
00302         const size_t Astride = A.getStride();
00303         const size_t Bstride = B.getStride();
00304         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00305                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Assign(A,B): A and B must have the same dimensions.");
00306         if (nC*nR == 0) return;
00307         RCP<Node> node = A.getNode();
00308         ArrayRCP<const Scalar> Bdata = B.getValues();
00309         ArrayRCP<Scalar>       Adata = A.getValuesNonConst();
00310         // prepare buffers
00311         ReadyBufferHelper<Node> rbh(node);
00312         rbh.begin();
00313         rbh.template addConstBuffer<Scalar>(Bdata);
00314         rbh.template addNonConstBuffer<Scalar>(Adata);
00315         rbh.end();
00316         // prepare op
00317         AssignOp<Scalar> wdp;
00318         if (Astride == nR && Bstride == nR) {
00319           // one kernel invocation for whole multivector assignment
00320           wdp.x = Adata(0,nR*nC).getRawPtr();
00321           wdp.y = Bdata(0,nR*nC).getRawPtr();
00322           node->template parallel_for<AssignOp<Scalar> >(0,nR*nC,wdp);
00323         }
00324         else {
00325           // one kernel invocation for each column
00326           for (size_t j=0; j<nC; ++j) {
00327             wdp.x = Adata(0,nR).getRawPtr();
00328             wdp.y = Bdata(0,nR).getRawPtr();
00329             node->template parallel_for<AssignOp<Scalar> >(0,nR,wdp);
00330             Adata += Astride;
00331             Bdata += Bstride;
00332           }
00333         }
00334       }
00335 
00336       inline static void Dot(const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B, const ArrayView<Scalar> &dots) {
00337         const size_t nR = A.getNumRows();
00338         const size_t nC = A.getNumCols();
00339         const size_t Astride = A.getStride();
00340         const size_t Bstride = B.getStride();
00341         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00342                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Dot(A,B,dots): A and B must have the same dimensions.");
00343         TEST_FOR_EXCEPTION(nC > Teuchos::as<size_t>(dots.size()), std::runtime_error, 
00344             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Dot(A,B,dots): dots must have length as large as number of columns of A and B.");
00345         if (nR*nC == 0) {
00346           std::fill( dots.begin(), dots.begin() + nC, Teuchos::ScalarTraits<Scalar>::zero() );
00347           return;
00348         }
00349         RCP<Node> node = A.getNode();
00350         ArrayRCP<const Scalar> Bdata = B.getValues(),
00351                                         Adata = A.getValues();
00352         // prepare buffers
00353         ReadyBufferHelper<Node> rbh(node);
00354         rbh.begin();
00355         rbh.template addConstBuffer<Scalar>(Bdata);
00356         rbh.template addConstBuffer<Scalar>(Adata);
00357         rbh.end();
00358         DotOp2<Scalar> op;
00359         for (size_t j=0; j<nC; ++j) {
00360           op.x = Adata(0,nR).getRawPtr();
00361           op.y = Bdata(0,nR).getRawPtr();
00362           dots[j] = node->parallel_reduce(0,nR,op);
00363           Adata += Astride;
00364           Bdata += Bstride;
00365         }
00366       }
00367 
00368       inline static Scalar Dot(const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B) {
00369         const size_t nR = A.getNumRows();
00370         const size_t nC = A.getNumCols();
00371         TEST_FOR_EXCEPTION(nR != B.getNumRows(), std::runtime_error,
00372                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Dot(A,B,dots): A and B must have the same number of rows.");
00373         if (nR*nC == 0) {
00374           return Teuchos::ScalarTraits<Scalar>::zero();
00375         }
00376         RCP<Node> node = A.getNode();
00377         ArrayRCP<const Scalar> Bdata = B.getValues(0),
00378                                         Adata = A.getValues(0);
00379         // prepare buffers
00380         ReadyBufferHelper<Node> rbh(node);
00381         rbh.begin();
00382         rbh.template addConstBuffer<Scalar>(Bdata);
00383         rbh.template addConstBuffer<Scalar>(Adata);
00384         rbh.end();
00385         DotOp2<Scalar> op;
00386         op.x = Adata(0,nR).getRawPtr();
00387         op.y = Bdata(0,nR).getRawPtr();
00388         return node->parallel_reduce(0,nR,op);
00389       }
00390 
00391       inline static void GEMM(MultiVector<Scalar,Node> &C, Teuchos::ETransp transA, Teuchos::ETransp transB, Scalar alpha, const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B, Scalar beta) {
00392         NodeGEMM<Scalar,Node>::GEMM(transA, transB, alpha, A, B, beta, C);
00393       }
00394 
00395       inline static void GESUM(MultiVector<Scalar,Node> &B, Scalar alpha, const MultiVector<Scalar,Node> &A, Scalar beta) {
00396         const size_t nR = A.getNumRows();
00397         const size_t nC = A.getNumCols();
00398         const size_t Astride = A.getStride();
00399         const size_t Bstride = B.getStride();
00400         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00401                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::GESUM(B,alpha,A,beta): A and B must have the same dimensions.");
00402         RCP<Node> node = B.getNode();
00403         ArrayRCP<const Scalar> Adata = A.getValues();
00404         ArrayRCP<Scalar>       Bdata = B.getValuesNonConst();
00405         // prepare buffers
00406         ReadyBufferHelper<Node> rbh(node);
00407         rbh.begin();
00408         rbh.template addConstBuffer<Scalar>(Adata);
00409         rbh.template addNonConstBuffer<Scalar>(Bdata);
00410         rbh.end();
00411         GESUMOp<Scalar> wdp;
00412         wdp.alpha = alpha;
00413         wdp.beta  = beta;
00414         if (Astride == nR && Bstride == nR) {
00415           // one kernel invocation for whole multivector
00416           wdp.y = Bdata(0,nR*nC).getRawPtr();
00417           wdp.x = Adata(0,nR*nC).getRawPtr();
00418           node->template parallel_for<GESUMOp<Scalar> >(0,nR*nC,wdp);
00419         }
00420         else {
00421           // one kernel invocation for each column
00422           for (size_t j=0; j<nC; ++j) {
00423             wdp.y = Bdata(0,nR).getRawPtr();
00424             wdp.x = Adata(0,nR).getRawPtr();
00425             node->template parallel_for<GESUMOp<Scalar> >(0,nR,wdp);
00426             Adata += Astride;
00427             Bdata += Bstride;
00428           }
00429         }
00430       }
00431 
00432       inline static void GESUM(MultiVector<Scalar,Node> &C, Scalar alpha, const MultiVector<Scalar,Node> &A, Scalar beta, const MultiVector<Scalar,Node> &B, Scalar gamma) {
00433         const size_t nR = A.getNumRows();
00434         const size_t nC = A.getNumCols();
00435         const size_t Astride = A.getStride();
00436         const size_t Bstride = B.getStride();
00437         const size_t Cstride = C.getStride();
00438         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00439                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::GESUM(C,alpha,A,beta,B,gamma): A and B must have the same dimensions.");
00440         RCP<Node> node = B.getNode();
00441         ArrayRCP<const Scalar> Adata = A.getValues(),
00442                                         Bdata = B.getValues();
00443         ArrayRCP<Scalar>       Cdata = C.getValuesNonConst();
00444         // prepare buffers
00445         ReadyBufferHelper<Node> rbh(node);
00446         rbh.begin();
00447         rbh.template addConstBuffer<Scalar>(Adata);
00448         rbh.template addConstBuffer<Scalar>(Bdata);
00449         rbh.template addNonConstBuffer<Scalar>(Cdata);
00450         rbh.end();
00451         GESUMOp3<Scalar> wdp;
00452         wdp.alpha = alpha;
00453         wdp.beta  = beta;
00454         wdp.gamma = gamma;
00455         if (Astride == nR && Bstride == nR && Cstride == nR) {
00456           // one kernel invocation for whole multivector
00457           wdp.z = Cdata(0,nR*nC).getRawPtr();
00458           wdp.y = Bdata(0,nR*nC).getRawPtr();
00459           wdp.x = Adata(0,nR*nC).getRawPtr();
00460           node->template parallel_for<GESUMOp3<Scalar> >(0,nR*nC,wdp);
00461         }
00462         else {
00463           // one kernel invocation for each column
00464           for (size_t j=0; j<nC; ++j) {
00465             wdp.z = Cdata(0,nR).getRawPtr();
00466             wdp.y = Bdata(0,nR).getRawPtr();
00467             wdp.x = Adata(0,nR).getRawPtr();
00468             node->template parallel_for<GESUMOp3<Scalar> >(0,nR,wdp);
00469             Adata += Astride;
00470             Bdata += Bstride;
00471             Cdata += Cstride;
00472           }
00473         }
00474       }
00475 
00476       inline static void Norm1(const MultiVector<Scalar,Node> &A, const ArrayView<typename Teuchos::ScalarTraits<Scalar>::magnitudeType> &norms) {
00477         const size_t nR = A.getNumRows();
00478         const size_t nC = A.getNumCols();
00479         const size_t Astride = A.getStride();
00480         TEST_FOR_EXCEPTION(nC > Teuchos::as<size_t>(norms.size()), std::runtime_error, 
00481             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Norm1(A,norms): norms must have length as large as number of columns of A.");
00482         if (nR*nC == 0) {
00483           std::fill( norms.begin(), norms.begin() + nC, Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero() );
00484           return;
00485         }
00486         RCP<Node> node = A.getNode();
00487         ArrayRCP<const Scalar> Adata = A.getValues();
00488         // prepare buffers
00489         ReadyBufferHelper<Node> rbh(node);
00490         rbh.begin();
00491         rbh.template addConstBuffer<Scalar>(Adata);
00492         rbh.end();
00493         SumAbsOp<Scalar> op;
00494         for (size_t j=0; j<nC; ++j) {
00495           op.x = Adata(0,nR).getRawPtr();
00496           norms[j] = node->parallel_reduce(0,nR,op);
00497           Adata += Astride;
00498         }
00499       }
00500 
00501       inline static typename Teuchos::ScalarTraits<Scalar>::magnitudeType
00502       Norm1(const MultiVector<Scalar,Node> &A) {
00503         const size_t nR = A.getNumRows();
00504         const size_t nC = A.getNumCols();
00505         if (nR*nC == 0) {
00506           return Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero();
00507         }
00508         RCP<Node> node = A.getNode();
00509         ArrayRCP<const Scalar> Adata = A.getValues(0);
00510         // prepare buffers
00511         ReadyBufferHelper<Node> rbh(node);
00512         rbh.begin();
00513         rbh.template addConstBuffer<Scalar>(Adata);
00514         rbh.end();
00515         SumAbsOp<Scalar> op;
00516         op.x = Adata(0,nR).getRawPtr();
00517         return node->parallel_reduce(0,nR,op);
00518       }
00519 
00520       inline static void Sum(const MultiVector<Scalar,Node> &A, const ArrayView<Scalar> &sums) {
00521         const size_t nR = A.getNumRows();
00522         const size_t nC = A.getNumCols();
00523         const size_t Astride = A.getStride();
00524         TEST_FOR_EXCEPTION(nC > (size_t)sums.size(), std::runtime_error, 
00525             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Sum(A,sums): sums must have length as large as number of columns of A.");
00526         if (nR*nC == 0) {
00527           std::fill( sums.begin(), sums.begin() + nC, Teuchos::ScalarTraits<Scalar>::zero() );
00528           return;
00529         }
00530         RCP<Node> node = A.getNode();
00531         ArrayRCP<const Scalar> Adata = A.getValues();
00532         // prepare buffers
00533         ReadyBufferHelper<Node> rbh(node);
00534         rbh.begin();
00535         rbh.template addConstBuffer<Scalar>(Adata);
00536         rbh.end();
00537         SumOp<Scalar> op;
00538         for (size_t j=0; j<nC; ++j) {
00539           op.x = Adata(0,nR).getRawPtr();
00540           sums[j] = node->parallel_reduce(0,nR,op);
00541           Adata += Astride;
00542         }
00543       }
00544 
00545       inline static Scalar Sum(const MultiVector<Scalar,Node> &A) {
00546         const size_t nR = A.getNumRows();
00547         const size_t nC = A.getNumCols();
00548         if (nR*nC == 0) {
00549           return Teuchos::ScalarTraits<Scalar>::zero();
00550         }
00551         RCP<Node> node = A.getNode();
00552         ArrayRCP<const Scalar> Adata = A.getValues(0);
00553         // prepare buffers
00554         ReadyBufferHelper<Node> rbh(node);
00555         rbh.begin();
00556         rbh.template addConstBuffer<Scalar>(Adata);
00557         rbh.end();
00558         SumOp<Scalar> op;
00559         op.x = Adata(0,nR).getRawPtr();
00560         return node->parallel_reduce(0,nR,op);
00561       }
00562 
00563       inline static typename Teuchos::ScalarTraits<Scalar>::magnitudeType NormInf(const MultiVector<Scalar,Node> &A) {
00564         const size_t nR = A.getNumRows();
00565         const size_t nC = A.getNumCols();
00566         if (nR*nC == 0) {
00567           return Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero();
00568         }
00569         RCP<Node> node = A.getNode();
00570         ArrayRCP<const Scalar> Adata = A.getValues(0);
00571         // prepare buffers
00572         ReadyBufferHelper<Node> rbh(node);
00573         rbh.begin();
00574         rbh.template addConstBuffer<Scalar>(Adata);
00575         rbh.end();
00576         MaxAbsOp<Scalar> op;
00577         op.x = Adata(0,nR).getRawPtr();
00578         return node->parallel_reduce(0,nR,op);
00579       }
00580 
00581       inline static void NormInf(const MultiVector<Scalar,Node> &A, const ArrayView<typename Teuchos::ScalarTraits<Scalar>::magnitudeType> &norms) {
00582         const size_t nR = A.getNumRows();
00583         const size_t nC = A.getNumCols();
00584         const size_t Astride = A.getStride();
00585         TEST_FOR_EXCEPTION(nC > Teuchos::as<size_t>(norms.size()), std::runtime_error, 
00586             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::NormInf(A,norms): norms must have length as large as number of columns of A.");
00587         if (nR*nC == 0) {
00588           std::fill( norms.begin(), norms.begin() + nC, Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero() );
00589           return;
00590         }
00591         RCP<Node> node = A.getNode();
00592         ArrayRCP<const Scalar> Adata = A.getValues();
00593         // prepare buffers
00594         ReadyBufferHelper<Node> rbh(node);
00595         rbh.begin();
00596         rbh.template addConstBuffer<Scalar>(Adata);
00597         rbh.end();
00598         MaxAbsOp<Scalar> op;
00599         for (size_t j=0; j<nC; ++j) {
00600           op.x = Adata(0,nR).getRawPtr();
00601           norms[j] = node->parallel_reduce(0,nR,op);
00602           Adata += Astride;
00603         }
00604       }
00605 
00606       inline static void Norm2Squared(const MultiVector<Scalar,Node> &A, const ArrayView<typename Teuchos::ScalarTraits<Scalar>::magnitudeType> &norms) {
00607         const size_t nR = A.getNumRows();
00608         const size_t nC = A.getNumCols();
00609         const size_t Astride = A.getStride();
00610         TEST_FOR_EXCEPTION(nC > Teuchos::as<size_t>(norms.size()), std::runtime_error, 
00611             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Norm2Squared(A,norms): norms must have length as large as number of columns of A.");
00612         if (nR*nC == 0) {
00613           std::fill( norms.begin(), norms.begin() + nC, Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero() );
00614           return;
00615         }
00616         RCP<Node> node = A.getNode();
00617         ArrayRCP<const Scalar> Adata = A.getValues();
00618         // prepare buffers
00619         ReadyBufferHelper<Node> rbh(node);
00620         rbh.begin();
00621         rbh.template addConstBuffer<Scalar>(Adata);
00622         rbh.end();
00623         DotOp1<Scalar> op;
00624         for (size_t j=0; j<nC; ++j) {
00625           op.x = Adata(0,nR).getRawPtr();
00626           norms[j] = node->parallel_reduce(0,nR,op);
00627           Adata += Astride;
00628         }
00629       }
00630 
00631       inline static typename Teuchos::ScalarTraits<Scalar>::magnitudeType 
00632       Norm2Squared(const MultiVector<Scalar,Node> &A) {
00633         const size_t nR = A.getNumRows();
00634         if (nR == 0) {
00635           return Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero();
00636         }
00637         RCP<Node> node = A.getNode();
00638         ArrayRCP<const Scalar> Adata = A.getValues(0);
00639         // prepare buffers
00640         ReadyBufferHelper<Node> rbh(node);
00641         rbh.begin();
00642         rbh.template addConstBuffer<Scalar>(Adata);
00643         rbh.end();
00644         DotOp1<Scalar> op;
00645         op.x = Adata(0,nR).getRawPtr();
00646         return node->parallel_reduce(0,nR,op);
00647       }
00648 
00649       inline static typename Teuchos::ScalarTraits<Scalar>::magnitudeType
00650       WeightedNorm(const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &weightVector) {
00651         const size_t nR = A.getNumRows();
00652         const size_t nC = A.getNumCols();
00653         if (nR*nC == 0) {
00654           return Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero();
00655         }
00656         RCP<Node> node = A.getNode();
00657         ArrayRCP<const Scalar> Adata = A.getValues(0),
00658                                         Wdata = weightVector.getValues(0);
00659         // prepare buffers
00660         ReadyBufferHelper<Node> rbh(node);
00661         rbh.begin();
00662         rbh.template addConstBuffer<Scalar>(Adata);
00663         rbh.template addConstBuffer<Scalar>(Wdata);
00664         rbh.end();
00665         WeightNormOp<Scalar> op;
00666         op.x = Adata(0,nR).getRawPtr();
00667         op.w = Wdata(0,nR).getRawPtr();
00668         return node->parallel_reduce(0,nR,op);
00669       }
00670 
00671       inline static void WeightedNorm(const MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &weightVector, const ArrayView<typename Teuchos::ScalarTraits<Scalar>::magnitudeType> &norms) {
00672         const size_t nR = A.getNumRows();
00673         const size_t nC = A.getNumCols();
00674         const size_t Astride = A.getStride(),
00675                      Wstride = weightVector.getStride();
00676         TEST_FOR_EXCEPTION(nC > Teuchos::as<size_t>(norms.size()), std::runtime_error, 
00677             "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Norm1(A,norms): norms must have length as large as number of columns of A.");
00678         if (nR*nC == 0) {
00679           std::fill( norms.begin(), norms.begin() + nC, Teuchos::ScalarTraits<typename Teuchos::ScalarTraits<Scalar>::magnitudeType>::zero() );
00680           return;
00681         }
00682         RCP<Node> node = A.getNode();
00683         ArrayRCP<const Scalar> Adata = A.getValues(),
00684                                         Wdata = weightVector.getValues();
00685         const bool OneW = (weightVector.getNumCols() == 1);
00686         // prepare buffers
00687         ReadyBufferHelper<Node> rbh(node);
00688         rbh.begin();
00689         rbh.template addConstBuffer<Scalar>(Adata);
00690         rbh.template addConstBuffer<Scalar>(Wdata);
00691         rbh.end();
00692         WeightNormOp<Scalar> op;
00693         if (OneW) {
00694           op.w = Wdata(0,nR).getRawPtr();
00695           for (size_t j=0; j<nC; ++j) {
00696             op.x = Adata(0,nR).getRawPtr();
00697             norms[j] = node->parallel_reduce(0,nR,op);
00698             Adata += Astride;
00699           }
00700         }
00701         else {
00702           for (size_t j=0; j<nC; ++j) {
00703             op.x = Adata(0,nR).getRawPtr();
00704             op.w = Wdata(0,nR).getRawPtr();
00705             norms[j] = node->parallel_reduce(0,nR,op);
00706             Adata += Astride;
00707             Wdata += Wstride;
00708           }
00709         }
00710       }
00711 
00712       inline static void Random(MultiVector<Scalar,Node> &A) {
00713         // TODO: consider adding rand() functionality to node
00714         // in the meantime, just generate random numbers via Teuchos and then copy to node
00715         typedef Teuchos::ScalarTraits<Scalar> SCT;
00716         const size_t stride = A.getStride();
00717         const size_t nR = A.getNumRows();
00718         const size_t nC = A.getNumCols();
00719         if (nR*nC == 0) return;
00720         RCP<Node> node = A.getNode();
00721         ArrayRCP<Scalar> Adata = A.getValuesNonConst();
00722         // we'll overwrite all data covered by the multivector, but not off-stride data
00723         // therefore, we are write-only only in the case that stride=nR
00724         ReadWriteOption rw = (stride == nR ? WriteOnly : ReadWrite);
00725         ArrayRCP<Scalar> mvdata = node->template viewBufferNonConst<Scalar>(rw,stride*(nC-1)+nR,Adata);
00726         for (size_t j=0; j<nC; ++j) {
00727           for (size_t i=0; i<nR; ++i) {
00728             mvdata[j*stride + i] = SCT::random();
00729           }
00730         }
00731         mvdata = null;
00732       }
00733 
00734       inline static void Abs(MultiVector<Scalar,Node> &A, const MultiVector<Scalar,Node> &B) {
00735         const size_t nR = A.getNumRows();
00736         const size_t nC = A.getNumCols();
00737         const size_t Astride = A.getStride();
00738         const size_t Bstride = B.getStride();
00739         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00740                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Abs(A,B): A and B must have the same dimensions.");
00741         if (nC*nR == 0) return;
00742         RCP<Node> node = A.getNode();
00743         ArrayRCP<const Scalar> Bdata = B.getValues();
00744         ArrayRCP<Scalar>       Adata = A.getValuesNonConst();
00745         // prepare buffers
00746         ReadyBufferHelper<Node> rbh(node);
00747         rbh.begin();
00748         rbh.template addConstBuffer<Scalar>(Bdata);
00749         rbh.template addNonConstBuffer<Scalar>(Adata);
00750         rbh.end();
00751         // prepare op
00752         AbsOp<Scalar> wdp;
00753         if (Astride == nR && Bstride == nR) {
00754           // one kernel invocation for whole multivector assignment
00755           wdp.x = Adata(0,nR*nC).getRawPtr();
00756           wdp.y = Bdata(0,nR*nC).getRawPtr();
00757           node->template parallel_for<AbsOp<Scalar> >(0,nR*nC,wdp);
00758         }
00759         else {
00760           // one kernel invocation for each column
00761           for (size_t j=0; j<nC; ++j) {
00762             wdp.x = Adata(0,nR).getRawPtr();
00763             wdp.y = Bdata(0,nR).getRawPtr();
00764             node->template parallel_for<AbsOp<Scalar> >(0,nR,wdp);
00765             Adata += Astride;
00766             Bdata += Bstride;
00767           }
00768         }
00769       }
00770 
00771       inline static void Scale(MultiVector<Scalar,Node> &B, Scalar alpha, const MultiVector<Scalar,Node> &A) {
00772         const size_t nR = A.getNumRows();
00773         const size_t nC = A.getNumCols();
00774         const size_t Astride = A.getStride();
00775         const size_t Bstride = B.getStride();
00776         TEST_FOR_EXCEPTION(nC != B.getNumCols() || nR != B.getNumRows(), std::runtime_error,
00777                            "DefaultArithmetic<" << Teuchos::typeName(A) << ">::Scale(B,alpha,A): A and B must have the same dimensions.");
00778         RCP<Node> node = B.getNode();
00779         ArrayRCP<const Scalar> Adata = A.getValues();
00780         ArrayRCP<Scalar>       Bdata = B.getValuesNonConst();
00781         // prepare buffers
00782         ReadyBufferHelper<Node> rbh(node);
00783         rbh.begin();
00784         rbh.template addConstBuffer<Scalar>(Adata);
00785         rbh.template addNonConstBuffer<Scalar>(Bdata);
00786         rbh.end();
00787         MVScaleOp<Scalar> wdp;
00788         wdp.alpha = alpha;
00789         if (Astride == nR && Bstride == nR) {
00790           // one kernel invocation for whole multivector
00791           wdp.x = Bdata(0,nR*nC).getRawPtr();
00792           wdp.y = Adata(0,nR*nC).getRawPtr();
00793           node->template parallel_for<MVScaleOp<Scalar> >(0,nR*nC,wdp);
00794         }
00795         else {
00796           // one kernel invocation for each column
00797           for (size_t j=0; j<nC; ++j) {
00798             wdp.x = Bdata(0,nR).getRawPtr();
00799             wdp.y = Adata(0,nR).getRawPtr();
00800             node->template parallel_for<MVScaleOp<Scalar> >(0,nR,wdp);
00801             Adata += Astride;
00802             Bdata += Bstride;
00803           }
00804         }
00805       }
00806 
00807       inline static void Scale(MultiVector<Scalar,Node> &A, Scalar alpha) {
00808         const size_t nR = A.getNumRows();
00809         const size_t nC = A.getNumCols();
00810         const size_t stride = A.getStride();
00811         RCP<Node> node = A.getNode();
00812         ArrayRCP<Scalar> data = A.getValuesNonConst();
00813         // prepare buffers
00814         ReadyBufferHelper<Node> rbh(node);
00815         rbh.begin();
00816         rbh.template addNonConstBuffer<Scalar>(data);
00817         rbh.end();
00818         // prepare op
00819         SingleScaleOp<Scalar> wdp;
00820         wdp.alpha = alpha;
00821         if (stride == nR) {
00822           // one kernel invocation for whole multivector
00823           wdp.x = data(0,nR*nC).getRawPtr();
00824           node->template parallel_for<SingleScaleOp<Scalar> >(0,nR*nC,wdp);
00825         }
00826         else {
00827           // one kernel invocation for each column
00828           for (size_t j=0; j<nC; ++j) {
00829             wdp.x = data(0,nR).getRawPtr();
00830             node->template parallel_for<SingleScaleOp<Scalar> >(0,nR,wdp);
00831             data += stride;
00832           }
00833         }
00834       }
00835 
00836       inline static void initializeValues(MultiVector<Scalar,Node> &A,
00837                                    size_t numRows, size_t numCols, 
00838                                    const ArrayRCP<Scalar> &values,
00839                                    size_t stride) {
00840         A.initializeValues(numRows,numCols,values,stride);
00841       }
00842 
00843       inline static ArrayRCP<const Scalar> getValues(const MultiVector<Scalar,Node> &A) {
00844         return A.getValues();
00845       }
00846 
00847       inline static ArrayRCP<const Scalar> getValues(const MultiVector<Scalar,Node> &A, size_t j) {
00848         return A.getValues(j);
00849       }
00850 
00851       inline static ArrayRCP<Scalar> getValuesNonConst(MultiVector<Scalar,Node> &A) {
00852         return A.getValuesNonConst();
00853       }
00854 
00855       inline static ArrayRCP<Scalar> getValuesNonConst(MultiVector<Scalar,Node> &A, size_t j) {
00856         return A.getValuesNonConst(j);
00857       }
00858 
00859       inline static size_t getNumRows(const MultiVector<Scalar,Node> &A) {
00860         return A.getNumRows();
00861       }
00862 
00863       inline static size_t getNumCols(const MultiVector<Scalar,Node> &A) {
00864         return A.getNumCols();
00865       }
00866 
00867       inline static size_t getStride(const MultiVector<Scalar,Node> &A) {
00868         return A.getStride();
00869       }
00870 
00871       inline static RCP<Node> getNode(const MultiVector<Scalar,Node> &A) {
00872         return A.getNode();
00873       }
00874 
00875   };
00876 
00877 }
00878 
00879 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends