Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_DefaultBlockSparseKernelOps.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef KOKKOS_DEFAULTBLOCKSPARSEKERNELOPS_HPP
00030 #define KOKKOS_DEFAULTBLOCKSPARSEKERNELOPS_HPP
00031 
00032 #ifndef KERNEL_PREFIX
00033 #define KERNEL_PREFIX
00034 #endif
00035 
00036 #ifdef __CUDACC__
00037 #include <Teuchos_ScalarTraitsCUDA.hpp>
00038 #else
00039 #include <Teuchos_ScalarTraits.hpp>
00040 #endif
00041 
00042 
00043 // FINISH TODO : This code is missing the calls to ScalarTraits::conjugate() 
00044 //               necessary for the transposed kernels to work for complex-valued scalars.
00045 //               Note, this will require also the addition of conjugate functionality to the 1x1 kernel
00046 
00047 namespace Kokkos {
00048 
00050 template<class Scalar,class Ordinal,class DomainScalar,class RangeScalar>
00051 inline KERNEL_PREFIX void 
00052 densematvec(Ordinal Nrows, Ordinal Ncols,
00053             RangeScalar alpha, const Scalar* A,
00054             const DomainScalar* x, RangeScalar* y)
00055 {
00056   unsigned offset = 0;
00057   for(Ordinal c=0; c<Ncols; ++c) {
00058     for(Ordinal r=0; r<Nrows; ++r) {
00059       y[r] += alpha*A[offset++]*x[c];
00060     }
00061   }
00062 }
00063 
00066 template<class Scalar,class Ordinal,class DomainScalar,class RangeScalar>
00067 inline KERNEL_PREFIX void 
00068 densematvec_skipdiag(Ordinal N,
00069                      RangeScalar alpha, const Scalar* A,
00070                      const DomainScalar* x, RangeScalar* y)
00071 {
00072   unsigned offset = 0;
00073   for(Ordinal c=0; c<N; ++c) {
00074     for(Ordinal r=0; r<N; ++r) {
00075       if (r != c) {
00076         y[r] += alpha*A[offset]*x[c];
00077       }
00078       ++offset;
00079     }
00080   }
00081 }
00082 
00086 template<class Scalar,class Ordinal,class RangeScalar>
00087 inline KERNEL_PREFIX void 
00088 solve_block_upper(Ordinal N, const Scalar* A, RangeScalar* y)
00089 {
00090   int n = N;
00091   for(int r=n-1; r>=0; --r) {
00092     for(int c=n-1; c>r; --c) {
00093       y[r] -= A[c*n+r]*y[c];
00094     }
00095     y[r] /= A[r*n+r];
00096   }
00097 }
00098 
00102 template<class Scalar,class Ordinal,class RangeScalar>
00103 inline KERNEL_PREFIX void 
00104 solve_block_upper_transpose(Ordinal N, const Scalar* A, RangeScalar* y)
00105 {
00106   int n = N;
00107   for(int c=0; c<N; ++c) {
00108     for(int r=0; r<c; ++r) {
00109       y[c] -= A[c*n+r]*y[r];
00110     }
00111     y[c] /= A[c*n+c];
00112   }
00113 }
00114 
00118 template<class Scalar,class Ordinal,class RangeScalar>
00119 inline KERNEL_PREFIX void 
00120 solve_block_lower(Ordinal N, const Scalar* A, RangeScalar* y)
00121 {
00122   int n = N;
00123   for(int r=0; r<n; ++r) {
00124     for(int c=0; c<r; ++c) {
00125       y[r] -= A[c*n+r]*y[c];
00126     }
00127     y[r] /= A[r*n+r];
00128   }
00129 }
00130 
00134 template<class Scalar,class Ordinal,class RangeScalar>
00135 inline KERNEL_PREFIX void 
00136 solve_block_lower_transpose(Ordinal N, const Scalar* A, RangeScalar* y)
00137 {
00138   int n = N;
00139   for(int c=n-1; c>=0; --c) {
00140     for(int r=n-1; r>c; --r) {
00141       y[c] -= A[c*n+r]*y[r];
00142     }
00143     y[c] /= A[c*n+c];
00144   }
00145 }
00146 
00148 template<class Scalar,class Ordinal,class DomainScalar,class RangeScalar>
00149 inline KERNEL_PREFIX void
00150 densematvec_trans(Ordinal Nrows, Ordinal Ncols,
00151                   RangeScalar alpha, const Scalar* A,
00152                   const DomainScalar* x, RangeScalar* y)
00153 {
00154   unsigned offset = 0;
00155   for(Ordinal c=0; c<Ncols; ++c) {
00156     for(Ordinal r=0; r<Nrows; ++r) {
00157       y[c] += alpha*A[offset++]*x[r];
00158     }
00159   }
00160 }
00161 
00163 template<class Scalar,class DomainScalar,class RangeScalar>
00164 inline KERNEL_PREFIX void 
00165 dense_matvec_1x1( RangeScalar alpha, const Scalar* A,
00166                   const DomainScalar* x, RangeScalar* y)
00167 {
00168   y[0] += alpha*A[0]*x[0];
00169 }
00170 
00172 template<class Scalar,class DomainScalar,class RangeScalar>
00173 inline KERNEL_PREFIX void 
00174 dense_matvec_2x2(RangeScalar alpha, const Scalar* A,
00175                  const DomainScalar* x, RangeScalar* y)
00176 {
00177   y[0] += alpha*(A[0]*x[0] + A[2]*x[1]);
00178   y[1] += alpha*(A[1]*x[0] + A[3]*x[1]);
00179 }
00180 
00182 template<class Scalar,class DomainScalar,class RangeScalar>
00183 inline KERNEL_PREFIX void 
00184 dense_matvec_3x3(RangeScalar alpha, const Scalar* A,
00185                  const DomainScalar* x, RangeScalar* y)
00186 {
00187   y[0] += alpha*(A[0]*x[0] + A[3]*x[1] + A[6]*x[2]);
00188   y[1] += alpha*(A[1]*x[0] + A[4]*x[1] + A[7]*x[2]);
00189   y[2] += alpha*(A[2]*x[0] + A[5]*x[1] + A[8]*x[2]);
00190 }
00191 
00193 template<class Scalar,class DomainScalar,class RangeScalar>
00194 inline KERNEL_PREFIX void 
00195 dense_matvec_4x4(RangeScalar alpha, const Scalar* A,
00196                  const DomainScalar* x, RangeScalar* y)
00197 {
00198   y[0] += alpha*(A[0]*x[0] + A[4]*x[1] + A[8]*x[2] + A[12]*x[3]);
00199   y[1] += alpha*(A[1]*x[0] + A[5]*x[1] + A[9]*x[2] + A[13]*x[3]);
00200   y[2] += alpha*(A[2]*x[0] + A[6]*x[1] + A[10]*x[2] + A[14]*x[3]);
00201   y[3] += alpha*(A[3]*x[0] + A[7]*x[1] + A[11]*x[2] + A[15]*x[3]);
00202 }
00203 
00205 template<class Scalar,class DomainScalar,class RangeScalar>
00206 inline KERNEL_PREFIX void 
00207 dense_matvec_trans_2x2(RangeScalar alpha, const Scalar* A,
00208                        const DomainScalar* x, RangeScalar* y)
00209 {
00210   y[0] += alpha*(A[0]*x[0] + A[1]*x[1]);
00211   y[1] += alpha*(A[2]*x[0] + A[3]*x[1]);
00212 }
00213 
00215 template<class Scalar,class DomainScalar,class RangeScalar>
00216 inline KERNEL_PREFIX void 
00217 dense_matvec_trans_3x3(RangeScalar alpha, const Scalar* A,
00218                        const DomainScalar* x, RangeScalar* y)
00219 {
00220   y[0] += alpha*(A[0]*x[0] + A[1]*x[1] + A[2]*x[2]);
00221   y[1] += alpha*(A[3]*x[0] + A[4]*x[1] + A[5]*x[2]);
00222   y[2] += alpha*(A[6]*x[0] + A[7]*x[1] + A[8]*x[2]);
00223 }
00224 
00226 template<class Scalar,class DomainScalar,class RangeScalar>
00227 inline KERNEL_PREFIX void 
00228 dense_matvec_trans_4x4(RangeScalar alpha, const Scalar* A,
00229                        const DomainScalar* x, RangeScalar* y)
00230 {
00231   y[0] += alpha*(A[0]*x[0] + A[1]*x[1] + A[2]*x[2] + A[3]*x[3]);
00232   y[1] += alpha*(A[4]*x[0] + A[5]*x[1] + A[6]*x[2] + A[7]*x[3]);
00233   y[2] += alpha*(A[8]*x[0] + A[9]*x[1] + A[10]*x[2] + A[11]*x[3]);
00234   y[3] += alpha*(A[12]*x[0] + A[13]*x[1] + A[14]*x[2] + A[15]*x[3]);
00235 }
00236 
00237 template <class Scalar, class Ordinal, class DomainScalar, class RangeScalar>
00238 struct DefaultBlockSparseMultiplyOp1 {
00239   // mat data
00240   const Ordinal *rptr, *cptr;
00241   const size_t *bptr;
00242   const Ordinal *bindx, *indx;
00243   const Scalar  *vals;
00244   // matvec params
00245   RangeScalar        alpha, beta;
00246   size_t numBlockRows;
00247   // mv data
00248   const DomainScalar  *x;
00249   RangeScalar         *y;
00250   size_t xstride, ystride;
00251 
00252   inline KERNEL_PREFIX void execute(size_t myIter) {
00253     RangeScalar zero = Teuchos::ScalarTraits<RangeScalar>::zero();
00254     const size_t myBlockRow = myIter % numBlockRows;
00255     const size_t myRHS = (myIter - myBlockRow) / numBlockRows;
00256     const Ordinal numRowsInBlock = rptr[myBlockRow+1]-rptr[myBlockRow];
00257     const DomainScalar* xvec = x+myRHS*xstride;
00258     RangeScalar*        yvec = y+myRHS*ystride;
00259     RangeScalar* yy = &yvec[rptr[myBlockRow]];
00260 
00261     // init my block of y to beta*y
00262     if (beta == zero) for(Ordinal i=0; i<numRowsInBlock; ++i) yy[i] = zero;
00263     else for(Ordinal i=0; i<numRowsInBlock; ++i) yy[i] = beta*yy[i];
00264 
00265     // loop over the block in my row and do the multiplication
00266     for (size_t b=bptr[myBlockRow]; b<bptr[myBlockRow+1]; ++b) {
00267       // get pointers into A and x
00268       const Ordinal col = bindx[b];
00269       const Ordinal numColsInBlock = cptr[col+1]-cptr[col];
00270       const Scalar* A = &vals[indx[b]];
00271       const DomainScalar* xx = &xvec[cptr[col]];
00272       // do the GEMV
00273       if (numRowsInBlock == numColsInBlock) {
00274         switch(numRowsInBlock) {
00275           case 1: dense_matvec_1x1(alpha, A, xx, yy); break;
00276           case 2: dense_matvec_2x2(alpha, A, xx, yy); break;
00277           case 3: dense_matvec_3x3(alpha, A, xx, yy); break;
00278           case 4: dense_matvec_4x4(alpha, A, xx, yy); break;
00279           default: densematvec(numRowsInBlock, numColsInBlock, alpha, A, xx, yy);
00280         }
00281       }
00282       else {
00283         densematvec(numRowsInBlock,numColsInBlock,alpha,A,xx,yy);
00284       }
00285     }
00286   }
00287 };
00288 
00289 template <class Scalar, class Ordinal, class DomainScalar, class RangeScalar>
00290 struct DefaultBlockSparseMultiplyOp1Transpose {
00291   // mat data
00292   const Ordinal *rptr, *cptr;
00293   const size_t *bptr;
00294   const Ordinal *bindx, *indx;
00295   const Scalar  *vals;
00296   // matvec params
00297   RangeScalar        alpha;
00298   size_t numBlockRows;
00299   // mv data
00300   const DomainScalar  *x;
00301   RangeScalar         *y;
00302   size_t xstride, ystride;
00303 
00304   inline KERNEL_PREFIX void execute(const size_t myRHS) {
00305     // get pointers into X and Y for my assigned RHS
00306     const DomainScalar* xvec = x+myRHS*xstride;
00307     RangeScalar*        yvec = y+myRHS*ystride;
00308     for (size_t bR=0; bR<numBlockRows; ++bR) {
00309       // accumulate bR-th block bR (transposed) times xvec[bR] block entry into yvec
00310       const DomainScalar* xx = &xvec[rptr[bR]];
00311       const Ordinal Nrows = rptr[bR+1]-rptr[bR];
00312   
00313       for (size_t b=bptr[bR]; b<bptr[bR+1]; ++b) {
00314         const Ordinal col = bindx[b];
00315         const Ordinal Ncols = cptr[col+1]-cptr[col];
00316   
00317         const Scalar* A = &vals[indx[b]];
00318         RangeScalar* yy = &yvec[cptr[col]];
00319   
00320         if (Nrows == Ncols) {
00321           switch(Nrows) {
00322           case 1: dense_matvec_1x1(alpha, A, xx, yy); break;
00323           case 2: dense_matvec_trans_2x2(alpha, A, xx, yy); break;
00324           case 3: dense_matvec_trans_3x3(alpha, A, xx, yy); break;
00325           case 4: dense_matvec_trans_4x4(alpha, A, xx, yy); break;
00326           default: densematvec_trans(Nrows, Ncols, alpha, A, xx, yy);
00327           }
00328         }
00329         else {
00330           densematvec_trans(Nrows,Ncols,alpha,A,xx,yy);
00331         }
00332       }
00333     }
00334   }
00335 };
00336 
00337 template <class Scalar, class Ordinal, class DomainScalar, class RangeScalar>
00338 struct DefaultBlockSparseSolveOp1 {
00339   // mat data
00340   const Ordinal *rptr, *cptr;
00341   const size_t *bptr;
00342   const Ordinal *bindx, *indx;
00343   const Scalar  *vals;
00344   // solve params
00345   bool unitDiag, upper;
00346   size_t numBlockRows;
00347   // multivector data
00348   const RangeScalar  *y;
00349   DomainScalar       *x;
00350   size_t xstride, ystride;
00351 
00352   //find x such that A*x = y
00353 
00354   inline KERNEL_PREFIX void execute(size_t i) {
00355     //solve for i-th vector of multivector
00356     const RangeScalar* yvec = y+i*ystride;
00357     DomainScalar*      xvec = x+i*xstride;
00358     RangeScalar one = Teuchos::ScalarTraits<RangeScalar>::one();
00359 
00360     //copy y into x:
00361     Ordinal numPointRows = rptr[numBlockRows];
00362     for(Ordinal ptrow=0; ptrow<numPointRows; ++ptrow) xvec[ptrow] = yvec[ptrow];
00363 
00364     for(size_t r = 0; r < numBlockRows; ++r) {
00365       //if upper-triangular then loop from row N to 0 (bottom to top), but
00366       //if lower-triangular then loop from 0 to N (top to bottom):
00367       Ordinal row = upper ? numBlockRows-r-1 : r;
00368 
00369       const Ordinal numRowsInBlock = rptr[row+1]-rptr[row];
00370       DomainScalar* xx = &xvec[rptr[row]];
00371 
00372       // loop over the blocks in this row and do the multiplication
00373       for (size_t b=bptr[row]; b<bptr[row+1]; ++b) {
00374         //if upper-triangular then loop right-to-left (backwards) across
00375         //the row; but if lower-triangular then loop left-to-right:
00376         size_t blk = upper ? bptr[row+1]-(b-bptr[row])-1 : b;
00377 
00378         // get pointers into A and y
00379         const Ordinal col = bindx[blk];
00380         const Ordinal numColsInBlock = cptr[col+1]-cptr[col];
00381         const Scalar* A = &vals[indx[blk]];
00382         DomainScalar* yy = &xvec[cptr[col]];
00383         if (col==row) {
00384           if (unitDiag) {
00385             densematvec_skipdiag(numRowsInBlock, -one, A, yy, xx);
00386           }
00387           else {
00388             if (upper) solve_block_upper(numRowsInBlock, A, xx);
00389             else solve_block_lower(numRowsInBlock, A, xx);
00390           }
00391         }
00392         else {
00393           // do the GEMV
00394           if (numRowsInBlock == numColsInBlock) {
00395             switch(numRowsInBlock) {
00396               case 1: dense_matvec_1x1(-one, A, yy, xx); break;
00397               case 2: dense_matvec_2x2(-one, A, yy, xx); break;
00398               case 3: dense_matvec_3x3(-one, A, yy, xx); break;
00399               case 4: dense_matvec_4x4(-one, A, yy, xx); break;
00400               default: densematvec(numRowsInBlock, numColsInBlock, -one, A, yy, xx);
00401             }
00402           }
00403           else {
00404             densematvec(numRowsInBlock,numColsInBlock,-one,A,yy,xx);
00405           }
00406         }
00407       }
00408     }
00409   }
00410 };
00411 
00412 template <class Scalar, class Ordinal, class DomainScalar, class RangeScalar>
00413 struct DefaultBlockSparseTransposeSolveOp1 {
00414   // mat data
00415   const Ordinal *rptr, *cptr;
00416   const size_t *bptr;
00417   const Ordinal *bindx, *indx;
00418   const Scalar  *vals;
00419   // solve params
00420   bool unitDiag, upper;
00421   size_t numBlockRows;
00422   // multivector data
00423   const RangeScalar  *y;
00424   DomainScalar       *x;
00425   size_t xstride, ystride;
00426 
00427   //find x such that A^T*x = y
00428 
00429   inline KERNEL_PREFIX void execute(size_t i) {
00430     //solve for i-th vector of multivector
00431     const RangeScalar* yvec = y+i*ystride;
00432     DomainScalar*      xvec = x+i*xstride;
00433     RangeScalar one = Teuchos::ScalarTraits<RangeScalar>::one();
00434 
00435     //copy y into x:
00436     Ordinal numPointRows = rptr[numBlockRows];
00437     for(Ordinal ptrow=0; ptrow<numPointRows; ++ptrow) xvec[ptrow] = yvec[ptrow];
00438 
00439     for(size_t r = 0; r < numBlockRows; ++r) {
00440       //for the transpose solve, if upper-triangular then
00441       //loop from row 0 to row N, but if lower-triangular then
00442       //loop from row N to row 0:
00443       Ordinal row = upper ? r : numBlockRows-r-1;
00444 
00445       const Ordinal numRowsInBlock = rptr[row+1]-rptr[row];
00446       DomainScalar* xx = &xvec[rptr[row]];
00447 
00448       // loop over the blocks in this row and do the multiplication
00449       for (size_t b=bptr[row]; b<bptr[row+1]; ++b) {
00450         //for the tranpose solve, if upper-triangular then loop
00451         //left-to-right across the row, but if lower-triangular then
00452         //loop right-to-left across the row:
00453         size_t blk = upper ? b : bptr[row+1]-(b-bptr[row])-1;
00454 
00455         // get pointers into A and y
00456         const Ordinal col = bindx[blk];
00457         const Ordinal numColsInBlock = cptr[col+1]-cptr[col];
00458         const Scalar* A = &vals[indx[blk]];
00459         DomainScalar* yy = &xvec[cptr[col]];
00460         if (col==row) {
00461           if (unitDiag) {
00462             densematvec_skipdiag(numRowsInBlock, -one, A, yy, xx);
00463           }
00464           else {
00465             if (upper) solve_block_upper_transpose(numRowsInBlock, A, xx);
00466             else solve_block_lower_transpose(numRowsInBlock, A, xx);
00467           }
00468         }
00469         else {
00470           // do the GEMV
00471           if (numRowsInBlock == numColsInBlock) {
00472             switch(numRowsInBlock) {
00473               case 1: dense_matvec_1x1(-one, A, xx, yy); break;
00474               case 2: dense_matvec_2x2(-one, A, xx, yy); break;
00475               case 3: dense_matvec_3x3(-one, A, xx, yy); break;
00476               case 4: dense_matvec_4x4(-one, A, xx, yy); break;
00477               default: densematvec(numRowsInBlock, numColsInBlock, -one, A, xx, yy);
00478             }
00479           }
00480           else {
00481             densematvec(numRowsInBlock,numColsInBlock,-one,A,xx,yy);
00482           }
00483         }
00484       }
00485     }
00486   }
00487 };
00488 
00489 } // namespace Kokkos
00490 
00491 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends