Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_DefaultBlockSparseKernelOps.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef KOKKOS_DEFAULTBLOCKSPARSEKERNELOPS_HPP
00043 #define KOKKOS_DEFAULTBLOCKSPARSEKERNELOPS_HPP
00044 
00045 #ifndef KERNEL_PREFIX
00046 #define KERNEL_PREFIX
00047 #endif
00048 
00049 #ifdef __CUDACC__
00050 #include <Teuchos_ScalarTraitsCUDA.hpp>
00051 #else
00052 #include <Teuchos_ScalarTraits.hpp>
00053 #endif
00054 
00055 
00056 // FINISH TODO : This code is missing the calls to ScalarTraits::conjugate() 
00057 //               necessary for the transposed kernels to work for complex-valued scalars.
00058 //               Note, this will require also the addition of conjugate functionality to the 1x1 kernel
00059 
00060 namespace Kokkos {
00061 
00063 template<class Scalar,class Ordinal,class DomainScalar,class RangeScalar>
00064 inline KERNEL_PREFIX void 
00065 densematvec(Ordinal Nrows, Ordinal Ncols,
00066             RangeScalar alpha, const Scalar* A,
00067             const DomainScalar* x, RangeScalar* y)
00068 {
00069   unsigned offset = 0;
00070   for(Ordinal c=0; c<Ncols; ++c) {
00071     for(Ordinal r=0; r<Nrows; ++r) {
00072       y[r] += alpha*A[offset++]*x[c];
00073     }
00074   }
00075 }
00076 
00079 template<class Scalar,class Ordinal,class DomainScalar,class RangeScalar>
00080 inline KERNEL_PREFIX void 
00081 densematvec_skipdiag(Ordinal N,
00082                      RangeScalar alpha, const Scalar* A,
00083                      const DomainScalar* x, RangeScalar* y)
00084 {
00085   unsigned offset = 0;
00086   for(Ordinal c=0; c<N; ++c) {
00087     for(Ordinal r=0; r<N; ++r) {
00088       if (r != c) {
00089         y[r] += alpha*A[offset]*x[c];
00090       }
00091       ++offset;
00092     }
00093   }
00094 }
00095 
00099 template<class Scalar,class Ordinal,class RangeScalar>
00100 inline KERNEL_PREFIX void 
00101 solve_block_upper(Ordinal N, const Scalar* A, RangeScalar* y)
00102 {
00103   int n = N;
00104   for(int r=n-1; r>=0; --r) {
00105     for(int c=n-1; c>r; --c) {
00106       y[r] -= A[c*n+r]*y[c];
00107     }
00108     y[r] /= A[r*n+r];
00109   }
00110 }
00111 
00115 template<class Scalar,class Ordinal,class RangeScalar>
00116 inline KERNEL_PREFIX void 
00117 solve_block_upper_transpose(Ordinal N, const Scalar* A, RangeScalar* y)
00118 {
00119   int n = N;
00120   for(int c=0; c<N; ++c) {
00121     for(int r=0; r<c; ++r) {
00122       y[c] -= A[c*n+r]*y[r];
00123     }
00124     y[c] /= A[c*n+c];
00125   }
00126 }
00127 
00131 template<class Scalar,class Ordinal,class RangeScalar>
00132 inline KERNEL_PREFIX void 
00133 solve_block_lower(Ordinal N, const Scalar* A, RangeScalar* y)
00134 {
00135   int n = N;
00136   for(int r=0; r<n; ++r) {
00137     for(int c=0; c<r; ++c) {
00138       y[r] -= A[c*n+r]*y[c];
00139     }
00140     y[r] /= A[r*n+r];
00141   }
00142 }
00143 
00147 template<class Scalar,class Ordinal,class RangeScalar>
00148 inline KERNEL_PREFIX void 
00149 solve_block_lower_transpose(Ordinal N, const Scalar* A, RangeScalar* y)
00150 {
00151   int n = N;
00152   for(int c=n-1; c>=0; --c) {
00153     for(int r=n-1; r>c; --r) {
00154       y[c] -= A[c*n+r]*y[r];
00155     }
00156     y[c] /= A[c*n+c];
00157   }
00158 }
00159 
00161 template<class Scalar,class Ordinal,class DomainScalar,class RangeScalar>
00162 inline KERNEL_PREFIX void
00163 densematvec_trans(Ordinal Nrows, Ordinal Ncols,
00164                   RangeScalar alpha, const Scalar* A,
00165                   const DomainScalar* x, RangeScalar* y)
00166 {
00167   unsigned offset = 0;
00168   for(Ordinal c=0; c<Ncols; ++c) {
00169     for(Ordinal r=0; r<Nrows; ++r) {
00170       y[c] += alpha*A[offset++]*x[r];
00171     }
00172   }
00173 }
00174 
00176 template<class Scalar,class DomainScalar,class RangeScalar>
00177 inline KERNEL_PREFIX void 
00178 dense_matvec_1x1( RangeScalar alpha, const Scalar* A,
00179                   const DomainScalar* x, RangeScalar* y)
00180 {
00181   y[0] += alpha*A[0]*x[0];
00182 }
00183 
00185 template<class Scalar,class DomainScalar,class RangeScalar>
00186 inline KERNEL_PREFIX void 
00187 dense_matvec_2x2(RangeScalar alpha, const Scalar* A,
00188                  const DomainScalar* x, RangeScalar* y)
00189 {
00190   y[0] += alpha*(A[0]*x[0] + A[2]*x[1]);
00191   y[1] += alpha*(A[1]*x[0] + A[3]*x[1]);
00192 }
00193 
00195 template<class Scalar,class DomainScalar,class RangeScalar>
00196 inline KERNEL_PREFIX void 
00197 dense_matvec_3x3(RangeScalar alpha, const Scalar* A,
00198                  const DomainScalar* x, RangeScalar* y)
00199 {
00200   y[0] += alpha*(A[0]*x[0] + A[3]*x[1] + A[6]*x[2]);
00201   y[1] += alpha*(A[1]*x[0] + A[4]*x[1] + A[7]*x[2]);
00202   y[2] += alpha*(A[2]*x[0] + A[5]*x[1] + A[8]*x[2]);
00203 }
00204 
00206 template<class Scalar,class DomainScalar,class RangeScalar>
00207 inline KERNEL_PREFIX void 
00208 dense_matvec_4x4(RangeScalar alpha, const Scalar* A,
00209                  const DomainScalar* x, RangeScalar* y)
00210 {
00211   y[0] += alpha*(A[0]*x[0] + A[4]*x[1] + A[8]*x[2] + A[12]*x[3]);
00212   y[1] += alpha*(A[1]*x[0] + A[5]*x[1] + A[9]*x[2] + A[13]*x[3]);
00213   y[2] += alpha*(A[2]*x[0] + A[6]*x[1] + A[10]*x[2] + A[14]*x[3]);
00214   y[3] += alpha*(A[3]*x[0] + A[7]*x[1] + A[11]*x[2] + A[15]*x[3]);
00215 }
00216 
00218 template<class Scalar,class DomainScalar,class RangeScalar>
00219 inline KERNEL_PREFIX void 
00220 dense_matvec_trans_2x2(RangeScalar alpha, const Scalar* A,
00221                        const DomainScalar* x, RangeScalar* y)
00222 {
00223   y[0] += alpha*(A[0]*x[0] + A[1]*x[1]);
00224   y[1] += alpha*(A[2]*x[0] + A[3]*x[1]);
00225 }
00226 
00228 template<class Scalar,class DomainScalar,class RangeScalar>
00229 inline KERNEL_PREFIX void 
00230 dense_matvec_trans_3x3(RangeScalar alpha, const Scalar* A,
00231                        const DomainScalar* x, RangeScalar* y)
00232 {
00233   y[0] += alpha*(A[0]*x[0] + A[1]*x[1] + A[2]*x[2]);
00234   y[1] += alpha*(A[3]*x[0] + A[4]*x[1] + A[5]*x[2]);
00235   y[2] += alpha*(A[6]*x[0] + A[7]*x[1] + A[8]*x[2]);
00236 }
00237 
00239 template<class Scalar,class DomainScalar,class RangeScalar>
00240 inline KERNEL_PREFIX void 
00241 dense_matvec_trans_4x4(RangeScalar alpha, const Scalar* A,
00242                        const DomainScalar* x, RangeScalar* y)
00243 {
00244   y[0] += alpha*(A[0]*x[0] + A[1]*x[1] + A[2]*x[2] + A[3]*x[3]);
00245   y[1] += alpha*(A[4]*x[0] + A[5]*x[1] + A[6]*x[2] + A[7]*x[3]);
00246   y[2] += alpha*(A[8]*x[0] + A[9]*x[1] + A[10]*x[2] + A[11]*x[3]);
00247   y[3] += alpha*(A[12]*x[0] + A[13]*x[1] + A[14]*x[2] + A[15]*x[3]);
00248 }
00249 
00250 template <class Scalar, class Ordinal, class DomainScalar, class RangeScalar>
00251 struct DefaultBlockSparseMultiplyOp1 {
00252   // mat data
00253   const Ordinal *rptr, *cptr;
00254   const size_t *bptr;
00255   const Ordinal *bindx, *indx;
00256   const Scalar  *vals;
00257   // matvec params
00258   RangeScalar        alpha, beta;
00259   size_t numBlockRows;
00260   // mv data
00261   const DomainScalar  *x;
00262   RangeScalar         *y;
00263   size_t xstride, ystride;
00264 
00265   inline KERNEL_PREFIX void execute(size_t myIter) {
00266     RangeScalar zero = Teuchos::ScalarTraits<RangeScalar>::zero();
00267     const size_t myBlockRow = myIter % numBlockRows;
00268     const size_t myRHS = (myIter - myBlockRow) / numBlockRows;
00269     const Ordinal numRowsInBlock = rptr[myBlockRow+1]-rptr[myBlockRow];
00270     const DomainScalar* xvec = x+myRHS*xstride;
00271     RangeScalar*        yvec = y+myRHS*ystride;
00272     RangeScalar* yy = &yvec[rptr[myBlockRow]];
00273 
00274     // init my block of y to beta*y
00275     if (beta == zero) for(Ordinal i=0; i<numRowsInBlock; ++i) yy[i] = zero;
00276     else for(Ordinal i=0; i<numRowsInBlock; ++i) yy[i] = beta*yy[i];
00277 
00278     // loop over the block in my row and do the multiplication
00279     for (size_t b=bptr[myBlockRow]; b<bptr[myBlockRow+1]; ++b) {
00280       // get pointers into A and x
00281       const Ordinal col = bindx[b];
00282       const Ordinal numColsInBlock = cptr[col+1]-cptr[col];
00283       const Scalar* A = &vals[indx[b]];
00284       const DomainScalar* xx = &xvec[cptr[col]];
00285       // do the GEMV
00286       if (numRowsInBlock == numColsInBlock) {
00287         switch(numRowsInBlock) {
00288           case 1: dense_matvec_1x1(alpha, A, xx, yy); break;
00289           case 2: dense_matvec_2x2(alpha, A, xx, yy); break;
00290           case 3: dense_matvec_3x3(alpha, A, xx, yy); break;
00291           case 4: dense_matvec_4x4(alpha, A, xx, yy); break;
00292           default: densematvec(numRowsInBlock, numColsInBlock, alpha, A, xx, yy);
00293         }
00294       }
00295       else {
00296         densematvec(numRowsInBlock,numColsInBlock,alpha,A,xx,yy);
00297       }
00298     }
00299   }
00300 };
00301 
00302 template <class Scalar, class Ordinal, class DomainScalar, class RangeScalar>
00303 struct DefaultBlockSparseMultiplyOp1Transpose {
00304   // mat data
00305   const Ordinal *rptr, *cptr;
00306   const size_t *bptr;
00307   const Ordinal *bindx, *indx;
00308   const Scalar  *vals;
00309   // matvec params
00310   RangeScalar        alpha;
00311   size_t numBlockRows;
00312   // mv data
00313   const DomainScalar  *x;
00314   RangeScalar         *y;
00315   size_t xstride, ystride;
00316 
00317   inline KERNEL_PREFIX void execute(const size_t myRHS) {
00318     // get pointers into X and Y for my assigned RHS
00319     const DomainScalar* xvec = x+myRHS*xstride;
00320     RangeScalar*        yvec = y+myRHS*ystride;
00321     for (size_t bR=0; bR<numBlockRows; ++bR) {
00322       // accumulate bR-th block bR (transposed) times xvec[bR] block entry into yvec
00323       const DomainScalar* xx = &xvec[rptr[bR]];
00324       const Ordinal Nrows = rptr[bR+1]-rptr[bR];
00325   
00326       for (size_t b=bptr[bR]; b<bptr[bR+1]; ++b) {
00327         const Ordinal col = bindx[b];
00328         const Ordinal Ncols = cptr[col+1]-cptr[col];
00329   
00330         const Scalar* A = &vals[indx[b]];
00331         RangeScalar* yy = &yvec[cptr[col]];
00332   
00333         if (Nrows == Ncols) {
00334           switch(Nrows) {
00335           case 1: dense_matvec_1x1(alpha, A, xx, yy); break;
00336           case 2: dense_matvec_trans_2x2(alpha, A, xx, yy); break;
00337           case 3: dense_matvec_trans_3x3(alpha, A, xx, yy); break;
00338           case 4: dense_matvec_trans_4x4(alpha, A, xx, yy); break;
00339           default: densematvec_trans(Nrows, Ncols, alpha, A, xx, yy);
00340           }
00341         }
00342         else {
00343           densematvec_trans(Nrows,Ncols,alpha,A,xx,yy);
00344         }
00345       }
00346     }
00347   }
00348 };
00349 
00350 template <class Scalar, class Ordinal, class DomainScalar, class RangeScalar>
00351 struct DefaultBlockSparseSolveOp1 {
00352   // mat data
00353   const Ordinal *rptr, *cptr;
00354   const size_t *bptr;
00355   const Ordinal *bindx, *indx;
00356   const Scalar  *vals;
00357   // solve params
00358   bool unitDiag, upper;
00359   size_t numBlockRows;
00360   // multivector data
00361   const RangeScalar  *y;
00362   DomainScalar       *x;
00363   size_t xstride, ystride;
00364 
00365   //find x such that A*x = y
00366 
00367   inline KERNEL_PREFIX void execute(size_t i) {
00368     //solve for i-th vector of multivector
00369     const RangeScalar* yvec = y+i*ystride;
00370     DomainScalar*      xvec = x+i*xstride;
00371     RangeScalar one = Teuchos::ScalarTraits<RangeScalar>::one();
00372 
00373     //copy y into x:
00374     Ordinal numPointRows = rptr[numBlockRows];
00375     for(Ordinal ptrow=0; ptrow<numPointRows; ++ptrow) xvec[ptrow] = yvec[ptrow];
00376 
00377     for(size_t r = 0; r < numBlockRows; ++r) {
00378       //if upper-triangular then loop from row N to 0 (bottom to top), but
00379       //if lower-triangular then loop from 0 to N (top to bottom):
00380       Ordinal row = upper ? numBlockRows-r-1 : r;
00381 
00382       const Ordinal numRowsInBlock = rptr[row+1]-rptr[row];
00383       DomainScalar* xx = &xvec[rptr[row]];
00384 
00385       // loop over the blocks in this row and do the multiplication
00386       for (size_t b=bptr[row]; b<bptr[row+1]; ++b) {
00387         //if upper-triangular then loop right-to-left (backwards) across
00388         //the row; but if lower-triangular then loop left-to-right:
00389         size_t blk = upper ? bptr[row+1]-(b-bptr[row])-1 : b;
00390 
00391         // get pointers into A and y
00392         const Ordinal col = bindx[blk];
00393         const Ordinal numColsInBlock = cptr[col+1]-cptr[col];
00394         const Scalar* A = &vals[indx[blk]];
00395         DomainScalar* yy = &xvec[cptr[col]];
00396         if (col==row) {
00397           if (unitDiag) {
00398             densematvec_skipdiag(numRowsInBlock, -one, A, yy, xx);
00399           }
00400           else {
00401             if (upper) solve_block_upper(numRowsInBlock, A, xx);
00402             else solve_block_lower(numRowsInBlock, A, xx);
00403           }
00404         }
00405         else {
00406           // do the GEMV
00407           if (numRowsInBlock == numColsInBlock) {
00408             switch(numRowsInBlock) {
00409               case 1: dense_matvec_1x1(-one, A, yy, xx); break;
00410               case 2: dense_matvec_2x2(-one, A, yy, xx); break;
00411               case 3: dense_matvec_3x3(-one, A, yy, xx); break;
00412               case 4: dense_matvec_4x4(-one, A, yy, xx); break;
00413               default: densematvec(numRowsInBlock, numColsInBlock, -one, A, yy, xx);
00414             }
00415           }
00416           else {
00417             densematvec(numRowsInBlock,numColsInBlock,-one,A,yy,xx);
00418           }
00419         }
00420       }
00421     }
00422   }
00423 };
00424 
00425 template <class Scalar, class Ordinal, class DomainScalar, class RangeScalar>
00426 struct DefaultBlockSparseTransposeSolveOp1 {
00427   // mat data
00428   const Ordinal *rptr, *cptr;
00429   const size_t *bptr;
00430   const Ordinal *bindx, *indx;
00431   const Scalar  *vals;
00432   // solve params
00433   bool unitDiag, upper;
00434   size_t numBlockRows;
00435   // multivector data
00436   const RangeScalar  *y;
00437   DomainScalar       *x;
00438   size_t xstride, ystride;
00439 
00440   //find x such that A^T*x = y
00441 
00442   inline KERNEL_PREFIX void execute(size_t i) {
00443     //solve for i-th vector of multivector
00444     const RangeScalar* yvec = y+i*ystride;
00445     DomainScalar*      xvec = x+i*xstride;
00446     RangeScalar one = Teuchos::ScalarTraits<RangeScalar>::one();
00447 
00448     //copy y into x:
00449     Ordinal numPointRows = rptr[numBlockRows];
00450     for(Ordinal ptrow=0; ptrow<numPointRows; ++ptrow) xvec[ptrow] = yvec[ptrow];
00451 
00452     for(size_t r = 0; r < numBlockRows; ++r) {
00453       //for the transpose solve, if upper-triangular then
00454       //loop from row 0 to row N, but if lower-triangular then
00455       //loop from row N to row 0:
00456       Ordinal row = upper ? r : numBlockRows-r-1;
00457 
00458       const Ordinal numRowsInBlock = rptr[row+1]-rptr[row];
00459       DomainScalar* xx = &xvec[rptr[row]];
00460 
00461       // loop over the blocks in this row and do the multiplication
00462       for (size_t b=bptr[row]; b<bptr[row+1]; ++b) {
00463         //for the tranpose solve, if upper-triangular then loop
00464         //left-to-right across the row, but if lower-triangular then
00465         //loop right-to-left across the row:
00466         size_t blk = upper ? b : bptr[row+1]-(b-bptr[row])-1;
00467 
00468         // get pointers into A and y
00469         const Ordinal col = bindx[blk];
00470         const Ordinal numColsInBlock = cptr[col+1]-cptr[col];
00471         const Scalar* A = &vals[indx[blk]];
00472         DomainScalar* yy = &xvec[cptr[col]];
00473         if (col==row) {
00474           if (unitDiag) {
00475             densematvec_skipdiag(numRowsInBlock, -one, A, yy, xx);
00476           }
00477           else {
00478             if (upper) solve_block_upper_transpose(numRowsInBlock, A, xx);
00479             else solve_block_lower_transpose(numRowsInBlock, A, xx);
00480           }
00481         }
00482         else {
00483           // do the GEMV
00484           if (numRowsInBlock == numColsInBlock) {
00485             switch(numRowsInBlock) {
00486               case 1: dense_matvec_1x1(-one, A, xx, yy); break;
00487               case 2: dense_matvec_2x2(-one, A, xx, yy); break;
00488               case 3: dense_matvec_3x3(-one, A, xx, yy); break;
00489               case 4: dense_matvec_4x4(-one, A, xx, yy); break;
00490               default: densematvec(numRowsInBlock, numColsInBlock, -one, A, xx, yy);
00491             }
00492           }
00493           else {
00494             densematvec(numRowsInBlock,numColsInBlock,-one,A,xx,yy);
00495           }
00496         }
00497       }
00498     }
00499   }
00500 };
00501 
00502 } // namespace Kokkos
00503 
00504 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends