Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_DefaultRelaxationKernelOps.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef KOKKOS_DEFAULTRELAXATION_KERNELOPS_HPP
00030 #define KOKKOS_DEFAULTRELAXATION_KERNELOPS_HPP
00031 
00032 #ifndef KERNEL_PREFIX
00033 #define KERNEL_PREFIX
00034 #endif
00035 
00036 #ifdef __CUDACC__
00037 #include <Teuchos_ScalarTraitsCUDA.hpp>
00038 #else
00039 #include <Teuchos_ScalarTraits.hpp>
00040 #endif
00041 
00042 // NTS: Need to remove Scalar divisions and replace with appropriate inverse listed in Traits
00043 
00044 namespace Kokkos {
00045 
00046   // Extract Matrix Diagonal for Type 1 storage
00047   template <class Scalar, class Ordinal>
00048   struct ExtractDiagonalOp1 {
00049 
00050     // mat data
00051     const size_t  *offsets;
00052     const Ordinal *inds;
00053     const Scalar  *vals;
00054     Scalar * diag;
00055     size_t numRows;
00056 
00057 
00058     inline KERNEL_PREFIX void execute(size_t row) {
00059       for (size_t c=offsets[row];c<offsets[row+1];c++) {
00060         if(row==(size_t)inds[c]) {
00061           diag[row]=vals[c];
00062           break;
00063         }
00064       }
00065     }
00066   };
00067 
00068   // Extract Matrix Diagonal for Type 2 storage
00069   template <class Scalar, class Ordinal>
00070   struct ExtractDiagonalOp2 {
00071 
00072     // mat data
00073     const Ordinal * const * inds_beg;
00074     const Scalar  * const * vals_beg;
00075     const size_t  *         numEntries;
00076     Scalar * diag;
00077     size_t numRows;
00078 
00079     inline KERNEL_PREFIX void execute(size_t row) {
00080       const Scalar  *curval = vals_beg[row];
00081       const Ordinal *curind = inds_beg[row];
00082       for (size_t j=0;j<numEntries[row];j++) {
00083         if(row==(size_t)curind[j]){
00084           diag[row]=curval[j];
00085           break;
00086         }
00087       }
00088     }
00089   };
00090 
00091 
00092   /************************************************************************************/
00093   /********************************* Jacobi Kernels ***********************************/
00094   /************************************************************************************/
00095   // Jacobi for Type 1 storage.
00096   template <class Scalar, class Ordinal>
00097   struct DefaultJacobiOp1 {
00098     const size_t  *offsets;
00099     const Ordinal *inds;
00100     const Scalar  *vals;
00101     const Scalar  *diag;
00102     size_t numRows;
00103     // vector data (including multiple rhs)
00104     Scalar       *x;
00105     const Scalar *x0;
00106     const Scalar *b;
00107     Scalar damping_factor;
00108     size_t xstride, bstride;
00109 
00110     inline KERNEL_PREFIX void execute(size_t i) {
00111       const size_t row  = i % numRows;
00112       const size_t rhs  = (i - row) / numRows;
00113       Scalar       *xj  = x + rhs * xstride;
00114       const Scalar *x0j = x0 + rhs * xstride;
00115       const Scalar *bj  = b + rhs * bstride;
00116 
00117       Scalar tmp = bj[row];
00118       for (size_t c=offsets[row];c<offsets[row+1];c++) {
00119         tmp -= vals[c] * x0j[inds[c]];
00120       }
00121       xj[row]=x0j[row]+damping_factor*tmp/diag[row];
00122     }
00123   };
00124 
00125 
00126   // Jacobi for Type 2 storage.
00127   template <class Scalar, class Ordinal>
00128   struct DefaultJacobiOp2 {
00129     // mat data
00130     const Ordinal * const * inds_beg;
00131     const Scalar  * const * vals_beg;
00132     const size_t  *         numEntries;
00133     const Scalar  *diag;
00134     size_t numRows;
00135     // vector data (including multiple rhs)    
00136     Scalar        *x;
00137     const Scalar *x0;
00138     const Scalar  *b;
00139     Scalar damping_factor;
00140     size_t xstride, bstride;
00141 
00142     inline KERNEL_PREFIX void execute(size_t i) {
00143       const size_t row = i % numRows;
00144       const size_t rhs = (i - row) / numRows;
00145       Scalar       *xj = x + rhs * xstride;
00146       const Scalar *x0j = x0 + rhs * xstride;
00147       const Scalar *bj = b + rhs * bstride;
00148       Scalar tmp = bj[row];
00149       const Scalar  *curval = vals_beg[row];
00150       const Ordinal *curind = inds_beg[row];
00151       for (size_t j=0; j != numEntries[row]; ++j) {
00152         tmp -= (curval[j]) * x0j[curind[j]];
00153       }
00154       xj[row]=x0j[row]+damping_factor*tmp/diag[row];
00155     }
00156   };
00157 
00158   /************************************************************************************/
00159   /************************ Fine-Grain Gauss-Seidel Kernels ***************************/
00160   /************************************************************************************/
00161 
00162   // Fine-grain "hybrid" Gauss-Seidel for Type 1 storage.
00163   // Note: This is actually real Gauss-Seidel for a serial node, and hybrid for almost any other kind of node.
00164   template <class Scalar, class Ordinal>
00165   struct DefaultFineGrainHybridGaussSeidelOp1 {
00166     const size_t  *offsets;
00167     const Ordinal *inds;
00168     const Scalar  *vals;
00169     const Scalar  *diag;
00170     size_t numRows;
00171     // vector data (including multiple rhs)
00172     Scalar       *x;
00173     const Scalar *b;
00174     Scalar damping_factor;
00175     size_t xstride, bstride;
00176 
00177     inline KERNEL_PREFIX void execute(size_t i) {
00178       const size_t row = i % numRows;
00179       const size_t rhs = (i - row) / numRows;
00180       Scalar       *xj = x + rhs * xstride;
00181       const Scalar *bj = b + rhs * bstride;
00182       Scalar tmp = bj[row];
00183       for (size_t c=offsets[row];c<offsets[row+1];c++) {
00184         tmp -= vals[c] * xj[inds[c]];
00185       }
00186       xj[row]+=damping_factor*tmp/diag[row];
00187     }
00188   };
00189 
00190 
00191   // Fine-grain "hybrid" Gauss-Seidel for Type 2 storage.
00192   // Note: This is actually real Gauss-Seidel for a serial node, and hybrid for almost any other kind of node.
00193   template <class Scalar, class Ordinal>
00194   struct DefaultFineGrainHybridGaussSeidelOp2 {
00195     // mat data
00196     const Ordinal * const * inds_beg;
00197     const Scalar  * const * vals_beg;
00198     const size_t  *         numEntries;
00199     const Scalar  *diag;
00200     size_t numRows;
00201     // vector data (including multiple rhs)    
00202     Scalar        *x;
00203     const Scalar  *b;
00204     Scalar damping_factor;
00205     size_t xstride, bstride;
00206 
00207     inline KERNEL_PREFIX void execute(size_t i) {
00208       const size_t row = i % numRows;
00209       const size_t rhs = (i - row) / numRows;
00210       Scalar       *xj = x + rhs * xstride;
00211       const Scalar *bj = b + rhs * bstride;
00212       Scalar tmp = bj[row];
00213       const Scalar  *curval = vals_beg[row];
00214       const Ordinal *curind = inds_beg[row];
00215       for (size_t j=0; j != numEntries[row]; ++j) {
00216         tmp -= (curval[j]) * xj[curind[j]];
00217       }
00218       xj[row]+=damping_factor*tmp/diag[row];
00219     }
00220   };
00221 
00222 
00223   /************************************************************************************/
00224   /************************ Coarse-Grain Gauss-Seidel Kernels *************************/
00225   /************************************************************************************/
00226 
00227   // Coarse-grain "hybrid" Gauss-Seidel for Type 1 storage.
00228   // Note: This is actually real Gauss-Seidel for a serial node, and hybrid for almost any other kind of node.
00229   template <class Scalar, class Ordinal>
00230   struct DefaultCoarseGrainHybridGaussSeidelOp1 {
00231     const size_t  *offsets;
00232     const Ordinal *inds;
00233     const Scalar  *vals;
00234     const Scalar  *diag;
00235     size_t numRows;
00236     size_t numChunks;
00237     // vector data (including multiple rhs)
00238     Scalar       *x;
00239     const Scalar *b;
00240     Scalar damping_factor;
00241     size_t xstride, bstride;
00242 
00243     inline KERNEL_PREFIX void execute(size_t i) {
00244       const size_t chunk = i % numChunks;
00245       const size_t rhs = (i - chunk) / numChunks;
00246       const size_t start_r = chunk * numRows / numChunks;
00247       const size_t stop_r  = TEUCHOS_MIN((chunk+1)*numRows/numChunks,numRows);
00248       Scalar       *xj = x + rhs * xstride;
00249       const Scalar *bj = b + rhs * bstride;
00250       for (size_t row=start_r;row<stop_r;row++){
00251         Scalar tmp = bj[row];
00252         for (size_t c=offsets[row];c<offsets[row+1];c++) {
00253           tmp -= vals[c] * xj[inds[c]];
00254         }
00255         xj[row]+=damping_factor*tmp/diag[row];
00256       }
00257     }
00258   };
00259 
00260 
00261   // Coarse-grain "hybrid" Gauss-Seidel for Type 2 storage.
00262   // Note: This is actually real Gauss-Seidel for a serial node, and hybrid for almost any other kind of node.
00263   template <class Scalar, class Ordinal>
00264   struct DefaultCoarseGrainHybridGaussSeidelOp2 {
00265     // mat data
00266     const Ordinal * const * inds_beg;
00267     const Scalar  * const * vals_beg;
00268     const size_t  *         numEntries;
00269     const Scalar  *diag;
00270     size_t numRows;
00271     size_t numChunks;
00272     // vector data (including multiple rhs)    
00273     Scalar        *x;
00274     const Scalar  *b;
00275     Scalar damping_factor;
00276     size_t xstride, bstride;
00277 
00278     inline KERNEL_PREFIX void execute(size_t i) {
00279       const size_t chunk = i % numChunks;
00280       const size_t rhs = (i - chunk) / numChunks;
00281       const size_t start_r = chunk * numRows / numChunks;
00282       const size_t stop_r  = TEUCHOS_MIN((chunk+1)*numRows/numChunks,numRows);
00283       Scalar       *xj = x + rhs * xstride;
00284       const Scalar *bj = b + rhs * bstride;
00285       for (size_t row=start_r;row<stop_r;row++){
00286         Scalar tmp = bj[row];
00287         const Scalar  *curval = vals_beg[row];
00288         const Ordinal *curind = inds_beg[row];
00289         for (size_t j=0; j!=numEntries[row];j++) {
00290           tmp -= (curval[j]) * xj[curind[j]];
00291         }
00292         xj[row]+=damping_factor*tmp/diag[row];
00293       }
00294     }
00295   };
00296  
00297   /************************************************************************************/
00298   /******************************** Chebyshev Kernels *********************************/
00299   /************************************************************************************/
00300 
00301   template <class Scalar, class Ordinal>
00302   struct DefaultChebyshevOp1 {
00303     const size_t  *offsets;
00304     const Ordinal *inds;
00305     const Scalar  *vals;
00306     const Scalar  *diag;
00307     size_t numRows;
00308     // vector data (including multiple rhs)
00309     Scalar       *x,*w;
00310     const Scalar *x0,*b;
00311     Scalar oneOverTheta,dtemp1,dtemp2;
00312     size_t stride;
00313     bool first_step;
00314     bool zero_initial_guess;
00315 
00316     inline KERNEL_PREFIX void execute(size_t i) {
00317       const size_t row  = i % numRows;
00318       const size_t rhs  = (i - row) / numRows;
00319       Scalar       *xj  = x + rhs * stride;
00320       const Scalar *x0j = x0 + rhs * stride;
00321       Scalar       *wj  = w + rhs * stride;      
00322       const Scalar *bj  = b + rhs * stride;
00323       Scalar        vj;
00324 
00325       if(first_step){
00326   if(zero_initial_guess)
00327     // x= theta^{-1} D^{-1} b
00328     xj[row]=wj[row]=bj[row] / diag[row] *oneOverTheta;
00329   else{
00330     // v=Ax
00331     vj=Teuchos::ScalarTraits<Scalar>::zero();
00332     for (size_t c=offsets[row];c<offsets[row+1];c++) {
00333       vj += vals[c] * x0j[inds[c]];
00334     }
00335     // w=theta^{-1} D^{-1} (b -Ax)
00336     wj[row]=(bj[row]-vj)/diag[row]*oneOverTheta;
00337     // x+=w
00338     xj[row]+=wj[row];
00339   }
00340       }
00341       else{
00342   //v=Ax
00343   vj=Teuchos::ScalarTraits<Scalar>::zero();
00344   for (size_t c=offsets[row];c<offsets[row+1];c++) {
00345     vj += vals[c] * x0j[inds[c]];
00346   }
00347   // w=dtemp1*w +  D^{-1}*dtemp2*(b-Ax)
00348   wj[row]=dtemp1*wj[row]+dtemp2*(bj[row]-vj)/diag[row];
00349   // x+=w
00350   xj[row]+=wj[row];
00351       }
00352 
00353       //      printf("[%3d-%d] x=%11.4e v=%11.4e w=%11.4e x0=%11.4e\n",row,first_step,xj[row],vj,wj[row],x0j[row]);
00354 
00355     }
00356 
00357   };
00358 
00359 }// namespace Kokkos
00360 
00361 #endif /* KOKKOS_DEFAULTRELAXATION_KERNELOPS_HPP */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends