Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_DefaultSparseMultiplyKernelOps.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 //
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 //
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef KOKKOS_DEFAULTSPARSEMULTIPLY_KERNELOPS_HPP
00043 #define KOKKOS_DEFAULTSPARSEMULTIPLY_KERNELOPS_HPP
00044 
00045 #ifndef KERNEL_PREFIX
00046 #define KERNEL_PREFIX
00047 #endif
00048 
00049 #include <Teuchos_ScalarTraits.hpp>
00050 
00051 namespace Kokkos {
00052 
00053   template <class Scalar, class OffsetType, class Ordinal, class DomainScalar, class RangeScalar, int NO_BETA_AND_OVERWRITE>
00054   struct DefaultSparseMultiplyOp {
00055     // mat data
00056     const OffsetType *offs;
00057     const Ordinal    *inds;
00058     const Scalar     *vals;
00059     // matvec params
00060     RangeScalar        alpha, beta;
00061     Ordinal numRows;
00062     // mv data
00063     const DomainScalar  *x;
00064     RangeScalar         *y;
00065     Ordinal numRHS, xstride, ystride;
00066 
00067     // CrT 27Jul12: performance improvement 2x-8x.
00068     // Changed the "while" loop to a "for" loop,
00069     // Reversed order of loops (outside is the loop over RHS vectors, inside the loop over the row),
00070     // Introduced temporary variables,
00071     // Unrolled the loop over RHS vectors by 4, and
00072     // Added specific cases for 1,2,3,4 RHS vectors.
00073     //
00074     // Unrolling and specialization are invisible to code that calls
00075     // DefaultSparseMultiplyOP, since these operations are handled via
00076     // an if-else within the execute function.  Without unrolling one
00077     // would lose about 1/3 in performance (on Intel Sandy Bridge).
00078     //
00079     // When using the Intel compiler, the code below may be vectorized
00080     // through the "always vectorize" directive #pragma simd (see
00081     // comments in the code).  The resulting performance gain on Intel
00082     // Sandy Bridge is 10-20%.  However, this currently only works if
00083     // the temporary variable tmp (of type RangeScalar) is made a
00084     // double/float explicitly.  This could be because the Intel
00085     // compiler (most recent version, not yet released) does not know
00086     // to ignore the SIMD reduction directive when the template
00087     // parameter type does not support the SIMD vector instructions.
00088     // This results in a compile error.  One workaround would be to
00089     // specialize DefaultSparseMultiplyOp for various RangeScalar
00090     // types, and only insert the pragma in the specialized versions.
00091     //
00092     // mfh (27 Jul 2012): Note that with the Intel compiler, #pragma
00093     // simd vectorizes the loop according to the "fast" floating-point
00094     // model (as in -fp-model=fast), even if that is not the
00095     // prevailing model of the rest of the compilation unit.  Sparse
00096     // matrix-vector multiply is a less sensitive kernel than others,
00097     // so this should not be a risky optimization in terms of
00098     // accuracy.  The main concern here is that the loop may flush
00099     // denormal floating-point values to zero.  For a discussion, see
00100     // e.g.,
00101     // http://software.intel.com/en-us/articles/consistency-of-floating-point-results-using-the-intel-compiler/
00102     //
00103     // mfh (27 Jul 2012): Reviewed and briefly edited changes and comments.
00104     inline KERNEL_PREFIX void execute(size_t row) {
00105       const OffsetType start = offs[row];
00106       const OffsetType end = offs[row+1];
00107 
00108       // CrT: Unroll by 4 over numRHS; specialize for numRHS <= 4.
00109       if (numRHS > 4) {
00110         Ordinal j = 0;
00111         if (alpha == Teuchos::ScalarTraits<RangeScalar>::one()) {
00112           // Strip-mined portion of the loop over RHS vectors.
00113           for (; j < numRHS - 3; j+=4) {
00114             RangeScalar* const ys = y+j*ystride + row;
00115             RangeScalar tmp[4] = {
00116               Teuchos::ScalarTraits<RangeScalar>::zero(),
00117               Teuchos::ScalarTraits<RangeScalar>::zero(),
00118               Teuchos::ScalarTraits<RangeScalar>::zero(),
00119               Teuchos::ScalarTraits<RangeScalar>::zero()
00120             };
00121 
00122             // CrT: Adding the pragma below improves performance by
00123             // 15% on Intel SandyBridge, but with the (new beta)
00124             // version of the Intel compiler that I tested, this
00125             // doesn't work when RangeScalar is a template parameter.
00126             // You have to change "RangeScalar tmp;" to "double tmp;".
00127             //
00128             //#pragma simd reduction(+: tmp)
00129             for (OffsetType i = start; i < end; i++) {
00130               const RangeScalar Aij = (RangeScalar)vals[i];
00131               const DomainScalar* const xs = x+j*xstride + inds[i];
00132               tmp[0] += Aij * (RangeScalar)xs[0 * xstride];
00133               tmp[1] += Aij * (RangeScalar)xs[1 * xstride];
00134               tmp[2] += Aij * (RangeScalar)xs[2 * xstride];
00135               tmp[3] += Aij * (RangeScalar)xs[3 * xstride];
00136             }
00137             // The compiler should remove the branch, since the test
00138             // value is a compile-time constant.
00139             if (NO_BETA_AND_OVERWRITE) {
00140               ys[0 * ystride] = tmp[0];
00141               ys[1 * ystride] = tmp[1];
00142               ys[2 * ystride] = tmp[2];
00143               ys[3 * ystride] = tmp[3];
00144             } else {
00145               ys[0 * ystride] = tmp[0] + beta * ys[0 * ystride];
00146               ys[1 * ystride] = tmp[1] + beta * ys[1 * ystride];
00147               ys[2 * ystride] = tmp[2] + beta * ys[2 * ystride];
00148               ys[3 * ystride] = tmp[3] + beta * ys[3 * ystride];
00149             }
00150           }
00151           // Clean-up portion of the loop over RHS vectors.
00152           for (; j < numRHS; ++j) {
00153             RangeScalar* const ys = y+j*ystride + row;
00154             const DomainScalar* const xs = x+j*xstride;
00155             RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
00156 
00157             // CrT: Adding the pragma below improves performance by
00158             // 15% on Intel SandyBridge, but with the (new beta)
00159             // version of the Intel compiler that I tested, this
00160             // doesn't work when RangeScalar is a template parameter.
00161             // You have to change "RangeScalar tmp;" to "double tmp;".
00162             //
00163             //#pragma simd reduction(+: tmp)
00164             for (OffsetType i = start; i < end; i++) {
00165               tmp += (RangeScalar)vals[i] * (RangeScalar)xs[inds[i]];
00166             }
00167             // The compiler should remove the branch, since the test
00168             // value is a compile-time constant.
00169             if (NO_BETA_AND_OVERWRITE)
00170               ys[0] = tmp;
00171             else
00172               ys[0] = tmp + beta * ys[0];
00173           }
00174         }
00175         else { // alpha != one
00176           for (; j < numRHS - 3; j += 4) {
00177             RangeScalar* const ys = y+j*ystride + row;
00178             RangeScalar tmp[4] = {
00179               Teuchos::ScalarTraits<RangeScalar>::zero(),
00180               Teuchos::ScalarTraits<RangeScalar>::zero(),
00181               Teuchos::ScalarTraits<RangeScalar>::zero(),
00182               Teuchos::ScalarTraits<RangeScalar>::zero()
00183             };
00184 
00185             // CrT: +15% on SandyBridge but only if you change "RangeScalar tmp;" to "double tmp;"
00186             //#pragma simd reduction(+: tmp)
00187             for (OffsetType i = start; i < end; i++) {
00188               const RangeScalar Aij = (RangeScalar)vals[i];
00189               const DomainScalar* const xs = x+j*xstride + inds[i];
00190               tmp[0] += Aij * (RangeScalar)xs[0 * xstride];
00191               tmp[1] += Aij * (RangeScalar)xs[1 * xstride];
00192               tmp[2] += Aij * (RangeScalar)xs[2 * xstride];
00193               tmp[3] += Aij * (RangeScalar)xs[3 * xstride];
00194             }
00195 
00196             tmp[0] *= alpha;
00197             tmp[1] *= alpha;
00198             tmp[2] *= alpha;
00199             tmp[3] *= alpha;
00200 
00201             if (NO_BETA_AND_OVERWRITE) {
00202               ys[0 * ystride] = tmp[0];
00203               ys[1 * ystride] = tmp[1];
00204               ys[2 * ystride] = tmp[2];
00205               ys[3 * ystride] = tmp[3];
00206             } else {
00207               ys[0 * ystride] = tmp[0] + beta * ys[0 * ystride];
00208               ys[1 * ystride] = tmp[1] + beta * ys[1 * ystride];
00209               ys[2 * ystride] = tmp[2] + beta * ys[2 * ystride];
00210               ys[3 * ystride] = tmp[3] + beta * ys[3 * ystride];
00211             }
00212           }
00213           for (; j < numRHS; ++j) {
00214             RangeScalar* const ys = y+j*ystride + row;
00215             const DomainScalar* const xs = x+j*xstride;
00216             RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
00217 
00218             // CrT: +15% on SandyBridge but only if you change "RangeScalar tmp;" to "double tmp;"
00219             //#pragma simd reduction(+: tmp)
00220             for (OffsetType i = start; i < end; i++) {
00221               tmp += (RangeScalar)vals[i] * (RangeScalar)xs[inds[i]];
00222             }
00223             tmp *= alpha;
00224             if (NO_BETA_AND_OVERWRITE)
00225               ys[0] = tmp;
00226             else
00227               ys[0] = tmp + beta * ys[0];
00228           }
00229         }
00230       }
00231       else if (numRHS == 1) {
00232         RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
00233 
00234         //+15% on SandyBridge but only if you change "RangeScalar tmp;" to "double tmp;"
00235         //#pragma simd reduction(+: tmp)
00236         for (OffsetType i = start; i < end; i++) {
00237           tmp += (RangeScalar)vals[i] * (RangeScalar)x[inds[i]];
00238         }
00239         if (alpha != Teuchos::ScalarTraits<RangeScalar>::one())
00240           tmp *= alpha;
00241         if (NO_BETA_AND_OVERWRITE)
00242           y[row] = tmp;
00243         else
00244           y[row] = tmp + beta * y[row];
00245       }
00246       else if (numRHS == 2) {
00247         RangeScalar* const ys = y + row;
00248         RangeScalar tmp[2] = {
00249           Teuchos::ScalarTraits<RangeScalar>::zero(),
00250           Teuchos::ScalarTraits<RangeScalar>::zero()
00251         };
00252 
00253         //+15% on SandyBridge but only if you change "RangeScalar tmp;" to "double tmp;"
00254         //#pragma simd reduction(+: tmp)
00255         for (OffsetType i = start; i < end; i++) {
00256           const RangeScalar Aij = (RangeScalar)vals[i];
00257           const DomainScalar* const xs = x + inds[i];
00258           tmp[0] += Aij * (RangeScalar)xs[0 * xstride];
00259           tmp[1] += Aij * (RangeScalar)xs[1 * xstride];
00260         }
00261         if (alpha != Teuchos::ScalarTraits<RangeScalar>::one()) {
00262           tmp[0] *= alpha;
00263           tmp[1] *= alpha;
00264         }
00265         if (NO_BETA_AND_OVERWRITE) {
00266           ys[0 * ystride] = tmp[0];
00267           ys[1 * ystride] = tmp[1];
00268         } else {
00269           ys[0 * ystride] = tmp[0] + beta * ys[0 * ystride];
00270           ys[1 * ystride] = tmp[1] + beta * ys[1 * ystride];
00271         }
00272       }
00273       else if (numRHS == 3) {
00274         RangeScalar* const ys = y + row;
00275         RangeScalar tmp[3] = {
00276           Teuchos::ScalarTraits<RangeScalar>::zero(),
00277           Teuchos::ScalarTraits<RangeScalar>::zero(),
00278           Teuchos::ScalarTraits<RangeScalar>::zero()
00279         };
00280 
00281         //+15% on SandyBridge but only if you change "RangeScalar tmp;" to "double tmp;"
00282         //#pragma simd reduction(+: tmp)
00283         for (OffsetType i = start; i < end; i++) {
00284           const RangeScalar Aij = (RangeScalar)vals[i];
00285           const DomainScalar* const xs = x + inds[i];
00286           tmp[0] += Aij * (RangeScalar)xs[0 * xstride];
00287           tmp[1] += Aij * (RangeScalar)xs[1 * xstride];
00288           tmp[2] += Aij * (RangeScalar)xs[2 * xstride];
00289         }
00290         if (alpha != Teuchos::ScalarTraits<RangeScalar>::one()) {
00291           tmp[0] *= alpha;
00292           tmp[1] *= alpha;
00293           tmp[2] *= alpha;
00294         }
00295         if (NO_BETA_AND_OVERWRITE) {
00296           ys[0 * ystride] = tmp[0];
00297           ys[1 * ystride] = tmp[1];
00298           ys[2 * ystride] = tmp[2];
00299         } else {
00300           ys[0 * ystride] = tmp[0] + beta * ys[0 * ystride];
00301           ys[1 * ystride] = tmp[1] + beta * ys[1 * ystride];
00302           ys[2 * ystride] = tmp[2] + beta * ys[2 * ystride];
00303         }
00304       }
00305       else { // if (numRHS == 4)
00306         RangeScalar* const ys = y + row;
00307         RangeScalar tmp[4] = {
00308           Teuchos::ScalarTraits<RangeScalar>::zero(),
00309           Teuchos::ScalarTraits<RangeScalar>::zero(),
00310           Teuchos::ScalarTraits<RangeScalar>::zero(),
00311           Teuchos::ScalarTraits<RangeScalar>::zero()
00312         };
00313 
00314         //+15% on SandyBridge but only if you change "RangeScalar tmp;" to "double tmp;"
00315         //#pragma simd reduction(+: tmp)
00316         for (OffsetType i = start; i < end; i++) {
00317           const RangeScalar Aij = (RangeScalar)vals[i];
00318           const DomainScalar* const xs = x + inds[i];
00319           tmp[0] += Aij * (RangeScalar)xs[0 * xstride];
00320           tmp[1] += Aij * (RangeScalar)xs[1 * xstride];
00321           tmp[2] += Aij * (RangeScalar)xs[2 * xstride];
00322           tmp[3] += Aij * (RangeScalar)xs[3 * xstride];
00323         }
00324         if (alpha != Teuchos::ScalarTraits<RangeScalar>::one()) {
00325           tmp[0] *= alpha;
00326           tmp[1] *= alpha;
00327           tmp[2] *= alpha;
00328           tmp[3] *= alpha;
00329         }
00330         if (NO_BETA_AND_OVERWRITE) {
00331           ys[0 * ystride] = tmp[0];
00332           ys[1 * ystride] = tmp[1];
00333           ys[2 * ystride] = tmp[2];
00334           ys[3 * ystride] = tmp[3];
00335         } else {
00336           ys[0 * ystride] = tmp[0] + beta * ys[0 * ystride];
00337           ys[1 * ystride] = tmp[1] + beta * ys[1 * ystride];
00338           ys[2 * ystride] = tmp[2] + beta * ys[2 * ystride];
00339           ys[3 * ystride] = tmp[3] + beta * ys[3 * ystride];
00340         }
00341       }
00342     }
00343   };
00344 
00345 
00346   // mfh 15 June 2012: I added the ScalarIsComplex Boolean template parameter to
00347   // avoid build errors due to attempts to cast from Scalar=int to
00348   // RangeScalar=std::complex<double>.  This additional template parameter is a
00349   // standard technique to specialize code for the real or complex arithmetic
00350   // case, when attempts to write a single code for both cases would result in
00351   // syntax errors.  The template parameter refers to RangeScalar, because we
00352   // have to decide whether the RangeScalar constructor takes one or two
00353   // arguments.
00354   //
00355   // If you wish to optimize further, you might like to add another Boolean
00356   // template parameter for whether Scalar is real or complex.  First, this
00357   // would let you avoid calling Teuchos::ScalarTraits<Scalar>::conjugate().
00358   // This should inline to nothing, but not all compilers are good at inlining.
00359   // Second, this would simplify the code for the case when both RangeScalar and
00360   // Scalar are both complex.  However, this second Boolean template parameter
00361   // is not necessary for correctness.
00362   template <class Scalar, // the type of entries in the sparse matrix
00363             class OffsetType, // the type of the row offsets
00364             class Ordinal, // the type of indices in the sparse matrix
00365             class DomainScalar, // the type of entries in the input (multi)vector
00366             class RangeScalar, // the type of entries in the output (multi)vector
00367             int NO_BETA_AND_OVERWRITE,
00368             bool RangeScalarIsComplex=Teuchos::ScalarTraits<RangeScalar>::isComplex> // whether RangeScalar is complex valued.
00369   struct DefaultSparseTransposeMultiplyOp;
00370 
00371 
00372   // Partial specialization for when RangeScalar is real (not complex) valued.
00373   template <class Scalar, class OffsetType, class Ordinal, class DomainScalar, class RangeScalar, int NO_BETA_AND_OVERWRITE>
00374   struct DefaultSparseTransposeMultiplyOp<Scalar, OffsetType, Ordinal, DomainScalar, RangeScalar, NO_BETA_AND_OVERWRITE, false> {
00375     // mat data
00376     const OffsetType *offs;
00377     const Ordinal    *inds;
00378     const Scalar     *vals;
00379     // matvec params
00380     RangeScalar        alpha, beta;
00381     Ordinal numRows, numCols;
00382     // mv data
00383     const DomainScalar  *x;
00384     RangeScalar         *y;
00385     Ordinal numRHS, xstride, ystride;
00386 
00387     inline void execute() {
00388       using Teuchos::ScalarTraits;
00389       
00390       if (NO_BETA_AND_OVERWRITE) {
00391         for (Ordinal j=0; j<numRHS; ++j) {
00392           RangeScalar *yp = y+j*ystride;
00393           for (Ordinal row=0; row<numCols; ++row) {
00394             yp[row] = ScalarTraits<RangeScalar>::zero();
00395           }
00396         }
00397       }
00398       else {
00399         for (Ordinal j=0; j<numRHS; ++j) {
00400           RangeScalar *yp = y+j*ystride;
00401           for (Ordinal row=0; row<numCols; ++row) {
00402             yp[row] *= beta;
00403           }
00404         }
00405       }
00406       // save the extra multiplication if possible
00407       if (alpha == ScalarTraits<RangeScalar>::one()) {
00408         for (Ordinal colAt=0; colAt < numRows; ++colAt) {
00409           const Scalar  *v  = vals + offs[colAt];
00410           const Ordinal *i  = inds + offs[colAt];
00411           const Ordinal *ie = inds + offs[colAt+1];
00412           // sparse outer product: AT[:,colAt] * X[ind]
00413           while (i != ie) {
00414             const  Scalar val = ScalarTraits<Scalar>::conjugate (*v++);
00415             const Ordinal ind = *i++;
00416             for (Ordinal j = 0; j < numRHS; ++j) {
00417               // mfh 15 June 2012: Casting Scalar to RangeScalar may produce a
00418               // build warning, e.g., if Scalar is int and RangeScalar is
00419               // double.  The way to get it to work is not to rely on casting
00420               // Scalar to RangeScalar.  Instead, rely on C++'s type promotion
00421               // rules.  For now, we just cast the input vector's value to
00422               // RangeScalar, to handle the common iterative refinement case of
00423               // an output vector with higher precision.
00424               y[j*ystride+ind] += val * RangeScalar (x[j*xstride+colAt]);
00425             }
00426           }
00427         }
00428       }
00429       else { // alpha != one
00430         for (Ordinal colAt=0; colAt < numRows; ++colAt) {
00431           const Scalar  *v  = vals + offs[colAt];
00432           const Ordinal *i  = inds + offs[colAt];
00433           const Ordinal *ie = inds + offs[colAt+1];
00434           // sparse outer product: AT[:,colAt] * X[ind
00435           while (i != ie) {
00436             const  Scalar val = ScalarTraits<Scalar>::conjugate (*v++);
00437             const Ordinal ind = *i++;
00438             for (Ordinal j=0; j<numRHS; ++j) {
00439               // mfh 15 June 2012: See notes above about build warnings when
00440               // casting val from Scalar to RangeScalar.
00441               y[j*ystride+ind] += alpha * val * RangeScalar (x[j*xstride+colAt]);
00442             }
00443           }
00444         }
00445       }
00446     }
00447   };
00448 
00449 
00450   // Partial specialization for when RangeScalar is complex valued.  The most
00451   // common case is that RangeScalar is a specialization of std::complex, but
00452   // this is not necessarily the case.  For example, CUDA has its own complex
00453   // arithmetic type, which is necessary because std::complex's methods are not
00454   // marked as device methods.  This code assumes that RangeScalar, being
00455   // complex valued, has a constructor which takes two arguments, each of which
00456   // can be converted to Teuchos::ScalarTraits<RangeScalar>::magnitudeType.
00457   template <class Scalar, class OffsetType, class Ordinal, class DomainScalar, class RangeScalar, int NO_BETA_AND_OVERWRITE>
00458   struct DefaultSparseTransposeMultiplyOp<Scalar, OffsetType, Ordinal, DomainScalar, RangeScalar, NO_BETA_AND_OVERWRITE, true> {
00459     // mat data
00460     const OffsetType *offs;
00461     const Ordinal    *inds;
00462     const Scalar     *vals;
00463     // matvec params
00464     RangeScalar        alpha, beta;
00465     Ordinal numRows, numCols;
00466     // mv data
00467     const DomainScalar  *x;
00468     RangeScalar         *y;
00469     Ordinal numRHS, xstride, ystride;
00470 
00471     inline void execute() {
00472       using Teuchos::ScalarTraits;
00473       typedef typename ScalarTraits<RangeScalar>::magnitudeType RSMT;
00474       if (NO_BETA_AND_OVERWRITE) {
00475         for (Ordinal j=0; j<numRHS; ++j) {
00476           RangeScalar *yp = y+j*ystride;
00477           for (Ordinal row=0; row<numCols; ++row) {
00478             yp[row] = ScalarTraits<RangeScalar>::zero();
00479           }
00480         }
00481       }
00482       else {
00483         for (Ordinal j=0; j<numRHS; ++j) {
00484           RangeScalar *yp = y+j*ystride;
00485           for (Ordinal row=0; row<numCols; ++row) {
00486             yp[row] *= beta;
00487           }
00488         }
00489       }
00490       // save the extra multiplication if possible
00491       if (alpha == ScalarTraits<RangeScalar>::one()) {
00492         for (Ordinal colAt=0; colAt < numRows; ++colAt) {
00493           const Scalar  *v  = vals + offs[colAt];
00494           const Ordinal *i  = inds + offs[colAt];
00495           const Ordinal *ie = inds + offs[colAt+1];
00496           // sparse outer product: AT[:,colAt] * X[ind]
00497           while (i != ie) {
00498             const  Scalar val = ScalarTraits<Scalar>::conjugate (*v++);
00499             const Ordinal ind = *i++;
00500             for (Ordinal j = 0; j < numRHS; ++j) {
00501               // mfh 15 June 2012: Casting Scalar to RangeScalar via a
00502               // static_cast won't work if Scalar is int and RangeScalar is
00503               // std::complex<double>.  (This is valid code.)  This is because
00504               // std::complex<double> doesn't have a constructor that takes one
00505               // int argument.  Furthermore, the C++ standard library doesn't
00506               // define operator*(int, std::complex<double>), so we can't rely
00507               // on C++'s type promotion rules.  However, any reasonable complex
00508               // arithmetic type should have a two-argument constructor that
00509               // takes arguments convertible to RSMT
00510               // (ScalarTraits<RangeScalar>::magnitudeType), and we should also
00511               // be able to cast from Scalar to RSMT.
00512               //
00513               // The mess with taking the real and imaginary components of val
00514               // is because Scalar could be either real or complex, but the
00515               // two-argument constructor of RangeScalar expects two real
00516               // arguments.  You can get rid of this by adding another template
00517               // parameter for whether Scalar is real or complex.
00518               y[j*ystride+ind] += RangeScalar (RSMT (ScalarTraits<Scalar>::real (val)),
00519                                                RSMT (ScalarTraits<Scalar>::imag (val))) *
00520                 RangeScalar (x[j*xstride+colAt]);
00521             }
00522           }
00523         }
00524       }
00525       else { // alpha != one
00526         for (Ordinal colAt=0; colAt < numRows; ++colAt) {
00527           const Scalar  *v  = vals + offs[colAt];
00528           const Ordinal *i  = inds + offs[colAt];
00529           const Ordinal *ie = inds + offs[colAt+1];
00530           // sparse outer product: AT[:,colAt] * X[ind]
00531           while (i != ie) {
00532             const  Scalar val = ScalarTraits<Scalar>::conjugate (*v++);
00533             const Ordinal ind = *i++;
00534             for (Ordinal j=0; j<numRHS; ++j) {
00535               // mfh 15 June 2012: See notes above about it sometimes being
00536               // invalid to cast val from Scalar to RangeScalar.
00537               y[j*ystride+ind] += alpha *
00538                 RangeScalar (RSMT (ScalarTraits<Scalar>::real (val)),
00539                              RSMT (ScalarTraits<Scalar>::imag (val))) *
00540                 RangeScalar (x[j*xstride+colAt]);
00541             }
00542           }
00543         }
00544       }
00545     }
00546   };
00547 
00548 } // namespace Kokkos
00549 
00550 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends