Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_Raw_SparseMatVec_def.hpp
Go to the documentation of this file.
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __Kokkos_Raw_SparseMatVec_def_hpp
00043 #define __Kokkos_Raw_SparseMatVec_def_hpp
00044 
00050 
00051 namespace Kokkos {
00052 namespace Raw {
00053 
00054 template<class Ordinal,
00055          class MatrixScalar,
00056          class DomainScalar,
00057          class RangeScalar>
00058 void
00059 matVecCscColMajor (
00060   const Ordinal numRows,
00061   const Ordinal numCols,
00062   const Ordinal numVecs,
00063   const RangeScalar& beta,
00064   RangeScalar Y[],
00065   const Ordinal colStrideY,
00066   const RangeScalar alpha,
00067   const Ordinal ptr[],
00068   const Ordinal ind[],
00069   const MatrixScalar val[],
00070   const DomainScalar X[],
00071   const Ordinal colStrideX)
00072 {
00073   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00074 
00075   // Prescale: Y := beta * Y.
00076   if (beta == STS::zero()) {
00077     for (Ordinal j = 0; j < numVecs; ++j) {
00078       RangeScalar* const Y_j = &Y[j*colStrideY];
00079       for (Ordinal i = 0; i < numRows; ++i) {
00080         // Follow the Sparse BLAS convention for beta == 0. 
00081         Y_j[i] = STS::zero();
00082       }
00083     }
00084   }
00085   else if (beta != STS::one()) {
00086     for (Ordinal j = 0; j < numVecs; ++j) {
00087       RangeScalar* const Y_j = &Y[j*colStrideY];
00088       for (Ordinal i = 0; i < numRows; ++i) {
00089         Y_j[i] = beta * Y_j[i];
00090       }
00091     }
00092   }
00093   Ordinal j = 0;
00094   if (alpha == STS::zero()) {
00095     return; // Our work is done!
00096   }
00097   const Ordinal nnz = ptr[numCols];
00098   if (alpha == STS::one()) {
00099     for (Ordinal k = 0; k < nnz; ++k) {
00100       const MatrixScalar A_ij = val[k];
00101       const Ordinal i = ind[k];
00102       while (k >= ptr[j+1]) {
00103         ++j;
00104       }
00105       RangeScalar* const Y_i = &Y[i];
00106       const DomainScalar* const X_j = &X[j];
00107       for (Ordinal c = 0; c < numVecs; ++c) {
00108         Y_i[c*colStrideY] += A_ij * X_j[c*colStrideX];
00109       }
00110     }
00111   }
00112   else { // alpha != STS::one()
00113     for (Ordinal k = 0; k < nnz; ++k) {
00114       const MatrixScalar A_ij = val[k];
00115       const Ordinal i = ind[k];
00116       while (k >= ptr[j+1]) {
00117         ++j;
00118       }
00119       RangeScalar* const Y_i = &Y[i];
00120       const DomainScalar* const X_j = &X[j];
00121       for (Ordinal c = 0; c < numVecs; ++c) {
00122         Y_i[c*colStrideY] += alpha * A_ij * X_j[c*colStrideX];
00123       }
00124     }
00125   }
00126 }
00127 
00128 template<class Ordinal,
00129          class MatrixScalar,
00130          class DomainScalar,
00131          class RangeScalar>
00132 void
00133 matVecCscColMajor4Unrolled (
00134   const Ordinal numRows,
00135   const Ordinal numCols,
00136   const Ordinal numVecs,
00137   const RangeScalar& beta,
00138   RangeScalar Y[],
00139   const Ordinal colStrideY,
00140   const RangeScalar alpha,
00141   const Ordinal ptr[],
00142   const Ordinal ind[],
00143   const MatrixScalar val[],
00144   const DomainScalar X[],
00145   const Ordinal colStrideX)
00146 {
00147   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00148 
00149   // Prescale: Y := beta * Y.
00150   if (beta == STS::zero()) {
00151     for (Ordinal j = 0; j < numVecs; ++j) {
00152       RangeScalar* const Y_j = &Y[j*colStrideY];
00153       for (Ordinal i = 0; i < numRows; ++i) {
00154         // Follow the Sparse BLAS convention for beta == 0. 
00155         Y_j[i] = STS::zero();
00156       }
00157     }
00158   }
00159   else if (beta != STS::one()) {
00160     for (Ordinal j = 0; j < numVecs; ++j) {
00161       RangeScalar* const Y_j = &Y[j*colStrideY];
00162       for (Ordinal i = 0; i < numRows; ++i) {
00163         Y_j[i] = beta * Y_j[i];
00164       }
00165     }
00166   }
00167   Ordinal j = 0;
00168   if (alpha == STS::zero()) {
00169     return; // Our work is done!
00170   }
00171   const Ordinal nnz = ptr[numCols];
00172   if (alpha == STS::one()) {
00173     for (Ordinal k = 0; k < nnz; ++k) {
00174       const MatrixScalar A_ij = val[k];
00175       const Ordinal i = ind[k];
00176       while (k >= ptr[j+1]) {
00177         ++j;
00178       }
00179       RangeScalar* const Y_i = &Y[i];
00180       const DomainScalar* const X_j = &X[j];
00181       Ordinal c = 0;
00182       // Extra +1 in loop bound ensures first 4 iterations get
00183       // strip-mined, but requires that Ordinal be a signed type.
00184       for ( ; c < numVecs - 3; c += 4) {
00185         Y_i[0] += A_ij * X_j[0];
00186         Y_i[colStrideY] += A_ij * X_j[colStrideX];
00187         Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
00188         Y_i[3*colStrideY] += A_ij * X_j[3*colStrideX];
00189       }
00190       for ( ; c < numVecs; ++c) {
00191         Y_i[c*colStrideY] += A_ij * X_j[c*colStrideX];
00192       }
00193     }
00194   }
00195   else { // alpha != STS::one()
00196     for (Ordinal k = 0; k < nnz; ++k) {
00197       const MatrixScalar A_ij = val[k];
00198       const Ordinal i = ind[k];
00199       while (k >= ptr[j+1]) {
00200         ++j;
00201       }
00202       RangeScalar* const Y_i = &Y[i];
00203       const DomainScalar* const X_j = &X[j];
00204       Ordinal c = 0;
00205       // Extra +1 in loop bound ensures first 4 iterations get
00206       // strip-mined, but requires that Ordinal be a signed type.
00207       for ( ; c < numVecs - 3; c += 4) {
00208         Y_i[0] += alpha * A_ij * X_j[0];
00209         Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
00210         Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
00211         Y_i[3*colStrideY] += alpha * A_ij * X_j[3*colStrideX];
00212       }
00213       for ( ; c < numVecs; ++c) {
00214         Y_i[c*colStrideY] += alpha * A_ij * X_j[c*colStrideX];
00215       }
00216     }
00217   }
00218 }
00219 
00220 template<class Ordinal,
00221          class MatrixScalar,
00222          class DomainScalar,
00223          class RangeScalar>
00224 void
00225 matVecCscColMajor1Vec (
00226   const Ordinal numRows,
00227   const Ordinal numCols,
00228   const RangeScalar& beta,
00229   RangeScalar Y[],
00230   const Ordinal colStrideY,
00231   const RangeScalar alpha,
00232   const Ordinal ptr[],
00233   const Ordinal ind[],
00234   const MatrixScalar val[],
00235   const DomainScalar X[],
00236   const Ordinal colStrideX)
00237 {
00238   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00239 
00240   const Ordinal numVecs = 1;
00241   // Prescale: Y := beta * Y.
00242   if (beta == STS::zero()) {
00243     for (Ordinal j = 0; j < numVecs; ++j) {
00244       RangeScalar* const Y_j = &Y[j*colStrideY];
00245       for (Ordinal i = 0; i < numRows; ++i) {
00246         // Follow the Sparse BLAS convention for beta == 0. 
00247         Y_j[i] = STS::zero();
00248       }
00249     }
00250   }
00251   else if (beta != STS::one()) {
00252     for (Ordinal j = 0; j < numVecs; ++j) {
00253       RangeScalar* const Y_j = &Y[j*colStrideY];
00254       for (Ordinal i = 0; i < numRows; ++i) {
00255         Y_j[i] = beta * Y_j[i];
00256       }
00257     }
00258   }
00259   Ordinal j = 0;
00260   if (alpha == STS::zero()) {
00261     return; // Our work is done!
00262   }
00263   const Ordinal nnz = ptr[numCols];
00264   if (alpha == STS::one()) {
00265     for (Ordinal k = 0; k < nnz; ++k) {
00266       const MatrixScalar A_ij = val[k];
00267       const Ordinal i = ind[k];
00268       while (k >= ptr[j+1]) {
00269         ++j;
00270       }
00271       Y[i] += A_ij * X[j];
00272     }
00273   }
00274   else { // alpha != STS::one()
00275     for (Ordinal k = 0; k < nnz; ++k) {
00276       const MatrixScalar A_ij = val[k];
00277       const Ordinal i = ind[k];
00278       while (k >= ptr[j+1]) {
00279         ++j;
00280       }
00281       Y[i] += alpha * A_ij * X[j];
00282     }
00283   }
00284 }
00285 
00286 template<class Ordinal,
00287          class MatrixScalar,
00288          class DomainScalar,
00289          class RangeScalar>
00290 void
00291 matVecCscColMajor2Vec (
00292   const Ordinal numRows,
00293   const Ordinal numCols,
00294   const RangeScalar& beta,
00295   RangeScalar Y[],
00296   const Ordinal colStrideY,
00297   const RangeScalar alpha,
00298   const Ordinal ptr[],
00299   const Ordinal ind[],
00300   const MatrixScalar val[],
00301   const DomainScalar X[],
00302   const Ordinal colStrideX)
00303 {
00304   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00305 
00306   const Ordinal numVecs = 2;
00307   // Prescale: Y := beta * Y.
00308   if (beta == STS::zero()) {
00309     for (Ordinal j = 0; j < numVecs; ++j) {
00310       RangeScalar* const Y_j = &Y[j*colStrideY];
00311       for (Ordinal i = 0; i < numRows; ++i) {
00312         // Follow the Sparse BLAS convention for beta == 0. 
00313         Y_j[i] = STS::zero();
00314       }
00315     }
00316   }
00317   else if (beta != STS::one()) {
00318     for (Ordinal j = 0; j < numVecs; ++j) {
00319       RangeScalar* const Y_j = &Y[j*colStrideY];
00320       for (Ordinal i = 0; i < numRows; ++i) {
00321         Y_j[i] = beta * Y_j[i];
00322       }
00323     }
00324   }
00325   Ordinal j = 0;
00326   if (alpha == STS::zero()) {
00327     return; // Our work is done!
00328   }
00329   const Ordinal nnz = ptr[numCols];
00330   if (alpha == STS::one()) {
00331     for (Ordinal k = 0; k < nnz; ++k) {
00332       const MatrixScalar A_ij = val[k];
00333       const Ordinal i = ind[k];
00334       while (k >= ptr[j+1]) {
00335         ++j;
00336       }
00337       RangeScalar* const Y_i = &Y[i];
00338       const DomainScalar* const X_j = &X[j];
00339       Y_i[0] += A_ij * X_j[0];
00340       Y_i[colStrideY] += A_ij * X_j[colStrideX];
00341     }
00342   }
00343   else { // alpha != STS::one()
00344     for (Ordinal k = 0; k < nnz; ++k) {
00345       const MatrixScalar A_ij = val[k];
00346       const Ordinal i = ind[k];
00347       while (k >= ptr[j+1]) {
00348         ++j;
00349       }
00350       RangeScalar* const Y_i = &Y[i];
00351       const DomainScalar* const X_j = &X[j];
00352       Y_i[0] += alpha * A_ij * X_j[0];
00353       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
00354     }
00355   }
00356 }
00357 
00358 template<class Ordinal,
00359          class MatrixScalar,
00360          class DomainScalar,
00361          class RangeScalar>
00362 void
00363 matVecCscColMajor3Vec (
00364   const Ordinal numRows,
00365   const Ordinal numCols,
00366   const RangeScalar& beta,
00367   RangeScalar Y[],
00368   const Ordinal colStrideY,
00369   const RangeScalar alpha,
00370   const Ordinal ptr[],
00371   const Ordinal ind[],
00372   const MatrixScalar val[],
00373   const DomainScalar X[],
00374   const Ordinal colStrideX)
00375 {
00376   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00377 
00378   const Ordinal numVecs = 3;
00379   // Prescale: Y := beta * Y.
00380   if (beta == STS::zero()) {
00381     for (Ordinal j = 0; j < numVecs; ++j) {
00382       RangeScalar* const Y_j = &Y[j*colStrideY];
00383       for (Ordinal i = 0; i < numRows; ++i) {
00384         // Follow the Sparse BLAS convention for beta == 0. 
00385         Y_j[i] = STS::zero();
00386       }
00387     }
00388   }
00389   else if (beta != STS::one()) {
00390     for (Ordinal j = 0; j < numVecs; ++j) {
00391       RangeScalar* const Y_j = &Y[j*colStrideY];
00392       for (Ordinal i = 0; i < numRows; ++i) {
00393         Y_j[i] = beta * Y_j[i];
00394       }
00395     }
00396   }
00397   Ordinal j = 0;
00398   if (alpha == STS::zero()) {
00399     return; // Our work is done!
00400   }
00401   const Ordinal nnz = ptr[numCols];
00402   if (alpha == STS::one()) {
00403     for (Ordinal k = 0; k < nnz; ++k) {
00404       const MatrixScalar A_ij = val[k];
00405       const Ordinal i = ind[k];
00406       while (k >= ptr[j+1]) {
00407         ++j;
00408       }
00409       RangeScalar* const Y_i = &Y[i];
00410       const DomainScalar* const X_j = &X[j];
00411       Y_i[0] += A_ij * X_j[0];
00412       Y_i[colStrideY] += A_ij * X_j[colStrideX];
00413       Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
00414     }
00415   }
00416   else { // alpha != STS::one()
00417     for (Ordinal k = 0; k < nnz; ++k) {
00418       const MatrixScalar A_ij = val[k];
00419       const Ordinal i = ind[k];
00420       while (k >= ptr[j+1]) {
00421         ++j;
00422       }
00423       RangeScalar* const Y_i = &Y[i];
00424       const DomainScalar* const X_j = &X[j];
00425       Y_i[0] += alpha * A_ij * X_j[0];
00426       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
00427       Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
00428     }
00429   }
00430 }
00431 
00432 template<class Ordinal,
00433          class MatrixScalar,
00434          class DomainScalar,
00435          class RangeScalar>
00436 void
00437 matVecCscColMajor4Vec (
00438   const Ordinal numRows,
00439   const Ordinal numCols,
00440   const RangeScalar& beta,
00441   RangeScalar Y[],
00442   const Ordinal colStrideY,
00443   const RangeScalar alpha,
00444   const Ordinal ptr[],
00445   const Ordinal ind[],
00446   const MatrixScalar val[],
00447   const DomainScalar X[],
00448   const Ordinal colStrideX)
00449 {
00450   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00451 
00452   const Ordinal numVecs = 4;
00453   // Prescale: Y := beta * Y.
00454   if (beta == STS::zero()) {
00455     for (Ordinal j = 0; j < numVecs; ++j) {
00456       RangeScalar* const Y_j = &Y[j*colStrideY];
00457       for (Ordinal i = 0; i < numRows; ++i) {
00458         // Follow the Sparse BLAS convention for beta == 0. 
00459         Y_j[i] = STS::zero();
00460       }
00461     }
00462   }
00463   else if (beta != STS::one()) {
00464     for (Ordinal j = 0; j < numVecs; ++j) {
00465       RangeScalar* const Y_j = &Y[j*colStrideY];
00466       for (Ordinal i = 0; i < numRows; ++i) {
00467         Y_j[i] = beta * Y_j[i];
00468       }
00469     }
00470   }
00471   Ordinal j = 0;
00472   if (alpha == STS::zero()) {
00473     return; // Our work is done!
00474   }
00475   const Ordinal nnz = ptr[numCols];
00476   if (alpha == STS::one()) {
00477     for (Ordinal k = 0; k < nnz; ++k) {
00478       const MatrixScalar A_ij = val[k];
00479       const Ordinal i = ind[k];
00480       while (k >= ptr[j+1]) {
00481         ++j;
00482       }
00483       RangeScalar* const Y_i = &Y[i];
00484       const DomainScalar* const X_j = &X[j];
00485       Y_i[0] += A_ij * X_j[0];
00486       Y_i[colStrideY] += A_ij * X_j[colStrideX];
00487       Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
00488       Y_i[3*colStrideY] += A_ij * X_j[3*colStrideX];
00489     }
00490   }
00491   else { // alpha != STS::one()
00492     for (Ordinal k = 0; k < nnz; ++k) {
00493       const MatrixScalar A_ij = val[k];
00494       const Ordinal i = ind[k];
00495       while (k >= ptr[j+1]) {
00496         ++j;
00497       }
00498       RangeScalar* const Y_i = &Y[i];
00499       const DomainScalar* const X_j = &X[j];
00500       Y_i[0] += alpha * A_ij * X_j[0];
00501       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
00502       Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
00503       Y_i[3*colStrideY] += alpha * A_ij * X_j[3*colStrideX];
00504     }
00505   }
00506 }
00507 
00508 template<class Ordinal,
00509          class MatrixScalar,
00510          class DomainScalar,
00511          class RangeScalar>
00512 void
00513 matVecCsrColMajor (
00514   const Ordinal numRows,
00515   const Ordinal numCols,
00516   const Ordinal numVecs,
00517   const RangeScalar& beta,
00518   RangeScalar Y[],
00519   const Ordinal colStrideY,
00520   const RangeScalar alpha,
00521   const Ordinal ptr[],
00522   const Ordinal ind[],
00523   const MatrixScalar val[],
00524   const DomainScalar X[],
00525   const Ordinal colStrideX)
00526 {
00527   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00528 
00529   Ordinal i = 0;
00530   // Special case for CSR only: Y(0,:) = 0.
00531   if (beta != STS::zero()) {
00532     for (Ordinal c = 0; c < numVecs; ++c) {
00533       Y[c*colStrideY] = beta * Y[c*colStrideY];
00534     }
00535   }
00536   else {
00537     // Follow the Sparse BLAS convention for beta == 0. 
00538     for (Ordinal c = 0; c < numVecs; ++c) {
00539       Y[c*colStrideY] = STS::zero();
00540     }
00541   }
00542   if (alpha == STS::zero()) {
00543     return; // Our work is done!
00544   }
00545   const Ordinal nnz = ptr[numRows];
00546   if (alpha == STS::one()) {
00547     for (Ordinal k = 0; k < nnz; ++k) {
00548       const MatrixScalar A_ij = val[k];
00549       const Ordinal j = ind[k];
00550       while (k >= ptr[i+1]) {
00551         ++i;
00552         // We haven't seen row i before; scale Y(i,:) by beta.
00553         RangeScalar* const Y_i = &Y[i];
00554         for (Ordinal c = 0; c < numVecs; ++c) {
00555           Y_i[c*colStrideY] = beta * Y_i[c*colStrideY];
00556         }
00557       }
00558       RangeScalar* const Y_i = &Y[i];
00559       const DomainScalar* const X_j = &X[j];
00560       for (Ordinal c = 0; c < numVecs; ++c) {
00561         Y_i[c*colStrideY] += A_ij * X_j[c*colStrideX];
00562       }
00563     }
00564   }
00565   else { // alpha != STS::one()
00566     for (Ordinal k = 0; k < nnz; ++k) {
00567       const MatrixScalar A_ij = val[k];
00568       const Ordinal j = ind[k];
00569       while (k >= ptr[i+1]) {
00570         ++i;
00571         // We haven't seen row i before; scale Y(i,:) by beta.
00572         RangeScalar* const Y_i = &Y[i];
00573         for (Ordinal c = 0; c < numVecs; ++c) {
00574           Y_i[c*colStrideY] = beta * Y_i[c*colStrideY];
00575         }
00576       }
00577       RangeScalar* const Y_i = &Y[i];
00578       const DomainScalar* const X_j = &X[j];
00579       for (Ordinal c = 0; c < numVecs; ++c) {
00580         Y_i[c*colStrideY] += alpha * A_ij * X_j[c*colStrideX];
00581       }
00582     }
00583   }
00584 }
00585 
00586 template<class Ordinal,
00587          class MatrixScalar,
00588          class DomainScalar,
00589          class RangeScalar>
00590 void
00591 matVecCsrColMajor4Unrolled (
00592   const Ordinal numRows,
00593   const Ordinal numCols,
00594   const Ordinal numVecs,
00595   const RangeScalar& beta,
00596   RangeScalar Y[],
00597   const Ordinal colStrideY,
00598   const RangeScalar alpha,
00599   const Ordinal ptr[],
00600   const Ordinal ind[],
00601   const MatrixScalar val[],
00602   const DomainScalar X[],
00603   const Ordinal colStrideX)
00604 {
00605   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00606 
00607   Ordinal i = 0;
00608   // Special case for CSR only: Y(0,:) = 0.
00609   if (beta != STS::zero()) {
00610     for (Ordinal c = 0; c < numVecs; ++c) {
00611       Y[c*colStrideY] = beta * Y[c*colStrideY];
00612     }
00613   }
00614   else {
00615     // Follow the Sparse BLAS convention for beta == 0. 
00616     for (Ordinal c = 0; c < numVecs; ++c) {
00617       Y[c*colStrideY] = STS::zero();
00618     }
00619   }
00620   if (alpha == STS::zero()) {
00621     return; // Our work is done!
00622   }
00623   const Ordinal nnz = ptr[numRows];
00624   if (alpha == STS::one()) {
00625     for (Ordinal k = 0; k < nnz; ++k) {
00626       const MatrixScalar A_ij = val[k];
00627       const Ordinal j = ind[k];
00628       while (k >= ptr[i+1]) {
00629         ++i;
00630         // We haven't seen row i before; scale Y(i,:) by beta.
00631         RangeScalar* const Y_i = &Y[i];
00632         Ordinal c = 0;
00633         // Extra +1 in loop bound ensures first 4 iterations get
00634         // strip-mined, but requires that Ordinal be a signed type.
00635         for ( ; c < numVecs - 3; c += 4) {
00636           Y_i[0] *= beta;
00637           Y_i[colStrideY] *= beta;
00638           Y_i[2*colStrideY] *= beta;
00639           Y_i[3*colStrideY] *= beta;
00640         }
00641         for ( ; c < numVecs; ++c) {
00642           Y_i[c*colStrideY] *= beta;
00643         }
00644       }
00645       RangeScalar* const Y_i = &Y[i];
00646       const DomainScalar* const X_j = &X[j];
00647       Ordinal c = 0;
00648       // Extra +1 in loop bound ensures first 4 iterations get
00649       // strip-mined, but requires that Ordinal be a signed type.
00650       for ( ; c < numVecs - 3; c += 4) {
00651         Y_i[0] += A_ij * X_j[0];
00652         Y_i[colStrideY] += A_ij * X_j[colStrideX];
00653         Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
00654         Y_i[3*colStrideY] += A_ij * X_j[3*colStrideX];
00655       }
00656       for ( ; c < numVecs; ++c) {
00657         Y_i[c*colStrideY] += A_ij * X_j[c*colStrideX];
00658       }
00659     }
00660   }
00661   else { // alpha != STS::one()
00662     for (Ordinal k = 0; k < nnz; ++k) {
00663       const MatrixScalar A_ij = val[k];
00664       const Ordinal j = ind[k];
00665       while (k >= ptr[i+1]) {
00666         ++i;
00667         // We haven't seen row i before; scale Y(i,:) by beta.
00668         RangeScalar* const Y_i = &Y[i];
00669         Ordinal c = 0;
00670         // Extra +1 in loop bound ensures first 4 iterations get
00671         // strip-mined, but requires that Ordinal be a signed type.
00672         for ( ; c < numVecs - 3; c += 4) {
00673           Y_i[0] *= beta;
00674           Y_i[colStrideY] *= beta;
00675           Y_i[2*colStrideY] *= beta;
00676           Y_i[3*colStrideY] *= beta;
00677         }
00678         for ( ; c < numVecs; ++c) {
00679           Y_i[c*colStrideY] *= beta;
00680         }
00681       }
00682       RangeScalar* const Y_i = &Y[i];
00683       const DomainScalar* const X_j = &X[j];
00684       Ordinal c = 0;
00685       // Extra +1 in loop bound ensures first 4 iterations get
00686       // strip-mined, but requires that Ordinal be a signed type.
00687       for ( ; c < numVecs - 3; c += 4) {
00688         Y_i[0] += alpha * A_ij * X_j[0];
00689         Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
00690         Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
00691         Y_i[3*colStrideY] += alpha * A_ij * X_j[3*colStrideX];
00692       }
00693       for ( ; c < numVecs; ++c) {
00694         Y_i[c*colStrideY] += alpha * A_ij * X_j[c*colStrideX];
00695       }
00696     }
00697   }
00698 }
00699 
00700 template<class Ordinal,
00701          class MatrixScalar,
00702          class DomainScalar,
00703          class RangeScalar>
00704 void
00705 matVecCsrColMajor1Vec (
00706   const Ordinal numRows,
00707   const Ordinal numCols,
00708   const RangeScalar& beta,
00709   RangeScalar Y[],
00710   const Ordinal colStrideY,
00711   const RangeScalar alpha,
00712   const Ordinal ptr[],
00713   const Ordinal ind[],
00714   const MatrixScalar val[],
00715   const DomainScalar X[],
00716   const Ordinal colStrideX)
00717 {
00718   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00719 
00720   const Ordinal numVecs = 1;
00721   Ordinal i = 0;
00722   // Special case for CSR only: Y(0,:) = 0.
00723   if (beta != STS::zero()) {
00724     for (Ordinal c = 0; c < numVecs; ++c) {
00725       Y[c*colStrideY] = beta * Y[c*colStrideY];
00726     }
00727   }
00728   else {
00729     // Follow the Sparse BLAS convention for beta == 0. 
00730     for (Ordinal c = 0; c < numVecs; ++c) {
00731       Y[c*colStrideY] = STS::zero();
00732     }
00733   }
00734   if (alpha == STS::zero()) {
00735     return; // Our work is done!
00736   }
00737   const Ordinal nnz = ptr[numRows];
00738   if (alpha == STS::one()) {
00739     for (Ordinal k = 0; k < nnz; ++k) {
00740       const MatrixScalar A_ij = val[k];
00741       const Ordinal j = ind[k];
00742       while (k >= ptr[i+1]) {
00743         ++i;
00744         // We haven't seen row i before; scale Y(i,:) by beta.
00745         Y[i] *= beta;
00746       }
00747       Y[i] += A_ij * X[j];
00748     }
00749   }
00750   else { // alpha != STS::one()
00751     for (Ordinal k = 0; k < nnz; ++k) {
00752       const MatrixScalar A_ij = val[k];
00753       const Ordinal j = ind[k];
00754       while (k >= ptr[i+1]) {
00755         ++i;
00756         // We haven't seen row i before; scale Y(i,:) by beta.
00757         Y[i] *= beta;
00758       }
00759       Y[i] += alpha * A_ij * X[j];
00760     }
00761   }
00762 }
00763 
00764 template<class Ordinal,
00765          class MatrixScalar,
00766          class DomainScalar,
00767          class RangeScalar>
00768 void
00769 matVecCsrColMajor2Vec (
00770   const Ordinal numRows,
00771   const Ordinal numCols,
00772   const RangeScalar& beta,
00773   RangeScalar Y[],
00774   const Ordinal colStrideY,
00775   const RangeScalar alpha,
00776   const Ordinal ptr[],
00777   const Ordinal ind[],
00778   const MatrixScalar val[],
00779   const DomainScalar X[],
00780   const Ordinal colStrideX)
00781 {
00782   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00783 
00784   const Ordinal numVecs = 2;
00785   Ordinal i = 0;
00786   // Special case for CSR only: Y(0,:) = 0.
00787   if (beta != STS::zero()) {
00788     for (Ordinal c = 0; c < numVecs; ++c) {
00789       Y[c*colStrideY] = beta * Y[c*colStrideY];
00790     }
00791   }
00792   else {
00793     // Follow the Sparse BLAS convention for beta == 0. 
00794     for (Ordinal c = 0; c < numVecs; ++c) {
00795       Y[c*colStrideY] = STS::zero();
00796     }
00797   }
00798   if (alpha == STS::zero()) {
00799     return; // Our work is done!
00800   }
00801   const Ordinal nnz = ptr[numRows];
00802   if (alpha == STS::one()) {
00803     for (Ordinal k = 0; k < nnz; ++k) {
00804       const MatrixScalar A_ij = val[k];
00805       const Ordinal j = ind[k];
00806       while (k >= ptr[i+1]) {
00807         ++i;
00808         // We haven't seen row i before; scale Y(i,:) by beta.
00809         RangeScalar* const Y_i = &Y[i];
00810         Y_i[0] *= beta;
00811         Y_i[colStrideY] *= beta;
00812       }
00813       RangeScalar* const Y_i = &Y[i];
00814       const DomainScalar* const X_j = &X[j];
00815       Y_i[0] += A_ij * X_j[0];
00816       Y_i[colStrideY] += A_ij * X_j[colStrideX];
00817     }
00818   }
00819   else { // alpha != STS::one()
00820     for (Ordinal k = 0; k < nnz; ++k) {
00821       const MatrixScalar A_ij = val[k];
00822       const Ordinal j = ind[k];
00823       while (k >= ptr[i+1]) {
00824         ++i;
00825         // We haven't seen row i before; scale Y(i,:) by beta.
00826         RangeScalar* const Y_i = &Y[i];
00827         Y_i[0] *= beta;
00828         Y_i[colStrideY] *= beta;
00829       }
00830       RangeScalar* const Y_i = &Y[i];
00831       const DomainScalar* const X_j = &X[j];
00832       Y_i[0] += alpha * A_ij * X_j[0];
00833       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
00834     }
00835   }
00836 }
00837 
00838 template<class Ordinal,
00839          class MatrixScalar,
00840          class DomainScalar,
00841          class RangeScalar>
00842 void
00843 matVecCsrColMajor3Vec (
00844   const Ordinal numRows,
00845   const Ordinal numCols,
00846   const RangeScalar& beta,
00847   RangeScalar Y[],
00848   const Ordinal colStrideY,
00849   const RangeScalar alpha,
00850   const Ordinal ptr[],
00851   const Ordinal ind[],
00852   const MatrixScalar val[],
00853   const DomainScalar X[],
00854   const Ordinal colStrideX)
00855 {
00856   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00857 
00858   const Ordinal numVecs = 3;
00859   Ordinal i = 0;
00860   // Special case for CSR only: Y(0,:) = 0.
00861   if (beta != STS::zero()) {
00862     for (Ordinal c = 0; c < numVecs; ++c) {
00863       Y[c*colStrideY] = beta * Y[c*colStrideY];
00864     }
00865   }
00866   else {
00867     // Follow the Sparse BLAS convention for beta == 0. 
00868     for (Ordinal c = 0; c < numVecs; ++c) {
00869       Y[c*colStrideY] = STS::zero();
00870     }
00871   }
00872   if (alpha == STS::zero()) {
00873     return; // Our work is done!
00874   }
00875   const Ordinal nnz = ptr[numRows];
00876   if (alpha == STS::one()) {
00877     for (Ordinal k = 0; k < nnz; ++k) {
00878       const MatrixScalar A_ij = val[k];
00879       const Ordinal j = ind[k];
00880       while (k >= ptr[i+1]) {
00881         ++i;
00882         // We haven't seen row i before; scale Y(i,:) by beta.
00883         RangeScalar* const Y_i = &Y[i];
00884         Y_i[0] *= beta;
00885         Y_i[colStrideY] *= beta;
00886         Y_i[2*colStrideY] *= beta;
00887       }
00888       RangeScalar* const Y_i = &Y[i];
00889       const DomainScalar* const X_j = &X[j];
00890       Y_i[0] += A_ij * X_j[0];
00891       Y_i[colStrideY] += A_ij * X_j[colStrideX];
00892       Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
00893     }
00894   }
00895   else { // alpha != STS::one()
00896     for (Ordinal k = 0; k < nnz; ++k) {
00897       const MatrixScalar A_ij = val[k];
00898       const Ordinal j = ind[k];
00899       while (k >= ptr[i+1]) {
00900         ++i;
00901         // We haven't seen row i before; scale Y(i,:) by beta.
00902         RangeScalar* const Y_i = &Y[i];
00903         Y_i[0] *= beta;
00904         Y_i[colStrideY] *= beta;
00905         Y_i[2*colStrideY] *= beta;
00906       }
00907       RangeScalar* const Y_i = &Y[i];
00908       const DomainScalar* const X_j = &X[j];
00909       Y_i[0] += alpha * A_ij * X_j[0];
00910       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
00911       Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
00912     }
00913   }
00914 }
00915 
00916 template<class Ordinal,
00917          class MatrixScalar,
00918          class DomainScalar,
00919          class RangeScalar>
00920 void
00921 matVecCsrColMajor4Vec (
00922   const Ordinal numRows,
00923   const Ordinal numCols,
00924   const RangeScalar& beta,
00925   RangeScalar Y[],
00926   const Ordinal colStrideY,
00927   const RangeScalar alpha,
00928   const Ordinal ptr[],
00929   const Ordinal ind[],
00930   const MatrixScalar val[],
00931   const DomainScalar X[],
00932   const Ordinal colStrideX)
00933 {
00934   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00935 
00936   const Ordinal numVecs = 4;
00937   Ordinal i = 0;
00938   // Special case for CSR only: Y(0,:) = 0.
00939   if (beta != STS::zero()) {
00940     for (Ordinal c = 0; c < numVecs; ++c) {
00941       Y[c*colStrideY] = beta * Y[c*colStrideY];
00942     }
00943   }
00944   else {
00945     // Follow the Sparse BLAS convention for beta == 0. 
00946     for (Ordinal c = 0; c < numVecs; ++c) {
00947       Y[c*colStrideY] = STS::zero();
00948     }
00949   }
00950   if (alpha == STS::zero()) {
00951     return; // Our work is done!
00952   }
00953   const Ordinal nnz = ptr[numRows];
00954   if (alpha == STS::one()) {
00955     for (Ordinal k = 0; k < nnz; ++k) {
00956       const MatrixScalar A_ij = val[k];
00957       const Ordinal j = ind[k];
00958       while (k >= ptr[i+1]) {
00959         ++i;
00960         // We haven't seen row i before; scale Y(i,:) by beta.
00961         RangeScalar* const Y_i = &Y[i];
00962         Y_i[0] *= beta;
00963         Y_i[colStrideY] *= beta;
00964         Y_i[2*colStrideY] *= beta;
00965         Y_i[3*colStrideY] *= beta;
00966       }
00967       RangeScalar* const Y_i = &Y[i];
00968       const DomainScalar* const X_j = &X[j];
00969       Y_i[0] += A_ij * X_j[0];
00970       Y_i[colStrideY] += A_ij * X_j[colStrideX];
00971       Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
00972       Y_i[3*colStrideY] += A_ij * X_j[3*colStrideX];
00973     }
00974   }
00975   else { // alpha != STS::one()
00976     for (Ordinal k = 0; k < nnz; ++k) {
00977       const MatrixScalar A_ij = val[k];
00978       const Ordinal j = ind[k];
00979       while (k >= ptr[i+1]) {
00980         ++i;
00981         // We haven't seen row i before; scale Y(i,:) by beta.
00982         RangeScalar* const Y_i = &Y[i];
00983         Y_i[0] *= beta;
00984         Y_i[colStrideY] *= beta;
00985         Y_i[2*colStrideY] *= beta;
00986         Y_i[3*colStrideY] *= beta;
00987       }
00988       RangeScalar* const Y_i = &Y[i];
00989       const DomainScalar* const X_j = &X[j];
00990       Y_i[0] += alpha * A_ij * X_j[0];
00991       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
00992       Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
00993       Y_i[3*colStrideY] += alpha * A_ij * X_j[3*colStrideX];
00994     }
00995   }
00996 }
00997 
00998 template<class Ordinal,
00999          class MatrixScalar,
01000          class DomainScalar,
01001          class RangeScalar>
01002 void
01003 matVecCscRowMajor (
01004   const Ordinal numRows,
01005   const Ordinal numCols,
01006   const Ordinal numVecs,
01007   const RangeScalar& beta,
01008   RangeScalar Y[],
01009   const Ordinal rowStrideY,
01010   const RangeScalar alpha,
01011   const Ordinal ptr[],
01012   const Ordinal ind[],
01013   const MatrixScalar val[],
01014   const DomainScalar X[],
01015   const Ordinal rowStrideX)
01016 {
01017   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01018 
01019   // Prescale: Y := beta * Y.
01020   if (beta == STS::zero()) {
01021     for (Ordinal i = 0; i < numRows; ++i) {
01022       RangeScalar* const Y_i = &Y[i*rowStrideY];
01023       for (Ordinal j = 0; j < numVecs; ++j) {
01024         // Follow the Sparse BLAS convention for beta == 0. 
01025         Y_i[j] = STS::zero();
01026       }
01027     }
01028   }
01029   else if (beta != STS::one()) {
01030     for (Ordinal i = 0; i < numRows; ++i) {
01031       RangeScalar* const Y_i = &Y[i*rowStrideY];
01032       for (Ordinal j = 0; j < numVecs; ++j) {
01033         Y_i[j] = beta * Y_i[j];
01034       }
01035     }
01036   }
01037   Ordinal j = 0;
01038   if (alpha == STS::zero()) {
01039     return; // Our work is done!
01040   }
01041   const Ordinal nnz = ptr[numCols];
01042   if (alpha == STS::one()) {
01043     for (Ordinal k = 0; k < nnz; ++k) {
01044       const MatrixScalar A_ij = val[k];
01045       const Ordinal i = ind[k];
01046       while (k >= ptr[j+1]) {
01047         ++j;
01048       }
01049       RangeScalar* const Y_i = &Y[i*rowStrideY];
01050       const DomainScalar* const X_j = &X[j*rowStrideX];
01051       for (Ordinal c = 0; c < numVecs; ++c) {
01052         Y_i[c] += A_ij * X_j[c];
01053       }
01054     }
01055   }
01056   else { // alpha != STS::one()
01057     for (Ordinal k = 0; k < nnz; ++k) {
01058       const MatrixScalar A_ij = val[k];
01059       const Ordinal i = ind[k];
01060       while (k >= ptr[j+1]) {
01061         ++j;
01062       }
01063       RangeScalar* const Y_i = &Y[i*rowStrideY];
01064       const DomainScalar* const X_j = &X[j*rowStrideX];
01065       for (Ordinal c = 0; c < numVecs; ++c) {
01066         Y_i[c] += alpha * A_ij * X_j[c];
01067       }
01068     }
01069   }
01070 }
01071 
01072 template<class Ordinal,
01073          class MatrixScalar,
01074          class DomainScalar,
01075          class RangeScalar>
01076 void
01077 matVecCscRowMajor4Unrolled (
01078   const Ordinal numRows,
01079   const Ordinal numCols,
01080   const Ordinal numVecs,
01081   const RangeScalar& beta,
01082   RangeScalar Y[],
01083   const Ordinal rowStrideY,
01084   const RangeScalar alpha,
01085   const Ordinal ptr[],
01086   const Ordinal ind[],
01087   const MatrixScalar val[],
01088   const DomainScalar X[],
01089   const Ordinal rowStrideX)
01090 {
01091   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01092 
01093   // Prescale: Y := beta * Y.
01094   if (beta == STS::zero()) {
01095     for (Ordinal i = 0; i < numRows; ++i) {
01096       RangeScalar* const Y_i = &Y[i*rowStrideY];
01097       for (Ordinal j = 0; j < numVecs; ++j) {
01098         // Follow the Sparse BLAS convention for beta == 0. 
01099         Y_i[j] = STS::zero();
01100       }
01101     }
01102   }
01103   else if (beta != STS::one()) {
01104     for (Ordinal i = 0; i < numRows; ++i) {
01105       RangeScalar* const Y_i = &Y[i*rowStrideY];
01106       for (Ordinal j = 0; j < numVecs; ++j) {
01107         Y_i[j] = beta * Y_i[j];
01108       }
01109     }
01110   }
01111   Ordinal j = 0;
01112   if (alpha == STS::zero()) {
01113     return; // Our work is done!
01114   }
01115   const Ordinal nnz = ptr[numCols];
01116   if (alpha == STS::one()) {
01117     for (Ordinal k = 0; k < nnz; ++k) {
01118       const MatrixScalar A_ij = val[k];
01119       const Ordinal i = ind[k];
01120       while (k >= ptr[j+1]) {
01121         ++j;
01122       }
01123       RangeScalar* const Y_i = &Y[i*rowStrideY];
01124       const DomainScalar* const X_j = &X[j*rowStrideX];
01125       Ordinal c = 0;
01126       // Extra +1 in loop bound ensures first 4 iterations get
01127       // strip-mined, but requires that Ordinal be a signed type.
01128       for ( ; c < numVecs - 3; c += 4) {
01129         Y_i[0] += A_ij * X_j[0];
01130         Y_i[1] += A_ij * X_j[1];
01131         Y_i[2] += A_ij * X_j[2];
01132         Y_i[3] += A_ij * X_j[3];
01133       }
01134       for ( ; c < numVecs; ++c) {
01135         Y_i[c] += A_ij * X_j[c];
01136       }
01137     }
01138   }
01139   else { // alpha != STS::one()
01140     for (Ordinal k = 0; k < nnz; ++k) {
01141       const MatrixScalar A_ij = val[k];
01142       const Ordinal i = ind[k];
01143       while (k >= ptr[j+1]) {
01144         ++j;
01145       }
01146       RangeScalar* const Y_i = &Y[i*rowStrideY];
01147       const DomainScalar* const X_j = &X[j*rowStrideX];
01148       Ordinal c = 0;
01149       // Extra +1 in loop bound ensures first 4 iterations get
01150       // strip-mined, but requires that Ordinal be a signed type.
01151       for ( ; c < numVecs - 3; c += 4) {
01152         Y_i[0] += alpha * A_ij * X_j[0];
01153         Y_i[1] += alpha * A_ij * X_j[1];
01154         Y_i[2] += alpha * A_ij * X_j[2];
01155         Y_i[3] += alpha * A_ij * X_j[3];
01156       }
01157       for ( ; c < numVecs; ++c) {
01158         Y_i[c] += alpha * A_ij * X_j[c];
01159       }
01160     }
01161   }
01162 }
01163 
01164 template<class Ordinal,
01165          class MatrixScalar,
01166          class DomainScalar,
01167          class RangeScalar>
01168 void
01169 matVecCscRowMajor1Vec (
01170   const Ordinal numRows,
01171   const Ordinal numCols,
01172   const RangeScalar& beta,
01173   RangeScalar Y[],
01174   const Ordinal rowStrideY,
01175   const RangeScalar alpha,
01176   const Ordinal ptr[],
01177   const Ordinal ind[],
01178   const MatrixScalar val[],
01179   const DomainScalar X[],
01180   const Ordinal rowStrideX)
01181 {
01182   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01183 
01184   const Ordinal numVecs = 1;
01185   // Prescale: Y := beta * Y.
01186   if (beta == STS::zero()) {
01187     for (Ordinal i = 0; i < numRows; ++i) {
01188       RangeScalar* const Y_i = &Y[i*rowStrideY];
01189       for (Ordinal j = 0; j < numVecs; ++j) {
01190         // Follow the Sparse BLAS convention for beta == 0. 
01191         Y_i[j] = STS::zero();
01192       }
01193     }
01194   }
01195   else if (beta != STS::one()) {
01196     for (Ordinal i = 0; i < numRows; ++i) {
01197       RangeScalar* const Y_i = &Y[i*rowStrideY];
01198       for (Ordinal j = 0; j < numVecs; ++j) {
01199         Y_i[j] = beta * Y_i[j];
01200       }
01201     }
01202   }
01203   Ordinal j = 0;
01204   if (alpha == STS::zero()) {
01205     return; // Our work is done!
01206   }
01207   const Ordinal nnz = ptr[numCols];
01208   if (alpha == STS::one()) {
01209     for (Ordinal k = 0; k < nnz; ++k) {
01210       const MatrixScalar A_ij = val[k];
01211       const Ordinal i = ind[k];
01212       while (k >= ptr[j+1]) {
01213         ++j;
01214       }
01215       Y[i*rowStrideY] += A_ij * X[j*rowStrideX];
01216     }
01217   }
01218   else { // alpha != STS::one()
01219     for (Ordinal k = 0; k < nnz; ++k) {
01220       const MatrixScalar A_ij = val[k];
01221       const Ordinal i = ind[k];
01222       while (k >= ptr[j+1]) {
01223         ++j;
01224       }
01225       Y[i*rowStrideY] += alpha * A_ij * X[j*rowStrideX];
01226     }
01227   }
01228 }
01229 
01230 template<class Ordinal,
01231          class MatrixScalar,
01232          class DomainScalar,
01233          class RangeScalar>
01234 void
01235 matVecCscRowMajor2Vec (
01236   const Ordinal numRows,
01237   const Ordinal numCols,
01238   const RangeScalar& beta,
01239   RangeScalar Y[],
01240   const Ordinal rowStrideY,
01241   const RangeScalar alpha,
01242   const Ordinal ptr[],
01243   const Ordinal ind[],
01244   const MatrixScalar val[],
01245   const DomainScalar X[],
01246   const Ordinal rowStrideX)
01247 {
01248   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01249 
01250   const Ordinal numVecs = 2;
01251   // Prescale: Y := beta * Y.
01252   if (beta == STS::zero()) {
01253     for (Ordinal i = 0; i < numRows; ++i) {
01254       RangeScalar* const Y_i = &Y[i*rowStrideY];
01255       for (Ordinal j = 0; j < numVecs; ++j) {
01256         // Follow the Sparse BLAS convention for beta == 0. 
01257         Y_i[j] = STS::zero();
01258       }
01259     }
01260   }
01261   else if (beta != STS::one()) {
01262     for (Ordinal i = 0; i < numRows; ++i) {
01263       RangeScalar* const Y_i = &Y[i*rowStrideY];
01264       for (Ordinal j = 0; j < numVecs; ++j) {
01265         Y_i[j] = beta * Y_i[j];
01266       }
01267     }
01268   }
01269   Ordinal j = 0;
01270   if (alpha == STS::zero()) {
01271     return; // Our work is done!
01272   }
01273   const Ordinal nnz = ptr[numCols];
01274   if (alpha == STS::one()) {
01275     for (Ordinal k = 0; k < nnz; ++k) {
01276       const MatrixScalar A_ij = val[k];
01277       const Ordinal i = ind[k];
01278       while (k >= ptr[j+1]) {
01279         ++j;
01280       }
01281       RangeScalar* const Y_i = &Y[i*rowStrideY];
01282       const DomainScalar* const X_j = &X[j*rowStrideX];
01283       Y_i[0] += A_ij * X_j[0];
01284       Y_i[1] += A_ij * X_j[1];
01285     }
01286   }
01287   else { // alpha != STS::one()
01288     for (Ordinal k = 0; k < nnz; ++k) {
01289       const MatrixScalar A_ij = val[k];
01290       const Ordinal i = ind[k];
01291       while (k >= ptr[j+1]) {
01292         ++j;
01293       }
01294       RangeScalar* const Y_i = &Y[i*rowStrideY];
01295       const DomainScalar* const X_j = &X[j*rowStrideX];
01296       Y_i[0] += alpha * A_ij * X_j[0];
01297       Y_i[1] += alpha * A_ij * X_j[1];
01298     }
01299   }
01300 }
01301 
01302 template<class Ordinal,
01303          class MatrixScalar,
01304          class DomainScalar,
01305          class RangeScalar>
01306 void
01307 matVecCscRowMajor3Vec (
01308   const Ordinal numRows,
01309   const Ordinal numCols,
01310   const RangeScalar& beta,
01311   RangeScalar Y[],
01312   const Ordinal rowStrideY,
01313   const RangeScalar alpha,
01314   const Ordinal ptr[],
01315   const Ordinal ind[],
01316   const MatrixScalar val[],
01317   const DomainScalar X[],
01318   const Ordinal rowStrideX)
01319 {
01320   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01321 
01322   const Ordinal numVecs = 3;
01323   // Prescale: Y := beta * Y.
01324   if (beta == STS::zero()) {
01325     for (Ordinal i = 0; i < numRows; ++i) {
01326       RangeScalar* const Y_i = &Y[i*rowStrideY];
01327       for (Ordinal j = 0; j < numVecs; ++j) {
01328         // Follow the Sparse BLAS convention for beta == 0. 
01329         Y_i[j] = STS::zero();
01330       }
01331     }
01332   }
01333   else if (beta != STS::one()) {
01334     for (Ordinal i = 0; i < numRows; ++i) {
01335       RangeScalar* const Y_i = &Y[i*rowStrideY];
01336       for (Ordinal j = 0; j < numVecs; ++j) {
01337         Y_i[j] = beta * Y_i[j];
01338       }
01339     }
01340   }
01341   Ordinal j = 0;
01342   if (alpha == STS::zero()) {
01343     return; // Our work is done!
01344   }
01345   const Ordinal nnz = ptr[numCols];
01346   if (alpha == STS::one()) {
01347     for (Ordinal k = 0; k < nnz; ++k) {
01348       const MatrixScalar A_ij = val[k];
01349       const Ordinal i = ind[k];
01350       while (k >= ptr[j+1]) {
01351         ++j;
01352       }
01353       RangeScalar* const Y_i = &Y[i*rowStrideY];
01354       const DomainScalar* const X_j = &X[j*rowStrideX];
01355       Y_i[0] += A_ij * X_j[0];
01356       Y_i[1] += A_ij * X_j[1];
01357       Y_i[2] += A_ij * X_j[2];
01358     }
01359   }
01360   else { // alpha != STS::one()
01361     for (Ordinal k = 0; k < nnz; ++k) {
01362       const MatrixScalar A_ij = val[k];
01363       const Ordinal i = ind[k];
01364       while (k >= ptr[j+1]) {
01365         ++j;
01366       }
01367       RangeScalar* const Y_i = &Y[i*rowStrideY];
01368       const DomainScalar* const X_j = &X[j*rowStrideX];
01369       Y_i[0] += alpha * A_ij * X_j[0];
01370       Y_i[1] += alpha * A_ij * X_j[1];
01371       Y_i[2] += alpha * A_ij * X_j[2];
01372     }
01373   }
01374 }
01375 
01376 template<class Ordinal,
01377          class MatrixScalar,
01378          class DomainScalar,
01379          class RangeScalar>
01380 void
01381 matVecCscRowMajor4Vec (
01382   const Ordinal numRows,
01383   const Ordinal numCols,
01384   const RangeScalar& beta,
01385   RangeScalar Y[],
01386   const Ordinal rowStrideY,
01387   const RangeScalar alpha,
01388   const Ordinal ptr[],
01389   const Ordinal ind[],
01390   const MatrixScalar val[],
01391   const DomainScalar X[],
01392   const Ordinal rowStrideX)
01393 {
01394   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01395 
01396   const Ordinal numVecs = 4;
01397   // Prescale: Y := beta * Y.
01398   if (beta == STS::zero()) {
01399     for (Ordinal i = 0; i < numRows; ++i) {
01400       RangeScalar* const Y_i = &Y[i*rowStrideY];
01401       for (Ordinal j = 0; j < numVecs; ++j) {
01402         // Follow the Sparse BLAS convention for beta == 0. 
01403         Y_i[j] = STS::zero();
01404       }
01405     }
01406   }
01407   else if (beta != STS::one()) {
01408     for (Ordinal i = 0; i < numRows; ++i) {
01409       RangeScalar* const Y_i = &Y[i*rowStrideY];
01410       for (Ordinal j = 0; j < numVecs; ++j) {
01411         Y_i[j] = beta * Y_i[j];
01412       }
01413     }
01414   }
01415   Ordinal j = 0;
01416   if (alpha == STS::zero()) {
01417     return; // Our work is done!
01418   }
01419   const Ordinal nnz = ptr[numCols];
01420   if (alpha == STS::one()) {
01421     for (Ordinal k = 0; k < nnz; ++k) {
01422       const MatrixScalar A_ij = val[k];
01423       const Ordinal i = ind[k];
01424       while (k >= ptr[j+1]) {
01425         ++j;
01426       }
01427       RangeScalar* const Y_i = &Y[i*rowStrideY];
01428       const DomainScalar* const X_j = &X[j*rowStrideX];
01429       Y_i[0] += A_ij * X_j[0];
01430       Y_i[1] += A_ij * X_j[1];
01431       Y_i[2] += A_ij * X_j[2];
01432       Y_i[3] += A_ij * X_j[3];
01433     }
01434   }
01435   else { // alpha != STS::one()
01436     for (Ordinal k = 0; k < nnz; ++k) {
01437       const MatrixScalar A_ij = val[k];
01438       const Ordinal i = ind[k];
01439       while (k >= ptr[j+1]) {
01440         ++j;
01441       }
01442       RangeScalar* const Y_i = &Y[i*rowStrideY];
01443       const DomainScalar* const X_j = &X[j*rowStrideX];
01444       Y_i[0] += alpha * A_ij * X_j[0];
01445       Y_i[1] += alpha * A_ij * X_j[1];
01446       Y_i[2] += alpha * A_ij * X_j[2];
01447       Y_i[3] += alpha * A_ij * X_j[3];
01448     }
01449   }
01450 }
01451 
01452 template<class Ordinal,
01453          class MatrixScalar,
01454          class DomainScalar,
01455          class RangeScalar>
01456 void
01457 matVecCsrRowMajor (
01458   const Ordinal numRows,
01459   const Ordinal numCols,
01460   const Ordinal numVecs,
01461   const RangeScalar& beta,
01462   RangeScalar Y[],
01463   const Ordinal rowStrideY,
01464   const RangeScalar alpha,
01465   const Ordinal ptr[],
01466   const Ordinal ind[],
01467   const MatrixScalar val[],
01468   const DomainScalar X[],
01469   const Ordinal rowStrideX)
01470 {
01471   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01472 
01473   Ordinal i = 0;
01474   // Special case for CSR only: Y(0,:) = 0.
01475   if (beta != STS::zero()) {
01476     for (Ordinal c = 0; c < numVecs; ++c) {
01477       Y[c] = beta * Y[c];
01478     }
01479   }
01480   else {
01481     // Follow the Sparse BLAS convention for beta == 0. 
01482     for (Ordinal c = 0; c < numVecs; ++c) {
01483       Y[c] = STS::zero();
01484     }
01485   }
01486   if (alpha == STS::zero()) {
01487     return; // Our work is done!
01488   }
01489   const Ordinal nnz = ptr[numRows];
01490   if (alpha == STS::one()) {
01491     for (Ordinal k = 0; k < nnz; ++k) {
01492       const MatrixScalar A_ij = val[k];
01493       const Ordinal j = ind[k];
01494       while (k >= ptr[i+1]) {
01495         ++i;
01496         // We haven't seen row i before; scale Y(i,:) by beta.
01497         RangeScalar* const Y_i = &Y[i*rowStrideY];
01498         for (Ordinal c = 0; c < numVecs; ++c) {
01499           Y_i[c] = beta * Y_i[c];
01500         }
01501       }
01502       RangeScalar* const Y_i = &Y[i*rowStrideY];
01503       const DomainScalar* const X_j = &X[j*rowStrideX];
01504       for (Ordinal c = 0; c < numVecs; ++c) {
01505         Y_i[c] += A_ij * X_j[c];
01506       }
01507     }
01508   }
01509   else { // alpha != STS::one()
01510     for (Ordinal k = 0; k < nnz; ++k) {
01511       const MatrixScalar A_ij = val[k];
01512       const Ordinal j = ind[k];
01513       while (k >= ptr[i+1]) {
01514         ++i;
01515         // We haven't seen row i before; scale Y(i,:) by beta.
01516         RangeScalar* const Y_i = &Y[i*rowStrideY];
01517         for (Ordinal c = 0; c < numVecs; ++c) {
01518           Y_i[c] = beta * Y_i[c];
01519         }
01520       }
01521       RangeScalar* const Y_i = &Y[i*rowStrideY];
01522       const DomainScalar* const X_j = &X[j*rowStrideX];
01523       for (Ordinal c = 0; c < numVecs; ++c) {
01524         Y_i[c] += alpha * A_ij * X_j[c];
01525       }
01526     }
01527   }
01528 }
01529 
01530 template<class Ordinal,
01531          class MatrixScalar,
01532          class DomainScalar,
01533          class RangeScalar>
01534 void
01535 matVecCsrRowMajor4Unrolled (
01536   const Ordinal numRows,
01537   const Ordinal numCols,
01538   const Ordinal numVecs,
01539   const RangeScalar& beta,
01540   RangeScalar Y[],
01541   const Ordinal rowStrideY,
01542   const RangeScalar alpha,
01543   const Ordinal ptr[],
01544   const Ordinal ind[],
01545   const MatrixScalar val[],
01546   const DomainScalar X[],
01547   const Ordinal rowStrideX)
01548 {
01549   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01550 
01551   Ordinal i = 0;
01552   // Special case for CSR only: Y(0,:) = 0.
01553   if (beta != STS::zero()) {
01554     for (Ordinal c = 0; c < numVecs; ++c) {
01555       Y[c] = beta * Y[c];
01556     }
01557   }
01558   else {
01559     // Follow the Sparse BLAS convention for beta == 0. 
01560     for (Ordinal c = 0; c < numVecs; ++c) {
01561       Y[c] = STS::zero();
01562     }
01563   }
01564   if (alpha == STS::zero()) {
01565     return; // Our work is done!
01566   }
01567   const Ordinal nnz = ptr[numRows];
01568   if (alpha == STS::one()) {
01569     for (Ordinal k = 0; k < nnz; ++k) {
01570       const MatrixScalar A_ij = val[k];
01571       const Ordinal j = ind[k];
01572       while (k >= ptr[i+1]) {
01573         ++i;
01574         // We haven't seen row i before; scale Y(i,:) by beta.
01575         RangeScalar* const Y_i = &Y[i*rowStrideY];
01576         Ordinal c = 0;
01577         // Extra +1 in loop bound ensures first 4 iterations get
01578         // strip-mined, but requires that Ordinal be a signed type.
01579         for ( ; c < numVecs - 3; c += 4) {
01580           Y_i[0] *= beta;
01581           Y_i[1] *= beta;
01582           Y_i[2] *= beta;
01583           Y_i[3] *= beta;
01584         }
01585         for ( ; c < numVecs; ++c) {
01586           Y_i[c] *= beta;
01587         }
01588       }
01589       RangeScalar* const Y_i = &Y[i*rowStrideY];
01590       const DomainScalar* const X_j = &X[j*rowStrideX];
01591       Ordinal c = 0;
01592       // Extra +1 in loop bound ensures first 4 iterations get
01593       // strip-mined, but requires that Ordinal be a signed type.
01594       for ( ; c < numVecs - 3; c += 4) {
01595         Y_i[0] += A_ij * X_j[0];
01596         Y_i[1] += A_ij * X_j[1];
01597         Y_i[2] += A_ij * X_j[2];
01598         Y_i[3] += A_ij * X_j[3];
01599       }
01600       for ( ; c < numVecs; ++c) {
01601         Y_i[c] += A_ij * X_j[c];
01602       }
01603     }
01604   }
01605   else { // alpha != STS::one()
01606     for (Ordinal k = 0; k < nnz; ++k) {
01607       const MatrixScalar A_ij = val[k];
01608       const Ordinal j = ind[k];
01609       while (k >= ptr[i+1]) {
01610         ++i;
01611         // We haven't seen row i before; scale Y(i,:) by beta.
01612         RangeScalar* const Y_i = &Y[i*rowStrideY];
01613         Ordinal c = 0;
01614         // Extra +1 in loop bound ensures first 4 iterations get
01615         // strip-mined, but requires that Ordinal be a signed type.
01616         for ( ; c < numVecs - 3; c += 4) {
01617           Y_i[0] *= beta;
01618           Y_i[1] *= beta;
01619           Y_i[2] *= beta;
01620           Y_i[3] *= beta;
01621         }
01622         for ( ; c < numVecs; ++c) {
01623           Y_i[c] *= beta;
01624         }
01625       }
01626       RangeScalar* const Y_i = &Y[i*rowStrideY];
01627       const DomainScalar* const X_j = &X[j*rowStrideX];
01628       Ordinal c = 0;
01629       // Extra +1 in loop bound ensures first 4 iterations get
01630       // strip-mined, but requires that Ordinal be a signed type.
01631       for ( ; c < numVecs - 3; c += 4) {
01632         Y_i[0] += alpha * A_ij * X_j[0];
01633         Y_i[1] += alpha * A_ij * X_j[1];
01634         Y_i[2] += alpha * A_ij * X_j[2];
01635         Y_i[3] += alpha * A_ij * X_j[3];
01636       }
01637       for ( ; c < numVecs; ++c) {
01638         Y_i[c] += alpha * A_ij * X_j[c];
01639       }
01640     }
01641   }
01642 }
01643 
01644 template<class Ordinal,
01645          class MatrixScalar,
01646          class DomainScalar,
01647          class RangeScalar>
01648 void
01649 matVecCsrRowMajor1Vec (
01650   const Ordinal numRows,
01651   const Ordinal numCols,
01652   const RangeScalar& beta,
01653   RangeScalar Y[],
01654   const Ordinal rowStrideY,
01655   const RangeScalar alpha,
01656   const Ordinal ptr[],
01657   const Ordinal ind[],
01658   const MatrixScalar val[],
01659   const DomainScalar X[],
01660   const Ordinal rowStrideX)
01661 {
01662   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01663 
01664   const Ordinal numVecs = 1;
01665   Ordinal i = 0;
01666   // Special case for CSR only: Y(0,:) = 0.
01667   if (beta != STS::zero()) {
01668     for (Ordinal c = 0; c < numVecs; ++c) {
01669       Y[c] = beta * Y[c];
01670     }
01671   }
01672   else {
01673     // Follow the Sparse BLAS convention for beta == 0. 
01674     for (Ordinal c = 0; c < numVecs; ++c) {
01675       Y[c] = STS::zero();
01676     }
01677   }
01678   if (alpha == STS::zero()) {
01679     return; // Our work is done!
01680   }
01681   const Ordinal nnz = ptr[numRows];
01682   if (alpha == STS::one()) {
01683     for (Ordinal k = 0; k < nnz; ++k) {
01684       const MatrixScalar A_ij = val[k];
01685       const Ordinal j = ind[k];
01686       while (k >= ptr[i+1]) {
01687         ++i;
01688         // We haven't seen row i before; scale Y(i,:) by beta.
01689         Y[i*rowStrideY] *= beta;
01690       }
01691       Y[i*rowStrideY] += A_ij * X[j*rowStrideX];
01692     }
01693   }
01694   else { // alpha != STS::one()
01695     for (Ordinal k = 0; k < nnz; ++k) {
01696       const MatrixScalar A_ij = val[k];
01697       const Ordinal j = ind[k];
01698       while (k >= ptr[i+1]) {
01699         ++i;
01700         // We haven't seen row i before; scale Y(i,:) by beta.
01701         Y[i*rowStrideY] *= beta;
01702       }
01703       Y[i*rowStrideY] += alpha * A_ij * X[j*rowStrideX];
01704     }
01705   }
01706 }
01707 
01708 template<class Ordinal,
01709          class MatrixScalar,
01710          class DomainScalar,
01711          class RangeScalar>
01712 void
01713 matVecCsrRowMajor2Vec (
01714   const Ordinal numRows,
01715   const Ordinal numCols,
01716   const RangeScalar& beta,
01717   RangeScalar Y[],
01718   const Ordinal rowStrideY,
01719   const RangeScalar alpha,
01720   const Ordinal ptr[],
01721   const Ordinal ind[],
01722   const MatrixScalar val[],
01723   const DomainScalar X[],
01724   const Ordinal rowStrideX)
01725 {
01726   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01727 
01728   const Ordinal numVecs = 2;
01729   Ordinal i = 0;
01730   // Special case for CSR only: Y(0,:) = 0.
01731   if (beta != STS::zero()) {
01732     for (Ordinal c = 0; c < numVecs; ++c) {
01733       Y[c] = beta * Y[c];
01734     }
01735   }
01736   else {
01737     // Follow the Sparse BLAS convention for beta == 0. 
01738     for (Ordinal c = 0; c < numVecs; ++c) {
01739       Y[c] = STS::zero();
01740     }
01741   }
01742   if (alpha == STS::zero()) {
01743     return; // Our work is done!
01744   }
01745   const Ordinal nnz = ptr[numRows];
01746   if (alpha == STS::one()) {
01747     for (Ordinal k = 0; k < nnz; ++k) {
01748       const MatrixScalar A_ij = val[k];
01749       const Ordinal j = ind[k];
01750       while (k >= ptr[i+1]) {
01751         ++i;
01752         // We haven't seen row i before; scale Y(i,:) by beta.
01753         RangeScalar* const Y_i = &Y[i*rowStrideY];
01754         Y_i[0] *= beta;
01755         Y_i[1] *= beta;
01756       }
01757       RangeScalar* const Y_i = &Y[i*rowStrideY];
01758       const DomainScalar* const X_j = &X[j*rowStrideX];
01759       Y_i[0] += A_ij * X_j[0];
01760       Y_i[1] += A_ij * X_j[1];
01761     }
01762   }
01763   else { // alpha != STS::one()
01764     for (Ordinal k = 0; k < nnz; ++k) {
01765       const MatrixScalar A_ij = val[k];
01766       const Ordinal j = ind[k];
01767       while (k >= ptr[i+1]) {
01768         ++i;
01769         // We haven't seen row i before; scale Y(i,:) by beta.
01770         RangeScalar* const Y_i = &Y[i*rowStrideY];
01771         Y_i[0] *= beta;
01772         Y_i[1] *= beta;
01773       }
01774       RangeScalar* const Y_i = &Y[i*rowStrideY];
01775       const DomainScalar* const X_j = &X[j*rowStrideX];
01776       Y_i[0] += alpha * A_ij * X_j[0];
01777       Y_i[1] += alpha * A_ij * X_j[1];
01778     }
01779   }
01780 }
01781 
01782 template<class Ordinal,
01783          class MatrixScalar,
01784          class DomainScalar,
01785          class RangeScalar>
01786 void
01787 matVecCsrRowMajor3Vec (
01788   const Ordinal numRows,
01789   const Ordinal numCols,
01790   const RangeScalar& beta,
01791   RangeScalar Y[],
01792   const Ordinal rowStrideY,
01793   const RangeScalar alpha,
01794   const Ordinal ptr[],
01795   const Ordinal ind[],
01796   const MatrixScalar val[],
01797   const DomainScalar X[],
01798   const Ordinal rowStrideX)
01799 {
01800   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01801 
01802   const Ordinal numVecs = 3;
01803   Ordinal i = 0;
01804   // Special case for CSR only: Y(0,:) = 0.
01805   if (beta != STS::zero()) {
01806     for (Ordinal c = 0; c < numVecs; ++c) {
01807       Y[c] = beta * Y[c];
01808     }
01809   }
01810   else {
01811     // Follow the Sparse BLAS convention for beta == 0. 
01812     for (Ordinal c = 0; c < numVecs; ++c) {
01813       Y[c] = STS::zero();
01814     }
01815   }
01816   if (alpha == STS::zero()) {
01817     return; // Our work is done!
01818   }
01819   const Ordinal nnz = ptr[numRows];
01820   if (alpha == STS::one()) {
01821     for (Ordinal k = 0; k < nnz; ++k) {
01822       const MatrixScalar A_ij = val[k];
01823       const Ordinal j = ind[k];
01824       while (k >= ptr[i+1]) {
01825         ++i;
01826         // We haven't seen row i before; scale Y(i,:) by beta.
01827         RangeScalar* const Y_i = &Y[i*rowStrideY];
01828         Y_i[0] *= beta;
01829         Y_i[1] *= beta;
01830         Y_i[2] *= beta;
01831       }
01832       RangeScalar* const Y_i = &Y[i*rowStrideY];
01833       const DomainScalar* const X_j = &X[j*rowStrideX];
01834       Y_i[0] += A_ij * X_j[0];
01835       Y_i[1] += A_ij * X_j[1];
01836       Y_i[2] += A_ij * X_j[2];
01837     }
01838   }
01839   else { // alpha != STS::one()
01840     for (Ordinal k = 0; k < nnz; ++k) {
01841       const MatrixScalar A_ij = val[k];
01842       const Ordinal j = ind[k];
01843       while (k >= ptr[i+1]) {
01844         ++i;
01845         // We haven't seen row i before; scale Y(i,:) by beta.
01846         RangeScalar* const Y_i = &Y[i*rowStrideY];
01847         Y_i[0] *= beta;
01848         Y_i[1] *= beta;
01849         Y_i[2] *= beta;
01850       }
01851       RangeScalar* const Y_i = &Y[i*rowStrideY];
01852       const DomainScalar* const X_j = &X[j*rowStrideX];
01853       Y_i[0] += alpha * A_ij * X_j[0];
01854       Y_i[1] += alpha * A_ij * X_j[1];
01855       Y_i[2] += alpha * A_ij * X_j[2];
01856     }
01857   }
01858 }
01859 
01860 template<class Ordinal,
01861          class MatrixScalar,
01862          class DomainScalar,
01863          class RangeScalar>
01864 void
01865 matVecCsrRowMajor4Vec (
01866   const Ordinal numRows,
01867   const Ordinal numCols,
01868   const RangeScalar& beta,
01869   RangeScalar Y[],
01870   const Ordinal rowStrideY,
01871   const RangeScalar alpha,
01872   const Ordinal ptr[],
01873   const Ordinal ind[],
01874   const MatrixScalar val[],
01875   const DomainScalar X[],
01876   const Ordinal rowStrideX)
01877 {
01878   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01879 
01880   const Ordinal numVecs = 4;
01881   Ordinal i = 0;
01882   // Special case for CSR only: Y(0,:) = 0.
01883   if (beta != STS::zero()) {
01884     for (Ordinal c = 0; c < numVecs; ++c) {
01885       Y[c] = beta * Y[c];
01886     }
01887   }
01888   else {
01889     // Follow the Sparse BLAS convention for beta == 0. 
01890     for (Ordinal c = 0; c < numVecs; ++c) {
01891       Y[c] = STS::zero();
01892     }
01893   }
01894   if (alpha == STS::zero()) {
01895     return; // Our work is done!
01896   }
01897   const Ordinal nnz = ptr[numRows];
01898   if (alpha == STS::one()) {
01899     for (Ordinal k = 0; k < nnz; ++k) {
01900       const MatrixScalar A_ij = val[k];
01901       const Ordinal j = ind[k];
01902       while (k >= ptr[i+1]) {
01903         ++i;
01904         // We haven't seen row i before; scale Y(i,:) by beta.
01905         RangeScalar* const Y_i = &Y[i*rowStrideY];
01906         Y_i[0] *= beta;
01907         Y_i[1] *= beta;
01908         Y_i[2] *= beta;
01909         Y_i[3] *= beta;
01910       }
01911       RangeScalar* const Y_i = &Y[i*rowStrideY];
01912       const DomainScalar* const X_j = &X[j*rowStrideX];
01913       Y_i[0] += A_ij * X_j[0];
01914       Y_i[1] += A_ij * X_j[1];
01915       Y_i[2] += A_ij * X_j[2];
01916       Y_i[3] += A_ij * X_j[3];
01917     }
01918   }
01919   else { // alpha != STS::one()
01920     for (Ordinal k = 0; k < nnz; ++k) {
01921       const MatrixScalar A_ij = val[k];
01922       const Ordinal j = ind[k];
01923       while (k >= ptr[i+1]) {
01924         ++i;
01925         // We haven't seen row i before; scale Y(i,:) by beta.
01926         RangeScalar* const Y_i = &Y[i*rowStrideY];
01927         Y_i[0] *= beta;
01928         Y_i[1] *= beta;
01929         Y_i[2] *= beta;
01930         Y_i[3] *= beta;
01931       }
01932       RangeScalar* const Y_i = &Y[i*rowStrideY];
01933       const DomainScalar* const X_j = &X[j*rowStrideX];
01934       Y_i[0] += alpha * A_ij * X_j[0];
01935       Y_i[1] += alpha * A_ij * X_j[1];
01936       Y_i[2] += alpha * A_ij * X_j[2];
01937       Y_i[3] += alpha * A_ij * X_j[3];
01938     }
01939   }
01940 }
01941 
01942 template<class Ordinal,
01943          class MatrixScalar,
01944          class DomainScalar,
01945          class RangeScalar>
01946 void
01947 matVecCscColMajorConj (
01948   const Ordinal numRows,
01949   const Ordinal numCols,
01950   const Ordinal numVecs,
01951   const RangeScalar& beta,
01952   RangeScalar Y[],
01953   const Ordinal colStrideY,
01954   const RangeScalar alpha,
01955   const Ordinal ptr[],
01956   const Ordinal ind[],
01957   const MatrixScalar val[],
01958   const DomainScalar X[],
01959   const Ordinal colStrideX)
01960 {
01961   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01962 
01963   // Prescale: Y := beta * Y.
01964   if (beta == STS::zero()) {
01965     for (Ordinal j = 0; j < numVecs; ++j) {
01966       RangeScalar* const Y_j = &Y[j*colStrideY];
01967       for (Ordinal i = 0; i < numRows; ++i) {
01968         // Follow the Sparse BLAS convention for beta == 0. 
01969         Y_j[i] = STS::zero();
01970       }
01971     }
01972   }
01973   else if (beta != STS::one()) {
01974     for (Ordinal j = 0; j < numVecs; ++j) {
01975       RangeScalar* const Y_j = &Y[j*colStrideY];
01976       for (Ordinal i = 0; i < numRows; ++i) {
01977         Y_j[i] = beta * Y_j[i];
01978       }
01979     }
01980   }
01981   Ordinal j = 0;
01982   if (alpha == STS::zero()) {
01983     return; // Our work is done!
01984   }
01985   const Ordinal nnz = ptr[numCols];
01986   if (alpha == STS::one()) {
01987     for (Ordinal k = 0; k < nnz; ++k) {
01988       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
01989       const Ordinal i = ind[k];
01990       while (k >= ptr[j+1]) {
01991         ++j;
01992       }
01993       RangeScalar* const Y_i = &Y[i];
01994       const DomainScalar* const X_j = &X[j];
01995       for (Ordinal c = 0; c < numVecs; ++c) {
01996         Y_i[c*colStrideY] += A_ij * X_j[c*colStrideX];
01997       }
01998     }
01999   }
02000   else { // alpha != STS::one()
02001     for (Ordinal k = 0; k < nnz; ++k) {
02002       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02003       const Ordinal i = ind[k];
02004       while (k >= ptr[j+1]) {
02005         ++j;
02006       }
02007       RangeScalar* const Y_i = &Y[i];
02008       const DomainScalar* const X_j = &X[j];
02009       for (Ordinal c = 0; c < numVecs; ++c) {
02010         Y_i[c*colStrideY] += alpha * A_ij * X_j[c*colStrideX];
02011       }
02012     }
02013   }
02014 }
02015 
02016 template<class Ordinal,
02017          class MatrixScalar,
02018          class DomainScalar,
02019          class RangeScalar>
02020 void
02021 matVecCscColMajorConj4Unrolled (
02022   const Ordinal numRows,
02023   const Ordinal numCols,
02024   const Ordinal numVecs,
02025   const RangeScalar& beta,
02026   RangeScalar Y[],
02027   const Ordinal colStrideY,
02028   const RangeScalar alpha,
02029   const Ordinal ptr[],
02030   const Ordinal ind[],
02031   const MatrixScalar val[],
02032   const DomainScalar X[],
02033   const Ordinal colStrideX)
02034 {
02035   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02036 
02037   // Prescale: Y := beta * Y.
02038   if (beta == STS::zero()) {
02039     for (Ordinal j = 0; j < numVecs; ++j) {
02040       RangeScalar* const Y_j = &Y[j*colStrideY];
02041       for (Ordinal i = 0; i < numRows; ++i) {
02042         // Follow the Sparse BLAS convention for beta == 0. 
02043         Y_j[i] = STS::zero();
02044       }
02045     }
02046   }
02047   else if (beta != STS::one()) {
02048     for (Ordinal j = 0; j < numVecs; ++j) {
02049       RangeScalar* const Y_j = &Y[j*colStrideY];
02050       for (Ordinal i = 0; i < numRows; ++i) {
02051         Y_j[i] = beta * Y_j[i];
02052       }
02053     }
02054   }
02055   Ordinal j = 0;
02056   if (alpha == STS::zero()) {
02057     return; // Our work is done!
02058   }
02059   const Ordinal nnz = ptr[numCols];
02060   if (alpha == STS::one()) {
02061     for (Ordinal k = 0; k < nnz; ++k) {
02062       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02063       const Ordinal i = ind[k];
02064       while (k >= ptr[j+1]) {
02065         ++j;
02066       }
02067       RangeScalar* const Y_i = &Y[i];
02068       const DomainScalar* const X_j = &X[j];
02069       Ordinal c = 0;
02070       // Extra +1 in loop bound ensures first 4 iterations get
02071       // strip-mined, but requires that Ordinal be a signed type.
02072       for ( ; c < numVecs - 3; c += 4) {
02073         Y_i[0] += A_ij * X_j[0];
02074         Y_i[colStrideY] += A_ij * X_j[colStrideX];
02075         Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
02076         Y_i[3*colStrideY] += A_ij * X_j[3*colStrideX];
02077       }
02078       for ( ; c < numVecs; ++c) {
02079         Y_i[c*colStrideY] += A_ij * X_j[c*colStrideX];
02080       }
02081     }
02082   }
02083   else { // alpha != STS::one()
02084     for (Ordinal k = 0; k < nnz; ++k) {
02085       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02086       const Ordinal i = ind[k];
02087       while (k >= ptr[j+1]) {
02088         ++j;
02089       }
02090       RangeScalar* const Y_i = &Y[i];
02091       const DomainScalar* const X_j = &X[j];
02092       Ordinal c = 0;
02093       // Extra +1 in loop bound ensures first 4 iterations get
02094       // strip-mined, but requires that Ordinal be a signed type.
02095       for ( ; c < numVecs - 3; c += 4) {
02096         Y_i[0] += alpha * A_ij * X_j[0];
02097         Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
02098         Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
02099         Y_i[3*colStrideY] += alpha * A_ij * X_j[3*colStrideX];
02100       }
02101       for ( ; c < numVecs; ++c) {
02102         Y_i[c*colStrideY] += alpha * A_ij * X_j[c*colStrideX];
02103       }
02104     }
02105   }
02106 }
02107 
02108 template<class Ordinal,
02109          class MatrixScalar,
02110          class DomainScalar,
02111          class RangeScalar>
02112 void
02113 matVecCscColMajorConj1Vec (
02114   const Ordinal numRows,
02115   const Ordinal numCols,
02116   const RangeScalar& beta,
02117   RangeScalar Y[],
02118   const Ordinal colStrideY,
02119   const RangeScalar alpha,
02120   const Ordinal ptr[],
02121   const Ordinal ind[],
02122   const MatrixScalar val[],
02123   const DomainScalar X[],
02124   const Ordinal colStrideX)
02125 {
02126   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02127 
02128   const Ordinal numVecs = 1;
02129   // Prescale: Y := beta * Y.
02130   if (beta == STS::zero()) {
02131     for (Ordinal j = 0; j < numVecs; ++j) {
02132       RangeScalar* const Y_j = &Y[j*colStrideY];
02133       for (Ordinal i = 0; i < numRows; ++i) {
02134         // Follow the Sparse BLAS convention for beta == 0. 
02135         Y_j[i] = STS::zero();
02136       }
02137     }
02138   }
02139   else if (beta != STS::one()) {
02140     for (Ordinal j = 0; j < numVecs; ++j) {
02141       RangeScalar* const Y_j = &Y[j*colStrideY];
02142       for (Ordinal i = 0; i < numRows; ++i) {
02143         Y_j[i] = beta * Y_j[i];
02144       }
02145     }
02146   }
02147   Ordinal j = 0;
02148   if (alpha == STS::zero()) {
02149     return; // Our work is done!
02150   }
02151   const Ordinal nnz = ptr[numCols];
02152   if (alpha == STS::one()) {
02153     for (Ordinal k = 0; k < nnz; ++k) {
02154       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02155       const Ordinal i = ind[k];
02156       while (k >= ptr[j+1]) {
02157         ++j;
02158       }
02159       Y[i] += A_ij * X[j];
02160     }
02161   }
02162   else { // alpha != STS::one()
02163     for (Ordinal k = 0; k < nnz; ++k) {
02164       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02165       const Ordinal i = ind[k];
02166       while (k >= ptr[j+1]) {
02167         ++j;
02168       }
02169       Y[i] += alpha * A_ij * X[j];
02170     }
02171   }
02172 }
02173 
02174 template<class Ordinal,
02175          class MatrixScalar,
02176          class DomainScalar,
02177          class RangeScalar>
02178 void
02179 matVecCscColMajorConj2Vec (
02180   const Ordinal numRows,
02181   const Ordinal numCols,
02182   const RangeScalar& beta,
02183   RangeScalar Y[],
02184   const Ordinal colStrideY,
02185   const RangeScalar alpha,
02186   const Ordinal ptr[],
02187   const Ordinal ind[],
02188   const MatrixScalar val[],
02189   const DomainScalar X[],
02190   const Ordinal colStrideX)
02191 {
02192   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02193 
02194   const Ordinal numVecs = 2;
02195   // Prescale: Y := beta * Y.
02196   if (beta == STS::zero()) {
02197     for (Ordinal j = 0; j < numVecs; ++j) {
02198       RangeScalar* const Y_j = &Y[j*colStrideY];
02199       for (Ordinal i = 0; i < numRows; ++i) {
02200         // Follow the Sparse BLAS convention for beta == 0. 
02201         Y_j[i] = STS::zero();
02202       }
02203     }
02204   }
02205   else if (beta != STS::one()) {
02206     for (Ordinal j = 0; j < numVecs; ++j) {
02207       RangeScalar* const Y_j = &Y[j*colStrideY];
02208       for (Ordinal i = 0; i < numRows; ++i) {
02209         Y_j[i] = beta * Y_j[i];
02210       }
02211     }
02212   }
02213   Ordinal j = 0;
02214   if (alpha == STS::zero()) {
02215     return; // Our work is done!
02216   }
02217   const Ordinal nnz = ptr[numCols];
02218   if (alpha == STS::one()) {
02219     for (Ordinal k = 0; k < nnz; ++k) {
02220       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02221       const Ordinal i = ind[k];
02222       while (k >= ptr[j+1]) {
02223         ++j;
02224       }
02225       RangeScalar* const Y_i = &Y[i];
02226       const DomainScalar* const X_j = &X[j];
02227       Y_i[0] += A_ij * X_j[0];
02228       Y_i[colStrideY] += A_ij * X_j[colStrideX];
02229     }
02230   }
02231   else { // alpha != STS::one()
02232     for (Ordinal k = 0; k < nnz; ++k) {
02233       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02234       const Ordinal i = ind[k];
02235       while (k >= ptr[j+1]) {
02236         ++j;
02237       }
02238       RangeScalar* const Y_i = &Y[i];
02239       const DomainScalar* const X_j = &X[j];
02240       Y_i[0] += alpha * A_ij * X_j[0];
02241       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
02242     }
02243   }
02244 }
02245 
02246 template<class Ordinal,
02247          class MatrixScalar,
02248          class DomainScalar,
02249          class RangeScalar>
02250 void
02251 matVecCscColMajorConj3Vec (
02252   const Ordinal numRows,
02253   const Ordinal numCols,
02254   const RangeScalar& beta,
02255   RangeScalar Y[],
02256   const Ordinal colStrideY,
02257   const RangeScalar alpha,
02258   const Ordinal ptr[],
02259   const Ordinal ind[],
02260   const MatrixScalar val[],
02261   const DomainScalar X[],
02262   const Ordinal colStrideX)
02263 {
02264   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02265 
02266   const Ordinal numVecs = 3;
02267   // Prescale: Y := beta * Y.
02268   if (beta == STS::zero()) {
02269     for (Ordinal j = 0; j < numVecs; ++j) {
02270       RangeScalar* const Y_j = &Y[j*colStrideY];
02271       for (Ordinal i = 0; i < numRows; ++i) {
02272         // Follow the Sparse BLAS convention for beta == 0. 
02273         Y_j[i] = STS::zero();
02274       }
02275     }
02276   }
02277   else if (beta != STS::one()) {
02278     for (Ordinal j = 0; j < numVecs; ++j) {
02279       RangeScalar* const Y_j = &Y[j*colStrideY];
02280       for (Ordinal i = 0; i < numRows; ++i) {
02281         Y_j[i] = beta * Y_j[i];
02282       }
02283     }
02284   }
02285   Ordinal j = 0;
02286   if (alpha == STS::zero()) {
02287     return; // Our work is done!
02288   }
02289   const Ordinal nnz = ptr[numCols];
02290   if (alpha == STS::one()) {
02291     for (Ordinal k = 0; k < nnz; ++k) {
02292       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02293       const Ordinal i = ind[k];
02294       while (k >= ptr[j+1]) {
02295         ++j;
02296       }
02297       RangeScalar* const Y_i = &Y[i];
02298       const DomainScalar* const X_j = &X[j];
02299       Y_i[0] += A_ij * X_j[0];
02300       Y_i[colStrideY] += A_ij * X_j[colStrideX];
02301       Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
02302     }
02303   }
02304   else { // alpha != STS::one()
02305     for (Ordinal k = 0; k < nnz; ++k) {
02306       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02307       const Ordinal i = ind[k];
02308       while (k >= ptr[j+1]) {
02309         ++j;
02310       }
02311       RangeScalar* const Y_i = &Y[i];
02312       const DomainScalar* const X_j = &X[j];
02313       Y_i[0] += alpha * A_ij * X_j[0];
02314       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
02315       Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
02316     }
02317   }
02318 }
02319 
02320 template<class Ordinal,
02321          class MatrixScalar,
02322          class DomainScalar,
02323          class RangeScalar>
02324 void
02325 matVecCscColMajorConj4Vec (
02326   const Ordinal numRows,
02327   const Ordinal numCols,
02328   const RangeScalar& beta,
02329   RangeScalar Y[],
02330   const Ordinal colStrideY,
02331   const RangeScalar alpha,
02332   const Ordinal ptr[],
02333   const Ordinal ind[],
02334   const MatrixScalar val[],
02335   const DomainScalar X[],
02336   const Ordinal colStrideX)
02337 {
02338   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02339 
02340   const Ordinal numVecs = 4;
02341   // Prescale: Y := beta * Y.
02342   if (beta == STS::zero()) {
02343     for (Ordinal j = 0; j < numVecs; ++j) {
02344       RangeScalar* const Y_j = &Y[j*colStrideY];
02345       for (Ordinal i = 0; i < numRows; ++i) {
02346         // Follow the Sparse BLAS convention for beta == 0. 
02347         Y_j[i] = STS::zero();
02348       }
02349     }
02350   }
02351   else if (beta != STS::one()) {
02352     for (Ordinal j = 0; j < numVecs; ++j) {
02353       RangeScalar* const Y_j = &Y[j*colStrideY];
02354       for (Ordinal i = 0; i < numRows; ++i) {
02355         Y_j[i] = beta * Y_j[i];
02356       }
02357     }
02358   }
02359   Ordinal j = 0;
02360   if (alpha == STS::zero()) {
02361     return; // Our work is done!
02362   }
02363   const Ordinal nnz = ptr[numCols];
02364   if (alpha == STS::one()) {
02365     for (Ordinal k = 0; k < nnz; ++k) {
02366       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02367       const Ordinal i = ind[k];
02368       while (k >= ptr[j+1]) {
02369         ++j;
02370       }
02371       RangeScalar* const Y_i = &Y[i];
02372       const DomainScalar* const X_j = &X[j];
02373       Y_i[0] += A_ij * X_j[0];
02374       Y_i[colStrideY] += A_ij * X_j[colStrideX];
02375       Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
02376       Y_i[3*colStrideY] += A_ij * X_j[3*colStrideX];
02377     }
02378   }
02379   else { // alpha != STS::one()
02380     for (Ordinal k = 0; k < nnz; ++k) {
02381       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02382       const Ordinal i = ind[k];
02383       while (k >= ptr[j+1]) {
02384         ++j;
02385       }
02386       RangeScalar* const Y_i = &Y[i];
02387       const DomainScalar* const X_j = &X[j];
02388       Y_i[0] += alpha * A_ij * X_j[0];
02389       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
02390       Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
02391       Y_i[3*colStrideY] += alpha * A_ij * X_j[3*colStrideX];
02392     }
02393   }
02394 }
02395 
02396 template<class Ordinal,
02397          class MatrixScalar,
02398          class DomainScalar,
02399          class RangeScalar>
02400 void
02401 matVecCsrColMajorConj (
02402   const Ordinal numRows,
02403   const Ordinal numCols,
02404   const Ordinal numVecs,
02405   const RangeScalar& beta,
02406   RangeScalar Y[],
02407   const Ordinal colStrideY,
02408   const RangeScalar alpha,
02409   const Ordinal ptr[],
02410   const Ordinal ind[],
02411   const MatrixScalar val[],
02412   const DomainScalar X[],
02413   const Ordinal colStrideX)
02414 {
02415   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02416 
02417   Ordinal i = 0;
02418   // Special case for CSR only: Y(0,:) = 0.
02419   if (beta != STS::zero()) {
02420     for (Ordinal c = 0; c < numVecs; ++c) {
02421       Y[c*colStrideY] = beta * Y[c*colStrideY];
02422     }
02423   }
02424   else {
02425     // Follow the Sparse BLAS convention for beta == 0. 
02426     for (Ordinal c = 0; c < numVecs; ++c) {
02427       Y[c*colStrideY] = STS::zero();
02428     }
02429   }
02430   if (alpha == STS::zero()) {
02431     return; // Our work is done!
02432   }
02433   const Ordinal nnz = ptr[numRows];
02434   if (alpha == STS::one()) {
02435     for (Ordinal k = 0; k < nnz; ++k) {
02436       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02437       const Ordinal j = ind[k];
02438       while (k >= ptr[i+1]) {
02439         ++i;
02440         // We haven't seen row i before; scale Y(i,:) by beta.
02441         RangeScalar* const Y_i = &Y[i];
02442         for (Ordinal c = 0; c < numVecs; ++c) {
02443           Y_i[c*colStrideY] = beta * Y_i[c*colStrideY];
02444         }
02445       }
02446       RangeScalar* const Y_i = &Y[i];
02447       const DomainScalar* const X_j = &X[j];
02448       for (Ordinal c = 0; c < numVecs; ++c) {
02449         Y_i[c*colStrideY] += A_ij * X_j[c*colStrideX];
02450       }
02451     }
02452   }
02453   else { // alpha != STS::one()
02454     for (Ordinal k = 0; k < nnz; ++k) {
02455       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02456       const Ordinal j = ind[k];
02457       while (k >= ptr[i+1]) {
02458         ++i;
02459         // We haven't seen row i before; scale Y(i,:) by beta.
02460         RangeScalar* const Y_i = &Y[i];
02461         for (Ordinal c = 0; c < numVecs; ++c) {
02462           Y_i[c*colStrideY] = beta * Y_i[c*colStrideY];
02463         }
02464       }
02465       RangeScalar* const Y_i = &Y[i];
02466       const DomainScalar* const X_j = &X[j];
02467       for (Ordinal c = 0; c < numVecs; ++c) {
02468         Y_i[c*colStrideY] += alpha * A_ij * X_j[c*colStrideX];
02469       }
02470     }
02471   }
02472 }
02473 
02474 template<class Ordinal,
02475          class MatrixScalar,
02476          class DomainScalar,
02477          class RangeScalar>
02478 void
02479 matVecCsrColMajorConj4Unrolled (
02480   const Ordinal numRows,
02481   const Ordinal numCols,
02482   const Ordinal numVecs,
02483   const RangeScalar& beta,
02484   RangeScalar Y[],
02485   const Ordinal colStrideY,
02486   const RangeScalar alpha,
02487   const Ordinal ptr[],
02488   const Ordinal ind[],
02489   const MatrixScalar val[],
02490   const DomainScalar X[],
02491   const Ordinal colStrideX)
02492 {
02493   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02494 
02495   Ordinal i = 0;
02496   // Special case for CSR only: Y(0,:) = 0.
02497   if (beta != STS::zero()) {
02498     for (Ordinal c = 0; c < numVecs; ++c) {
02499       Y[c*colStrideY] = beta * Y[c*colStrideY];
02500     }
02501   }
02502   else {
02503     // Follow the Sparse BLAS convention for beta == 0. 
02504     for (Ordinal c = 0; c < numVecs; ++c) {
02505       Y[c*colStrideY] = STS::zero();
02506     }
02507   }
02508   if (alpha == STS::zero()) {
02509     return; // Our work is done!
02510   }
02511   const Ordinal nnz = ptr[numRows];
02512   if (alpha == STS::one()) {
02513     for (Ordinal k = 0; k < nnz; ++k) {
02514       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02515       const Ordinal j = ind[k];
02516       while (k >= ptr[i+1]) {
02517         ++i;
02518         // We haven't seen row i before; scale Y(i,:) by beta.
02519         RangeScalar* const Y_i = &Y[i];
02520         Ordinal c = 0;
02521         // Extra +1 in loop bound ensures first 4 iterations get
02522         // strip-mined, but requires that Ordinal be a signed type.
02523         for ( ; c < numVecs - 3; c += 4) {
02524           Y_i[0] *= beta;
02525           Y_i[colStrideY] *= beta;
02526           Y_i[2*colStrideY] *= beta;
02527           Y_i[3*colStrideY] *= beta;
02528         }
02529         for ( ; c < numVecs; ++c) {
02530           Y_i[c*colStrideY] *= beta;
02531         }
02532       }
02533       RangeScalar* const Y_i = &Y[i];
02534       const DomainScalar* const X_j = &X[j];
02535       Ordinal c = 0;
02536       // Extra +1 in loop bound ensures first 4 iterations get
02537       // strip-mined, but requires that Ordinal be a signed type.
02538       for ( ; c < numVecs - 3; c += 4) {
02539         Y_i[0] += A_ij * X_j[0];
02540         Y_i[colStrideY] += A_ij * X_j[colStrideX];
02541         Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
02542         Y_i[3*colStrideY] += A_ij * X_j[3*colStrideX];
02543       }
02544       for ( ; c < numVecs; ++c) {
02545         Y_i[c*colStrideY] += A_ij * X_j[c*colStrideX];
02546       }
02547     }
02548   }
02549   else { // alpha != STS::one()
02550     for (Ordinal k = 0; k < nnz; ++k) {
02551       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02552       const Ordinal j = ind[k];
02553       while (k >= ptr[i+1]) {
02554         ++i;
02555         // We haven't seen row i before; scale Y(i,:) by beta.
02556         RangeScalar* const Y_i = &Y[i];
02557         Ordinal c = 0;
02558         // Extra +1 in loop bound ensures first 4 iterations get
02559         // strip-mined, but requires that Ordinal be a signed type.
02560         for ( ; c < numVecs - 3; c += 4) {
02561           Y_i[0] *= beta;
02562           Y_i[colStrideY] *= beta;
02563           Y_i[2*colStrideY] *= beta;
02564           Y_i[3*colStrideY] *= beta;
02565         }
02566         for ( ; c < numVecs; ++c) {
02567           Y_i[c*colStrideY] *= beta;
02568         }
02569       }
02570       RangeScalar* const Y_i = &Y[i];
02571       const DomainScalar* const X_j = &X[j];
02572       Ordinal c = 0;
02573       // Extra +1 in loop bound ensures first 4 iterations get
02574       // strip-mined, but requires that Ordinal be a signed type.
02575       for ( ; c < numVecs - 3; c += 4) {
02576         Y_i[0] += alpha * A_ij * X_j[0];
02577         Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
02578         Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
02579         Y_i[3*colStrideY] += alpha * A_ij * X_j[3*colStrideX];
02580       }
02581       for ( ; c < numVecs; ++c) {
02582         Y_i[c*colStrideY] += alpha * A_ij * X_j[c*colStrideX];
02583       }
02584     }
02585   }
02586 }
02587 
02588 template<class Ordinal,
02589          class MatrixScalar,
02590          class DomainScalar,
02591          class RangeScalar>
02592 void
02593 matVecCsrColMajorConj1Vec (
02594   const Ordinal numRows,
02595   const Ordinal numCols,
02596   const RangeScalar& beta,
02597   RangeScalar Y[],
02598   const Ordinal colStrideY,
02599   const RangeScalar alpha,
02600   const Ordinal ptr[],
02601   const Ordinal ind[],
02602   const MatrixScalar val[],
02603   const DomainScalar X[],
02604   const Ordinal colStrideX)
02605 {
02606   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02607 
02608   const Ordinal numVecs = 1;
02609   Ordinal i = 0;
02610   // Special case for CSR only: Y(0,:) = 0.
02611   if (beta != STS::zero()) {
02612     for (Ordinal c = 0; c < numVecs; ++c) {
02613       Y[c*colStrideY] = beta * Y[c*colStrideY];
02614     }
02615   }
02616   else {
02617     // Follow the Sparse BLAS convention for beta == 0. 
02618     for (Ordinal c = 0; c < numVecs; ++c) {
02619       Y[c*colStrideY] = STS::zero();
02620     }
02621   }
02622   if (alpha == STS::zero()) {
02623     return; // Our work is done!
02624   }
02625   const Ordinal nnz = ptr[numRows];
02626   if (alpha == STS::one()) {
02627     for (Ordinal k = 0; k < nnz; ++k) {
02628       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02629       const Ordinal j = ind[k];
02630       while (k >= ptr[i+1]) {
02631         ++i;
02632         // We haven't seen row i before; scale Y(i,:) by beta.
02633         Y[i] *= beta;
02634       }
02635       Y[i] += A_ij * X[j];
02636     }
02637   }
02638   else { // alpha != STS::one()
02639     for (Ordinal k = 0; k < nnz; ++k) {
02640       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02641       const Ordinal j = ind[k];
02642       while (k >= ptr[i+1]) {
02643         ++i;
02644         // We haven't seen row i before; scale Y(i,:) by beta.
02645         Y[i] *= beta;
02646       }
02647       Y[i] += alpha * A_ij * X[j];
02648     }
02649   }
02650 }
02651 
02652 template<class Ordinal,
02653          class MatrixScalar,
02654          class DomainScalar,
02655          class RangeScalar>
02656 void
02657 matVecCsrColMajorConj2Vec (
02658   const Ordinal numRows,
02659   const Ordinal numCols,
02660   const RangeScalar& beta,
02661   RangeScalar Y[],
02662   const Ordinal colStrideY,
02663   const RangeScalar alpha,
02664   const Ordinal ptr[],
02665   const Ordinal ind[],
02666   const MatrixScalar val[],
02667   const DomainScalar X[],
02668   const Ordinal colStrideX)
02669 {
02670   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02671 
02672   const Ordinal numVecs = 2;
02673   Ordinal i = 0;
02674   // Special case for CSR only: Y(0,:) = 0.
02675   if (beta != STS::zero()) {
02676     for (Ordinal c = 0; c < numVecs; ++c) {
02677       Y[c*colStrideY] = beta * Y[c*colStrideY];
02678     }
02679   }
02680   else {
02681     // Follow the Sparse BLAS convention for beta == 0. 
02682     for (Ordinal c = 0; c < numVecs; ++c) {
02683       Y[c*colStrideY] = STS::zero();
02684     }
02685   }
02686   if (alpha == STS::zero()) {
02687     return; // Our work is done!
02688   }
02689   const Ordinal nnz = ptr[numRows];
02690   if (alpha == STS::one()) {
02691     for (Ordinal k = 0; k < nnz; ++k) {
02692       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02693       const Ordinal j = ind[k];
02694       while (k >= ptr[i+1]) {
02695         ++i;
02696         // We haven't seen row i before; scale Y(i,:) by beta.
02697         RangeScalar* const Y_i = &Y[i];
02698         Y_i[0] *= beta;
02699         Y_i[colStrideY] *= beta;
02700       }
02701       RangeScalar* const Y_i = &Y[i];
02702       const DomainScalar* const X_j = &X[j];
02703       Y_i[0] += A_ij * X_j[0];
02704       Y_i[colStrideY] += A_ij * X_j[colStrideX];
02705     }
02706   }
02707   else { // alpha != STS::one()
02708     for (Ordinal k = 0; k < nnz; ++k) {
02709       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02710       const Ordinal j = ind[k];
02711       while (k >= ptr[i+1]) {
02712         ++i;
02713         // We haven't seen row i before; scale Y(i,:) by beta.
02714         RangeScalar* const Y_i = &Y[i];
02715         Y_i[0] *= beta;
02716         Y_i[colStrideY] *= beta;
02717       }
02718       RangeScalar* const Y_i = &Y[i];
02719       const DomainScalar* const X_j = &X[j];
02720       Y_i[0] += alpha * A_ij * X_j[0];
02721       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
02722     }
02723   }
02724 }
02725 
02726 template<class Ordinal,
02727          class MatrixScalar,
02728          class DomainScalar,
02729          class RangeScalar>
02730 void
02731 matVecCsrColMajorConj3Vec (
02732   const Ordinal numRows,
02733   const Ordinal numCols,
02734   const RangeScalar& beta,
02735   RangeScalar Y[],
02736   const Ordinal colStrideY,
02737   const RangeScalar alpha,
02738   const Ordinal ptr[],
02739   const Ordinal ind[],
02740   const MatrixScalar val[],
02741   const DomainScalar X[],
02742   const Ordinal colStrideX)
02743 {
02744   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02745 
02746   const Ordinal numVecs = 3;
02747   Ordinal i = 0;
02748   // Special case for CSR only: Y(0,:) = 0.
02749   if (beta != STS::zero()) {
02750     for (Ordinal c = 0; c < numVecs; ++c) {
02751       Y[c*colStrideY] = beta * Y[c*colStrideY];
02752     }
02753   }
02754   else {
02755     // Follow the Sparse BLAS convention for beta == 0. 
02756     for (Ordinal c = 0; c < numVecs; ++c) {
02757       Y[c*colStrideY] = STS::zero();
02758     }
02759   }
02760   if (alpha == STS::zero()) {
02761     return; // Our work is done!
02762   }
02763   const Ordinal nnz = ptr[numRows];
02764   if (alpha == STS::one()) {
02765     for (Ordinal k = 0; k < nnz; ++k) {
02766       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02767       const Ordinal j = ind[k];
02768       while (k >= ptr[i+1]) {
02769         ++i;
02770         // We haven't seen row i before; scale Y(i,:) by beta.
02771         RangeScalar* const Y_i = &Y[i];
02772         Y_i[0] *= beta;
02773         Y_i[colStrideY] *= beta;
02774         Y_i[2*colStrideY] *= beta;
02775       }
02776       RangeScalar* const Y_i = &Y[i];
02777       const DomainScalar* const X_j = &X[j];
02778       Y_i[0] += A_ij * X_j[0];
02779       Y_i[colStrideY] += A_ij * X_j[colStrideX];
02780       Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
02781     }
02782   }
02783   else { // alpha != STS::one()
02784     for (Ordinal k = 0; k < nnz; ++k) {
02785       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02786       const Ordinal j = ind[k];
02787       while (k >= ptr[i+1]) {
02788         ++i;
02789         // We haven't seen row i before; scale Y(i,:) by beta.
02790         RangeScalar* const Y_i = &Y[i];
02791         Y_i[0] *= beta;
02792         Y_i[colStrideY] *= beta;
02793         Y_i[2*colStrideY] *= beta;
02794       }
02795       RangeScalar* const Y_i = &Y[i];
02796       const DomainScalar* const X_j = &X[j];
02797       Y_i[0] += alpha * A_ij * X_j[0];
02798       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
02799       Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
02800     }
02801   }
02802 }
02803 
02804 template<class Ordinal,
02805          class MatrixScalar,
02806          class DomainScalar,
02807          class RangeScalar>
02808 void
02809 matVecCsrColMajorConj4Vec (
02810   const Ordinal numRows,
02811   const Ordinal numCols,
02812   const RangeScalar& beta,
02813   RangeScalar Y[],
02814   const Ordinal colStrideY,
02815   const RangeScalar alpha,
02816   const Ordinal ptr[],
02817   const Ordinal ind[],
02818   const MatrixScalar val[],
02819   const DomainScalar X[],
02820   const Ordinal colStrideX)
02821 {
02822   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02823 
02824   const Ordinal numVecs = 4;
02825   Ordinal i = 0;
02826   // Special case for CSR only: Y(0,:) = 0.
02827   if (beta != STS::zero()) {
02828     for (Ordinal c = 0; c < numVecs; ++c) {
02829       Y[c*colStrideY] = beta * Y[c*colStrideY];
02830     }
02831   }
02832   else {
02833     // Follow the Sparse BLAS convention for beta == 0. 
02834     for (Ordinal c = 0; c < numVecs; ++c) {
02835       Y[c*colStrideY] = STS::zero();
02836     }
02837   }
02838   if (alpha == STS::zero()) {
02839     return; // Our work is done!
02840   }
02841   const Ordinal nnz = ptr[numRows];
02842   if (alpha == STS::one()) {
02843     for (Ordinal k = 0; k < nnz; ++k) {
02844       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02845       const Ordinal j = ind[k];
02846       while (k >= ptr[i+1]) {
02847         ++i;
02848         // We haven't seen row i before; scale Y(i,:) by beta.
02849         RangeScalar* const Y_i = &Y[i];
02850         Y_i[0] *= beta;
02851         Y_i[colStrideY] *= beta;
02852         Y_i[2*colStrideY] *= beta;
02853         Y_i[3*colStrideY] *= beta;
02854       }
02855       RangeScalar* const Y_i = &Y[i];
02856       const DomainScalar* const X_j = &X[j];
02857       Y_i[0] += A_ij * X_j[0];
02858       Y_i[colStrideY] += A_ij * X_j[colStrideX];
02859       Y_i[2*colStrideY] += A_ij * X_j[2*colStrideX];
02860       Y_i[3*colStrideY] += A_ij * X_j[3*colStrideX];
02861     }
02862   }
02863   else { // alpha != STS::one()
02864     for (Ordinal k = 0; k < nnz; ++k) {
02865       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02866       const Ordinal j = ind[k];
02867       while (k >= ptr[i+1]) {
02868         ++i;
02869         // We haven't seen row i before; scale Y(i,:) by beta.
02870         RangeScalar* const Y_i = &Y[i];
02871         Y_i[0] *= beta;
02872         Y_i[colStrideY] *= beta;
02873         Y_i[2*colStrideY] *= beta;
02874         Y_i[3*colStrideY] *= beta;
02875       }
02876       RangeScalar* const Y_i = &Y[i];
02877       const DomainScalar* const X_j = &X[j];
02878       Y_i[0] += alpha * A_ij * X_j[0];
02879       Y_i[colStrideY] += alpha * A_ij * X_j[colStrideX];
02880       Y_i[2*colStrideY] += alpha * A_ij * X_j[2*colStrideX];
02881       Y_i[3*colStrideY] += alpha * A_ij * X_j[3*colStrideX];
02882     }
02883   }
02884 }
02885 
02886 template<class Ordinal,
02887          class MatrixScalar,
02888          class DomainScalar,
02889          class RangeScalar>
02890 void
02891 matVecCscRowMajorConj (
02892   const Ordinal numRows,
02893   const Ordinal numCols,
02894   const Ordinal numVecs,
02895   const RangeScalar& beta,
02896   RangeScalar Y[],
02897   const Ordinal rowStrideY,
02898   const RangeScalar alpha,
02899   const Ordinal ptr[],
02900   const Ordinal ind[],
02901   const MatrixScalar val[],
02902   const DomainScalar X[],
02903   const Ordinal rowStrideX)
02904 {
02905   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02906 
02907   // Prescale: Y := beta * Y.
02908   if (beta == STS::zero()) {
02909     for (Ordinal i = 0; i < numRows; ++i) {
02910       RangeScalar* const Y_i = &Y[i*rowStrideY];
02911       for (Ordinal j = 0; j < numVecs; ++j) {
02912         // Follow the Sparse BLAS convention for beta == 0. 
02913         Y_i[j] = STS::zero();
02914       }
02915     }
02916   }
02917   else if (beta != STS::one()) {
02918     for (Ordinal i = 0; i < numRows; ++i) {
02919       RangeScalar* const Y_i = &Y[i*rowStrideY];
02920       for (Ordinal j = 0; j < numVecs; ++j) {
02921         Y_i[j] = beta * Y_i[j];
02922       }
02923     }
02924   }
02925   Ordinal j = 0;
02926   if (alpha == STS::zero()) {
02927     return; // Our work is done!
02928   }
02929   const Ordinal nnz = ptr[numCols];
02930   if (alpha == STS::one()) {
02931     for (Ordinal k = 0; k < nnz; ++k) {
02932       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02933       const Ordinal i = ind[k];
02934       while (k >= ptr[j+1]) {
02935         ++j;
02936       }
02937       RangeScalar* const Y_i = &Y[i*rowStrideY];
02938       const DomainScalar* const X_j = &X[j*rowStrideX];
02939       for (Ordinal c = 0; c < numVecs; ++c) {
02940         Y_i[c] += A_ij * X_j[c];
02941       }
02942     }
02943   }
02944   else { // alpha != STS::one()
02945     for (Ordinal k = 0; k < nnz; ++k) {
02946       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
02947       const Ordinal i = ind[k];
02948       while (k >= ptr[j+1]) {
02949         ++j;
02950       }
02951       RangeScalar* const Y_i = &Y[i*rowStrideY];
02952       const DomainScalar* const X_j = &X[j*rowStrideX];
02953       for (Ordinal c = 0; c < numVecs; ++c) {
02954         Y_i[c] += alpha * A_ij * X_j[c];
02955       }
02956     }
02957   }
02958 }
02959 
02960 template<class Ordinal,
02961          class MatrixScalar,
02962          class DomainScalar,
02963          class RangeScalar>
02964 void
02965 matVecCscRowMajorConj4Unrolled (
02966   const Ordinal numRows,
02967   const Ordinal numCols,
02968   const Ordinal numVecs,
02969   const RangeScalar& beta,
02970   RangeScalar Y[],
02971   const Ordinal rowStrideY,
02972   const RangeScalar alpha,
02973   const Ordinal ptr[],
02974   const Ordinal ind[],
02975   const MatrixScalar val[],
02976   const DomainScalar X[],
02977   const Ordinal rowStrideX)
02978 {
02979   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02980 
02981   // Prescale: Y := beta * Y.
02982   if (beta == STS::zero()) {
02983     for (Ordinal i = 0; i < numRows; ++i) {
02984       RangeScalar* const Y_i = &Y[i*rowStrideY];
02985       for (Ordinal j = 0; j < numVecs; ++j) {
02986         // Follow the Sparse BLAS convention for beta == 0. 
02987         Y_i[j] = STS::zero();
02988       }
02989     }
02990   }
02991   else if (beta != STS::one()) {
02992     for (Ordinal i = 0; i < numRows; ++i) {
02993       RangeScalar* const Y_i = &Y[i*rowStrideY];
02994       for (Ordinal j = 0; j < numVecs; ++j) {
02995         Y_i[j] = beta * Y_i[j];
02996       }
02997     }
02998   }
02999   Ordinal j = 0;
03000   if (alpha == STS::zero()) {
03001     return; // Our work is done!
03002   }
03003   const Ordinal nnz = ptr[numCols];
03004   if (alpha == STS::one()) {
03005     for (Ordinal k = 0; k < nnz; ++k) {
03006       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03007       const Ordinal i = ind[k];
03008       while (k >= ptr[j+1]) {
03009         ++j;
03010       }
03011       RangeScalar* const Y_i = &Y[i*rowStrideY];
03012       const DomainScalar* const X_j = &X[j*rowStrideX];
03013       Ordinal c = 0;
03014       // Extra +1 in loop bound ensures first 4 iterations get
03015       // strip-mined, but requires that Ordinal be a signed type.
03016       for ( ; c < numVecs - 3; c += 4) {
03017         Y_i[0] += A_ij * X_j[0];
03018         Y_i[1] += A_ij * X_j[1];
03019         Y_i[2] += A_ij * X_j[2];
03020         Y_i[3] += A_ij * X_j[3];
03021       }
03022       for ( ; c < numVecs; ++c) {
03023         Y_i[c] += A_ij * X_j[c];
03024       }
03025     }
03026   }
03027   else { // alpha != STS::one()
03028     for (Ordinal k = 0; k < nnz; ++k) {
03029       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03030       const Ordinal i = ind[k];
03031       while (k >= ptr[j+1]) {
03032         ++j;
03033       }
03034       RangeScalar* const Y_i = &Y[i*rowStrideY];
03035       const DomainScalar* const X_j = &X[j*rowStrideX];
03036       Ordinal c = 0;
03037       // Extra +1 in loop bound ensures first 4 iterations get
03038       // strip-mined, but requires that Ordinal be a signed type.
03039       for ( ; c < numVecs - 3; c += 4) {
03040         Y_i[0] += alpha * A_ij * X_j[0];
03041         Y_i[1] += alpha * A_ij * X_j[1];
03042         Y_i[2] += alpha * A_ij * X_j[2];
03043         Y_i[3] += alpha * A_ij * X_j[3];
03044       }
03045       for ( ; c < numVecs; ++c) {
03046         Y_i[c] += alpha * A_ij * X_j[c];
03047       }
03048     }
03049   }
03050 }
03051 
03052 template<class Ordinal,
03053          class MatrixScalar,
03054          class DomainScalar,
03055          class RangeScalar>
03056 void
03057 matVecCscRowMajorConj1Vec (
03058   const Ordinal numRows,
03059   const Ordinal numCols,
03060   const RangeScalar& beta,
03061   RangeScalar Y[],
03062   const Ordinal rowStrideY,
03063   const RangeScalar alpha,
03064   const Ordinal ptr[],
03065   const Ordinal ind[],
03066   const MatrixScalar val[],
03067   const DomainScalar X[],
03068   const Ordinal rowStrideX)
03069 {
03070   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03071 
03072   const Ordinal numVecs = 1;
03073   // Prescale: Y := beta * Y.
03074   if (beta == STS::zero()) {
03075     for (Ordinal i = 0; i < numRows; ++i) {
03076       RangeScalar* const Y_i = &Y[i*rowStrideY];
03077       for (Ordinal j = 0; j < numVecs; ++j) {
03078         // Follow the Sparse BLAS convention for beta == 0. 
03079         Y_i[j] = STS::zero();
03080       }
03081     }
03082   }
03083   else if (beta != STS::one()) {
03084     for (Ordinal i = 0; i < numRows; ++i) {
03085       RangeScalar* const Y_i = &Y[i*rowStrideY];
03086       for (Ordinal j = 0; j < numVecs; ++j) {
03087         Y_i[j] = beta * Y_i[j];
03088       }
03089     }
03090   }
03091   Ordinal j = 0;
03092   if (alpha == STS::zero()) {
03093     return; // Our work is done!
03094   }
03095   const Ordinal nnz = ptr[numCols];
03096   if (alpha == STS::one()) {
03097     for (Ordinal k = 0; k < nnz; ++k) {
03098       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03099       const Ordinal i = ind[k];
03100       while (k >= ptr[j+1]) {
03101         ++j;
03102       }
03103       Y[i*rowStrideY] += A_ij * X[j*rowStrideX];
03104     }
03105   }
03106   else { // alpha != STS::one()
03107     for (Ordinal k = 0; k < nnz; ++k) {
03108       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03109       const Ordinal i = ind[k];
03110       while (k >= ptr[j+1]) {
03111         ++j;
03112       }
03113       Y[i*rowStrideY] += alpha * A_ij * X[j*rowStrideX];
03114     }
03115   }
03116 }
03117 
03118 template<class Ordinal,
03119          class MatrixScalar,
03120          class DomainScalar,
03121          class RangeScalar>
03122 void
03123 matVecCscRowMajorConj2Vec (
03124   const Ordinal numRows,
03125   const Ordinal numCols,
03126   const RangeScalar& beta,
03127   RangeScalar Y[],
03128   const Ordinal rowStrideY,
03129   const RangeScalar alpha,
03130   const Ordinal ptr[],
03131   const Ordinal ind[],
03132   const MatrixScalar val[],
03133   const DomainScalar X[],
03134   const Ordinal rowStrideX)
03135 {
03136   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03137 
03138   const Ordinal numVecs = 2;
03139   // Prescale: Y := beta * Y.
03140   if (beta == STS::zero()) {
03141     for (Ordinal i = 0; i < numRows; ++i) {
03142       RangeScalar* const Y_i = &Y[i*rowStrideY];
03143       for (Ordinal j = 0; j < numVecs; ++j) {
03144         // Follow the Sparse BLAS convention for beta == 0. 
03145         Y_i[j] = STS::zero();
03146       }
03147     }
03148   }
03149   else if (beta != STS::one()) {
03150     for (Ordinal i = 0; i < numRows; ++i) {
03151       RangeScalar* const Y_i = &Y[i*rowStrideY];
03152       for (Ordinal j = 0; j < numVecs; ++j) {
03153         Y_i[j] = beta * Y_i[j];
03154       }
03155     }
03156   }
03157   Ordinal j = 0;
03158   if (alpha == STS::zero()) {
03159     return; // Our work is done!
03160   }
03161   const Ordinal nnz = ptr[numCols];
03162   if (alpha == STS::one()) {
03163     for (Ordinal k = 0; k < nnz; ++k) {
03164       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03165       const Ordinal i = ind[k];
03166       while (k >= ptr[j+1]) {
03167         ++j;
03168       }
03169       RangeScalar* const Y_i = &Y[i*rowStrideY];
03170       const DomainScalar* const X_j = &X[j*rowStrideX];
03171       Y_i[0] += A_ij * X_j[0];
03172       Y_i[1] += A_ij * X_j[1];
03173     }
03174   }
03175   else { // alpha != STS::one()
03176     for (Ordinal k = 0; k < nnz; ++k) {
03177       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03178       const Ordinal i = ind[k];
03179       while (k >= ptr[j+1]) {
03180         ++j;
03181       }
03182       RangeScalar* const Y_i = &Y[i*rowStrideY];
03183       const DomainScalar* const X_j = &X[j*rowStrideX];
03184       Y_i[0] += alpha * A_ij * X_j[0];
03185       Y_i[1] += alpha * A_ij * X_j[1];
03186     }
03187   }
03188 }
03189 
03190 template<class Ordinal,
03191          class MatrixScalar,
03192          class DomainScalar,
03193          class RangeScalar>
03194 void
03195 matVecCscRowMajorConj3Vec (
03196   const Ordinal numRows,
03197   const Ordinal numCols,
03198   const RangeScalar& beta,
03199   RangeScalar Y[],
03200   const Ordinal rowStrideY,
03201   const RangeScalar alpha,
03202   const Ordinal ptr[],
03203   const Ordinal ind[],
03204   const MatrixScalar val[],
03205   const DomainScalar X[],
03206   const Ordinal rowStrideX)
03207 {
03208   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03209 
03210   const Ordinal numVecs = 3;
03211   // Prescale: Y := beta * Y.
03212   if (beta == STS::zero()) {
03213     for (Ordinal i = 0; i < numRows; ++i) {
03214       RangeScalar* const Y_i = &Y[i*rowStrideY];
03215       for (Ordinal j = 0; j < numVecs; ++j) {
03216         // Follow the Sparse BLAS convention for beta == 0. 
03217         Y_i[j] = STS::zero();
03218       }
03219     }
03220   }
03221   else if (beta != STS::one()) {
03222     for (Ordinal i = 0; i < numRows; ++i) {
03223       RangeScalar* const Y_i = &Y[i*rowStrideY];
03224       for (Ordinal j = 0; j < numVecs; ++j) {
03225         Y_i[j] = beta * Y_i[j];
03226       }
03227     }
03228   }
03229   Ordinal j = 0;
03230   if (alpha == STS::zero()) {
03231     return; // Our work is done!
03232   }
03233   const Ordinal nnz = ptr[numCols];
03234   if (alpha == STS::one()) {
03235     for (Ordinal k = 0; k < nnz; ++k) {
03236       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03237       const Ordinal i = ind[k];
03238       while (k >= ptr[j+1]) {
03239         ++j;
03240       }
03241       RangeScalar* const Y_i = &Y[i*rowStrideY];
03242       const DomainScalar* const X_j = &X[j*rowStrideX];
03243       Y_i[0] += A_ij * X_j[0];
03244       Y_i[1] += A_ij * X_j[1];
03245       Y_i[2] += A_ij * X_j[2];
03246     }
03247   }
03248   else { // alpha != STS::one()
03249     for (Ordinal k = 0; k < nnz; ++k) {
03250       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03251       const Ordinal i = ind[k];
03252       while (k >= ptr[j+1]) {
03253         ++j;
03254       }
03255       RangeScalar* const Y_i = &Y[i*rowStrideY];
03256       const DomainScalar* const X_j = &X[j*rowStrideX];
03257       Y_i[0] += alpha * A_ij * X_j[0];
03258       Y_i[1] += alpha * A_ij * X_j[1];
03259       Y_i[2] += alpha * A_ij * X_j[2];
03260     }
03261   }
03262 }
03263 
03264 template<class Ordinal,
03265          class MatrixScalar,
03266          class DomainScalar,
03267          class RangeScalar>
03268 void
03269 matVecCscRowMajorConj4Vec (
03270   const Ordinal numRows,
03271   const Ordinal numCols,
03272   const RangeScalar& beta,
03273   RangeScalar Y[],
03274   const Ordinal rowStrideY,
03275   const RangeScalar alpha,
03276   const Ordinal ptr[],
03277   const Ordinal ind[],
03278   const MatrixScalar val[],
03279   const DomainScalar X[],
03280   const Ordinal rowStrideX)
03281 {
03282   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03283 
03284   const Ordinal numVecs = 4;
03285   // Prescale: Y := beta * Y.
03286   if (beta == STS::zero()) {
03287     for (Ordinal i = 0; i < numRows; ++i) {
03288       RangeScalar* const Y_i = &Y[i*rowStrideY];
03289       for (Ordinal j = 0; j < numVecs; ++j) {
03290         // Follow the Sparse BLAS convention for beta == 0. 
03291         Y_i[j] = STS::zero();
03292       }
03293     }
03294   }
03295   else if (beta != STS::one()) {
03296     for (Ordinal i = 0; i < numRows; ++i) {
03297       RangeScalar* const Y_i = &Y[i*rowStrideY];
03298       for (Ordinal j = 0; j < numVecs; ++j) {
03299         Y_i[j] = beta * Y_i[j];
03300       }
03301     }
03302   }
03303   Ordinal j = 0;
03304   if (alpha == STS::zero()) {
03305     return; // Our work is done!
03306   }
03307   const Ordinal nnz = ptr[numCols];
03308   if (alpha == STS::one()) {
03309     for (Ordinal k = 0; k < nnz; ++k) {
03310       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03311       const Ordinal i = ind[k];
03312       while (k >= ptr[j+1]) {
03313         ++j;
03314       }
03315       RangeScalar* const Y_i = &Y[i*rowStrideY];
03316       const DomainScalar* const X_j = &X[j*rowStrideX];
03317       Y_i[0] += A_ij * X_j[0];
03318       Y_i[1] += A_ij * X_j[1];
03319       Y_i[2] += A_ij * X_j[2];
03320       Y_i[3] += A_ij * X_j[3];
03321     }
03322   }
03323   else { // alpha != STS::one()
03324     for (Ordinal k = 0; k < nnz; ++k) {
03325       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03326       const Ordinal i = ind[k];
03327       while (k >= ptr[j+1]) {
03328         ++j;
03329       }
03330       RangeScalar* const Y_i = &Y[i*rowStrideY];
03331       const DomainScalar* const X_j = &X[j*rowStrideX];
03332       Y_i[0] += alpha * A_ij * X_j[0];
03333       Y_i[1] += alpha * A_ij * X_j[1];
03334       Y_i[2] += alpha * A_ij * X_j[2];
03335       Y_i[3] += alpha * A_ij * X_j[3];
03336     }
03337   }
03338 }
03339 
03340 template<class Ordinal,
03341          class MatrixScalar,
03342          class DomainScalar,
03343          class RangeScalar>
03344 void
03345 matVecCsrRowMajorConj (
03346   const Ordinal numRows,
03347   const Ordinal numCols,
03348   const Ordinal numVecs,
03349   const RangeScalar& beta,
03350   RangeScalar Y[],
03351   const Ordinal rowStrideY,
03352   const RangeScalar alpha,
03353   const Ordinal ptr[],
03354   const Ordinal ind[],
03355   const MatrixScalar val[],
03356   const DomainScalar X[],
03357   const Ordinal rowStrideX)
03358 {
03359   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03360 
03361   Ordinal i = 0;
03362   // Special case for CSR only: Y(0,:) = 0.
03363   if (beta != STS::zero()) {
03364     for (Ordinal c = 0; c < numVecs; ++c) {
03365       Y[c] = beta * Y[c];
03366     }
03367   }
03368   else {
03369     // Follow the Sparse BLAS convention for beta == 0. 
03370     for (Ordinal c = 0; c < numVecs; ++c) {
03371       Y[c] = STS::zero();
03372     }
03373   }
03374   if (alpha == STS::zero()) {
03375     return; // Our work is done!
03376   }
03377   const Ordinal nnz = ptr[numRows];
03378   if (alpha == STS::one()) {
03379     for (Ordinal k = 0; k < nnz; ++k) {
03380       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03381       const Ordinal j = ind[k];
03382       while (k >= ptr[i+1]) {
03383         ++i;
03384         // We haven't seen row i before; scale Y(i,:) by beta.
03385         RangeScalar* const Y_i = &Y[i*rowStrideY];
03386         for (Ordinal c = 0; c < numVecs; ++c) {
03387           Y_i[c] = beta * Y_i[c];
03388         }
03389       }
03390       RangeScalar* const Y_i = &Y[i*rowStrideY];
03391       const DomainScalar* const X_j = &X[j*rowStrideX];
03392       for (Ordinal c = 0; c < numVecs; ++c) {
03393         Y_i[c] += A_ij * X_j[c];
03394       }
03395     }
03396   }
03397   else { // alpha != STS::one()
03398     for (Ordinal k = 0; k < nnz; ++k) {
03399       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03400       const Ordinal j = ind[k];
03401       while (k >= ptr[i+1]) {
03402         ++i;
03403         // We haven't seen row i before; scale Y(i,:) by beta.
03404         RangeScalar* const Y_i = &Y[i*rowStrideY];
03405         for (Ordinal c = 0; c < numVecs; ++c) {
03406           Y_i[c] = beta * Y_i[c];
03407         }
03408       }
03409       RangeScalar* const Y_i = &Y[i*rowStrideY];
03410       const DomainScalar* const X_j = &X[j*rowStrideX];
03411       for (Ordinal c = 0; c < numVecs; ++c) {
03412         Y_i[c] += alpha * A_ij * X_j[c];
03413       }
03414     }
03415   }
03416 }
03417 
03418 template<class Ordinal,
03419          class MatrixScalar,
03420          class DomainScalar,
03421          class RangeScalar>
03422 void
03423 matVecCsrRowMajorConj4Unrolled (
03424   const Ordinal numRows,
03425   const Ordinal numCols,
03426   const Ordinal numVecs,
03427   const RangeScalar& beta,
03428   RangeScalar Y[],
03429   const Ordinal rowStrideY,
03430   const RangeScalar alpha,
03431   const Ordinal ptr[],
03432   const Ordinal ind[],
03433   const MatrixScalar val[],
03434   const DomainScalar X[],
03435   const Ordinal rowStrideX)
03436 {
03437   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03438 
03439   Ordinal i = 0;
03440   // Special case for CSR only: Y(0,:) = 0.
03441   if (beta != STS::zero()) {
03442     for (Ordinal c = 0; c < numVecs; ++c) {
03443       Y[c] = beta * Y[c];
03444     }
03445   }
03446   else {
03447     // Follow the Sparse BLAS convention for beta == 0. 
03448     for (Ordinal c = 0; c < numVecs; ++c) {
03449       Y[c] = STS::zero();
03450     }
03451   }
03452   if (alpha == STS::zero()) {
03453     return; // Our work is done!
03454   }
03455   const Ordinal nnz = ptr[numRows];
03456   if (alpha == STS::one()) {
03457     for (Ordinal k = 0; k < nnz; ++k) {
03458       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03459       const Ordinal j = ind[k];
03460       while (k >= ptr[i+1]) {
03461         ++i;
03462         // We haven't seen row i before; scale Y(i,:) by beta.
03463         RangeScalar* const Y_i = &Y[i*rowStrideY];
03464         Ordinal c = 0;
03465         // Extra +1 in loop bound ensures first 4 iterations get
03466         // strip-mined, but requires that Ordinal be a signed type.
03467         for ( ; c < numVecs - 3; c += 4) {
03468           Y_i[0] *= beta;
03469           Y_i[1] *= beta;
03470           Y_i[2] *= beta;
03471           Y_i[3] *= beta;
03472         }
03473         for ( ; c < numVecs; ++c) {
03474           Y_i[c] *= beta;
03475         }
03476       }
03477       RangeScalar* const Y_i = &Y[i*rowStrideY];
03478       const DomainScalar* const X_j = &X[j*rowStrideX];
03479       Ordinal c = 0;
03480       // Extra +1 in loop bound ensures first 4 iterations get
03481       // strip-mined, but requires that Ordinal be a signed type.
03482       for ( ; c < numVecs - 3; c += 4) {
03483         Y_i[0] += A_ij * X_j[0];
03484         Y_i[1] += A_ij * X_j[1];
03485         Y_i[2] += A_ij * X_j[2];
03486         Y_i[3] += A_ij * X_j[3];
03487       }
03488       for ( ; c < numVecs; ++c) {
03489         Y_i[c] += A_ij * X_j[c];
03490       }
03491     }
03492   }
03493   else { // alpha != STS::one()
03494     for (Ordinal k = 0; k < nnz; ++k) {
03495       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03496       const Ordinal j = ind[k];
03497       while (k >= ptr[i+1]) {
03498         ++i;
03499         // We haven't seen row i before; scale Y(i,:) by beta.
03500         RangeScalar* const Y_i = &Y[i*rowStrideY];
03501         Ordinal c = 0;
03502         // Extra +1 in loop bound ensures first 4 iterations get
03503         // strip-mined, but requires that Ordinal be a signed type.
03504         for ( ; c < numVecs - 3; c += 4) {
03505           Y_i[0] *= beta;
03506           Y_i[1] *= beta;
03507           Y_i[2] *= beta;
03508           Y_i[3] *= beta;
03509         }
03510         for ( ; c < numVecs; ++c) {
03511           Y_i[c] *= beta;
03512         }
03513       }
03514       RangeScalar* const Y_i = &Y[i*rowStrideY];
03515       const DomainScalar* const X_j = &X[j*rowStrideX];
03516       Ordinal c = 0;
03517       // Extra +1 in loop bound ensures first 4 iterations get
03518       // strip-mined, but requires that Ordinal be a signed type.
03519       for ( ; c < numVecs - 3; c += 4) {
03520         Y_i[0] += alpha * A_ij * X_j[0];
03521         Y_i[1] += alpha * A_ij * X_j[1];
03522         Y_i[2] += alpha * A_ij * X_j[2];
03523         Y_i[3] += alpha * A_ij * X_j[3];
03524       }
03525       for ( ; c < numVecs; ++c) {
03526         Y_i[c] += alpha * A_ij * X_j[c];
03527       }
03528     }
03529   }
03530 }
03531 
03532 template<class Ordinal,
03533          class MatrixScalar,
03534          class DomainScalar,
03535          class RangeScalar>
03536 void
03537 matVecCsrRowMajorConj1Vec (
03538   const Ordinal numRows,
03539   const Ordinal numCols,
03540   const RangeScalar& beta,
03541   RangeScalar Y[],
03542   const Ordinal rowStrideY,
03543   const RangeScalar alpha,
03544   const Ordinal ptr[],
03545   const Ordinal ind[],
03546   const MatrixScalar val[],
03547   const DomainScalar X[],
03548   const Ordinal rowStrideX)
03549 {
03550   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03551 
03552   const Ordinal numVecs = 1;
03553   Ordinal i = 0;
03554   // Special case for CSR only: Y(0,:) = 0.
03555   if (beta != STS::zero()) {
03556     for (Ordinal c = 0; c < numVecs; ++c) {
03557       Y[c] = beta * Y[c];
03558     }
03559   }
03560   else {
03561     // Follow the Sparse BLAS convention for beta == 0. 
03562     for (Ordinal c = 0; c < numVecs; ++c) {
03563       Y[c] = STS::zero();
03564     }
03565   }
03566   if (alpha == STS::zero()) {
03567     return; // Our work is done!
03568   }
03569   const Ordinal nnz = ptr[numRows];
03570   if (alpha == STS::one()) {
03571     for (Ordinal k = 0; k < nnz; ++k) {
03572       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03573       const Ordinal j = ind[k];
03574       while (k >= ptr[i+1]) {
03575         ++i;
03576         // We haven't seen row i before; scale Y(i,:) by beta.
03577         Y[i*rowStrideY] *= beta;
03578       }
03579       Y[i*rowStrideY] += A_ij * X[j*rowStrideX];
03580     }
03581   }
03582   else { // alpha != STS::one()
03583     for (Ordinal k = 0; k < nnz; ++k) {
03584       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03585       const Ordinal j = ind[k];
03586       while (k >= ptr[i+1]) {
03587         ++i;
03588         // We haven't seen row i before; scale Y(i,:) by beta.
03589         Y[i*rowStrideY] *= beta;
03590       }
03591       Y[i*rowStrideY] += alpha * A_ij * X[j*rowStrideX];
03592     }
03593   }
03594 }
03595 
03596 template<class Ordinal,
03597          class MatrixScalar,
03598          class DomainScalar,
03599          class RangeScalar>
03600 void
03601 matVecCsrRowMajorConj2Vec (
03602   const Ordinal numRows,
03603   const Ordinal numCols,
03604   const RangeScalar& beta,
03605   RangeScalar Y[],
03606   const Ordinal rowStrideY,
03607   const RangeScalar alpha,
03608   const Ordinal ptr[],
03609   const Ordinal ind[],
03610   const MatrixScalar val[],
03611   const DomainScalar X[],
03612   const Ordinal rowStrideX)
03613 {
03614   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03615 
03616   const Ordinal numVecs = 2;
03617   Ordinal i = 0;
03618   // Special case for CSR only: Y(0,:) = 0.
03619   if (beta != STS::zero()) {
03620     for (Ordinal c = 0; c < numVecs; ++c) {
03621       Y[c] = beta * Y[c];
03622     }
03623   }
03624   else {
03625     // Follow the Sparse BLAS convention for beta == 0. 
03626     for (Ordinal c = 0; c < numVecs; ++c) {
03627       Y[c] = STS::zero();
03628     }
03629   }
03630   if (alpha == STS::zero()) {
03631     return; // Our work is done!
03632   }
03633   const Ordinal nnz = ptr[numRows];
03634   if (alpha == STS::one()) {
03635     for (Ordinal k = 0; k < nnz; ++k) {
03636       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03637       const Ordinal j = ind[k];
03638       while (k >= ptr[i+1]) {
03639         ++i;
03640         // We haven't seen row i before; scale Y(i,:) by beta.
03641         RangeScalar* const Y_i = &Y[i*rowStrideY];
03642         Y_i[0] *= beta;
03643         Y_i[1] *= beta;
03644       }
03645       RangeScalar* const Y_i = &Y[i*rowStrideY];
03646       const DomainScalar* const X_j = &X[j*rowStrideX];
03647       Y_i[0] += A_ij * X_j[0];
03648       Y_i[1] += A_ij * X_j[1];
03649     }
03650   }
03651   else { // alpha != STS::one()
03652     for (Ordinal k = 0; k < nnz; ++k) {
03653       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03654       const Ordinal j = ind[k];
03655       while (k >= ptr[i+1]) {
03656         ++i;
03657         // We haven't seen row i before; scale Y(i,:) by beta.
03658         RangeScalar* const Y_i = &Y[i*rowStrideY];
03659         Y_i[0] *= beta;
03660         Y_i[1] *= beta;
03661       }
03662       RangeScalar* const Y_i = &Y[i*rowStrideY];
03663       const DomainScalar* const X_j = &X[j*rowStrideX];
03664       Y_i[0] += alpha * A_ij * X_j[0];
03665       Y_i[1] += alpha * A_ij * X_j[1];
03666     }
03667   }
03668 }
03669 
03670 template<class Ordinal,
03671          class MatrixScalar,
03672          class DomainScalar,
03673          class RangeScalar>
03674 void
03675 matVecCsrRowMajorConj3Vec (
03676   const Ordinal numRows,
03677   const Ordinal numCols,
03678   const RangeScalar& beta,
03679   RangeScalar Y[],
03680   const Ordinal rowStrideY,
03681   const RangeScalar alpha,
03682   const Ordinal ptr[],
03683   const Ordinal ind[],
03684   const MatrixScalar val[],
03685   const DomainScalar X[],
03686   const Ordinal rowStrideX)
03687 {
03688   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03689 
03690   const Ordinal numVecs = 3;
03691   Ordinal i = 0;
03692   // Special case for CSR only: Y(0,:) = 0.
03693   if (beta != STS::zero()) {
03694     for (Ordinal c = 0; c < numVecs; ++c) {
03695       Y[c] = beta * Y[c];
03696     }
03697   }
03698   else {
03699     // Follow the Sparse BLAS convention for beta == 0. 
03700     for (Ordinal c = 0; c < numVecs; ++c) {
03701       Y[c] = STS::zero();
03702     }
03703   }
03704   if (alpha == STS::zero()) {
03705     return; // Our work is done!
03706   }
03707   const Ordinal nnz = ptr[numRows];
03708   if (alpha == STS::one()) {
03709     for (Ordinal k = 0; k < nnz; ++k) {
03710       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03711       const Ordinal j = ind[k];
03712       while (k >= ptr[i+1]) {
03713         ++i;
03714         // We haven't seen row i before; scale Y(i,:) by beta.
03715         RangeScalar* const Y_i = &Y[i*rowStrideY];
03716         Y_i[0] *= beta;
03717         Y_i[1] *= beta;
03718         Y_i[2] *= beta;
03719       }
03720       RangeScalar* const Y_i = &Y[i*rowStrideY];
03721       const DomainScalar* const X_j = &X[j*rowStrideX];
03722       Y_i[0] += A_ij * X_j[0];
03723       Y_i[1] += A_ij * X_j[1];
03724       Y_i[2] += A_ij * X_j[2];
03725     }
03726   }
03727   else { // alpha != STS::one()
03728     for (Ordinal k = 0; k < nnz; ++k) {
03729       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03730       const Ordinal j = ind[k];
03731       while (k >= ptr[i+1]) {
03732         ++i;
03733         // We haven't seen row i before; scale Y(i,:) by beta.
03734         RangeScalar* const Y_i = &Y[i*rowStrideY];
03735         Y_i[0] *= beta;
03736         Y_i[1] *= beta;
03737         Y_i[2] *= beta;
03738       }
03739       RangeScalar* const Y_i = &Y[i*rowStrideY];
03740       const DomainScalar* const X_j = &X[j*rowStrideX];
03741       Y_i[0] += alpha * A_ij * X_j[0];
03742       Y_i[1] += alpha * A_ij * X_j[1];
03743       Y_i[2] += alpha * A_ij * X_j[2];
03744     }
03745   }
03746 }
03747 
03748 template<class Ordinal,
03749          class MatrixScalar,
03750          class DomainScalar,
03751          class RangeScalar>
03752 void
03753 matVecCsrRowMajorConj4Vec (
03754   const Ordinal numRows,
03755   const Ordinal numCols,
03756   const RangeScalar& beta,
03757   RangeScalar Y[],
03758   const Ordinal rowStrideY,
03759   const RangeScalar alpha,
03760   const Ordinal ptr[],
03761   const Ordinal ind[],
03762   const MatrixScalar val[],
03763   const DomainScalar X[],
03764   const Ordinal rowStrideX)
03765 {
03766   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03767 
03768   const Ordinal numVecs = 4;
03769   Ordinal i = 0;
03770   // Special case for CSR only: Y(0,:) = 0.
03771   if (beta != STS::zero()) {
03772     for (Ordinal c = 0; c < numVecs; ++c) {
03773       Y[c] = beta * Y[c];
03774     }
03775   }
03776   else {
03777     // Follow the Sparse BLAS convention for beta == 0. 
03778     for (Ordinal c = 0; c < numVecs; ++c) {
03779       Y[c] = STS::zero();
03780     }
03781   }
03782   if (alpha == STS::zero()) {
03783     return; // Our work is done!
03784   }
03785   const Ordinal nnz = ptr[numRows];
03786   if (alpha == STS::one()) {
03787     for (Ordinal k = 0; k < nnz; ++k) {
03788       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03789       const Ordinal j = ind[k];
03790       while (k >= ptr[i+1]) {
03791         ++i;
03792         // We haven't seen row i before; scale Y(i,:) by beta.
03793         RangeScalar* const Y_i = &Y[i*rowStrideY];
03794         Y_i[0] *= beta;
03795         Y_i[1] *= beta;
03796         Y_i[2] *= beta;
03797         Y_i[3] *= beta;
03798       }
03799       RangeScalar* const Y_i = &Y[i*rowStrideY];
03800       const DomainScalar* const X_j = &X[j*rowStrideX];
03801       Y_i[0] += A_ij * X_j[0];
03802       Y_i[1] += A_ij * X_j[1];
03803       Y_i[2] += A_ij * X_j[2];
03804       Y_i[3] += A_ij * X_j[3];
03805     }
03806   }
03807   else { // alpha != STS::one()
03808     for (Ordinal k = 0; k < nnz; ++k) {
03809       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
03810       const Ordinal j = ind[k];
03811       while (k >= ptr[i+1]) {
03812         ++i;
03813         // We haven't seen row i before; scale Y(i,:) by beta.
03814         RangeScalar* const Y_i = &Y[i*rowStrideY];
03815         Y_i[0] *= beta;
03816         Y_i[1] *= beta;
03817         Y_i[2] *= beta;
03818         Y_i[3] *= beta;
03819       }
03820       RangeScalar* const Y_i = &Y[i*rowStrideY];
03821       const DomainScalar* const X_j = &X[j*rowStrideX];
03822       Y_i[0] += alpha * A_ij * X_j[0];
03823       Y_i[1] += alpha * A_ij * X_j[1];
03824       Y_i[2] += alpha * A_ij * X_j[2];
03825       Y_i[3] += alpha * A_ij * X_j[3];
03826     }
03827   }
03828 }
03829 
03830 } // namespace Raw
03831 } // namespace Kokkos
03832 
03833 #endif // #ifndef __Kokkos_Raw_SparseMatVec_def_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends