Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_Raw_SparseMatVec_def.hpp
Go to the documentation of this file.
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __Kokkos_Raw_SparseMatVec_def_hpp
00043 #define __Kokkos_Raw_SparseMatVec_def_hpp
00044 
00051 
00052 namespace Kokkos {
00053 namespace Raw {
00054 
00055 template<class Ordinal,
00056          class MatrixScalar,
00057          class DomainScalar,
00058          class RangeScalar>
00059 void
00060 matVecCscColMajorForfor (
00061   const Ordinal numRows,
00062   const Ordinal numCols,
00063   const Ordinal numVecs,
00064   const RangeScalar& beta,
00065   RangeScalar Y[],
00066   const Ordinal colStrideY,
00067   const RangeScalar& alpha,
00068   const size_t  ptr[],
00069   const Ordinal ind[],
00070   const MatrixScalar val[],
00071   const DomainScalar X[],
00072   const Ordinal colStrideX)
00073 {
00074   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00075 
00076   // Prescale: Y := beta * Y.
00077   if (beta == STS::zero()) {
00078     for (Ordinal j = 0; j < numVecs; ++j) {
00079       RangeScalar* const Y_j = &Y[j*colStrideY];
00080       for (Ordinal i = 0; i < numRows; ++i) {
00081         // Follow the Sparse BLAS convention for beta == 0. 
00082         Y_j[i] = STS::zero();
00083       }
00084     }
00085   }
00086   else if (beta != STS::one()) {
00087     for (Ordinal j = 0; j < numVecs; ++j) {
00088       RangeScalar* const Y_j = &Y[j*colStrideY];
00089       for (Ordinal i = 0; i < numRows; ++i) {
00090         Y_j[i] = beta * Y_j[i];
00091       }
00092     }
00093   }
00094   // Outer for loop preface:
00095   if (alpha == STS::zero()) {
00096     return; // Our work is done!
00097   }
00098   if (alpha == STS::one()) {
00099     for (Ordinal j = 0; j < numCols; ++j) {
00100       for (Ordinal c = 0; c < numVecs; ++c) {
00101         const DomainScalar tmp = X[j + c*colStrideX];
00102 
00103         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00104           const MatrixScalar A_ij = val[k];
00105           const Ordinal i = ind[k];
00106           Y[i + c*colStrideY] += A_ij * tmp;
00107         }
00108       }
00109     }
00110   }
00111   else if (alpha == -STS::one()) {
00112     for (Ordinal j = 0; j < numCols; ++j) {
00113       for (Ordinal c = 0; c < numVecs; ++c) {
00114         const DomainScalar tmp = X[j + c*colStrideX];
00115 
00116         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00117           const MatrixScalar A_ij = val[k];
00118           const Ordinal i = ind[k];
00119           Y[i + c*colStrideY] -= A_ij * tmp;
00120         }
00121       }
00122     }
00123   }
00124   else { // alpha != 1 && alpha != -1
00125     for (Ordinal j = 0; j < numCols; ++j) {
00126       for (Ordinal c = 0; c < numVecs; ++c) {
00127         const DomainScalar tmp = X[j + c*colStrideX];
00128 
00129         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00130           const MatrixScalar A_ij = val[k];
00131           const Ordinal i = ind[k];
00132           Y[i + c*colStrideY] += alpha * A_ij * tmp;
00133         }
00134       }
00135     }
00136   }
00137 }
00138 
00139 template<class Ordinal,
00140          class MatrixScalar,
00141          class DomainScalar,
00142          class RangeScalar>
00143 void
00144 matVecCscColMajorForfor4Unrolled (
00145   const Ordinal numRows,
00146   const Ordinal numCols,
00147   const Ordinal numVecs,
00148   const RangeScalar& beta,
00149   RangeScalar Y[],
00150   const Ordinal colStrideY,
00151   const RangeScalar& alpha,
00152   const size_t  ptr[],
00153   const Ordinal ind[],
00154   const MatrixScalar val[],
00155   const DomainScalar X[],
00156   const Ordinal colStrideX)
00157 {
00158   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00159 
00160   // Prescale: Y := beta * Y.
00161   if (beta == STS::zero()) {
00162     for (Ordinal j = 0; j < numVecs; ++j) {
00163       RangeScalar* const Y_j = &Y[j*colStrideY];
00164       for (Ordinal i = 0; i < numRows; ++i) {
00165         // Follow the Sparse BLAS convention for beta == 0. 
00166         Y_j[i] = STS::zero();
00167       }
00168     }
00169   }
00170   else if (beta != STS::one()) {
00171     for (Ordinal j = 0; j < numVecs; ++j) {
00172       RangeScalar* const Y_j = &Y[j*colStrideY];
00173       for (Ordinal i = 0; i < numRows; ++i) {
00174         Y_j[i] = beta * Y_j[i];
00175       }
00176     }
00177   }
00178   // Outer for loop preface:
00179   if (alpha == STS::zero()) {
00180     return; // Our work is done!
00181   }
00182   if (alpha == STS::one()) {
00183     for (Ordinal j = 0; j < numCols; ++j) {
00184       // Extra +1 in loop bound ensures first 4 iterations get
00185       // strip-mined, but requires that Ordinal be a signed type.
00186       Ordinal c = 0;
00187       for ( ; c < numVecs - 3; c += 4) {
00188         const DomainScalar* const X_j = &X[j + c*colStrideX];
00189         const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
00190 
00191         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00192           const MatrixScalar A_ij = val[k];
00193           const Ordinal i = ind[k];
00194           RangeScalar* const Y_i = &Y[i + c*colStrideY];
00195           Y_i[0] += A_ij * tmp[0];
00196           Y_i[colStrideY] += A_ij * tmp[1];
00197           Y_i[2*colStrideY] += A_ij * tmp[2];
00198           Y_i[3*colStrideY] += A_ij * tmp[3];
00199         }
00200       }
00201       // Mop up left-over iterations over multivector columns.
00202       for ( ; c < numVecs; ++c) {
00203         const DomainScalar tmp = X[j + c*colStrideX];
00204 
00205         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00206           const MatrixScalar A_ij = val[k];
00207           const Ordinal i = ind[k];
00208           Y[i + c*colStrideY] += A_ij * tmp;
00209         }
00210       }
00211     }
00212   }
00213   else if (alpha == -STS::one()) {
00214     for (Ordinal j = 0; j < numCols; ++j) {
00215       // Extra +1 in loop bound ensures first 4 iterations get
00216       // strip-mined, but requires that Ordinal be a signed type.
00217       Ordinal c = 0;
00218       for ( ; c < numVecs - 3; c += 4) {
00219         const DomainScalar* const X_j = &X[j + c*colStrideX];
00220         const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
00221 
00222         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00223           const MatrixScalar A_ij = val[k];
00224           const Ordinal i = ind[k];
00225           RangeScalar* const Y_i = &Y[i + c*colStrideY];
00226           Y_i[0] -= A_ij * tmp[0];
00227           Y_i[colStrideY] -= A_ij * tmp[1];
00228           Y_i[2*colStrideY] -= A_ij * tmp[2];
00229           Y_i[3*colStrideY] -= A_ij * tmp[3];
00230         }
00231       }
00232       // Mop up left-over iterations over multivector columns.
00233       for ( ; c < numVecs; ++c) {
00234         const DomainScalar tmp = X[j + c*colStrideX];
00235 
00236         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00237           const MatrixScalar A_ij = val[k];
00238           const Ordinal i = ind[k];
00239           Y[i + c*colStrideY] -= A_ij * tmp;
00240         }
00241       }
00242     }
00243   }
00244   else { // alpha != 1 && alpha != -1
00245     for (Ordinal j = 0; j < numCols; ++j) {
00246       // Extra +1 in loop bound ensures first 4 iterations get
00247       // strip-mined, but requires that Ordinal be a signed type.
00248       Ordinal c = 0;
00249       for ( ; c < numVecs - 3; c += 4) {
00250         const DomainScalar* const X_j = &X[j + c*colStrideX];
00251         const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
00252 
00253         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00254           const MatrixScalar A_ij = val[k];
00255           const Ordinal i = ind[k];
00256           RangeScalar* const Y_i = &Y[i + c*colStrideY];
00257           Y_i[0] += alpha * A_ij * tmp[0];
00258           Y_i[colStrideY] += alpha * A_ij * tmp[1];
00259           Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
00260           Y_i[3*colStrideY] += alpha * A_ij * tmp[3];
00261         }
00262       }
00263       // Mop up left-over iterations over multivector columns.
00264       for ( ; c < numVecs; ++c) {
00265         const DomainScalar tmp = X[j + c*colStrideX];
00266 
00267         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00268           const MatrixScalar A_ij = val[k];
00269           const Ordinal i = ind[k];
00270           Y[i + c*colStrideY] += alpha * A_ij * tmp;
00271         }
00272       }
00273     }
00274   }
00275 }
00276 
00277 template<class Ordinal,
00278          class MatrixScalar,
00279          class DomainScalar,
00280          class RangeScalar>
00281 void
00282 matVecCscColMajorForfor1Vec (
00283   const Ordinal numRows,
00284   const Ordinal numCols,
00285   const Ordinal numVecs,
00286   const RangeScalar& beta,
00287   RangeScalar Y[],
00288   const Ordinal colStrideY,
00289   const RangeScalar& alpha,
00290   const size_t  ptr[],
00291   const Ordinal ind[],
00292   const MatrixScalar val[],
00293   const DomainScalar X[],
00294   const Ordinal colStrideX)
00295 {
00296   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00297 
00298   // Prescale: Y := beta * Y.
00299   if (beta == STS::zero()) {
00300     for (Ordinal j = 0; j < numVecs; ++j) {
00301       RangeScalar* const Y_j = &Y[j*colStrideY];
00302       for (Ordinal i = 0; i < numRows; ++i) {
00303         // Follow the Sparse BLAS convention for beta == 0. 
00304         Y_j[i] = STS::zero();
00305       }
00306     }
00307   }
00308   else if (beta != STS::one()) {
00309     for (Ordinal j = 0; j < numVecs; ++j) {
00310       RangeScalar* const Y_j = &Y[j*colStrideY];
00311       for (Ordinal i = 0; i < numRows; ++i) {
00312         Y_j[i] = beta * Y_j[i];
00313       }
00314     }
00315   }
00316   // Outer for loop preface:
00317   if (alpha == STS::zero()) {
00318     return; // Our work is done!
00319   }
00320   if (alpha == STS::one()) {
00321     for (Ordinal j = 0; j < numCols; ++j) {
00322       const DomainScalar tmp = X[j];
00323 
00324       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00325         const MatrixScalar A_ij = val[k];
00326         const Ordinal i = ind[k];
00327         Y[i] += A_ij * tmp;
00328       }
00329     }
00330   }
00331   else if (alpha == -STS::one()) {
00332     for (Ordinal j = 0; j < numCols; ++j) {
00333       const DomainScalar tmp = X[j];
00334 
00335       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00336         const MatrixScalar A_ij = val[k];
00337         const Ordinal i = ind[k];
00338         Y[i] -= A_ij * tmp;
00339       }
00340     }
00341   }
00342   else { // alpha != 1 && alpha != -1
00343     for (Ordinal j = 0; j < numCols; ++j) {
00344       const DomainScalar tmp = X[j];
00345 
00346       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00347         const MatrixScalar A_ij = val[k];
00348         const Ordinal i = ind[k];
00349         Y[i] += alpha * A_ij * tmp;
00350       }
00351     }
00352   }
00353 }
00354 
00355 template<class Ordinal,
00356          class MatrixScalar,
00357          class DomainScalar,
00358          class RangeScalar>
00359 void
00360 matVecCscColMajorForfor2Vec (
00361   const Ordinal numRows,
00362   const Ordinal numCols,
00363   const Ordinal numVecs,
00364   const RangeScalar& beta,
00365   RangeScalar Y[],
00366   const Ordinal colStrideY,
00367   const RangeScalar& alpha,
00368   const size_t  ptr[],
00369   const Ordinal ind[],
00370   const MatrixScalar val[],
00371   const DomainScalar X[],
00372   const Ordinal colStrideX)
00373 {
00374   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00375 
00376   // Prescale: Y := beta * Y.
00377   if (beta == STS::zero()) {
00378     for (Ordinal j = 0; j < numVecs; ++j) {
00379       RangeScalar* const Y_j = &Y[j*colStrideY];
00380       for (Ordinal i = 0; i < numRows; ++i) {
00381         // Follow the Sparse BLAS convention for beta == 0. 
00382         Y_j[i] = STS::zero();
00383       }
00384     }
00385   }
00386   else if (beta != STS::one()) {
00387     for (Ordinal j = 0; j < numVecs; ++j) {
00388       RangeScalar* const Y_j = &Y[j*colStrideY];
00389       for (Ordinal i = 0; i < numRows; ++i) {
00390         Y_j[i] = beta * Y_j[i];
00391       }
00392     }
00393   }
00394   // Outer for loop preface:
00395   if (alpha == STS::zero()) {
00396     return; // Our work is done!
00397   }
00398   if (alpha == STS::one()) {
00399     for (Ordinal j = 0; j < numCols; ++j) {
00400       const DomainScalar* const X_j = &X[j];
00401       const DomainScalar tmp[2] = {X_j[0], X_j[colStrideX]};
00402 
00403       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00404         const MatrixScalar A_ij = val[k];
00405         const Ordinal i = ind[k];
00406         RangeScalar* const Y_i = &Y[i];
00407         Y_i[0] += A_ij * tmp[0];
00408         Y_i[colStrideY] += A_ij * tmp[1];
00409       }
00410     }
00411   }
00412   else if (alpha == -STS::one()) {
00413     for (Ordinal j = 0; j < numCols; ++j) {
00414       const DomainScalar* const X_j = &X[j];
00415       const DomainScalar tmp[2] = {X_j[0], X_j[colStrideX]};
00416 
00417       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00418         const MatrixScalar A_ij = val[k];
00419         const Ordinal i = ind[k];
00420         RangeScalar* const Y_i = &Y[i];
00421         Y_i[0] -= A_ij * tmp[0];
00422         Y_i[colStrideY] -= A_ij * tmp[1];
00423       }
00424     }
00425   }
00426   else { // alpha != 1 && alpha != -1
00427     for (Ordinal j = 0; j < numCols; ++j) {
00428       const DomainScalar* const X_j = &X[j];
00429       const DomainScalar tmp[2] = {X_j[0], X_j[colStrideX]};
00430 
00431       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00432         const MatrixScalar A_ij = val[k];
00433         const Ordinal i = ind[k];
00434         RangeScalar* const Y_i = &Y[i];
00435         Y_i[0] += alpha * A_ij * tmp[0];
00436         Y_i[colStrideY] += alpha * A_ij * tmp[1];
00437       }
00438     }
00439   }
00440 }
00441 
00442 template<class Ordinal,
00443          class MatrixScalar,
00444          class DomainScalar,
00445          class RangeScalar>
00446 void
00447 matVecCscColMajorForfor3Vec (
00448   const Ordinal numRows,
00449   const Ordinal numCols,
00450   const Ordinal numVecs,
00451   const RangeScalar& beta,
00452   RangeScalar Y[],
00453   const Ordinal colStrideY,
00454   const RangeScalar& alpha,
00455   const size_t  ptr[],
00456   const Ordinal ind[],
00457   const MatrixScalar val[],
00458   const DomainScalar X[],
00459   const Ordinal colStrideX)
00460 {
00461   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00462 
00463   // Prescale: Y := beta * Y.
00464   if (beta == STS::zero()) {
00465     for (Ordinal j = 0; j < numVecs; ++j) {
00466       RangeScalar* const Y_j = &Y[j*colStrideY];
00467       for (Ordinal i = 0; i < numRows; ++i) {
00468         // Follow the Sparse BLAS convention for beta == 0. 
00469         Y_j[i] = STS::zero();
00470       }
00471     }
00472   }
00473   else if (beta != STS::one()) {
00474     for (Ordinal j = 0; j < numVecs; ++j) {
00475       RangeScalar* const Y_j = &Y[j*colStrideY];
00476       for (Ordinal i = 0; i < numRows; ++i) {
00477         Y_j[i] = beta * Y_j[i];
00478       }
00479     }
00480   }
00481   // Outer for loop preface:
00482   if (alpha == STS::zero()) {
00483     return; // Our work is done!
00484   }
00485   if (alpha == STS::one()) {
00486     for (Ordinal j = 0; j < numCols; ++j) {
00487       const DomainScalar* const X_j = &X[j];
00488       const DomainScalar tmp[3] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX]};
00489 
00490       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00491         const MatrixScalar A_ij = val[k];
00492         const Ordinal i = ind[k];
00493         RangeScalar* const Y_i = &Y[i];
00494         Y_i[0] += A_ij * tmp[0];
00495         Y_i[colStrideY] += A_ij * tmp[1];
00496         Y_i[2*colStrideY] += A_ij * tmp[2];
00497       }
00498     }
00499   }
00500   else if (alpha == -STS::one()) {
00501     for (Ordinal j = 0; j < numCols; ++j) {
00502       const DomainScalar* const X_j = &X[j];
00503       const DomainScalar tmp[3] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX]};
00504 
00505       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00506         const MatrixScalar A_ij = val[k];
00507         const Ordinal i = ind[k];
00508         RangeScalar* const Y_i = &Y[i];
00509         Y_i[0] -= A_ij * tmp[0];
00510         Y_i[colStrideY] -= A_ij * tmp[1];
00511         Y_i[2*colStrideY] -= A_ij * tmp[2];
00512       }
00513     }
00514   }
00515   else { // alpha != 1 && alpha != -1
00516     for (Ordinal j = 0; j < numCols; ++j) {
00517       const DomainScalar* const X_j = &X[j];
00518       const DomainScalar tmp[3] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX]};
00519 
00520       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00521         const MatrixScalar A_ij = val[k];
00522         const Ordinal i = ind[k];
00523         RangeScalar* const Y_i = &Y[i];
00524         Y_i[0] += alpha * A_ij * tmp[0];
00525         Y_i[colStrideY] += alpha * A_ij * tmp[1];
00526         Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
00527       }
00528     }
00529   }
00530 }
00531 
00532 template<class Ordinal,
00533          class MatrixScalar,
00534          class DomainScalar,
00535          class RangeScalar>
00536 void
00537 matVecCscColMajorForfor4Vec (
00538   const Ordinal numRows,
00539   const Ordinal numCols,
00540   const Ordinal numVecs,
00541   const RangeScalar& beta,
00542   RangeScalar Y[],
00543   const Ordinal colStrideY,
00544   const RangeScalar& alpha,
00545   const size_t  ptr[],
00546   const Ordinal ind[],
00547   const MatrixScalar val[],
00548   const DomainScalar X[],
00549   const Ordinal colStrideX)
00550 {
00551   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00552 
00553   // Prescale: Y := beta * Y.
00554   if (beta == STS::zero()) {
00555     for (Ordinal j = 0; j < numVecs; ++j) {
00556       RangeScalar* const Y_j = &Y[j*colStrideY];
00557       for (Ordinal i = 0; i < numRows; ++i) {
00558         // Follow the Sparse BLAS convention for beta == 0. 
00559         Y_j[i] = STS::zero();
00560       }
00561     }
00562   }
00563   else if (beta != STS::one()) {
00564     for (Ordinal j = 0; j < numVecs; ++j) {
00565       RangeScalar* const Y_j = &Y[j*colStrideY];
00566       for (Ordinal i = 0; i < numRows; ++i) {
00567         Y_j[i] = beta * Y_j[i];
00568       }
00569     }
00570   }
00571   // Outer for loop preface:
00572   if (alpha == STS::zero()) {
00573     return; // Our work is done!
00574   }
00575   if (alpha == STS::one()) {
00576     for (Ordinal j = 0; j < numCols; ++j) {
00577       const DomainScalar* const X_j = &X[j];
00578       const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
00579 
00580       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00581         const MatrixScalar A_ij = val[k];
00582         const Ordinal i = ind[k];
00583         RangeScalar* const Y_i = &Y[i];
00584         Y_i[0] += A_ij * tmp[0];
00585         Y_i[colStrideY] += A_ij * tmp[1];
00586         Y_i[2*colStrideY] += A_ij * tmp[2];
00587         Y_i[3*colStrideY] += A_ij * tmp[3];
00588       }
00589     }
00590   }
00591   else if (alpha == -STS::one()) {
00592     for (Ordinal j = 0; j < numCols; ++j) {
00593       const DomainScalar* const X_j = &X[j];
00594       const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
00595 
00596       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00597         const MatrixScalar A_ij = val[k];
00598         const Ordinal i = ind[k];
00599         RangeScalar* const Y_i = &Y[i];
00600         Y_i[0] -= A_ij * tmp[0];
00601         Y_i[colStrideY] -= A_ij * tmp[1];
00602         Y_i[2*colStrideY] -= A_ij * tmp[2];
00603         Y_i[3*colStrideY] -= A_ij * tmp[3];
00604       }
00605     }
00606   }
00607   else { // alpha != 1 && alpha != -1
00608     for (Ordinal j = 0; j < numCols; ++j) {
00609       const DomainScalar* const X_j = &X[j];
00610       const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
00611 
00612       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
00613         const MatrixScalar A_ij = val[k];
00614         const Ordinal i = ind[k];
00615         RangeScalar* const Y_i = &Y[i];
00616         Y_i[0] += alpha * A_ij * tmp[0];
00617         Y_i[colStrideY] += alpha * A_ij * tmp[1];
00618         Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
00619         Y_i[3*colStrideY] += alpha * A_ij * tmp[3];
00620       }
00621     }
00622   }
00623 }
00624 
00625 template<class Ordinal,
00626          class MatrixScalar,
00627          class DomainScalar,
00628          class RangeScalar>
00629 void
00630 matVecCscColMajorForwhile1Vec (
00631   const Ordinal numRows,
00632   const Ordinal numCols,
00633   const Ordinal numVecs,
00634   const RangeScalar& beta,
00635   RangeScalar Y[],
00636   const Ordinal colStrideY,
00637   const RangeScalar& alpha,
00638   const size_t  ptr[],
00639   const Ordinal ind[],
00640   const MatrixScalar val[],
00641   const DomainScalar X[],
00642   const Ordinal colStrideX)
00643 {
00644   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00645 
00646   // Prescale: Y := beta * Y.
00647   if (beta == STS::zero()) {
00648     for (Ordinal j = 0; j < numVecs; ++j) {
00649       RangeScalar* const Y_j = &Y[j*colStrideY];
00650       for (Ordinal i = 0; i < numRows; ++i) {
00651         // Follow the Sparse BLAS convention for beta == 0. 
00652         Y_j[i] = STS::zero();
00653       }
00654     }
00655   }
00656   else if (beta != STS::one()) {
00657     for (Ordinal j = 0; j < numVecs; ++j) {
00658       RangeScalar* const Y_j = &Y[j*colStrideY];
00659       for (Ordinal i = 0; i < numRows; ++i) {
00660         Y_j[i] = beta * Y_j[i];
00661       }
00662     }
00663   }
00664   // Outer for loop preface:
00665   if (alpha == STS::zero()) {
00666     return; // Our work is done!
00667   }
00668   const size_t nnz = ptr[numCols];
00669   if (alpha == STS::one()) {
00670     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
00671     // Initializing tmp here isn't necessary for correctness, but it
00672     // makes compilers stop complaining about uninitialized variables.
00673     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
00674     Ordinal j = 0;
00675     for (size_t k = 0; k < nnz; ++k) {
00676       const MatrixScalar A_ij = val[k];
00677       const Ordinal i = ind[k];
00678       while (k >= ptr[j+1]) {
00679         ++j;
00680         tmp = X[j + 0*colStrideX];
00681       }
00682       Y[i] += A_ij * tmp;
00683     }
00684   }
00685   else if (alpha == -STS::one()) {
00686     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
00687     // Initializing tmp here isn't necessary for correctness, but it
00688     // makes compilers stop complaining about uninitialized variables.
00689     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
00690     Ordinal j = 0;
00691     for (size_t k = 0; k < nnz; ++k) {
00692       const MatrixScalar A_ij = val[k];
00693       const Ordinal i = ind[k];
00694       while (k >= ptr[j+1]) {
00695         ++j;
00696         tmp = X[j + 0*colStrideX];
00697       }
00698       Y[i] -= A_ij * tmp;
00699     }
00700   }
00701   else { // alpha != 1 && alpha != -1
00702     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
00703     // Initializing tmp here isn't necessary for correctness, but it
00704     // makes compilers stop complaining about uninitialized variables.
00705     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
00706     Ordinal j = 0;
00707     for (size_t k = 0; k < nnz; ++k) {
00708       const MatrixScalar A_ij = val[k];
00709       const Ordinal i = ind[k];
00710       while (k >= ptr[j+1]) {
00711         ++j;
00712         tmp = X[j + 0*colStrideX];
00713       }
00714       Y[i] += alpha * A_ij * tmp;
00715     }
00716   }
00717 }
00718 
00719 template<class Ordinal,
00720          class MatrixScalar,
00721          class DomainScalar,
00722          class RangeScalar>
00723 void
00724 matVecCscColMajorForwhile2Vec (
00725   const Ordinal numRows,
00726   const Ordinal numCols,
00727   const Ordinal numVecs,
00728   const RangeScalar& beta,
00729   RangeScalar Y[],
00730   const Ordinal colStrideY,
00731   const RangeScalar& alpha,
00732   const size_t  ptr[],
00733   const Ordinal ind[],
00734   const MatrixScalar val[],
00735   const DomainScalar X[],
00736   const Ordinal colStrideX)
00737 {
00738   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00739 
00740   // Prescale: Y := beta * Y.
00741   if (beta == STS::zero()) {
00742     for (Ordinal j = 0; j < numVecs; ++j) {
00743       RangeScalar* const Y_j = &Y[j*colStrideY];
00744       for (Ordinal i = 0; i < numRows; ++i) {
00745         // Follow the Sparse BLAS convention for beta == 0. 
00746         Y_j[i] = STS::zero();
00747       }
00748     }
00749   }
00750   else if (beta != STS::one()) {
00751     for (Ordinal j = 0; j < numVecs; ++j) {
00752       RangeScalar* const Y_j = &Y[j*colStrideY];
00753       for (Ordinal i = 0; i < numRows; ++i) {
00754         Y_j[i] = beta * Y_j[i];
00755       }
00756     }
00757   }
00758   // Outer for loop preface:
00759   if (alpha == STS::zero()) {
00760     return; // Our work is done!
00761   }
00762   const size_t nnz = ptr[numCols];
00763   if (alpha == STS::one()) {
00764     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
00765     // Initializing tmp here isn't necessary for correctness, but it
00766     // makes compilers stop complaining about uninitialized variables.
00767     DomainScalar tmp[2];
00768     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
00769     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
00770 
00771     Ordinal j = 0;
00772     for (size_t k = 0; k < nnz; ++k) {
00773       const MatrixScalar A_ij = val[k];
00774       const Ordinal i = ind[k];
00775       while (k >= ptr[j+1]) {
00776         ++j;
00777         const DomainScalar* const X_j = &X[j];
00778         tmp[0] = X_j[0];
00779         tmp[1] = X_j[colStrideX];
00780       }
00781       RangeScalar* const Y_i = &Y[i];
00782       Y_i[0] += A_ij * tmp[0];
00783       Y_i[colStrideY] += A_ij * tmp[1];
00784     }
00785   }
00786   else if (alpha == -STS::one()) {
00787     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
00788     // Initializing tmp here isn't necessary for correctness, but it
00789     // makes compilers stop complaining about uninitialized variables.
00790     DomainScalar tmp[2];
00791     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
00792     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
00793 
00794     Ordinal j = 0;
00795     for (size_t k = 0; k < nnz; ++k) {
00796       const MatrixScalar A_ij = val[k];
00797       const Ordinal i = ind[k];
00798       while (k >= ptr[j+1]) {
00799         ++j;
00800         const DomainScalar* const X_j = &X[j];
00801         tmp[0] = X_j[0];
00802         tmp[1] = X_j[colStrideX];
00803       }
00804       RangeScalar* const Y_i = &Y[i];
00805       Y_i[0] -= A_ij * tmp[0];
00806       Y_i[colStrideY] -= A_ij * tmp[1];
00807     }
00808   }
00809   else { // alpha != 1 && alpha != -1
00810     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
00811     // Initializing tmp here isn't necessary for correctness, but it
00812     // makes compilers stop complaining about uninitialized variables.
00813     DomainScalar tmp[2];
00814     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
00815     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
00816 
00817     Ordinal j = 0;
00818     for (size_t k = 0; k < nnz; ++k) {
00819       const MatrixScalar A_ij = val[k];
00820       const Ordinal i = ind[k];
00821       while (k >= ptr[j+1]) {
00822         ++j;
00823         const DomainScalar* const X_j = &X[j];
00824         tmp[0] = X_j[0];
00825         tmp[1] = X_j[colStrideX];
00826       }
00827       RangeScalar* const Y_i = &Y[i];
00828       Y_i[0] += alpha * A_ij * tmp[0];
00829       Y_i[colStrideY] += alpha * A_ij * tmp[1];
00830     }
00831   }
00832 }
00833 
00834 template<class Ordinal,
00835          class MatrixScalar,
00836          class DomainScalar,
00837          class RangeScalar>
00838 void
00839 matVecCscColMajorForwhile3Vec (
00840   const Ordinal numRows,
00841   const Ordinal numCols,
00842   const Ordinal numVecs,
00843   const RangeScalar& beta,
00844   RangeScalar Y[],
00845   const Ordinal colStrideY,
00846   const RangeScalar& alpha,
00847   const size_t  ptr[],
00848   const Ordinal ind[],
00849   const MatrixScalar val[],
00850   const DomainScalar X[],
00851   const Ordinal colStrideX)
00852 {
00853   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00854 
00855   // Prescale: Y := beta * Y.
00856   if (beta == STS::zero()) {
00857     for (Ordinal j = 0; j < numVecs; ++j) {
00858       RangeScalar* const Y_j = &Y[j*colStrideY];
00859       for (Ordinal i = 0; i < numRows; ++i) {
00860         // Follow the Sparse BLAS convention for beta == 0. 
00861         Y_j[i] = STS::zero();
00862       }
00863     }
00864   }
00865   else if (beta != STS::one()) {
00866     for (Ordinal j = 0; j < numVecs; ++j) {
00867       RangeScalar* const Y_j = &Y[j*colStrideY];
00868       for (Ordinal i = 0; i < numRows; ++i) {
00869         Y_j[i] = beta * Y_j[i];
00870       }
00871     }
00872   }
00873   // Outer for loop preface:
00874   if (alpha == STS::zero()) {
00875     return; // Our work is done!
00876   }
00877   const size_t nnz = ptr[numCols];
00878   if (alpha == STS::one()) {
00879     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
00880     // Initializing tmp here isn't necessary for correctness, but it
00881     // makes compilers stop complaining about uninitialized variables.
00882     DomainScalar tmp[3];
00883     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
00884     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
00885     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
00886 
00887     Ordinal j = 0;
00888     for (size_t k = 0; k < nnz; ++k) {
00889       const MatrixScalar A_ij = val[k];
00890       const Ordinal i = ind[k];
00891       while (k >= ptr[j+1]) {
00892         ++j;
00893         const DomainScalar* const X_j = &X[j];
00894         tmp[0] = X_j[0];
00895         tmp[1] = X_j[colStrideX];
00896         tmp[2] = X_j[2*colStrideX];
00897       }
00898       RangeScalar* const Y_i = &Y[i];
00899       Y_i[0] += A_ij * tmp[0];
00900       Y_i[colStrideY] += A_ij * tmp[1];
00901       Y_i[2*colStrideY] += A_ij * tmp[2];
00902     }
00903   }
00904   else if (alpha == -STS::one()) {
00905     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
00906     // Initializing tmp here isn't necessary for correctness, but it
00907     // makes compilers stop complaining about uninitialized variables.
00908     DomainScalar tmp[3];
00909     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
00910     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
00911     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
00912 
00913     Ordinal j = 0;
00914     for (size_t k = 0; k < nnz; ++k) {
00915       const MatrixScalar A_ij = val[k];
00916       const Ordinal i = ind[k];
00917       while (k >= ptr[j+1]) {
00918         ++j;
00919         const DomainScalar* const X_j = &X[j];
00920         tmp[0] = X_j[0];
00921         tmp[1] = X_j[colStrideX];
00922         tmp[2] = X_j[2*colStrideX];
00923       }
00924       RangeScalar* const Y_i = &Y[i];
00925       Y_i[0] -= A_ij * tmp[0];
00926       Y_i[colStrideY] -= A_ij * tmp[1];
00927       Y_i[2*colStrideY] -= A_ij * tmp[2];
00928     }
00929   }
00930   else { // alpha != 1 && alpha != -1
00931     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
00932     // Initializing tmp here isn't necessary for correctness, but it
00933     // makes compilers stop complaining about uninitialized variables.
00934     DomainScalar tmp[3];
00935     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
00936     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
00937     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
00938 
00939     Ordinal j = 0;
00940     for (size_t k = 0; k < nnz; ++k) {
00941       const MatrixScalar A_ij = val[k];
00942       const Ordinal i = ind[k];
00943       while (k >= ptr[j+1]) {
00944         ++j;
00945         const DomainScalar* const X_j = &X[j];
00946         tmp[0] = X_j[0];
00947         tmp[1] = X_j[colStrideX];
00948         tmp[2] = X_j[2*colStrideX];
00949       }
00950       RangeScalar* const Y_i = &Y[i];
00951       Y_i[0] += alpha * A_ij * tmp[0];
00952       Y_i[colStrideY] += alpha * A_ij * tmp[1];
00953       Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
00954     }
00955   }
00956 }
00957 
00958 template<class Ordinal,
00959          class MatrixScalar,
00960          class DomainScalar,
00961          class RangeScalar>
00962 void
00963 matVecCscColMajorForwhile4Vec (
00964   const Ordinal numRows,
00965   const Ordinal numCols,
00966   const Ordinal numVecs,
00967   const RangeScalar& beta,
00968   RangeScalar Y[],
00969   const Ordinal colStrideY,
00970   const RangeScalar& alpha,
00971   const size_t  ptr[],
00972   const Ordinal ind[],
00973   const MatrixScalar val[],
00974   const DomainScalar X[],
00975   const Ordinal colStrideX)
00976 {
00977   typedef Teuchos::ScalarTraits<RangeScalar> STS;
00978 
00979   // Prescale: Y := beta * Y.
00980   if (beta == STS::zero()) {
00981     for (Ordinal j = 0; j < numVecs; ++j) {
00982       RangeScalar* const Y_j = &Y[j*colStrideY];
00983       for (Ordinal i = 0; i < numRows; ++i) {
00984         // Follow the Sparse BLAS convention for beta == 0. 
00985         Y_j[i] = STS::zero();
00986       }
00987     }
00988   }
00989   else if (beta != STS::one()) {
00990     for (Ordinal j = 0; j < numVecs; ++j) {
00991       RangeScalar* const Y_j = &Y[j*colStrideY];
00992       for (Ordinal i = 0; i < numRows; ++i) {
00993         Y_j[i] = beta * Y_j[i];
00994       }
00995     }
00996   }
00997   // Outer for loop preface:
00998   if (alpha == STS::zero()) {
00999     return; // Our work is done!
01000   }
01001   const size_t nnz = ptr[numCols];
01002   if (alpha == STS::one()) {
01003     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01004     // Initializing tmp here isn't necessary for correctness, but it
01005     // makes compilers stop complaining about uninitialized variables.
01006     DomainScalar tmp[4];
01007     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01008     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01009     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
01010     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
01011 
01012     Ordinal j = 0;
01013     for (size_t k = 0; k < nnz; ++k) {
01014       const MatrixScalar A_ij = val[k];
01015       const Ordinal i = ind[k];
01016       while (k >= ptr[j+1]) {
01017         ++j;
01018         const DomainScalar* const X_j = &X[j];
01019         tmp[0] = X_j[0];
01020         tmp[1] = X_j[colStrideX];
01021         tmp[2] = X_j[2*colStrideX];
01022         tmp[3] = X_j[3*colStrideX];
01023       }
01024       RangeScalar* const Y_i = &Y[i];
01025       Y_i[0] += A_ij * tmp[0];
01026       Y_i[colStrideY] += A_ij * tmp[1];
01027       Y_i[2*colStrideY] += A_ij * tmp[2];
01028       Y_i[3*colStrideY] += A_ij * tmp[3];
01029     }
01030   }
01031   else if (alpha == -STS::one()) {
01032     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01033     // Initializing tmp here isn't necessary for correctness, but it
01034     // makes compilers stop complaining about uninitialized variables.
01035     DomainScalar tmp[4];
01036     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01037     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01038     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
01039     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
01040 
01041     Ordinal j = 0;
01042     for (size_t k = 0; k < nnz; ++k) {
01043       const MatrixScalar A_ij = val[k];
01044       const Ordinal i = ind[k];
01045       while (k >= ptr[j+1]) {
01046         ++j;
01047         const DomainScalar* const X_j = &X[j];
01048         tmp[0] = X_j[0];
01049         tmp[1] = X_j[colStrideX];
01050         tmp[2] = X_j[2*colStrideX];
01051         tmp[3] = X_j[3*colStrideX];
01052       }
01053       RangeScalar* const Y_i = &Y[i];
01054       Y_i[0] -= A_ij * tmp[0];
01055       Y_i[colStrideY] -= A_ij * tmp[1];
01056       Y_i[2*colStrideY] -= A_ij * tmp[2];
01057       Y_i[3*colStrideY] -= A_ij * tmp[3];
01058     }
01059   }
01060   else { // alpha != 1 && alpha != -1
01061     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01062     // Initializing tmp here isn't necessary for correctness, but it
01063     // makes compilers stop complaining about uninitialized variables.
01064     DomainScalar tmp[4];
01065     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01066     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01067     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
01068     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
01069 
01070     Ordinal j = 0;
01071     for (size_t k = 0; k < nnz; ++k) {
01072       const MatrixScalar A_ij = val[k];
01073       const Ordinal i = ind[k];
01074       while (k >= ptr[j+1]) {
01075         ++j;
01076         const DomainScalar* const X_j = &X[j];
01077         tmp[0] = X_j[0];
01078         tmp[1] = X_j[colStrideX];
01079         tmp[2] = X_j[2*colStrideX];
01080         tmp[3] = X_j[3*colStrideX];
01081       }
01082       RangeScalar* const Y_i = &Y[i];
01083       Y_i[0] += alpha * A_ij * tmp[0];
01084       Y_i[colStrideY] += alpha * A_ij * tmp[1];
01085       Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
01086       Y_i[3*colStrideY] += alpha * A_ij * tmp[3];
01087     }
01088   }
01089 }
01090 
01091 template<class Ordinal,
01092          class MatrixScalar,
01093          class DomainScalar,
01094          class RangeScalar>
01095 void
01096 matVecCscColMajorForif1Vec (
01097   const Ordinal numRows,
01098   const Ordinal numCols,
01099   const Ordinal numVecs,
01100   const RangeScalar& beta,
01101   RangeScalar Y[],
01102   const Ordinal colStrideY,
01103   const RangeScalar& alpha,
01104   const size_t  ptr[],
01105   const Ordinal ind[],
01106   const MatrixScalar val[],
01107   const DomainScalar X[],
01108   const Ordinal colStrideX)
01109 {
01110   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01111 
01112   // Prescale: Y := beta * Y.
01113   if (beta == STS::zero()) {
01114     for (Ordinal j = 0; j < numVecs; ++j) {
01115       RangeScalar* const Y_j = &Y[j*colStrideY];
01116       for (Ordinal i = 0; i < numRows; ++i) {
01117         // Follow the Sparse BLAS convention for beta == 0. 
01118         Y_j[i] = STS::zero();
01119       }
01120     }
01121   }
01122   else if (beta != STS::one()) {
01123     for (Ordinal j = 0; j < numVecs; ++j) {
01124       RangeScalar* const Y_j = &Y[j*colStrideY];
01125       for (Ordinal i = 0; i < numRows; ++i) {
01126         Y_j[i] = beta * Y_j[i];
01127       }
01128     }
01129   }
01130   // Outer for loop preface:
01131   if (alpha == STS::zero()) {
01132     return; // Our work is done!
01133   }
01134   const size_t nnz = ptr[numCols];
01135   if (alpha == STS::one()) {
01136     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01137     // Initializing tmp here isn't necessary for correctness, but it
01138     // makes compilers stop complaining about uninitialized variables.
01139     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
01140     Ordinal j = 0;
01141     for (size_t k = 0; k < nnz; ++k) {
01142       const MatrixScalar A_ij = val[k];
01143       const Ordinal i = ind[k];
01144       // NOTE: "if" instead of "while" here is only valid
01145       // if the matrix contains no empty rows.
01146       if (k >= ptr[j+1]) {
01147         ++j;
01148         tmp = X[j + 0*colStrideX];
01149       }
01150       Y[i] += A_ij * tmp;
01151     }
01152   }
01153   else if (alpha == -STS::one()) {
01154     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01155     // Initializing tmp here isn't necessary for correctness, but it
01156     // makes compilers stop complaining about uninitialized variables.
01157     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
01158     Ordinal j = 0;
01159     for (size_t k = 0; k < nnz; ++k) {
01160       const MatrixScalar A_ij = val[k];
01161       const Ordinal i = ind[k];
01162       // NOTE: "if" instead of "while" here is only valid
01163       // if the matrix contains no empty rows.
01164       if (k >= ptr[j+1]) {
01165         ++j;
01166         tmp = X[j + 0*colStrideX];
01167       }
01168       Y[i] -= A_ij * tmp;
01169     }
01170   }
01171   else { // alpha != 1 && alpha != -1
01172     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01173     // Initializing tmp here isn't necessary for correctness, but it
01174     // makes compilers stop complaining about uninitialized variables.
01175     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
01176     Ordinal j = 0;
01177     for (size_t k = 0; k < nnz; ++k) {
01178       const MatrixScalar A_ij = val[k];
01179       const Ordinal i = ind[k];
01180       // NOTE: "if" instead of "while" here is only valid
01181       // if the matrix contains no empty rows.
01182       if (k >= ptr[j+1]) {
01183         ++j;
01184         tmp = X[j + 0*colStrideX];
01185       }
01186       Y[i] += alpha * A_ij * tmp;
01187     }
01188   }
01189 }
01190 
01191 template<class Ordinal,
01192          class MatrixScalar,
01193          class DomainScalar,
01194          class RangeScalar>
01195 void
01196 matVecCscColMajorForif2Vec (
01197   const Ordinal numRows,
01198   const Ordinal numCols,
01199   const Ordinal numVecs,
01200   const RangeScalar& beta,
01201   RangeScalar Y[],
01202   const Ordinal colStrideY,
01203   const RangeScalar& alpha,
01204   const size_t  ptr[],
01205   const Ordinal ind[],
01206   const MatrixScalar val[],
01207   const DomainScalar X[],
01208   const Ordinal colStrideX)
01209 {
01210   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01211 
01212   // Prescale: Y := beta * Y.
01213   if (beta == STS::zero()) {
01214     for (Ordinal j = 0; j < numVecs; ++j) {
01215       RangeScalar* const Y_j = &Y[j*colStrideY];
01216       for (Ordinal i = 0; i < numRows; ++i) {
01217         // Follow the Sparse BLAS convention for beta == 0. 
01218         Y_j[i] = STS::zero();
01219       }
01220     }
01221   }
01222   else if (beta != STS::one()) {
01223     for (Ordinal j = 0; j < numVecs; ++j) {
01224       RangeScalar* const Y_j = &Y[j*colStrideY];
01225       for (Ordinal i = 0; i < numRows; ++i) {
01226         Y_j[i] = beta * Y_j[i];
01227       }
01228     }
01229   }
01230   // Outer for loop preface:
01231   if (alpha == STS::zero()) {
01232     return; // Our work is done!
01233   }
01234   const size_t nnz = ptr[numCols];
01235   if (alpha == STS::one()) {
01236     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01237     // Initializing tmp here isn't necessary for correctness, but it
01238     // makes compilers stop complaining about uninitialized variables.
01239     DomainScalar tmp[2];
01240     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01241     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01242 
01243     Ordinal j = 0;
01244     for (size_t k = 0; k < nnz; ++k) {
01245       const MatrixScalar A_ij = val[k];
01246       const Ordinal i = ind[k];
01247       // NOTE: "if" instead of "while" here is only valid
01248       // if the matrix contains no empty rows.
01249       if (k >= ptr[j+1]) {
01250         ++j;
01251         const DomainScalar* const X_j = &X[j];
01252         tmp[0] = X_j[0];
01253         tmp[1] = X_j[colStrideX];
01254       }
01255       RangeScalar* const Y_i = &Y[i];
01256       Y_i[0] += A_ij * tmp[0];
01257       Y_i[colStrideY] += A_ij * tmp[1];
01258     }
01259   }
01260   else if (alpha == -STS::one()) {
01261     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01262     // Initializing tmp here isn't necessary for correctness, but it
01263     // makes compilers stop complaining about uninitialized variables.
01264     DomainScalar tmp[2];
01265     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01266     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01267 
01268     Ordinal j = 0;
01269     for (size_t k = 0; k < nnz; ++k) {
01270       const MatrixScalar A_ij = val[k];
01271       const Ordinal i = ind[k];
01272       // NOTE: "if" instead of "while" here is only valid
01273       // if the matrix contains no empty rows.
01274       if (k >= ptr[j+1]) {
01275         ++j;
01276         const DomainScalar* const X_j = &X[j];
01277         tmp[0] = X_j[0];
01278         tmp[1] = X_j[colStrideX];
01279       }
01280       RangeScalar* const Y_i = &Y[i];
01281       Y_i[0] -= A_ij * tmp[0];
01282       Y_i[colStrideY] -= A_ij * tmp[1];
01283     }
01284   }
01285   else { // alpha != 1 && alpha != -1
01286     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01287     // Initializing tmp here isn't necessary for correctness, but it
01288     // makes compilers stop complaining about uninitialized variables.
01289     DomainScalar tmp[2];
01290     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01291     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01292 
01293     Ordinal j = 0;
01294     for (size_t k = 0; k < nnz; ++k) {
01295       const MatrixScalar A_ij = val[k];
01296       const Ordinal i = ind[k];
01297       // NOTE: "if" instead of "while" here is only valid
01298       // if the matrix contains no empty rows.
01299       if (k >= ptr[j+1]) {
01300         ++j;
01301         const DomainScalar* const X_j = &X[j];
01302         tmp[0] = X_j[0];
01303         tmp[1] = X_j[colStrideX];
01304       }
01305       RangeScalar* const Y_i = &Y[i];
01306       Y_i[0] += alpha * A_ij * tmp[0];
01307       Y_i[colStrideY] += alpha * A_ij * tmp[1];
01308     }
01309   }
01310 }
01311 
01312 template<class Ordinal,
01313          class MatrixScalar,
01314          class DomainScalar,
01315          class RangeScalar>
01316 void
01317 matVecCscColMajorForif3Vec (
01318   const Ordinal numRows,
01319   const Ordinal numCols,
01320   const Ordinal numVecs,
01321   const RangeScalar& beta,
01322   RangeScalar Y[],
01323   const Ordinal colStrideY,
01324   const RangeScalar& alpha,
01325   const size_t  ptr[],
01326   const Ordinal ind[],
01327   const MatrixScalar val[],
01328   const DomainScalar X[],
01329   const Ordinal colStrideX)
01330 {
01331   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01332 
01333   // Prescale: Y := beta * Y.
01334   if (beta == STS::zero()) {
01335     for (Ordinal j = 0; j < numVecs; ++j) {
01336       RangeScalar* const Y_j = &Y[j*colStrideY];
01337       for (Ordinal i = 0; i < numRows; ++i) {
01338         // Follow the Sparse BLAS convention for beta == 0. 
01339         Y_j[i] = STS::zero();
01340       }
01341     }
01342   }
01343   else if (beta != STS::one()) {
01344     for (Ordinal j = 0; j < numVecs; ++j) {
01345       RangeScalar* const Y_j = &Y[j*colStrideY];
01346       for (Ordinal i = 0; i < numRows; ++i) {
01347         Y_j[i] = beta * Y_j[i];
01348       }
01349     }
01350   }
01351   // Outer for loop preface:
01352   if (alpha == STS::zero()) {
01353     return; // Our work is done!
01354   }
01355   const size_t nnz = ptr[numCols];
01356   if (alpha == STS::one()) {
01357     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01358     // Initializing tmp here isn't necessary for correctness, but it
01359     // makes compilers stop complaining about uninitialized variables.
01360     DomainScalar tmp[3];
01361     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01362     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01363     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
01364 
01365     Ordinal j = 0;
01366     for (size_t k = 0; k < nnz; ++k) {
01367       const MatrixScalar A_ij = val[k];
01368       const Ordinal i = ind[k];
01369       // NOTE: "if" instead of "while" here is only valid
01370       // if the matrix contains no empty rows.
01371       if (k >= ptr[j+1]) {
01372         ++j;
01373         const DomainScalar* const X_j = &X[j];
01374         tmp[0] = X_j[0];
01375         tmp[1] = X_j[colStrideX];
01376         tmp[2] = X_j[2*colStrideX];
01377       }
01378       RangeScalar* const Y_i = &Y[i];
01379       Y_i[0] += A_ij * tmp[0];
01380       Y_i[colStrideY] += A_ij * tmp[1];
01381       Y_i[2*colStrideY] += A_ij * tmp[2];
01382     }
01383   }
01384   else if (alpha == -STS::one()) {
01385     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01386     // Initializing tmp here isn't necessary for correctness, but it
01387     // makes compilers stop complaining about uninitialized variables.
01388     DomainScalar tmp[3];
01389     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01390     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01391     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
01392 
01393     Ordinal j = 0;
01394     for (size_t k = 0; k < nnz; ++k) {
01395       const MatrixScalar A_ij = val[k];
01396       const Ordinal i = ind[k];
01397       // NOTE: "if" instead of "while" here is only valid
01398       // if the matrix contains no empty rows.
01399       if (k >= ptr[j+1]) {
01400         ++j;
01401         const DomainScalar* const X_j = &X[j];
01402         tmp[0] = X_j[0];
01403         tmp[1] = X_j[colStrideX];
01404         tmp[2] = X_j[2*colStrideX];
01405       }
01406       RangeScalar* const Y_i = &Y[i];
01407       Y_i[0] -= A_ij * tmp[0];
01408       Y_i[colStrideY] -= A_ij * tmp[1];
01409       Y_i[2*colStrideY] -= A_ij * tmp[2];
01410     }
01411   }
01412   else { // alpha != 1 && alpha != -1
01413     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01414     // Initializing tmp here isn't necessary for correctness, but it
01415     // makes compilers stop complaining about uninitialized variables.
01416     DomainScalar tmp[3];
01417     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01418     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01419     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
01420 
01421     Ordinal j = 0;
01422     for (size_t k = 0; k < nnz; ++k) {
01423       const MatrixScalar A_ij = val[k];
01424       const Ordinal i = ind[k];
01425       // NOTE: "if" instead of "while" here is only valid
01426       // if the matrix contains no empty rows.
01427       if (k >= ptr[j+1]) {
01428         ++j;
01429         const DomainScalar* const X_j = &X[j];
01430         tmp[0] = X_j[0];
01431         tmp[1] = X_j[colStrideX];
01432         tmp[2] = X_j[2*colStrideX];
01433       }
01434       RangeScalar* const Y_i = &Y[i];
01435       Y_i[0] += alpha * A_ij * tmp[0];
01436       Y_i[colStrideY] += alpha * A_ij * tmp[1];
01437       Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
01438     }
01439   }
01440 }
01441 
01442 template<class Ordinal,
01443          class MatrixScalar,
01444          class DomainScalar,
01445          class RangeScalar>
01446 void
01447 matVecCscColMajorForif4Vec (
01448   const Ordinal numRows,
01449   const Ordinal numCols,
01450   const Ordinal numVecs,
01451   const RangeScalar& beta,
01452   RangeScalar Y[],
01453   const Ordinal colStrideY,
01454   const RangeScalar& alpha,
01455   const size_t  ptr[],
01456   const Ordinal ind[],
01457   const MatrixScalar val[],
01458   const DomainScalar X[],
01459   const Ordinal colStrideX)
01460 {
01461   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01462 
01463   // Prescale: Y := beta * Y.
01464   if (beta == STS::zero()) {
01465     for (Ordinal j = 0; j < numVecs; ++j) {
01466       RangeScalar* const Y_j = &Y[j*colStrideY];
01467       for (Ordinal i = 0; i < numRows; ++i) {
01468         // Follow the Sparse BLAS convention for beta == 0. 
01469         Y_j[i] = STS::zero();
01470       }
01471     }
01472   }
01473   else if (beta != STS::one()) {
01474     for (Ordinal j = 0; j < numVecs; ++j) {
01475       RangeScalar* const Y_j = &Y[j*colStrideY];
01476       for (Ordinal i = 0; i < numRows; ++i) {
01477         Y_j[i] = beta * Y_j[i];
01478       }
01479     }
01480   }
01481   // Outer for loop preface:
01482   if (alpha == STS::zero()) {
01483     return; // Our work is done!
01484   }
01485   const size_t nnz = ptr[numCols];
01486   if (alpha == STS::one()) {
01487     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01488     // Initializing tmp here isn't necessary for correctness, but it
01489     // makes compilers stop complaining about uninitialized variables.
01490     DomainScalar tmp[4];
01491     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01492     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01493     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
01494     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
01495 
01496     Ordinal j = 0;
01497     for (size_t k = 0; k < nnz; ++k) {
01498       const MatrixScalar A_ij = val[k];
01499       const Ordinal i = ind[k];
01500       // NOTE: "if" instead of "while" here is only valid
01501       // if the matrix contains no empty rows.
01502       if (k >= ptr[j+1]) {
01503         ++j;
01504         const DomainScalar* const X_j = &X[j];
01505         tmp[0] = X_j[0];
01506         tmp[1] = X_j[colStrideX];
01507         tmp[2] = X_j[2*colStrideX];
01508         tmp[3] = X_j[3*colStrideX];
01509       }
01510       RangeScalar* const Y_i = &Y[i];
01511       Y_i[0] += A_ij * tmp[0];
01512       Y_i[colStrideY] += A_ij * tmp[1];
01513       Y_i[2*colStrideY] += A_ij * tmp[2];
01514       Y_i[3*colStrideY] += A_ij * tmp[3];
01515     }
01516   }
01517   else if (alpha == -STS::one()) {
01518     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01519     // Initializing tmp here isn't necessary for correctness, but it
01520     // makes compilers stop complaining about uninitialized variables.
01521     DomainScalar tmp[4];
01522     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01523     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01524     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
01525     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
01526 
01527     Ordinal j = 0;
01528     for (size_t k = 0; k < nnz; ++k) {
01529       const MatrixScalar A_ij = val[k];
01530       const Ordinal i = ind[k];
01531       // NOTE: "if" instead of "while" here is only valid
01532       // if the matrix contains no empty rows.
01533       if (k >= ptr[j+1]) {
01534         ++j;
01535         const DomainScalar* const X_j = &X[j];
01536         tmp[0] = X_j[0];
01537         tmp[1] = X_j[colStrideX];
01538         tmp[2] = X_j[2*colStrideX];
01539         tmp[3] = X_j[3*colStrideX];
01540       }
01541       RangeScalar* const Y_i = &Y[i];
01542       Y_i[0] -= A_ij * tmp[0];
01543       Y_i[colStrideY] -= A_ij * tmp[1];
01544       Y_i[2*colStrideY] -= A_ij * tmp[2];
01545       Y_i[3*colStrideY] -= A_ij * tmp[3];
01546     }
01547   }
01548   else { // alpha != 1 && alpha != -1
01549     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
01550     // Initializing tmp here isn't necessary for correctness, but it
01551     // makes compilers stop complaining about uninitialized variables.
01552     DomainScalar tmp[4];
01553     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
01554     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
01555     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
01556     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
01557 
01558     Ordinal j = 0;
01559     for (size_t k = 0; k < nnz; ++k) {
01560       const MatrixScalar A_ij = val[k];
01561       const Ordinal i = ind[k];
01562       // NOTE: "if" instead of "while" here is only valid
01563       // if the matrix contains no empty rows.
01564       if (k >= ptr[j+1]) {
01565         ++j;
01566         const DomainScalar* const X_j = &X[j];
01567         tmp[0] = X_j[0];
01568         tmp[1] = X_j[colStrideX];
01569         tmp[2] = X_j[2*colStrideX];
01570         tmp[3] = X_j[3*colStrideX];
01571       }
01572       RangeScalar* const Y_i = &Y[i];
01573       Y_i[0] += alpha * A_ij * tmp[0];
01574       Y_i[colStrideY] += alpha * A_ij * tmp[1];
01575       Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
01576       Y_i[3*colStrideY] += alpha * A_ij * tmp[3];
01577     }
01578   }
01579 }
01580 
01581 template<class Ordinal,
01582          class MatrixScalar,
01583          class DomainScalar,
01584          class RangeScalar>
01585 void
01586 matVecCsrColMajorForfor (
01587   const Ordinal numRows,
01588   const Ordinal numCols,
01589   const Ordinal numVecs,
01590   const RangeScalar& beta,
01591   RangeScalar Y[],
01592   const Ordinal colStrideY,
01593   const RangeScalar& alpha,
01594   const size_t  ptr[],
01595   const Ordinal ind[],
01596   const MatrixScalar val[],
01597   const DomainScalar X[],
01598   const Ordinal colStrideX)
01599 {
01600   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01601 
01602   // With CSR for alpha == 0, scale Y by beta and return.
01603   if (alpha == STS::zero()) {
01604     // Prescale: Y := beta * Y.
01605     if (beta == STS::zero()) {
01606       for (Ordinal j = 0; j < numVecs; ++j) {
01607         RangeScalar* const Y_j = &Y[j*colStrideY];
01608         for (Ordinal i = 0; i < numRows; ++i) {
01609           // Follow the Sparse BLAS convention for beta == 0. 
01610           Y_j[i] = STS::zero();
01611         }
01612       }
01613     }
01614     else if (beta != STS::one()) {
01615       for (Ordinal j = 0; j < numVecs; ++j) {
01616         RangeScalar* const Y_j = &Y[j*colStrideY];
01617         for (Ordinal i = 0; i < numRows; ++i) {
01618           Y_j[i] = beta * Y_j[i];
01619         }
01620       }
01621     }
01622     return; // Our work is done!
01623   }
01624   if (alpha == STS::one()) {
01625     if (beta == -STS::one()) {
01626       for (Ordinal i = 0; i < numRows; ++i) {
01627         // Initialize temporary values to -Y(i,:).
01628         for (Ordinal c = 0; c < numVecs; ++c) {
01629           RangeScalar tmp = -Y[i + c*colStrideY];
01630 
01631           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01632             const MatrixScalar A_ij = val[k];
01633             const Ordinal j = ind[k];
01634             tmp += A_ij * X[j + c*colStrideX];
01635           }
01636           // Copy temporary values into output vector.
01637           Y[i + c*colStrideY] = tmp;
01638         }
01639       }
01640     }
01641     else if (beta == STS::zero()) {
01642       for (Ordinal i = 0; i < numRows; ++i) {
01643         // Initialize temporary values to 0.
01644         for (Ordinal c = 0; c < numVecs; ++c) {
01645           RangeScalar tmp = STS::zero();
01646 
01647           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01648             const MatrixScalar A_ij = val[k];
01649             const Ordinal j = ind[k];
01650             tmp += A_ij * X[j + c*colStrideX];
01651           }
01652           // Copy temporary values into output vector.
01653           Y[i + c*colStrideY] = tmp;
01654         }
01655       }
01656     }
01657     else if (beta == STS::one()) {
01658       for (Ordinal i = 0; i < numRows; ++i) {
01659         // Initialize temporary values to Y(i,:).
01660         for (Ordinal c = 0; c < numVecs; ++c) {
01661           RangeScalar tmp = Y[i + c*colStrideY];
01662 
01663           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01664             const MatrixScalar A_ij = val[k];
01665             const Ordinal j = ind[k];
01666             tmp += A_ij * X[j + c*colStrideX];
01667           }
01668           // Copy temporary values into output vector.
01669           Y[i + c*colStrideY] = tmp;
01670         }
01671       }
01672     }
01673     else { // beta != -1 && beta != 0 && beta != 1
01674       for (Ordinal i = 0; i < numRows; ++i) {
01675         // Initialize temporary values to Y(i,:) * beta.
01676         for (Ordinal c = 0; c < numVecs; ++c) {
01677           RangeScalar tmp = beta * Y[i + c*colStrideY];
01678 
01679           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01680             const MatrixScalar A_ij = val[k];
01681             const Ordinal j = ind[k];
01682             tmp += A_ij * X[j + c*colStrideX];
01683           }
01684           // Copy temporary values into output vector.
01685           Y[i + c*colStrideY] = tmp;
01686         }
01687       }
01688     }
01689   }
01690   else if (alpha == -STS::one()) {
01691     if (beta == -STS::one()) {
01692       for (Ordinal i = 0; i < numRows; ++i) {
01693         // Initialize temporary values to -Y(i,:).
01694         for (Ordinal c = 0; c < numVecs; ++c) {
01695           RangeScalar tmp = -Y[i + c*colStrideY];
01696 
01697           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01698             const MatrixScalar A_ij = val[k];
01699             const Ordinal j = ind[k];
01700             tmp -= A_ij * X[j + c*colStrideX];
01701           }
01702           // Copy temporary values into output vector.
01703           Y[i + c*colStrideY] = tmp;
01704         }
01705       }
01706     }
01707     else if (beta == STS::zero()) {
01708       for (Ordinal i = 0; i < numRows; ++i) {
01709         // Initialize temporary values to 0.
01710         for (Ordinal c = 0; c < numVecs; ++c) {
01711           RangeScalar tmp = STS::zero();
01712 
01713           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01714             const MatrixScalar A_ij = val[k];
01715             const Ordinal j = ind[k];
01716             tmp -= A_ij * X[j + c*colStrideX];
01717           }
01718           // Copy temporary values into output vector.
01719           Y[i + c*colStrideY] = tmp;
01720         }
01721       }
01722     }
01723     else if (beta == STS::one()) {
01724       for (Ordinal i = 0; i < numRows; ++i) {
01725         // Initialize temporary values to Y(i,:).
01726         for (Ordinal c = 0; c < numVecs; ++c) {
01727           RangeScalar tmp = Y[i + c*colStrideY];
01728 
01729           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01730             const MatrixScalar A_ij = val[k];
01731             const Ordinal j = ind[k];
01732             tmp -= A_ij * X[j + c*colStrideX];
01733           }
01734           // Copy temporary values into output vector.
01735           Y[i + c*colStrideY] = tmp;
01736         }
01737       }
01738     }
01739     else { // beta != -1 && beta != 0 && beta != 1
01740       for (Ordinal i = 0; i < numRows; ++i) {
01741         // Initialize temporary values to Y(i,:) * beta.
01742         for (Ordinal c = 0; c < numVecs; ++c) {
01743           RangeScalar tmp = beta * Y[i + c*colStrideY];
01744 
01745           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01746             const MatrixScalar A_ij = val[k];
01747             const Ordinal j = ind[k];
01748             tmp -= A_ij * X[j + c*colStrideX];
01749           }
01750           // Copy temporary values into output vector.
01751           Y[i + c*colStrideY] = tmp;
01752         }
01753       }
01754     }
01755   }
01756   else { // alpha != 1 && alpha != -1
01757     if (beta == -STS::one()) {
01758       for (Ordinal i = 0; i < numRows; ++i) {
01759         // Initialize temporary values to -Y(i,:).
01760         for (Ordinal c = 0; c < numVecs; ++c) {
01761           RangeScalar tmp = -Y[i + c*colStrideY];
01762 
01763           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01764             const MatrixScalar A_ij = val[k];
01765             const Ordinal j = ind[k];
01766             tmp += alpha * A_ij * X[j + c*colStrideX];
01767           }
01768           // Copy temporary values into output vector.
01769           Y[i + c*colStrideY] = tmp;
01770         }
01771       }
01772     }
01773     else if (beta == STS::zero()) {
01774       for (Ordinal i = 0; i < numRows; ++i) {
01775         // Initialize temporary values to 0.
01776         for (Ordinal c = 0; c < numVecs; ++c) {
01777           RangeScalar tmp = STS::zero();
01778 
01779           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01780             const MatrixScalar A_ij = val[k];
01781             const Ordinal j = ind[k];
01782             tmp += alpha * A_ij * X[j + c*colStrideX];
01783           }
01784           // Copy temporary values into output vector.
01785           Y[i + c*colStrideY] = tmp;
01786         }
01787       }
01788     }
01789     else if (beta == STS::one()) {
01790       for (Ordinal i = 0; i < numRows; ++i) {
01791         // Initialize temporary values to Y(i,:).
01792         for (Ordinal c = 0; c < numVecs; ++c) {
01793           RangeScalar tmp = Y[i + c*colStrideY];
01794 
01795           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01796             const MatrixScalar A_ij = val[k];
01797             const Ordinal j = ind[k];
01798             tmp += alpha * A_ij * X[j + c*colStrideX];
01799           }
01800           // Copy temporary values into output vector.
01801           Y[i + c*colStrideY] = tmp;
01802         }
01803       }
01804     }
01805     else { // beta != -1 && beta != 0 && beta != 1
01806       for (Ordinal i = 0; i < numRows; ++i) {
01807         // Initialize temporary values to Y(i,:) * beta.
01808         for (Ordinal c = 0; c < numVecs; ++c) {
01809           RangeScalar tmp = beta * Y[i + c*colStrideY];
01810 
01811           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01812             const MatrixScalar A_ij = val[k];
01813             const Ordinal j = ind[k];
01814             tmp += alpha * A_ij * X[j + c*colStrideX];
01815           }
01816           // Copy temporary values into output vector.
01817           Y[i + c*colStrideY] = tmp;
01818         }
01819       }
01820     }
01821   }
01822 }
01823 
01824 template<class Ordinal,
01825          class MatrixScalar,
01826          class DomainScalar,
01827          class RangeScalar>
01828 void
01829 matVecCsrColMajorForforOmp (
01830   const Ordinal numRows,
01831   const Ordinal numCols,
01832   const Ordinal numVecs,
01833   const RangeScalar& beta,
01834   RangeScalar Y[],
01835   const Ordinal colStrideY,
01836   const RangeScalar& alpha,
01837   const size_t  ptr[],
01838   const Ordinal ind[],
01839   const MatrixScalar val[],
01840   const DomainScalar X[],
01841   const Ordinal colStrideX)
01842 {
01843   typedef Teuchos::ScalarTraits<RangeScalar> STS;
01844 
01845   // With CSR for alpha == 0, scale Y by beta and return.
01846   if (alpha == STS::zero()) {
01847     // Prescale: Y := beta * Y.
01848     if (beta == STS::zero()) {
01849       for (Ordinal j = 0; j < numVecs; ++j) {
01850         RangeScalar* const Y_j = &Y[j*colStrideY];
01851         #pragma omp parallel for
01852         for (Ordinal i = 0; i < numRows; ++i) {
01853           // Follow the Sparse BLAS convention for beta == 0. 
01854           Y_j[i] = STS::zero();
01855         }
01856       }
01857     }
01858     else if (beta != STS::one()) {
01859       for (Ordinal j = 0; j < numVecs; ++j) {
01860         RangeScalar* const Y_j = &Y[j*colStrideY];
01861         #pragma omp parallel for
01862         for (Ordinal i = 0; i < numRows; ++i) {
01863           Y_j[i] = beta * Y_j[i];
01864         }
01865       }
01866     }
01867     return; // Our work is done!
01868   }
01869   if (alpha == STS::one()) {
01870     if (beta == -STS::one()) {
01871       #pragma omp parallel for
01872       for (Ordinal i = 0; i < numRows; ++i) {
01873         // Initialize temporary values to -Y(i,:).
01874         for (Ordinal c = 0; c < numVecs; ++c) {
01875           RangeScalar tmp = -Y[i + c*colStrideY];
01876 
01877           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01878             const MatrixScalar A_ij = val[k];
01879             const Ordinal j = ind[k];
01880             tmp += A_ij * X[j + c*colStrideX];
01881           }
01882           // Copy temporary values into output vector.
01883           Y[i + c*colStrideY] = tmp;
01884         }
01885       }
01886     }
01887     else if (beta == STS::zero()) {
01888       #pragma omp parallel for
01889       for (Ordinal i = 0; i < numRows; ++i) {
01890         // Initialize temporary values to 0.
01891         for (Ordinal c = 0; c < numVecs; ++c) {
01892           RangeScalar tmp = STS::zero();
01893 
01894           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01895             const MatrixScalar A_ij = val[k];
01896             const Ordinal j = ind[k];
01897             tmp += A_ij * X[j + c*colStrideX];
01898           }
01899           // Copy temporary values into output vector.
01900           Y[i + c*colStrideY] = tmp;
01901         }
01902       }
01903     }
01904     else if (beta == STS::one()) {
01905       #pragma omp parallel for
01906       for (Ordinal i = 0; i < numRows; ++i) {
01907         // Initialize temporary values to Y(i,:).
01908         for (Ordinal c = 0; c < numVecs; ++c) {
01909           RangeScalar tmp = Y[i + c*colStrideY];
01910 
01911           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01912             const MatrixScalar A_ij = val[k];
01913             const Ordinal j = ind[k];
01914             tmp += A_ij * X[j + c*colStrideX];
01915           }
01916           // Copy temporary values into output vector.
01917           Y[i + c*colStrideY] = tmp;
01918         }
01919       }
01920     }
01921     else { // beta != -1 && beta != 0 && beta != 1
01922       #pragma omp parallel for
01923       for (Ordinal i = 0; i < numRows; ++i) {
01924         // Initialize temporary values to Y(i,:) * beta.
01925         for (Ordinal c = 0; c < numVecs; ++c) {
01926           RangeScalar tmp = beta * Y[i + c*colStrideY];
01927 
01928           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01929             const MatrixScalar A_ij = val[k];
01930             const Ordinal j = ind[k];
01931             tmp += A_ij * X[j + c*colStrideX];
01932           }
01933           // Copy temporary values into output vector.
01934           Y[i + c*colStrideY] = tmp;
01935         }
01936       }
01937     }
01938   }
01939   else if (alpha == -STS::one()) {
01940     if (beta == -STS::one()) {
01941       #pragma omp parallel for
01942       for (Ordinal i = 0; i < numRows; ++i) {
01943         // Initialize temporary values to -Y(i,:).
01944         for (Ordinal c = 0; c < numVecs; ++c) {
01945           RangeScalar tmp = -Y[i + c*colStrideY];
01946 
01947           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01948             const MatrixScalar A_ij = val[k];
01949             const Ordinal j = ind[k];
01950             tmp -= A_ij * X[j + c*colStrideX];
01951           }
01952           // Copy temporary values into output vector.
01953           Y[i + c*colStrideY] = tmp;
01954         }
01955       }
01956     }
01957     else if (beta == STS::zero()) {
01958       #pragma omp parallel for
01959       for (Ordinal i = 0; i < numRows; ++i) {
01960         // Initialize temporary values to 0.
01961         for (Ordinal c = 0; c < numVecs; ++c) {
01962           RangeScalar tmp = STS::zero();
01963 
01964           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01965             const MatrixScalar A_ij = val[k];
01966             const Ordinal j = ind[k];
01967             tmp -= A_ij * X[j + c*colStrideX];
01968           }
01969           // Copy temporary values into output vector.
01970           Y[i + c*colStrideY] = tmp;
01971         }
01972       }
01973     }
01974     else if (beta == STS::one()) {
01975       #pragma omp parallel for
01976       for (Ordinal i = 0; i < numRows; ++i) {
01977         // Initialize temporary values to Y(i,:).
01978         for (Ordinal c = 0; c < numVecs; ++c) {
01979           RangeScalar tmp = Y[i + c*colStrideY];
01980 
01981           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01982             const MatrixScalar A_ij = val[k];
01983             const Ordinal j = ind[k];
01984             tmp -= A_ij * X[j + c*colStrideX];
01985           }
01986           // Copy temporary values into output vector.
01987           Y[i + c*colStrideY] = tmp;
01988         }
01989       }
01990     }
01991     else { // beta != -1 && beta != 0 && beta != 1
01992       #pragma omp parallel for
01993       for (Ordinal i = 0; i < numRows; ++i) {
01994         // Initialize temporary values to Y(i,:) * beta.
01995         for (Ordinal c = 0; c < numVecs; ++c) {
01996           RangeScalar tmp = beta * Y[i + c*colStrideY];
01997 
01998           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
01999             const MatrixScalar A_ij = val[k];
02000             const Ordinal j = ind[k];
02001             tmp -= A_ij * X[j + c*colStrideX];
02002           }
02003           // Copy temporary values into output vector.
02004           Y[i + c*colStrideY] = tmp;
02005         }
02006       }
02007     }
02008   }
02009   else { // alpha != 1 && alpha != -1
02010     if (beta == -STS::one()) {
02011       #pragma omp parallel for
02012       for (Ordinal i = 0; i < numRows; ++i) {
02013         // Initialize temporary values to -Y(i,:).
02014         for (Ordinal c = 0; c < numVecs; ++c) {
02015           RangeScalar tmp = -Y[i + c*colStrideY];
02016 
02017           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02018             const MatrixScalar A_ij = val[k];
02019             const Ordinal j = ind[k];
02020             tmp += alpha * A_ij * X[j + c*colStrideX];
02021           }
02022           // Copy temporary values into output vector.
02023           Y[i + c*colStrideY] = tmp;
02024         }
02025       }
02026     }
02027     else if (beta == STS::zero()) {
02028       #pragma omp parallel for
02029       for (Ordinal i = 0; i < numRows; ++i) {
02030         // Initialize temporary values to 0.
02031         for (Ordinal c = 0; c < numVecs; ++c) {
02032           RangeScalar tmp = STS::zero();
02033 
02034           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02035             const MatrixScalar A_ij = val[k];
02036             const Ordinal j = ind[k];
02037             tmp += alpha * A_ij * X[j + c*colStrideX];
02038           }
02039           // Copy temporary values into output vector.
02040           Y[i + c*colStrideY] = tmp;
02041         }
02042       }
02043     }
02044     else if (beta == STS::one()) {
02045       #pragma omp parallel for
02046       for (Ordinal i = 0; i < numRows; ++i) {
02047         // Initialize temporary values to Y(i,:).
02048         for (Ordinal c = 0; c < numVecs; ++c) {
02049           RangeScalar tmp = Y[i + c*colStrideY];
02050 
02051           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02052             const MatrixScalar A_ij = val[k];
02053             const Ordinal j = ind[k];
02054             tmp += alpha * A_ij * X[j + c*colStrideX];
02055           }
02056           // Copy temporary values into output vector.
02057           Y[i + c*colStrideY] = tmp;
02058         }
02059       }
02060     }
02061     else { // beta != -1 && beta != 0 && beta != 1
02062       #pragma omp parallel for
02063       for (Ordinal i = 0; i < numRows; ++i) {
02064         // Initialize temporary values to Y(i,:) * beta.
02065         for (Ordinal c = 0; c < numVecs; ++c) {
02066           RangeScalar tmp = beta * Y[i + c*colStrideY];
02067 
02068           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02069             const MatrixScalar A_ij = val[k];
02070             const Ordinal j = ind[k];
02071             tmp += alpha * A_ij * X[j + c*colStrideX];
02072           }
02073           // Copy temporary values into output vector.
02074           Y[i + c*colStrideY] = tmp;
02075         }
02076       }
02077     }
02078   }
02079 }
02080 
02081 template<class Ordinal,
02082          class MatrixScalar,
02083          class DomainScalar,
02084          class RangeScalar>
02085 void
02086 matVecCsrColMajorForfor4Unrolled (
02087   const Ordinal numRows,
02088   const Ordinal numCols,
02089   const Ordinal numVecs,
02090   const RangeScalar& beta,
02091   RangeScalar Y[],
02092   const Ordinal colStrideY,
02093   const RangeScalar& alpha,
02094   const size_t  ptr[],
02095   const Ordinal ind[],
02096   const MatrixScalar val[],
02097   const DomainScalar X[],
02098   const Ordinal colStrideX)
02099 {
02100   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02101 
02102   // With CSR for alpha == 0, scale Y by beta and return.
02103   if (alpha == STS::zero()) {
02104     // Prescale: Y := beta * Y.
02105     if (beta == STS::zero()) {
02106       for (Ordinal j = 0; j < numVecs; ++j) {
02107         RangeScalar* const Y_j = &Y[j*colStrideY];
02108         for (Ordinal i = 0; i < numRows; ++i) {
02109           // Follow the Sparse BLAS convention for beta == 0. 
02110           Y_j[i] = STS::zero();
02111         }
02112       }
02113     }
02114     else if (beta != STS::one()) {
02115       for (Ordinal j = 0; j < numVecs; ++j) {
02116         RangeScalar* const Y_j = &Y[j*colStrideY];
02117         for (Ordinal i = 0; i < numRows; ++i) {
02118           Y_j[i] = beta * Y_j[i];
02119         }
02120       }
02121     }
02122     return; // Our work is done!
02123   }
02124   if (alpha == STS::one()) {
02125     if (beta == -STS::one()) {
02126       for (Ordinal i = 0; i < numRows; ++i) {
02127         // Initialize temporary values to -Y(i,:).
02128         // Extra +1 in loop bound ensures first 4 iterations get
02129         // strip-mined, but requires that Ordinal be a signed type.
02130         Ordinal c = 0;
02131         for ( ; c < numVecs - 3; c += 4) {
02132           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02133           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
02134 
02135           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02136             const MatrixScalar A_ij = val[k];
02137             const Ordinal j = ind[k];
02138             const DomainScalar* const X_j = &X[j + c*colStrideX];
02139             tmp[0] += A_ij * X_j[0];
02140             tmp[1] += A_ij * X_j[colStrideX];
02141             tmp[2] += A_ij * X_j[2*colStrideX];
02142             tmp[3] += A_ij * X_j[3*colStrideX];
02143           }
02144           // Copy temporary values into output vector.
02145           Y_i[0] = tmp[0];
02146           Y_i[colStrideY] = tmp[1];
02147           Y_i[2*colStrideY] = tmp[2];
02148           Y_i[3*colStrideY] = tmp[3];
02149         }
02150         // Mop up left-over iterations over multivector columns.
02151         for ( ; c < numVecs; ++c) {
02152           RangeScalar tmp = -Y[i + c*colStrideY];
02153 
02154           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02155             const MatrixScalar A_ij = val[k];
02156             const Ordinal j = ind[k];
02157             tmp += A_ij * X[j + c*colStrideX];
02158           }
02159           Y[i + c*colStrideY] = tmp;
02160         }
02161       }
02162     }
02163     else if (beta == STS::zero()) {
02164       for (Ordinal i = 0; i < numRows; ++i) {
02165         // Initialize temporary values to 0.
02166         // Extra +1 in loop bound ensures first 4 iterations get
02167         // strip-mined, but requires that Ordinal be a signed type.
02168         Ordinal c = 0;
02169         for ( ; c < numVecs - 3; c += 4) {
02170           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02171           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
02172 
02173           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02174             const MatrixScalar A_ij = val[k];
02175             const Ordinal j = ind[k];
02176             const DomainScalar* const X_j = &X[j + c*colStrideX];
02177             tmp[0] += A_ij * X_j[0];
02178             tmp[1] += A_ij * X_j[colStrideX];
02179             tmp[2] += A_ij * X_j[2*colStrideX];
02180             tmp[3] += A_ij * X_j[3*colStrideX];
02181           }
02182           // Copy temporary values into output vector.
02183           Y_i[0] = tmp[0];
02184           Y_i[colStrideY] = tmp[1];
02185           Y_i[2*colStrideY] = tmp[2];
02186           Y_i[3*colStrideY] = tmp[3];
02187         }
02188         // Mop up left-over iterations over multivector columns.
02189         for ( ; c < numVecs; ++c) {
02190           RangeScalar tmp = STS::zero();
02191 
02192           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02193             const MatrixScalar A_ij = val[k];
02194             const Ordinal j = ind[k];
02195             tmp += A_ij * X[j + c*colStrideX];
02196           }
02197           Y[i + c*colStrideY] = tmp;
02198         }
02199       }
02200     }
02201     else if (beta == STS::one()) {
02202       for (Ordinal i = 0; i < numRows; ++i) {
02203         // Initialize temporary values to Y(i,:).
02204         // Extra +1 in loop bound ensures first 4 iterations get
02205         // strip-mined, but requires that Ordinal be a signed type.
02206         Ordinal c = 0;
02207         for ( ; c < numVecs - 3; c += 4) {
02208           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02209           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
02210 
02211           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02212             const MatrixScalar A_ij = val[k];
02213             const Ordinal j = ind[k];
02214             const DomainScalar* const X_j = &X[j + c*colStrideX];
02215             tmp[0] += A_ij * X_j[0];
02216             tmp[1] += A_ij * X_j[colStrideX];
02217             tmp[2] += A_ij * X_j[2*colStrideX];
02218             tmp[3] += A_ij * X_j[3*colStrideX];
02219           }
02220           // Copy temporary values into output vector.
02221           Y_i[0] = tmp[0];
02222           Y_i[colStrideY] = tmp[1];
02223           Y_i[2*colStrideY] = tmp[2];
02224           Y_i[3*colStrideY] = tmp[3];
02225         }
02226         // Mop up left-over iterations over multivector columns.
02227         for ( ; c < numVecs; ++c) {
02228           RangeScalar tmp = Y[i + c*colStrideY];
02229 
02230           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02231             const MatrixScalar A_ij = val[k];
02232             const Ordinal j = ind[k];
02233             tmp += A_ij * X[j + c*colStrideX];
02234           }
02235           Y[i + c*colStrideY] = tmp;
02236         }
02237       }
02238     }
02239     else { // beta != -1 && beta != 0 && beta != 1
02240       for (Ordinal i = 0; i < numRows; ++i) {
02241         // Initialize temporary values to Y(i,:) * beta.
02242         // Extra +1 in loop bound ensures first 4 iterations get
02243         // strip-mined, but requires that Ordinal be a signed type.
02244         Ordinal c = 0;
02245         for ( ; c < numVecs - 3; c += 4) {
02246           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02247           RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
02248 
02249           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02250             const MatrixScalar A_ij = val[k];
02251             const Ordinal j = ind[k];
02252             const DomainScalar* const X_j = &X[j + c*colStrideX];
02253             tmp[0] += A_ij * X_j[0];
02254             tmp[1] += A_ij * X_j[colStrideX];
02255             tmp[2] += A_ij * X_j[2*colStrideX];
02256             tmp[3] += A_ij * X_j[3*colStrideX];
02257           }
02258           // Copy temporary values into output vector.
02259           Y_i[0] = tmp[0];
02260           Y_i[colStrideY] = tmp[1];
02261           Y_i[2*colStrideY] = tmp[2];
02262           Y_i[3*colStrideY] = tmp[3];
02263         }
02264         // Mop up left-over iterations over multivector columns.
02265         for ( ; c < numVecs; ++c) {
02266           RangeScalar tmp = beta * Y[i + c*colStrideY];
02267 
02268           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02269             const MatrixScalar A_ij = val[k];
02270             const Ordinal j = ind[k];
02271             tmp += A_ij * X[j + c*colStrideX];
02272           }
02273           Y[i + c*colStrideY] = tmp;
02274         }
02275       }
02276     }
02277   }
02278   else if (alpha == -STS::one()) {
02279     if (beta == -STS::one()) {
02280       for (Ordinal i = 0; i < numRows; ++i) {
02281         // Initialize temporary values to -Y(i,:).
02282         // Extra +1 in loop bound ensures first 4 iterations get
02283         // strip-mined, but requires that Ordinal be a signed type.
02284         Ordinal c = 0;
02285         for ( ; c < numVecs - 3; c += 4) {
02286           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02287           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
02288 
02289           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02290             const MatrixScalar A_ij = val[k];
02291             const Ordinal j = ind[k];
02292             const DomainScalar* const X_j = &X[j + c*colStrideX];
02293             tmp[0] -= A_ij * X_j[0];
02294             tmp[1] -= A_ij * X_j[colStrideX];
02295             tmp[2] -= A_ij * X_j[2*colStrideX];
02296             tmp[3] -= A_ij * X_j[3*colStrideX];
02297           }
02298           // Copy temporary values into output vector.
02299           Y_i[0] = tmp[0];
02300           Y_i[colStrideY] = tmp[1];
02301           Y_i[2*colStrideY] = tmp[2];
02302           Y_i[3*colStrideY] = tmp[3];
02303         }
02304         // Mop up left-over iterations over multivector columns.
02305         for ( ; c < numVecs; ++c) {
02306           RangeScalar tmp = -Y[i + c*colStrideY];
02307 
02308           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02309             const MatrixScalar A_ij = val[k];
02310             const Ordinal j = ind[k];
02311             tmp -= A_ij * X[j + c*colStrideX];
02312           }
02313           Y[i + c*colStrideY] = tmp;
02314         }
02315       }
02316     }
02317     else if (beta == STS::zero()) {
02318       for (Ordinal i = 0; i < numRows; ++i) {
02319         // Initialize temporary values to 0.
02320         // Extra +1 in loop bound ensures first 4 iterations get
02321         // strip-mined, but requires that Ordinal be a signed type.
02322         Ordinal c = 0;
02323         for ( ; c < numVecs - 3; c += 4) {
02324           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02325           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
02326 
02327           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02328             const MatrixScalar A_ij = val[k];
02329             const Ordinal j = ind[k];
02330             const DomainScalar* const X_j = &X[j + c*colStrideX];
02331             tmp[0] -= A_ij * X_j[0];
02332             tmp[1] -= A_ij * X_j[colStrideX];
02333             tmp[2] -= A_ij * X_j[2*colStrideX];
02334             tmp[3] -= A_ij * X_j[3*colStrideX];
02335           }
02336           // Copy temporary values into output vector.
02337           Y_i[0] = tmp[0];
02338           Y_i[colStrideY] = tmp[1];
02339           Y_i[2*colStrideY] = tmp[2];
02340           Y_i[3*colStrideY] = tmp[3];
02341         }
02342         // Mop up left-over iterations over multivector columns.
02343         for ( ; c < numVecs; ++c) {
02344           RangeScalar tmp = STS::zero();
02345 
02346           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02347             const MatrixScalar A_ij = val[k];
02348             const Ordinal j = ind[k];
02349             tmp -= A_ij * X[j + c*colStrideX];
02350           }
02351           Y[i + c*colStrideY] = tmp;
02352         }
02353       }
02354     }
02355     else if (beta == STS::one()) {
02356       for (Ordinal i = 0; i < numRows; ++i) {
02357         // Initialize temporary values to Y(i,:).
02358         // Extra +1 in loop bound ensures first 4 iterations get
02359         // strip-mined, but requires that Ordinal be a signed type.
02360         Ordinal c = 0;
02361         for ( ; c < numVecs - 3; c += 4) {
02362           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02363           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
02364 
02365           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02366             const MatrixScalar A_ij = val[k];
02367             const Ordinal j = ind[k];
02368             const DomainScalar* const X_j = &X[j + c*colStrideX];
02369             tmp[0] -= A_ij * X_j[0];
02370             tmp[1] -= A_ij * X_j[colStrideX];
02371             tmp[2] -= A_ij * X_j[2*colStrideX];
02372             tmp[3] -= A_ij * X_j[3*colStrideX];
02373           }
02374           // Copy temporary values into output vector.
02375           Y_i[0] = tmp[0];
02376           Y_i[colStrideY] = tmp[1];
02377           Y_i[2*colStrideY] = tmp[2];
02378           Y_i[3*colStrideY] = tmp[3];
02379         }
02380         // Mop up left-over iterations over multivector columns.
02381         for ( ; c < numVecs; ++c) {
02382           RangeScalar tmp = Y[i + c*colStrideY];
02383 
02384           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02385             const MatrixScalar A_ij = val[k];
02386             const Ordinal j = ind[k];
02387             tmp -= A_ij * X[j + c*colStrideX];
02388           }
02389           Y[i + c*colStrideY] = tmp;
02390         }
02391       }
02392     }
02393     else { // beta != -1 && beta != 0 && beta != 1
02394       for (Ordinal i = 0; i < numRows; ++i) {
02395         // Initialize temporary values to Y(i,:) * beta.
02396         // Extra +1 in loop bound ensures first 4 iterations get
02397         // strip-mined, but requires that Ordinal be a signed type.
02398         Ordinal c = 0;
02399         for ( ; c < numVecs - 3; c += 4) {
02400           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02401           RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
02402 
02403           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02404             const MatrixScalar A_ij = val[k];
02405             const Ordinal j = ind[k];
02406             const DomainScalar* const X_j = &X[j + c*colStrideX];
02407             tmp[0] -= A_ij * X_j[0];
02408             tmp[1] -= A_ij * X_j[colStrideX];
02409             tmp[2] -= A_ij * X_j[2*colStrideX];
02410             tmp[3] -= A_ij * X_j[3*colStrideX];
02411           }
02412           // Copy temporary values into output vector.
02413           Y_i[0] = tmp[0];
02414           Y_i[colStrideY] = tmp[1];
02415           Y_i[2*colStrideY] = tmp[2];
02416           Y_i[3*colStrideY] = tmp[3];
02417         }
02418         // Mop up left-over iterations over multivector columns.
02419         for ( ; c < numVecs; ++c) {
02420           RangeScalar tmp = beta * Y[i + c*colStrideY];
02421 
02422           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02423             const MatrixScalar A_ij = val[k];
02424             const Ordinal j = ind[k];
02425             tmp -= A_ij * X[j + c*colStrideX];
02426           }
02427           Y[i + c*colStrideY] = tmp;
02428         }
02429       }
02430     }
02431   }
02432   else { // alpha != 1 && alpha != -1
02433     if (beta == -STS::one()) {
02434       for (Ordinal i = 0; i < numRows; ++i) {
02435         // Initialize temporary values to -Y(i,:).
02436         // Extra +1 in loop bound ensures first 4 iterations get
02437         // strip-mined, but requires that Ordinal be a signed type.
02438         Ordinal c = 0;
02439         for ( ; c < numVecs - 3; c += 4) {
02440           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02441           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
02442 
02443           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02444             const MatrixScalar A_ij = val[k];
02445             const Ordinal j = ind[k];
02446             const DomainScalar* const X_j = &X[j + c*colStrideX];
02447             tmp[0] += alpha * A_ij * X_j[0];
02448             tmp[1] += alpha * A_ij * X_j[colStrideX];
02449             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
02450             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
02451           }
02452           // Copy temporary values into output vector.
02453           Y_i[0] = tmp[0];
02454           Y_i[colStrideY] = tmp[1];
02455           Y_i[2*colStrideY] = tmp[2];
02456           Y_i[3*colStrideY] = tmp[3];
02457         }
02458         // Mop up left-over iterations over multivector columns.
02459         for ( ; c < numVecs; ++c) {
02460           RangeScalar tmp = -Y[i + c*colStrideY];
02461 
02462           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02463             const MatrixScalar A_ij = val[k];
02464             const Ordinal j = ind[k];
02465             tmp += alpha * A_ij * X[j + c*colStrideX];
02466           }
02467           Y[i + c*colStrideY] = tmp;
02468         }
02469       }
02470     }
02471     else if (beta == STS::zero()) {
02472       for (Ordinal i = 0; i < numRows; ++i) {
02473         // Initialize temporary values to 0.
02474         // Extra +1 in loop bound ensures first 4 iterations get
02475         // strip-mined, but requires that Ordinal be a signed type.
02476         Ordinal c = 0;
02477         for ( ; c < numVecs - 3; c += 4) {
02478           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02479           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
02480 
02481           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02482             const MatrixScalar A_ij = val[k];
02483             const Ordinal j = ind[k];
02484             const DomainScalar* const X_j = &X[j + c*colStrideX];
02485             tmp[0] += alpha * A_ij * X_j[0];
02486             tmp[1] += alpha * A_ij * X_j[colStrideX];
02487             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
02488             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
02489           }
02490           // Copy temporary values into output vector.
02491           Y_i[0] = tmp[0];
02492           Y_i[colStrideY] = tmp[1];
02493           Y_i[2*colStrideY] = tmp[2];
02494           Y_i[3*colStrideY] = tmp[3];
02495         }
02496         // Mop up left-over iterations over multivector columns.
02497         for ( ; c < numVecs; ++c) {
02498           RangeScalar tmp = STS::zero();
02499 
02500           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02501             const MatrixScalar A_ij = val[k];
02502             const Ordinal j = ind[k];
02503             tmp += alpha * A_ij * X[j + c*colStrideX];
02504           }
02505           Y[i + c*colStrideY] = tmp;
02506         }
02507       }
02508     }
02509     else if (beta == STS::one()) {
02510       for (Ordinal i = 0; i < numRows; ++i) {
02511         // Initialize temporary values to Y(i,:).
02512         // Extra +1 in loop bound ensures first 4 iterations get
02513         // strip-mined, but requires that Ordinal be a signed type.
02514         Ordinal c = 0;
02515         for ( ; c < numVecs - 3; c += 4) {
02516           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02517           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
02518 
02519           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02520             const MatrixScalar A_ij = val[k];
02521             const Ordinal j = ind[k];
02522             const DomainScalar* const X_j = &X[j + c*colStrideX];
02523             tmp[0] += alpha * A_ij * X_j[0];
02524             tmp[1] += alpha * A_ij * X_j[colStrideX];
02525             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
02526             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
02527           }
02528           // Copy temporary values into output vector.
02529           Y_i[0] = tmp[0];
02530           Y_i[colStrideY] = tmp[1];
02531           Y_i[2*colStrideY] = tmp[2];
02532           Y_i[3*colStrideY] = tmp[3];
02533         }
02534         // Mop up left-over iterations over multivector columns.
02535         for ( ; c < numVecs; ++c) {
02536           RangeScalar tmp = Y[i + c*colStrideY];
02537 
02538           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02539             const MatrixScalar A_ij = val[k];
02540             const Ordinal j = ind[k];
02541             tmp += alpha * A_ij * X[j + c*colStrideX];
02542           }
02543           Y[i + c*colStrideY] = tmp;
02544         }
02545       }
02546     }
02547     else { // beta != -1 && beta != 0 && beta != 1
02548       for (Ordinal i = 0; i < numRows; ++i) {
02549         // Initialize temporary values to Y(i,:) * beta.
02550         // Extra +1 in loop bound ensures first 4 iterations get
02551         // strip-mined, but requires that Ordinal be a signed type.
02552         Ordinal c = 0;
02553         for ( ; c < numVecs - 3; c += 4) {
02554           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02555           RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
02556 
02557           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02558             const MatrixScalar A_ij = val[k];
02559             const Ordinal j = ind[k];
02560             const DomainScalar* const X_j = &X[j + c*colStrideX];
02561             tmp[0] += alpha * A_ij * X_j[0];
02562             tmp[1] += alpha * A_ij * X_j[colStrideX];
02563             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
02564             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
02565           }
02566           // Copy temporary values into output vector.
02567           Y_i[0] = tmp[0];
02568           Y_i[colStrideY] = tmp[1];
02569           Y_i[2*colStrideY] = tmp[2];
02570           Y_i[3*colStrideY] = tmp[3];
02571         }
02572         // Mop up left-over iterations over multivector columns.
02573         for ( ; c < numVecs; ++c) {
02574           RangeScalar tmp = beta * Y[i + c*colStrideY];
02575 
02576           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02577             const MatrixScalar A_ij = val[k];
02578             const Ordinal j = ind[k];
02579             tmp += alpha * A_ij * X[j + c*colStrideX];
02580           }
02581           Y[i + c*colStrideY] = tmp;
02582         }
02583       }
02584     }
02585   }
02586 }
02587 
02588 template<class Ordinal,
02589          class MatrixScalar,
02590          class DomainScalar,
02591          class RangeScalar>
02592 void
02593 matVecCsrColMajorForfor4UnrolledOmp (
02594   const Ordinal numRows,
02595   const Ordinal numCols,
02596   const Ordinal numVecs,
02597   const RangeScalar& beta,
02598   RangeScalar Y[],
02599   const Ordinal colStrideY,
02600   const RangeScalar& alpha,
02601   const size_t  ptr[],
02602   const Ordinal ind[],
02603   const MatrixScalar val[],
02604   const DomainScalar X[],
02605   const Ordinal colStrideX)
02606 {
02607   typedef Teuchos::ScalarTraits<RangeScalar> STS;
02608 
02609   // With CSR for alpha == 0, scale Y by beta and return.
02610   if (alpha == STS::zero()) {
02611     // Prescale: Y := beta * Y.
02612     if (beta == STS::zero()) {
02613       for (Ordinal j = 0; j < numVecs; ++j) {
02614         RangeScalar* const Y_j = &Y[j*colStrideY];
02615         #pragma omp parallel for
02616         for (Ordinal i = 0; i < numRows; ++i) {
02617           // Follow the Sparse BLAS convention for beta == 0. 
02618           Y_j[i] = STS::zero();
02619         }
02620       }
02621     }
02622     else if (beta != STS::one()) {
02623       for (Ordinal j = 0; j < numVecs; ++j) {
02624         RangeScalar* const Y_j = &Y[j*colStrideY];
02625         #pragma omp parallel for
02626         for (Ordinal i = 0; i < numRows; ++i) {
02627           Y_j[i] = beta * Y_j[i];
02628         }
02629       }
02630     }
02631     return; // Our work is done!
02632   }
02633   if (alpha == STS::one()) {
02634     if (beta == -STS::one()) {
02635       #pragma omp parallel for
02636       for (Ordinal i = 0; i < numRows; ++i) {
02637         // Initialize temporary values to -Y(i,:).
02638         // Extra +1 in loop bound ensures first 4 iterations get
02639         // strip-mined, but requires that Ordinal be a signed type.
02640         Ordinal c = 0;
02641         for ( ; c < numVecs - 3; c += 4) {
02642           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02643           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
02644 
02645           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02646             const MatrixScalar A_ij = val[k];
02647             const Ordinal j = ind[k];
02648             const DomainScalar* const X_j = &X[j + c*colStrideX];
02649             tmp[0] += A_ij * X_j[0];
02650             tmp[1] += A_ij * X_j[colStrideX];
02651             tmp[2] += A_ij * X_j[2*colStrideX];
02652             tmp[3] += A_ij * X_j[3*colStrideX];
02653           }
02654           // Copy temporary values into output vector.
02655           Y_i[0] = tmp[0];
02656           Y_i[colStrideY] = tmp[1];
02657           Y_i[2*colStrideY] = tmp[2];
02658           Y_i[3*colStrideY] = tmp[3];
02659         }
02660         // Mop up left-over iterations over multivector columns.
02661         for ( ; c < numVecs; ++c) {
02662           RangeScalar tmp = -Y[i + c*colStrideY];
02663 
02664           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02665             const MatrixScalar A_ij = val[k];
02666             const Ordinal j = ind[k];
02667             tmp += A_ij * X[j + c*colStrideX];
02668           }
02669           Y[i + c*colStrideY] = tmp;
02670         }
02671       }
02672     }
02673     else if (beta == STS::zero()) {
02674       #pragma omp parallel for
02675       for (Ordinal i = 0; i < numRows; ++i) {
02676         // Initialize temporary values to 0.
02677         // Extra +1 in loop bound ensures first 4 iterations get
02678         // strip-mined, but requires that Ordinal be a signed type.
02679         Ordinal c = 0;
02680         for ( ; c < numVecs - 3; c += 4) {
02681           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02682           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
02683 
02684           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02685             const MatrixScalar A_ij = val[k];
02686             const Ordinal j = ind[k];
02687             const DomainScalar* const X_j = &X[j + c*colStrideX];
02688             tmp[0] += A_ij * X_j[0];
02689             tmp[1] += A_ij * X_j[colStrideX];
02690             tmp[2] += A_ij * X_j[2*colStrideX];
02691             tmp[3] += A_ij * X_j[3*colStrideX];
02692           }
02693           // Copy temporary values into output vector.
02694           Y_i[0] = tmp[0];
02695           Y_i[colStrideY] = tmp[1];
02696           Y_i[2*colStrideY] = tmp[2];
02697           Y_i[3*colStrideY] = tmp[3];
02698         }
02699         // Mop up left-over iterations over multivector columns.
02700         for ( ; c < numVecs; ++c) {
02701           RangeScalar tmp = STS::zero();
02702 
02703           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02704             const MatrixScalar A_ij = val[k];
02705             const Ordinal j = ind[k];
02706             tmp += A_ij * X[j + c*colStrideX];
02707           }
02708           Y[i + c*colStrideY] = tmp;
02709         }
02710       }
02711     }
02712     else if (beta == STS::one()) {
02713       #pragma omp parallel for
02714       for (Ordinal i = 0; i < numRows; ++i) {
02715         // Initialize temporary values to Y(i,:).
02716         // Extra +1 in loop bound ensures first 4 iterations get
02717         // strip-mined, but requires that Ordinal be a signed type.
02718         Ordinal c = 0;
02719         for ( ; c < numVecs - 3; c += 4) {
02720           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02721           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
02722 
02723           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02724             const MatrixScalar A_ij = val[k];
02725             const Ordinal j = ind[k];
02726             const DomainScalar* const X_j = &X[j + c*colStrideX];
02727             tmp[0] += A_ij * X_j[0];
02728             tmp[1] += A_ij * X_j[colStrideX];
02729             tmp[2] += A_ij * X_j[2*colStrideX];
02730             tmp[3] += A_ij * X_j[3*colStrideX];
02731           }
02732           // Copy temporary values into output vector.
02733           Y_i[0] = tmp[0];
02734           Y_i[colStrideY] = tmp[1];
02735           Y_i[2*colStrideY] = tmp[2];
02736           Y_i[3*colStrideY] = tmp[3];
02737         }
02738         // Mop up left-over iterations over multivector columns.
02739         for ( ; c < numVecs; ++c) {
02740           RangeScalar tmp = Y[i + c*colStrideY];
02741 
02742           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02743             const MatrixScalar A_ij = val[k];
02744             const Ordinal j = ind[k];
02745             tmp += A_ij * X[j + c*colStrideX];
02746           }
02747           Y[i + c*colStrideY] = tmp;
02748         }
02749       }
02750     }
02751     else { // beta != -1 && beta != 0 && beta != 1
02752       #pragma omp parallel for
02753       for (Ordinal i = 0; i < numRows; ++i) {
02754         // Initialize temporary values to Y(i,:) * beta.
02755         // Extra +1 in loop bound ensures first 4 iterations get
02756         // strip-mined, but requires that Ordinal be a signed type.
02757         Ordinal c = 0;
02758         for ( ; c < numVecs - 3; c += 4) {
02759           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02760           RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
02761 
02762           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02763             const MatrixScalar A_ij = val[k];
02764             const Ordinal j = ind[k];
02765             const DomainScalar* const X_j = &X[j + c*colStrideX];
02766             tmp[0] += A_ij * X_j[0];
02767             tmp[1] += A_ij * X_j[colStrideX];
02768             tmp[2] += A_ij * X_j[2*colStrideX];
02769             tmp[3] += A_ij * X_j[3*colStrideX];
02770           }
02771           // Copy temporary values into output vector.
02772           Y_i[0] = tmp[0];
02773           Y_i[colStrideY] = tmp[1];
02774           Y_i[2*colStrideY] = tmp[2];
02775           Y_i[3*colStrideY] = tmp[3];
02776         }
02777         // Mop up left-over iterations over multivector columns.
02778         for ( ; c < numVecs; ++c) {
02779           RangeScalar tmp = beta * Y[i + c*colStrideY];
02780 
02781           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02782             const MatrixScalar A_ij = val[k];
02783             const Ordinal j = ind[k];
02784             tmp += A_ij * X[j + c*colStrideX];
02785           }
02786           Y[i + c*colStrideY] = tmp;
02787         }
02788       }
02789     }
02790   }
02791   else if (alpha == -STS::one()) {
02792     if (beta == -STS::one()) {
02793       #pragma omp parallel for
02794       for (Ordinal i = 0; i < numRows; ++i) {
02795         // Initialize temporary values to -Y(i,:).
02796         // Extra +1 in loop bound ensures first 4 iterations get
02797         // strip-mined, but requires that Ordinal be a signed type.
02798         Ordinal c = 0;
02799         for ( ; c < numVecs - 3; c += 4) {
02800           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02801           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
02802 
02803           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02804             const MatrixScalar A_ij = val[k];
02805             const Ordinal j = ind[k];
02806             const DomainScalar* const X_j = &X[j + c*colStrideX];
02807             tmp[0] -= A_ij * X_j[0];
02808             tmp[1] -= A_ij * X_j[colStrideX];
02809             tmp[2] -= A_ij * X_j[2*colStrideX];
02810             tmp[3] -= A_ij * X_j[3*colStrideX];
02811           }
02812           // Copy temporary values into output vector.
02813           Y_i[0] = tmp[0];
02814           Y_i[colStrideY] = tmp[1];
02815           Y_i[2*colStrideY] = tmp[2];
02816           Y_i[3*colStrideY] = tmp[3];
02817         }
02818         // Mop up left-over iterations over multivector columns.
02819         for ( ; c < numVecs; ++c) {
02820           RangeScalar tmp = -Y[i + c*colStrideY];
02821 
02822           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02823             const MatrixScalar A_ij = val[k];
02824             const Ordinal j = ind[k];
02825             tmp -= A_ij * X[j + c*colStrideX];
02826           }
02827           Y[i + c*colStrideY] = tmp;
02828         }
02829       }
02830     }
02831     else if (beta == STS::zero()) {
02832       #pragma omp parallel for
02833       for (Ordinal i = 0; i < numRows; ++i) {
02834         // Initialize temporary values to 0.
02835         // Extra +1 in loop bound ensures first 4 iterations get
02836         // strip-mined, but requires that Ordinal be a signed type.
02837         Ordinal c = 0;
02838         for ( ; c < numVecs - 3; c += 4) {
02839           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02840           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
02841 
02842           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02843             const MatrixScalar A_ij = val[k];
02844             const Ordinal j = ind[k];
02845             const DomainScalar* const X_j = &X[j + c*colStrideX];
02846             tmp[0] -= A_ij * X_j[0];
02847             tmp[1] -= A_ij * X_j[colStrideX];
02848             tmp[2] -= A_ij * X_j[2*colStrideX];
02849             tmp[3] -= A_ij * X_j[3*colStrideX];
02850           }
02851           // Copy temporary values into output vector.
02852           Y_i[0] = tmp[0];
02853           Y_i[colStrideY] = tmp[1];
02854           Y_i[2*colStrideY] = tmp[2];
02855           Y_i[3*colStrideY] = tmp[3];
02856         }
02857         // Mop up left-over iterations over multivector columns.
02858         for ( ; c < numVecs; ++c) {
02859           RangeScalar tmp = STS::zero();
02860 
02861           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02862             const MatrixScalar A_ij = val[k];
02863             const Ordinal j = ind[k];
02864             tmp -= A_ij * X[j + c*colStrideX];
02865           }
02866           Y[i + c*colStrideY] = tmp;
02867         }
02868       }
02869     }
02870     else if (beta == STS::one()) {
02871       #pragma omp parallel for
02872       for (Ordinal i = 0; i < numRows; ++i) {
02873         // Initialize temporary values to Y(i,:).
02874         // Extra +1 in loop bound ensures first 4 iterations get
02875         // strip-mined, but requires that Ordinal be a signed type.
02876         Ordinal c = 0;
02877         for ( ; c < numVecs - 3; c += 4) {
02878           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02879           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
02880 
02881           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02882             const MatrixScalar A_ij = val[k];
02883             const Ordinal j = ind[k];
02884             const DomainScalar* const X_j = &X[j + c*colStrideX];
02885             tmp[0] -= A_ij * X_j[0];
02886             tmp[1] -= A_ij * X_j[colStrideX];
02887             tmp[2] -= A_ij * X_j[2*colStrideX];
02888             tmp[3] -= A_ij * X_j[3*colStrideX];
02889           }
02890           // Copy temporary values into output vector.
02891           Y_i[0] = tmp[0];
02892           Y_i[colStrideY] = tmp[1];
02893           Y_i[2*colStrideY] = tmp[2];
02894           Y_i[3*colStrideY] = tmp[3];
02895         }
02896         // Mop up left-over iterations over multivector columns.
02897         for ( ; c < numVecs; ++c) {
02898           RangeScalar tmp = Y[i + c*colStrideY];
02899 
02900           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02901             const MatrixScalar A_ij = val[k];
02902             const Ordinal j = ind[k];
02903             tmp -= A_ij * X[j + c*colStrideX];
02904           }
02905           Y[i + c*colStrideY] = tmp;
02906         }
02907       }
02908     }
02909     else { // beta != -1 && beta != 0 && beta != 1
02910       #pragma omp parallel for
02911       for (Ordinal i = 0; i < numRows; ++i) {
02912         // Initialize temporary values to Y(i,:) * beta.
02913         // Extra +1 in loop bound ensures first 4 iterations get
02914         // strip-mined, but requires that Ordinal be a signed type.
02915         Ordinal c = 0;
02916         for ( ; c < numVecs - 3; c += 4) {
02917           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02918           RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
02919 
02920           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02921             const MatrixScalar A_ij = val[k];
02922             const Ordinal j = ind[k];
02923             const DomainScalar* const X_j = &X[j + c*colStrideX];
02924             tmp[0] -= A_ij * X_j[0];
02925             tmp[1] -= A_ij * X_j[colStrideX];
02926             tmp[2] -= A_ij * X_j[2*colStrideX];
02927             tmp[3] -= A_ij * X_j[3*colStrideX];
02928           }
02929           // Copy temporary values into output vector.
02930           Y_i[0] = tmp[0];
02931           Y_i[colStrideY] = tmp[1];
02932           Y_i[2*colStrideY] = tmp[2];
02933           Y_i[3*colStrideY] = tmp[3];
02934         }
02935         // Mop up left-over iterations over multivector columns.
02936         for ( ; c < numVecs; ++c) {
02937           RangeScalar tmp = beta * Y[i + c*colStrideY];
02938 
02939           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02940             const MatrixScalar A_ij = val[k];
02941             const Ordinal j = ind[k];
02942             tmp -= A_ij * X[j + c*colStrideX];
02943           }
02944           Y[i + c*colStrideY] = tmp;
02945         }
02946       }
02947     }
02948   }
02949   else { // alpha != 1 && alpha != -1
02950     if (beta == -STS::one()) {
02951       #pragma omp parallel for
02952       for (Ordinal i = 0; i < numRows; ++i) {
02953         // Initialize temporary values to -Y(i,:).
02954         // Extra +1 in loop bound ensures first 4 iterations get
02955         // strip-mined, but requires that Ordinal be a signed type.
02956         Ordinal c = 0;
02957         for ( ; c < numVecs - 3; c += 4) {
02958           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02959           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
02960 
02961           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02962             const MatrixScalar A_ij = val[k];
02963             const Ordinal j = ind[k];
02964             const DomainScalar* const X_j = &X[j + c*colStrideX];
02965             tmp[0] += alpha * A_ij * X_j[0];
02966             tmp[1] += alpha * A_ij * X_j[colStrideX];
02967             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
02968             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
02969           }
02970           // Copy temporary values into output vector.
02971           Y_i[0] = tmp[0];
02972           Y_i[colStrideY] = tmp[1];
02973           Y_i[2*colStrideY] = tmp[2];
02974           Y_i[3*colStrideY] = tmp[3];
02975         }
02976         // Mop up left-over iterations over multivector columns.
02977         for ( ; c < numVecs; ++c) {
02978           RangeScalar tmp = -Y[i + c*colStrideY];
02979 
02980           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
02981             const MatrixScalar A_ij = val[k];
02982             const Ordinal j = ind[k];
02983             tmp += alpha * A_ij * X[j + c*colStrideX];
02984           }
02985           Y[i + c*colStrideY] = tmp;
02986         }
02987       }
02988     }
02989     else if (beta == STS::zero()) {
02990       #pragma omp parallel for
02991       for (Ordinal i = 0; i < numRows; ++i) {
02992         // Initialize temporary values to 0.
02993         // Extra +1 in loop bound ensures first 4 iterations get
02994         // strip-mined, but requires that Ordinal be a signed type.
02995         Ordinal c = 0;
02996         for ( ; c < numVecs - 3; c += 4) {
02997           RangeScalar* const Y_i = &Y[i + c*colStrideY];
02998           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
02999 
03000           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03001             const MatrixScalar A_ij = val[k];
03002             const Ordinal j = ind[k];
03003             const DomainScalar* const X_j = &X[j + c*colStrideX];
03004             tmp[0] += alpha * A_ij * X_j[0];
03005             tmp[1] += alpha * A_ij * X_j[colStrideX];
03006             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
03007             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
03008           }
03009           // Copy temporary values into output vector.
03010           Y_i[0] = tmp[0];
03011           Y_i[colStrideY] = tmp[1];
03012           Y_i[2*colStrideY] = tmp[2];
03013           Y_i[3*colStrideY] = tmp[3];
03014         }
03015         // Mop up left-over iterations over multivector columns.
03016         for ( ; c < numVecs; ++c) {
03017           RangeScalar tmp = STS::zero();
03018 
03019           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03020             const MatrixScalar A_ij = val[k];
03021             const Ordinal j = ind[k];
03022             tmp += alpha * A_ij * X[j + c*colStrideX];
03023           }
03024           Y[i + c*colStrideY] = tmp;
03025         }
03026       }
03027     }
03028     else if (beta == STS::one()) {
03029       #pragma omp parallel for
03030       for (Ordinal i = 0; i < numRows; ++i) {
03031         // Initialize temporary values to Y(i,:).
03032         // Extra +1 in loop bound ensures first 4 iterations get
03033         // strip-mined, but requires that Ordinal be a signed type.
03034         Ordinal c = 0;
03035         for ( ; c < numVecs - 3; c += 4) {
03036           RangeScalar* const Y_i = &Y[i + c*colStrideY];
03037           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
03038 
03039           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03040             const MatrixScalar A_ij = val[k];
03041             const Ordinal j = ind[k];
03042             const DomainScalar* const X_j = &X[j + c*colStrideX];
03043             tmp[0] += alpha * A_ij * X_j[0];
03044             tmp[1] += alpha * A_ij * X_j[colStrideX];
03045             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
03046             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
03047           }
03048           // Copy temporary values into output vector.
03049           Y_i[0] = tmp[0];
03050           Y_i[colStrideY] = tmp[1];
03051           Y_i[2*colStrideY] = tmp[2];
03052           Y_i[3*colStrideY] = tmp[3];
03053         }
03054         // Mop up left-over iterations over multivector columns.
03055         for ( ; c < numVecs; ++c) {
03056           RangeScalar tmp = Y[i + c*colStrideY];
03057 
03058           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03059             const MatrixScalar A_ij = val[k];
03060             const Ordinal j = ind[k];
03061             tmp += alpha * A_ij * X[j + c*colStrideX];
03062           }
03063           Y[i + c*colStrideY] = tmp;
03064         }
03065       }
03066     }
03067     else { // beta != -1 && beta != 0 && beta != 1
03068       #pragma omp parallel for
03069       for (Ordinal i = 0; i < numRows; ++i) {
03070         // Initialize temporary values to Y(i,:) * beta.
03071         // Extra +1 in loop bound ensures first 4 iterations get
03072         // strip-mined, but requires that Ordinal be a signed type.
03073         Ordinal c = 0;
03074         for ( ; c < numVecs - 3; c += 4) {
03075           RangeScalar* const Y_i = &Y[i + c*colStrideY];
03076           RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
03077 
03078           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03079             const MatrixScalar A_ij = val[k];
03080             const Ordinal j = ind[k];
03081             const DomainScalar* const X_j = &X[j + c*colStrideX];
03082             tmp[0] += alpha * A_ij * X_j[0];
03083             tmp[1] += alpha * A_ij * X_j[colStrideX];
03084             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
03085             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
03086           }
03087           // Copy temporary values into output vector.
03088           Y_i[0] = tmp[0];
03089           Y_i[colStrideY] = tmp[1];
03090           Y_i[2*colStrideY] = tmp[2];
03091           Y_i[3*colStrideY] = tmp[3];
03092         }
03093         // Mop up left-over iterations over multivector columns.
03094         for ( ; c < numVecs; ++c) {
03095           RangeScalar tmp = beta * Y[i + c*colStrideY];
03096 
03097           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03098             const MatrixScalar A_ij = val[k];
03099             const Ordinal j = ind[k];
03100             tmp += alpha * A_ij * X[j + c*colStrideX];
03101           }
03102           Y[i + c*colStrideY] = tmp;
03103         }
03104       }
03105     }
03106   }
03107 }
03108 
03109 template<class Ordinal,
03110          class MatrixScalar,
03111          class DomainScalar,
03112          class RangeScalar>
03113 void
03114 matVecCsrColMajorForfor1Vec (
03115   const Ordinal numRows,
03116   const Ordinal numCols,
03117   const Ordinal numVecs,
03118   const RangeScalar& beta,
03119   RangeScalar Y[],
03120   const Ordinal colStrideY,
03121   const RangeScalar& alpha,
03122   const size_t  ptr[],
03123   const Ordinal ind[],
03124   const MatrixScalar val[],
03125   const DomainScalar X[],
03126   const Ordinal colStrideX)
03127 {
03128   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03129 
03130   // With CSR for alpha == 0, scale Y by beta and return.
03131   if (alpha == STS::zero()) {
03132     // Prescale: Y := beta * Y.
03133     if (beta == STS::zero()) {
03134       for (Ordinal j = 0; j < numVecs; ++j) {
03135         RangeScalar* const Y_j = &Y[j*colStrideY];
03136         for (Ordinal i = 0; i < numRows; ++i) {
03137           // Follow the Sparse BLAS convention for beta == 0. 
03138           Y_j[i] = STS::zero();
03139         }
03140       }
03141     }
03142     else if (beta != STS::one()) {
03143       for (Ordinal j = 0; j < numVecs; ++j) {
03144         RangeScalar* const Y_j = &Y[j*colStrideY];
03145         for (Ordinal i = 0; i < numRows; ++i) {
03146           Y_j[i] = beta * Y_j[i];
03147         }
03148       }
03149     }
03150     return; // Our work is done!
03151   }
03152   if (alpha == STS::one()) {
03153     if (beta == -STS::one()) {
03154       for (Ordinal i = 0; i < numRows; ++i) {
03155         // Initialize temporary values to -Y(i,:).
03156         RangeScalar tmp = -Y[i];
03157 
03158         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03159           const MatrixScalar A_ij = val[k];
03160           const Ordinal j = ind[k];
03161           tmp += A_ij * X[j];
03162         }
03163         // Copy temporary values into output vector.
03164         Y[i] = tmp;
03165       }
03166     }
03167     else if (beta == STS::zero()) {
03168       for (Ordinal i = 0; i < numRows; ++i) {
03169         // Initialize temporary values to 0.
03170         RangeScalar tmp = STS::zero();
03171 
03172         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03173           const MatrixScalar A_ij = val[k];
03174           const Ordinal j = ind[k];
03175           tmp += A_ij * X[j];
03176         }
03177         // Copy temporary values into output vector.
03178         Y[i] = tmp;
03179       }
03180     }
03181     else if (beta == STS::one()) {
03182       for (Ordinal i = 0; i < numRows; ++i) {
03183         // Initialize temporary values to Y(i,:).
03184         RangeScalar tmp = Y[i];
03185 
03186         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03187           const MatrixScalar A_ij = val[k];
03188           const Ordinal j = ind[k];
03189           tmp += A_ij * X[j];
03190         }
03191         // Copy temporary values into output vector.
03192         Y[i] = tmp;
03193       }
03194     }
03195     else { // beta != -1 && beta != 0 && beta != 1
03196       for (Ordinal i = 0; i < numRows; ++i) {
03197         // Initialize temporary values to Y(i,:) * beta.
03198         RangeScalar tmp = beta * Y[i];
03199 
03200         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03201           const MatrixScalar A_ij = val[k];
03202           const Ordinal j = ind[k];
03203           tmp += A_ij * X[j];
03204         }
03205         // Copy temporary values into output vector.
03206         Y[i] = tmp;
03207       }
03208     }
03209   }
03210   else if (alpha == -STS::one()) {
03211     if (beta == -STS::one()) {
03212       for (Ordinal i = 0; i < numRows; ++i) {
03213         // Initialize temporary values to -Y(i,:).
03214         RangeScalar tmp = -Y[i];
03215 
03216         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03217           const MatrixScalar A_ij = val[k];
03218           const Ordinal j = ind[k];
03219           tmp -= A_ij * X[j];
03220         }
03221         // Copy temporary values into output vector.
03222         Y[i] = tmp;
03223       }
03224     }
03225     else if (beta == STS::zero()) {
03226       for (Ordinal i = 0; i < numRows; ++i) {
03227         // Initialize temporary values to 0.
03228         RangeScalar tmp = STS::zero();
03229 
03230         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03231           const MatrixScalar A_ij = val[k];
03232           const Ordinal j = ind[k];
03233           tmp -= A_ij * X[j];
03234         }
03235         // Copy temporary values into output vector.
03236         Y[i] = tmp;
03237       }
03238     }
03239     else if (beta == STS::one()) {
03240       for (Ordinal i = 0; i < numRows; ++i) {
03241         // Initialize temporary values to Y(i,:).
03242         RangeScalar tmp = Y[i];
03243 
03244         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03245           const MatrixScalar A_ij = val[k];
03246           const Ordinal j = ind[k];
03247           tmp -= A_ij * X[j];
03248         }
03249         // Copy temporary values into output vector.
03250         Y[i] = tmp;
03251       }
03252     }
03253     else { // beta != -1 && beta != 0 && beta != 1
03254       for (Ordinal i = 0; i < numRows; ++i) {
03255         // Initialize temporary values to Y(i,:) * beta.
03256         RangeScalar tmp = beta * Y[i];
03257 
03258         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03259           const MatrixScalar A_ij = val[k];
03260           const Ordinal j = ind[k];
03261           tmp -= A_ij * X[j];
03262         }
03263         // Copy temporary values into output vector.
03264         Y[i] = tmp;
03265       }
03266     }
03267   }
03268   else { // alpha != 1 && alpha != -1
03269     if (beta == -STS::one()) {
03270       for (Ordinal i = 0; i < numRows; ++i) {
03271         // Initialize temporary values to -Y(i,:).
03272         RangeScalar tmp = -Y[i];
03273 
03274         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03275           const MatrixScalar A_ij = val[k];
03276           const Ordinal j = ind[k];
03277           tmp += alpha * A_ij * X[j];
03278         }
03279         // Copy temporary values into output vector.
03280         Y[i] = tmp;
03281       }
03282     }
03283     else if (beta == STS::zero()) {
03284       for (Ordinal i = 0; i < numRows; ++i) {
03285         // Initialize temporary values to 0.
03286         RangeScalar tmp = STS::zero();
03287 
03288         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03289           const MatrixScalar A_ij = val[k];
03290           const Ordinal j = ind[k];
03291           tmp += alpha * A_ij * X[j];
03292         }
03293         // Copy temporary values into output vector.
03294         Y[i] = tmp;
03295       }
03296     }
03297     else if (beta == STS::one()) {
03298       for (Ordinal i = 0; i < numRows; ++i) {
03299         // Initialize temporary values to Y(i,:).
03300         RangeScalar tmp = Y[i];
03301 
03302         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03303           const MatrixScalar A_ij = val[k];
03304           const Ordinal j = ind[k];
03305           tmp += alpha * A_ij * X[j];
03306         }
03307         // Copy temporary values into output vector.
03308         Y[i] = tmp;
03309       }
03310     }
03311     else { // beta != -1 && beta != 0 && beta != 1
03312       for (Ordinal i = 0; i < numRows; ++i) {
03313         // Initialize temporary values to Y(i,:) * beta.
03314         RangeScalar tmp = beta * Y[i];
03315 
03316         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03317           const MatrixScalar A_ij = val[k];
03318           const Ordinal j = ind[k];
03319           tmp += alpha * A_ij * X[j];
03320         }
03321         // Copy temporary values into output vector.
03322         Y[i] = tmp;
03323       }
03324     }
03325   }
03326 }
03327 
03328 template<class Ordinal,
03329          class MatrixScalar,
03330          class DomainScalar,
03331          class RangeScalar>
03332 void
03333 matVecCsrColMajorForfor1VecOmp (
03334   const Ordinal numRows,
03335   const Ordinal numCols,
03336   const Ordinal numVecs,
03337   const RangeScalar& beta,
03338   RangeScalar Y[],
03339   const Ordinal colStrideY,
03340   const RangeScalar& alpha,
03341   const size_t  ptr[],
03342   const Ordinal ind[],
03343   const MatrixScalar val[],
03344   const DomainScalar X[],
03345   const Ordinal colStrideX)
03346 {
03347   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03348 
03349   // With CSR for alpha == 0, scale Y by beta and return.
03350   if (alpha == STS::zero()) {
03351     // Prescale: Y := beta * Y.
03352     if (beta == STS::zero()) {
03353       for (Ordinal j = 0; j < numVecs; ++j) {
03354         RangeScalar* const Y_j = &Y[j*colStrideY];
03355         #pragma omp parallel for
03356         for (Ordinal i = 0; i < numRows; ++i) {
03357           // Follow the Sparse BLAS convention for beta == 0. 
03358           Y_j[i] = STS::zero();
03359         }
03360       }
03361     }
03362     else if (beta != STS::one()) {
03363       for (Ordinal j = 0; j < numVecs; ++j) {
03364         RangeScalar* const Y_j = &Y[j*colStrideY];
03365         #pragma omp parallel for
03366         for (Ordinal i = 0; i < numRows; ++i) {
03367           Y_j[i] = beta * Y_j[i];
03368         }
03369       }
03370     }
03371     return; // Our work is done!
03372   }
03373   if (alpha == STS::one()) {
03374     if (beta == -STS::one()) {
03375       #pragma omp parallel for
03376       for (Ordinal i = 0; i < numRows; ++i) {
03377         // Initialize temporary values to -Y(i,:).
03378         RangeScalar tmp = -Y[i];
03379 
03380         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03381           const MatrixScalar A_ij = val[k];
03382           const Ordinal j = ind[k];
03383           tmp += A_ij * X[j];
03384         }
03385         // Copy temporary values into output vector.
03386         Y[i] = tmp;
03387       }
03388     }
03389     else if (beta == STS::zero()) {
03390       #pragma omp parallel for
03391       for (Ordinal i = 0; i < numRows; ++i) {
03392         // Initialize temporary values to 0.
03393         RangeScalar tmp = STS::zero();
03394 
03395         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03396           const MatrixScalar A_ij = val[k];
03397           const Ordinal j = ind[k];
03398           tmp += A_ij * X[j];
03399         }
03400         // Copy temporary values into output vector.
03401         Y[i] = tmp;
03402       }
03403     }
03404     else if (beta == STS::one()) {
03405       #pragma omp parallel for
03406       for (Ordinal i = 0; i < numRows; ++i) {
03407         // Initialize temporary values to Y(i,:).
03408         RangeScalar tmp = Y[i];
03409 
03410         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03411           const MatrixScalar A_ij = val[k];
03412           const Ordinal j = ind[k];
03413           tmp += A_ij * X[j];
03414         }
03415         // Copy temporary values into output vector.
03416         Y[i] = tmp;
03417       }
03418     }
03419     else { // beta != -1 && beta != 0 && beta != 1
03420       #pragma omp parallel for
03421       for (Ordinal i = 0; i < numRows; ++i) {
03422         // Initialize temporary values to Y(i,:) * beta.
03423         RangeScalar tmp = beta * Y[i];
03424 
03425         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03426           const MatrixScalar A_ij = val[k];
03427           const Ordinal j = ind[k];
03428           tmp += A_ij * X[j];
03429         }
03430         // Copy temporary values into output vector.
03431         Y[i] = tmp;
03432       }
03433     }
03434   }
03435   else if (alpha == -STS::one()) {
03436     if (beta == -STS::one()) {
03437       #pragma omp parallel for
03438       for (Ordinal i = 0; i < numRows; ++i) {
03439         // Initialize temporary values to -Y(i,:).
03440         RangeScalar tmp = -Y[i];
03441 
03442         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03443           const MatrixScalar A_ij = val[k];
03444           const Ordinal j = ind[k];
03445           tmp -= A_ij * X[j];
03446         }
03447         // Copy temporary values into output vector.
03448         Y[i] = tmp;
03449       }
03450     }
03451     else if (beta == STS::zero()) {
03452       #pragma omp parallel for
03453       for (Ordinal i = 0; i < numRows; ++i) {
03454         // Initialize temporary values to 0.
03455         RangeScalar tmp = STS::zero();
03456 
03457         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03458           const MatrixScalar A_ij = val[k];
03459           const Ordinal j = ind[k];
03460           tmp -= A_ij * X[j];
03461         }
03462         // Copy temporary values into output vector.
03463         Y[i] = tmp;
03464       }
03465     }
03466     else if (beta == STS::one()) {
03467       #pragma omp parallel for
03468       for (Ordinal i = 0; i < numRows; ++i) {
03469         // Initialize temporary values to Y(i,:).
03470         RangeScalar tmp = Y[i];
03471 
03472         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03473           const MatrixScalar A_ij = val[k];
03474           const Ordinal j = ind[k];
03475           tmp -= A_ij * X[j];
03476         }
03477         // Copy temporary values into output vector.
03478         Y[i] = tmp;
03479       }
03480     }
03481     else { // beta != -1 && beta != 0 && beta != 1
03482       #pragma omp parallel for
03483       for (Ordinal i = 0; i < numRows; ++i) {
03484         // Initialize temporary values to Y(i,:) * beta.
03485         RangeScalar tmp = beta * Y[i];
03486 
03487         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03488           const MatrixScalar A_ij = val[k];
03489           const Ordinal j = ind[k];
03490           tmp -= A_ij * X[j];
03491         }
03492         // Copy temporary values into output vector.
03493         Y[i] = tmp;
03494       }
03495     }
03496   }
03497   else { // alpha != 1 && alpha != -1
03498     if (beta == -STS::one()) {
03499       #pragma omp parallel for
03500       for (Ordinal i = 0; i < numRows; ++i) {
03501         // Initialize temporary values to -Y(i,:).
03502         RangeScalar tmp = -Y[i];
03503 
03504         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03505           const MatrixScalar A_ij = val[k];
03506           const Ordinal j = ind[k];
03507           tmp += alpha * A_ij * X[j];
03508         }
03509         // Copy temporary values into output vector.
03510         Y[i] = tmp;
03511       }
03512     }
03513     else if (beta == STS::zero()) {
03514       #pragma omp parallel for
03515       for (Ordinal i = 0; i < numRows; ++i) {
03516         // Initialize temporary values to 0.
03517         RangeScalar tmp = STS::zero();
03518 
03519         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03520           const MatrixScalar A_ij = val[k];
03521           const Ordinal j = ind[k];
03522           tmp += alpha * A_ij * X[j];
03523         }
03524         // Copy temporary values into output vector.
03525         Y[i] = tmp;
03526       }
03527     }
03528     else if (beta == STS::one()) {
03529       #pragma omp parallel for
03530       for (Ordinal i = 0; i < numRows; ++i) {
03531         // Initialize temporary values to Y(i,:).
03532         RangeScalar tmp = Y[i];
03533 
03534         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03535           const MatrixScalar A_ij = val[k];
03536           const Ordinal j = ind[k];
03537           tmp += alpha * A_ij * X[j];
03538         }
03539         // Copy temporary values into output vector.
03540         Y[i] = tmp;
03541       }
03542     }
03543     else { // beta != -1 && beta != 0 && beta != 1
03544       #pragma omp parallel for
03545       for (Ordinal i = 0; i < numRows; ++i) {
03546         // Initialize temporary values to Y(i,:) * beta.
03547         RangeScalar tmp = beta * Y[i];
03548 
03549         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03550           const MatrixScalar A_ij = val[k];
03551           const Ordinal j = ind[k];
03552           tmp += alpha * A_ij * X[j];
03553         }
03554         // Copy temporary values into output vector.
03555         Y[i] = tmp;
03556       }
03557     }
03558   }
03559 }
03560 
03561 template<class Ordinal,
03562          class MatrixScalar,
03563          class DomainScalar,
03564          class RangeScalar>
03565 void
03566 matVecCsrColMajorForfor2Vec (
03567   const Ordinal numRows,
03568   const Ordinal numCols,
03569   const Ordinal numVecs,
03570   const RangeScalar& beta,
03571   RangeScalar Y[],
03572   const Ordinal colStrideY,
03573   const RangeScalar& alpha,
03574   const size_t  ptr[],
03575   const Ordinal ind[],
03576   const MatrixScalar val[],
03577   const DomainScalar X[],
03578   const Ordinal colStrideX)
03579 {
03580   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03581 
03582   // With CSR for alpha == 0, scale Y by beta and return.
03583   if (alpha == STS::zero()) {
03584     // Prescale: Y := beta * Y.
03585     if (beta == STS::zero()) {
03586       for (Ordinal j = 0; j < numVecs; ++j) {
03587         RangeScalar* const Y_j = &Y[j*colStrideY];
03588         for (Ordinal i = 0; i < numRows; ++i) {
03589           // Follow the Sparse BLAS convention for beta == 0. 
03590           Y_j[i] = STS::zero();
03591         }
03592       }
03593     }
03594     else if (beta != STS::one()) {
03595       for (Ordinal j = 0; j < numVecs; ++j) {
03596         RangeScalar* const Y_j = &Y[j*colStrideY];
03597         for (Ordinal i = 0; i < numRows; ++i) {
03598           Y_j[i] = beta * Y_j[i];
03599         }
03600       }
03601     }
03602     return; // Our work is done!
03603   }
03604   if (alpha == STS::one()) {
03605     if (beta == -STS::one()) {
03606       for (Ordinal i = 0; i < numRows; ++i) {
03607         // Initialize temporary values to -Y(i,:).
03608         RangeScalar* const Y_i = &Y[i];
03609         RangeScalar tmp[2] = {-Y_i[0], -Y_i[colStrideY]};
03610 
03611         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03612           const MatrixScalar A_ij = val[k];
03613           const Ordinal j = ind[k];
03614           const DomainScalar* const X_j = &X[j];
03615           tmp[0] += A_ij * X_j[0];
03616           tmp[1] += A_ij * X_j[colStrideX];
03617         }
03618         // Copy temporary values into output vector.
03619         Y_i[0] = tmp[0];
03620         Y_i[colStrideY] = tmp[1];
03621       }
03622     }
03623     else if (beta == STS::zero()) {
03624       for (Ordinal i = 0; i < numRows; ++i) {
03625         // Initialize temporary values to 0.
03626         RangeScalar* const Y_i = &Y[i];
03627         RangeScalar tmp[2] = {STS::zero(), STS::zero()};
03628 
03629         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03630           const MatrixScalar A_ij = val[k];
03631           const Ordinal j = ind[k];
03632           const DomainScalar* const X_j = &X[j];
03633           tmp[0] += A_ij * X_j[0];
03634           tmp[1] += A_ij * X_j[colStrideX];
03635         }
03636         // Copy temporary values into output vector.
03637         Y_i[0] = tmp[0];
03638         Y_i[colStrideY] = tmp[1];
03639       }
03640     }
03641     else if (beta == STS::one()) {
03642       for (Ordinal i = 0; i < numRows; ++i) {
03643         // Initialize temporary values to Y(i,:).
03644         RangeScalar* const Y_i = &Y[i];
03645         RangeScalar tmp[2] = {Y_i[0], Y_i[colStrideY]};
03646 
03647         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03648           const MatrixScalar A_ij = val[k];
03649           const Ordinal j = ind[k];
03650           const DomainScalar* const X_j = &X[j];
03651           tmp[0] += A_ij * X_j[0];
03652           tmp[1] += A_ij * X_j[colStrideX];
03653         }
03654         // Copy temporary values into output vector.
03655         Y_i[0] = tmp[0];
03656         Y_i[colStrideY] = tmp[1];
03657       }
03658     }
03659     else { // beta != -1 && beta != 0 && beta != 1
03660       for (Ordinal i = 0; i < numRows; ++i) {
03661         // Initialize temporary values to Y(i,:) * beta.
03662         RangeScalar* const Y_i = &Y[i];
03663         RangeScalar tmp[2] = {beta * Y_i[0], beta * Y_i[colStrideY]};
03664 
03665         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03666           const MatrixScalar A_ij = val[k];
03667           const Ordinal j = ind[k];
03668           const DomainScalar* const X_j = &X[j];
03669           tmp[0] += A_ij * X_j[0];
03670           tmp[1] += A_ij * X_j[colStrideX];
03671         }
03672         // Copy temporary values into output vector.
03673         Y_i[0] = tmp[0];
03674         Y_i[colStrideY] = tmp[1];
03675       }
03676     }
03677   }
03678   else if (alpha == -STS::one()) {
03679     if (beta == -STS::one()) {
03680       for (Ordinal i = 0; i < numRows; ++i) {
03681         // Initialize temporary values to -Y(i,:).
03682         RangeScalar* const Y_i = &Y[i];
03683         RangeScalar tmp[2] = {-Y_i[0], -Y_i[colStrideY]};
03684 
03685         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03686           const MatrixScalar A_ij = val[k];
03687           const Ordinal j = ind[k];
03688           const DomainScalar* const X_j = &X[j];
03689           tmp[0] -= A_ij * X_j[0];
03690           tmp[1] -= A_ij * X_j[colStrideX];
03691         }
03692         // Copy temporary values into output vector.
03693         Y_i[0] = tmp[0];
03694         Y_i[colStrideY] = tmp[1];
03695       }
03696     }
03697     else if (beta == STS::zero()) {
03698       for (Ordinal i = 0; i < numRows; ++i) {
03699         // Initialize temporary values to 0.
03700         RangeScalar* const Y_i = &Y[i];
03701         RangeScalar tmp[2] = {STS::zero(), STS::zero()};
03702 
03703         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03704           const MatrixScalar A_ij = val[k];
03705           const Ordinal j = ind[k];
03706           const DomainScalar* const X_j = &X[j];
03707           tmp[0] -= A_ij * X_j[0];
03708           tmp[1] -= A_ij * X_j[colStrideX];
03709         }
03710         // Copy temporary values into output vector.
03711         Y_i[0] = tmp[0];
03712         Y_i[colStrideY] = tmp[1];
03713       }
03714     }
03715     else if (beta == STS::one()) {
03716       for (Ordinal i = 0; i < numRows; ++i) {
03717         // Initialize temporary values to Y(i,:).
03718         RangeScalar* const Y_i = &Y[i];
03719         RangeScalar tmp[2] = {Y_i[0], Y_i[colStrideY]};
03720 
03721         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03722           const MatrixScalar A_ij = val[k];
03723           const Ordinal j = ind[k];
03724           const DomainScalar* const X_j = &X[j];
03725           tmp[0] -= A_ij * X_j[0];
03726           tmp[1] -= A_ij * X_j[colStrideX];
03727         }
03728         // Copy temporary values into output vector.
03729         Y_i[0] = tmp[0];
03730         Y_i[colStrideY] = tmp[1];
03731       }
03732     }
03733     else { // beta != -1 && beta != 0 && beta != 1
03734       for (Ordinal i = 0; i < numRows; ++i) {
03735         // Initialize temporary values to Y(i,:) * beta.
03736         RangeScalar* const Y_i = &Y[i];
03737         RangeScalar tmp[2] = {beta * Y_i[0], beta * Y_i[colStrideY]};
03738 
03739         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03740           const MatrixScalar A_ij = val[k];
03741           const Ordinal j = ind[k];
03742           const DomainScalar* const X_j = &X[j];
03743           tmp[0] -= A_ij * X_j[0];
03744           tmp[1] -= A_ij * X_j[colStrideX];
03745         }
03746         // Copy temporary values into output vector.
03747         Y_i[0] = tmp[0];
03748         Y_i[colStrideY] = tmp[1];
03749       }
03750     }
03751   }
03752   else { // alpha != 1 && alpha != -1
03753     if (beta == -STS::one()) {
03754       for (Ordinal i = 0; i < numRows; ++i) {
03755         // Initialize temporary values to -Y(i,:).
03756         RangeScalar* const Y_i = &Y[i];
03757         RangeScalar tmp[2] = {-Y_i[0], -Y_i[colStrideY]};
03758 
03759         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03760           const MatrixScalar A_ij = val[k];
03761           const Ordinal j = ind[k];
03762           const DomainScalar* const X_j = &X[j];
03763           tmp[0] += alpha * A_ij * X_j[0];
03764           tmp[1] += alpha * A_ij * X_j[colStrideX];
03765         }
03766         // Copy temporary values into output vector.
03767         Y_i[0] = tmp[0];
03768         Y_i[colStrideY] = tmp[1];
03769       }
03770     }
03771     else if (beta == STS::zero()) {
03772       for (Ordinal i = 0; i < numRows; ++i) {
03773         // Initialize temporary values to 0.
03774         RangeScalar* const Y_i = &Y[i];
03775         RangeScalar tmp[2] = {STS::zero(), STS::zero()};
03776 
03777         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03778           const MatrixScalar A_ij = val[k];
03779           const Ordinal j = ind[k];
03780           const DomainScalar* const X_j = &X[j];
03781           tmp[0] += alpha * A_ij * X_j[0];
03782           tmp[1] += alpha * A_ij * X_j[colStrideX];
03783         }
03784         // Copy temporary values into output vector.
03785         Y_i[0] = tmp[0];
03786         Y_i[colStrideY] = tmp[1];
03787       }
03788     }
03789     else if (beta == STS::one()) {
03790       for (Ordinal i = 0; i < numRows; ++i) {
03791         // Initialize temporary values to Y(i,:).
03792         RangeScalar* const Y_i = &Y[i];
03793         RangeScalar tmp[2] = {Y_i[0], Y_i[colStrideY]};
03794 
03795         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03796           const MatrixScalar A_ij = val[k];
03797           const Ordinal j = ind[k];
03798           const DomainScalar* const X_j = &X[j];
03799           tmp[0] += alpha * A_ij * X_j[0];
03800           tmp[1] += alpha * A_ij * X_j[colStrideX];
03801         }
03802         // Copy temporary values into output vector.
03803         Y_i[0] = tmp[0];
03804         Y_i[colStrideY] = tmp[1];
03805       }
03806     }
03807     else { // beta != -1 && beta != 0 && beta != 1
03808       for (Ordinal i = 0; i < numRows; ++i) {
03809         // Initialize temporary values to Y(i,:) * beta.
03810         RangeScalar* const Y_i = &Y[i];
03811         RangeScalar tmp[2] = {beta * Y_i[0], beta * Y_i[colStrideY]};
03812 
03813         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03814           const MatrixScalar A_ij = val[k];
03815           const Ordinal j = ind[k];
03816           const DomainScalar* const X_j = &X[j];
03817           tmp[0] += alpha * A_ij * X_j[0];
03818           tmp[1] += alpha * A_ij * X_j[colStrideX];
03819         }
03820         // Copy temporary values into output vector.
03821         Y_i[0] = tmp[0];
03822         Y_i[colStrideY] = tmp[1];
03823       }
03824     }
03825   }
03826 }
03827 
03828 template<class Ordinal,
03829          class MatrixScalar,
03830          class DomainScalar,
03831          class RangeScalar>
03832 void
03833 matVecCsrColMajorForfor2VecOmp (
03834   const Ordinal numRows,
03835   const Ordinal numCols,
03836   const Ordinal numVecs,
03837   const RangeScalar& beta,
03838   RangeScalar Y[],
03839   const Ordinal colStrideY,
03840   const RangeScalar& alpha,
03841   const size_t  ptr[],
03842   const Ordinal ind[],
03843   const MatrixScalar val[],
03844   const DomainScalar X[],
03845   const Ordinal colStrideX)
03846 {
03847   typedef Teuchos::ScalarTraits<RangeScalar> STS;
03848 
03849   // With CSR for alpha == 0, scale Y by beta and return.
03850   if (alpha == STS::zero()) {
03851     // Prescale: Y := beta * Y.
03852     if (beta == STS::zero()) {
03853       for (Ordinal j = 0; j < numVecs; ++j) {
03854         RangeScalar* const Y_j = &Y[j*colStrideY];
03855         #pragma omp parallel for
03856         for (Ordinal i = 0; i < numRows; ++i) {
03857           // Follow the Sparse BLAS convention for beta == 0. 
03858           Y_j[i] = STS::zero();
03859         }
03860       }
03861     }
03862     else if (beta != STS::one()) {
03863       for (Ordinal j = 0; j < numVecs; ++j) {
03864         RangeScalar* const Y_j = &Y[j*colStrideY];
03865         #pragma omp parallel for
03866         for (Ordinal i = 0; i < numRows; ++i) {
03867           Y_j[i] = beta * Y_j[i];
03868         }
03869       }
03870     }
03871     return; // Our work is done!
03872   }
03873   if (alpha == STS::one()) {
03874     if (beta == -STS::one()) {
03875       #pragma omp parallel for
03876       for (Ordinal i = 0; i < numRows; ++i) {
03877         // Initialize temporary values to -Y(i,:).
03878         RangeScalar* const Y_i = &Y[i];
03879         RangeScalar tmp[2] = {-Y_i[0], -Y_i[colStrideY]};
03880 
03881         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03882           const MatrixScalar A_ij = val[k];
03883           const Ordinal j = ind[k];
03884           const DomainScalar* const X_j = &X[j];
03885           tmp[0] += A_ij * X_j[0];
03886           tmp[1] += A_ij * X_j[colStrideX];
03887         }
03888         // Copy temporary values into output vector.
03889         Y_i[0] = tmp[0];
03890         Y_i[colStrideY] = tmp[1];
03891       }
03892     }
03893     else if (beta == STS::zero()) {
03894       #pragma omp parallel for
03895       for (Ordinal i = 0; i < numRows; ++i) {
03896         // Initialize temporary values to 0.
03897         RangeScalar* const Y_i = &Y[i];
03898         RangeScalar tmp[2] = {STS::zero(), STS::zero()};
03899 
03900         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03901           const MatrixScalar A_ij = val[k];
03902           const Ordinal j = ind[k];
03903           const DomainScalar* const X_j = &X[j];
03904           tmp[0] += A_ij * X_j[0];
03905           tmp[1] += A_ij * X_j[colStrideX];
03906         }
03907         // Copy temporary values into output vector.
03908         Y_i[0] = tmp[0];
03909         Y_i[colStrideY] = tmp[1];
03910       }
03911     }
03912     else if (beta == STS::one()) {
03913       #pragma omp parallel for
03914       for (Ordinal i = 0; i < numRows; ++i) {
03915         // Initialize temporary values to Y(i,:).
03916         RangeScalar* const Y_i = &Y[i];
03917         RangeScalar tmp[2] = {Y_i[0], Y_i[colStrideY]};
03918 
03919         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03920           const MatrixScalar A_ij = val[k];
03921           const Ordinal j = ind[k];
03922           const DomainScalar* const X_j = &X[j];
03923           tmp[0] += A_ij * X_j[0];
03924           tmp[1] += A_ij * X_j[colStrideX];
03925         }
03926         // Copy temporary values into output vector.
03927         Y_i[0] = tmp[0];
03928         Y_i[colStrideY] = tmp[1];
03929       }
03930     }
03931     else { // beta != -1 && beta != 0 && beta != 1
03932       #pragma omp parallel for
03933       for (Ordinal i = 0; i < numRows; ++i) {
03934         // Initialize temporary values to Y(i,:) * beta.
03935         RangeScalar* const Y_i = &Y[i];
03936         RangeScalar tmp[2] = {beta * Y_i[0], beta * Y_i[colStrideY]};
03937 
03938         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03939           const MatrixScalar A_ij = val[k];
03940           const Ordinal j = ind[k];
03941           const DomainScalar* const X_j = &X[j];
03942           tmp[0] += A_ij * X_j[0];
03943           tmp[1] += A_ij * X_j[colStrideX];
03944         }
03945         // Copy temporary values into output vector.
03946         Y_i[0] = tmp[0];
03947         Y_i[colStrideY] = tmp[1];
03948       }
03949     }
03950   }
03951   else if (alpha == -STS::one()) {
03952     if (beta == -STS::one()) {
03953       #pragma omp parallel for
03954       for (Ordinal i = 0; i < numRows; ++i) {
03955         // Initialize temporary values to -Y(i,:).
03956         RangeScalar* const Y_i = &Y[i];
03957         RangeScalar tmp[2] = {-Y_i[0], -Y_i[colStrideY]};
03958 
03959         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03960           const MatrixScalar A_ij = val[k];
03961           const Ordinal j = ind[k];
03962           const DomainScalar* const X_j = &X[j];
03963           tmp[0] -= A_ij * X_j[0];
03964           tmp[1] -= A_ij * X_j[colStrideX];
03965         }
03966         // Copy temporary values into output vector.
03967         Y_i[0] = tmp[0];
03968         Y_i[colStrideY] = tmp[1];
03969       }
03970     }
03971     else if (beta == STS::zero()) {
03972       #pragma omp parallel for
03973       for (Ordinal i = 0; i < numRows; ++i) {
03974         // Initialize temporary values to 0.
03975         RangeScalar* const Y_i = &Y[i];
03976         RangeScalar tmp[2] = {STS::zero(), STS::zero()};
03977 
03978         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03979           const MatrixScalar A_ij = val[k];
03980           const Ordinal j = ind[k];
03981           const DomainScalar* const X_j = &X[j];
03982           tmp[0] -= A_ij * X_j[0];
03983           tmp[1] -= A_ij * X_j[colStrideX];
03984         }
03985         // Copy temporary values into output vector.
03986         Y_i[0] = tmp[0];
03987         Y_i[colStrideY] = tmp[1];
03988       }
03989     }
03990     else if (beta == STS::one()) {
03991       #pragma omp parallel for
03992       for (Ordinal i = 0; i < numRows; ++i) {
03993         // Initialize temporary values to Y(i,:).
03994         RangeScalar* const Y_i = &Y[i];
03995         RangeScalar tmp[2] = {Y_i[0], Y_i[colStrideY]};
03996 
03997         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
03998           const MatrixScalar A_ij = val[k];
03999           const Ordinal j = ind[k];
04000           const DomainScalar* const X_j = &X[j];
04001           tmp[0] -= A_ij * X_j[0];
04002           tmp[1] -= A_ij * X_j[colStrideX];
04003         }
04004         // Copy temporary values into output vector.
04005         Y_i[0] = tmp[0];
04006         Y_i[colStrideY] = tmp[1];
04007       }
04008     }
04009     else { // beta != -1 && beta != 0 && beta != 1
04010       #pragma omp parallel for
04011       for (Ordinal i = 0; i < numRows; ++i) {
04012         // Initialize temporary values to Y(i,:) * beta.
04013         RangeScalar* const Y_i = &Y[i];
04014         RangeScalar tmp[2] = {beta * Y_i[0], beta * Y_i[colStrideY]};
04015 
04016         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04017           const MatrixScalar A_ij = val[k];
04018           const Ordinal j = ind[k];
04019           const DomainScalar* const X_j = &X[j];
04020           tmp[0] -= A_ij * X_j[0];
04021           tmp[1] -= A_ij * X_j[colStrideX];
04022         }
04023         // Copy temporary values into output vector.
04024         Y_i[0] = tmp[0];
04025         Y_i[colStrideY] = tmp[1];
04026       }
04027     }
04028   }
04029   else { // alpha != 1 && alpha != -1
04030     if (beta == -STS::one()) {
04031       #pragma omp parallel for
04032       for (Ordinal i = 0; i < numRows; ++i) {
04033         // Initialize temporary values to -Y(i,:).
04034         RangeScalar* const Y_i = &Y[i];
04035         RangeScalar tmp[2] = {-Y_i[0], -Y_i[colStrideY]};
04036 
04037         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04038           const MatrixScalar A_ij = val[k];
04039           const Ordinal j = ind[k];
04040           const DomainScalar* const X_j = &X[j];
04041           tmp[0] += alpha * A_ij * X_j[0];
04042           tmp[1] += alpha * A_ij * X_j[colStrideX];
04043         }
04044         // Copy temporary values into output vector.
04045         Y_i[0] = tmp[0];
04046         Y_i[colStrideY] = tmp[1];
04047       }
04048     }
04049     else if (beta == STS::zero()) {
04050       #pragma omp parallel for
04051       for (Ordinal i = 0; i < numRows; ++i) {
04052         // Initialize temporary values to 0.
04053         RangeScalar* const Y_i = &Y[i];
04054         RangeScalar tmp[2] = {STS::zero(), STS::zero()};
04055 
04056         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04057           const MatrixScalar A_ij = val[k];
04058           const Ordinal j = ind[k];
04059           const DomainScalar* const X_j = &X[j];
04060           tmp[0] += alpha * A_ij * X_j[0];
04061           tmp[1] += alpha * A_ij * X_j[colStrideX];
04062         }
04063         // Copy temporary values into output vector.
04064         Y_i[0] = tmp[0];
04065         Y_i[colStrideY] = tmp[1];
04066       }
04067     }
04068     else if (beta == STS::one()) {
04069       #pragma omp parallel for
04070       for (Ordinal i = 0; i < numRows; ++i) {
04071         // Initialize temporary values to Y(i,:).
04072         RangeScalar* const Y_i = &Y[i];
04073         RangeScalar tmp[2] = {Y_i[0], Y_i[colStrideY]};
04074 
04075         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04076           const MatrixScalar A_ij = val[k];
04077           const Ordinal j = ind[k];
04078           const DomainScalar* const X_j = &X[j];
04079           tmp[0] += alpha * A_ij * X_j[0];
04080           tmp[1] += alpha * A_ij * X_j[colStrideX];
04081         }
04082         // Copy temporary values into output vector.
04083         Y_i[0] = tmp[0];
04084         Y_i[colStrideY] = tmp[1];
04085       }
04086     }
04087     else { // beta != -1 && beta != 0 && beta != 1
04088       #pragma omp parallel for
04089       for (Ordinal i = 0; i < numRows; ++i) {
04090         // Initialize temporary values to Y(i,:) * beta.
04091         RangeScalar* const Y_i = &Y[i];
04092         RangeScalar tmp[2] = {beta * Y_i[0], beta * Y_i[colStrideY]};
04093 
04094         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04095           const MatrixScalar A_ij = val[k];
04096           const Ordinal j = ind[k];
04097           const DomainScalar* const X_j = &X[j];
04098           tmp[0] += alpha * A_ij * X_j[0];
04099           tmp[1] += alpha * A_ij * X_j[colStrideX];
04100         }
04101         // Copy temporary values into output vector.
04102         Y_i[0] = tmp[0];
04103         Y_i[colStrideY] = tmp[1];
04104       }
04105     }
04106   }
04107 }
04108 
04109 template<class Ordinal,
04110          class MatrixScalar,
04111          class DomainScalar,
04112          class RangeScalar>
04113 void
04114 matVecCsrColMajorForfor3Vec (
04115   const Ordinal numRows,
04116   const Ordinal numCols,
04117   const Ordinal numVecs,
04118   const RangeScalar& beta,
04119   RangeScalar Y[],
04120   const Ordinal colStrideY,
04121   const RangeScalar& alpha,
04122   const size_t  ptr[],
04123   const Ordinal ind[],
04124   const MatrixScalar val[],
04125   const DomainScalar X[],
04126   const Ordinal colStrideX)
04127 {
04128   typedef Teuchos::ScalarTraits<RangeScalar> STS;
04129 
04130   // With CSR for alpha == 0, scale Y by beta and return.
04131   if (alpha == STS::zero()) {
04132     // Prescale: Y := beta * Y.
04133     if (beta == STS::zero()) {
04134       for (Ordinal j = 0; j < numVecs; ++j) {
04135         RangeScalar* const Y_j = &Y[j*colStrideY];
04136         for (Ordinal i = 0; i < numRows; ++i) {
04137           // Follow the Sparse BLAS convention for beta == 0. 
04138           Y_j[i] = STS::zero();
04139         }
04140       }
04141     }
04142     else if (beta != STS::one()) {
04143       for (Ordinal j = 0; j < numVecs; ++j) {
04144         RangeScalar* const Y_j = &Y[j*colStrideY];
04145         for (Ordinal i = 0; i < numRows; ++i) {
04146           Y_j[i] = beta * Y_j[i];
04147         }
04148       }
04149     }
04150     return; // Our work is done!
04151   }
04152   if (alpha == STS::one()) {
04153     if (beta == -STS::one()) {
04154       for (Ordinal i = 0; i < numRows; ++i) {
04155         // Initialize temporary values to -Y(i,:).
04156         RangeScalar* const Y_i = &Y[i];
04157         RangeScalar tmp[3] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY]};
04158 
04159         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04160           const MatrixScalar A_ij = val[k];
04161           const Ordinal j = ind[k];
04162           const DomainScalar* const X_j = &X[j];
04163           tmp[0] += A_ij * X_j[0];
04164           tmp[1] += A_ij * X_j[colStrideX];
04165           tmp[2] += A_ij * X_j[2*colStrideX];
04166         }
04167         // Copy temporary values into output vector.
04168         Y_i[0] = tmp[0];
04169         Y_i[colStrideY] = tmp[1];
04170         Y_i[2*colStrideY] = tmp[2];
04171       }
04172     }
04173     else if (beta == STS::zero()) {
04174       for (Ordinal i = 0; i < numRows; ++i) {
04175         // Initialize temporary values to 0.
04176         RangeScalar* const Y_i = &Y[i];
04177         RangeScalar tmp[3] = {STS::zero(), STS::zero(), STS::zero()};
04178 
04179         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04180           const MatrixScalar A_ij = val[k];
04181           const Ordinal j = ind[k];
04182           const DomainScalar* const X_j = &X[j];
04183           tmp[0] += A_ij * X_j[0];
04184           tmp[1] += A_ij * X_j[colStrideX];
04185           tmp[2] += A_ij * X_j[2*colStrideX];
04186         }
04187         // Copy temporary values into output vector.
04188         Y_i[0] = tmp[0];
04189         Y_i[colStrideY] = tmp[1];
04190         Y_i[2*colStrideY] = tmp[2];
04191       }
04192     }
04193     else if (beta == STS::one()) {
04194       for (Ordinal i = 0; i < numRows; ++i) {
04195         // Initialize temporary values to Y(i,:).
04196         RangeScalar* const Y_i = &Y[i];
04197         RangeScalar tmp[3] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY]};
04198 
04199         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04200           const MatrixScalar A_ij = val[k];
04201           const Ordinal j = ind[k];
04202           const DomainScalar* const X_j = &X[j];
04203           tmp[0] += A_ij * X_j[0];
04204           tmp[1] += A_ij * X_j[colStrideX];
04205           tmp[2] += A_ij * X_j[2*colStrideX];
04206         }
04207         // Copy temporary values into output vector.
04208         Y_i[0] = tmp[0];
04209         Y_i[colStrideY] = tmp[1];
04210         Y_i[2*colStrideY] = tmp[2];
04211       }
04212     }
04213     else { // beta != -1 && beta != 0 && beta != 1
04214       for (Ordinal i = 0; i < numRows; ++i) {
04215         // Initialize temporary values to Y(i,:) * beta.
04216         RangeScalar* const Y_i = &Y[i];
04217         RangeScalar tmp[3] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY]};
04218 
04219         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04220           const MatrixScalar A_ij = val[k];
04221           const Ordinal j = ind[k];
04222           const DomainScalar* const X_j = &X[j];
04223           tmp[0] += A_ij * X_j[0];
04224           tmp[1] += A_ij * X_j[colStrideX];
04225           tmp[2] += A_ij * X_j[2*colStrideX];
04226         }
04227         // Copy temporary values into output vector.
04228         Y_i[0] = tmp[0];
04229         Y_i[colStrideY] = tmp[1];
04230         Y_i[2*colStrideY] = tmp[2];
04231       }
04232     }
04233   }
04234   else if (alpha == -STS::one()) {
04235     if (beta == -STS::one()) {
04236       for (Ordinal i = 0; i < numRows; ++i) {
04237         // Initialize temporary values to -Y(i,:).
04238         RangeScalar* const Y_i = &Y[i];
04239         RangeScalar tmp[3] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY]};
04240 
04241         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04242           const MatrixScalar A_ij = val[k];
04243           const Ordinal j = ind[k];
04244           const DomainScalar* const X_j = &X[j];
04245           tmp[0] -= A_ij * X_j[0];
04246           tmp[1] -= A_ij * X_j[colStrideX];
04247           tmp[2] -= A_ij * X_j[2*colStrideX];
04248         }
04249         // Copy temporary values into output vector.
04250         Y_i[0] = tmp[0];
04251         Y_i[colStrideY] = tmp[1];
04252         Y_i[2*colStrideY] = tmp[2];
04253       }
04254     }
04255     else if (beta == STS::zero()) {
04256       for (Ordinal i = 0; i < numRows; ++i) {
04257         // Initialize temporary values to 0.
04258         RangeScalar* const Y_i = &Y[i];
04259         RangeScalar tmp[3] = {STS::zero(), STS::zero(), STS::zero()};
04260 
04261         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04262           const MatrixScalar A_ij = val[k];
04263           const Ordinal j = ind[k];
04264           const DomainScalar* const X_j = &X[j];
04265           tmp[0] -= A_ij * X_j[0];
04266           tmp[1] -= A_ij * X_j[colStrideX];
04267           tmp[2] -= A_ij * X_j[2*colStrideX];
04268         }
04269         // Copy temporary values into output vector.
04270         Y_i[0] = tmp[0];
04271         Y_i[colStrideY] = tmp[1];
04272         Y_i[2*colStrideY] = tmp[2];
04273       }
04274     }
04275     else if (beta == STS::one()) {
04276       for (Ordinal i = 0; i < numRows; ++i) {
04277         // Initialize temporary values to Y(i,:).
04278         RangeScalar* const Y_i = &Y[i];
04279         RangeScalar tmp[3] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY]};
04280 
04281         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04282           const MatrixScalar A_ij = val[k];
04283           const Ordinal j = ind[k];
04284           const DomainScalar* const X_j = &X[j];
04285           tmp[0] -= A_ij * X_j[0];
04286           tmp[1] -= A_ij * X_j[colStrideX];
04287           tmp[2] -= A_ij * X_j[2*colStrideX];
04288         }
04289         // Copy temporary values into output vector.
04290         Y_i[0] = tmp[0];
04291         Y_i[colStrideY] = tmp[1];
04292         Y_i[2*colStrideY] = tmp[2];
04293       }
04294     }
04295     else { // beta != -1 && beta != 0 && beta != 1
04296       for (Ordinal i = 0; i < numRows; ++i) {
04297         // Initialize temporary values to Y(i,:) * beta.
04298         RangeScalar* const Y_i = &Y[i];
04299         RangeScalar tmp[3] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY]};
04300 
04301         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04302           const MatrixScalar A_ij = val[k];
04303           const Ordinal j = ind[k];
04304           const DomainScalar* const X_j = &X[j];
04305           tmp[0] -= A_ij * X_j[0];
04306           tmp[1] -= A_ij * X_j[colStrideX];
04307           tmp[2] -= A_ij * X_j[2*colStrideX];
04308         }
04309         // Copy temporary values into output vector.
04310         Y_i[0] = tmp[0];
04311         Y_i[colStrideY] = tmp[1];
04312         Y_i[2*colStrideY] = tmp[2];
04313       }
04314     }
04315   }
04316   else { // alpha != 1 && alpha != -1
04317     if (beta == -STS::one()) {
04318       for (Ordinal i = 0; i < numRows; ++i) {
04319         // Initialize temporary values to -Y(i,:).
04320         RangeScalar* const Y_i = &Y[i];
04321         RangeScalar tmp[3] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY]};
04322 
04323         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04324           const MatrixScalar A_ij = val[k];
04325           const Ordinal j = ind[k];
04326           const DomainScalar* const X_j = &X[j];
04327           tmp[0] += alpha * A_ij * X_j[0];
04328           tmp[1] += alpha * A_ij * X_j[colStrideX];
04329           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
04330         }
04331         // Copy temporary values into output vector.
04332         Y_i[0] = tmp[0];
04333         Y_i[colStrideY] = tmp[1];
04334         Y_i[2*colStrideY] = tmp[2];
04335       }
04336     }
04337     else if (beta == STS::zero()) {
04338       for (Ordinal i = 0; i < numRows; ++i) {
04339         // Initialize temporary values to 0.
04340         RangeScalar* const Y_i = &Y[i];
04341         RangeScalar tmp[3] = {STS::zero(), STS::zero(), STS::zero()};
04342 
04343         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04344           const MatrixScalar A_ij = val[k];
04345           const Ordinal j = ind[k];
04346           const DomainScalar* const X_j = &X[j];
04347           tmp[0] += alpha * A_ij * X_j[0];
04348           tmp[1] += alpha * A_ij * X_j[colStrideX];
04349           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
04350         }
04351         // Copy temporary values into output vector.
04352         Y_i[0] = tmp[0];
04353         Y_i[colStrideY] = tmp[1];
04354         Y_i[2*colStrideY] = tmp[2];
04355       }
04356     }
04357     else if (beta == STS::one()) {
04358       for (Ordinal i = 0; i < numRows; ++i) {
04359         // Initialize temporary values to Y(i,:).
04360         RangeScalar* const Y_i = &Y[i];
04361         RangeScalar tmp[3] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY]};
04362 
04363         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04364           const MatrixScalar A_ij = val[k];
04365           const Ordinal j = ind[k];
04366           const DomainScalar* const X_j = &X[j];
04367           tmp[0] += alpha * A_ij * X_j[0];
04368           tmp[1] += alpha * A_ij * X_j[colStrideX];
04369           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
04370         }
04371         // Copy temporary values into output vector.
04372         Y_i[0] = tmp[0];
04373         Y_i[colStrideY] = tmp[1];
04374         Y_i[2*colStrideY] = tmp[2];
04375       }
04376     }
04377     else { // beta != -1 && beta != 0 && beta != 1
04378       for (Ordinal i = 0; i < numRows; ++i) {
04379         // Initialize temporary values to Y(i,:) * beta.
04380         RangeScalar* const Y_i = &Y[i];
04381         RangeScalar tmp[3] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY]};
04382 
04383         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04384           const MatrixScalar A_ij = val[k];
04385           const Ordinal j = ind[k];
04386           const DomainScalar* const X_j = &X[j];
04387           tmp[0] += alpha * A_ij * X_j[0];
04388           tmp[1] += alpha * A_ij * X_j[colStrideX];
04389           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
04390         }
04391         // Copy temporary values into output vector.
04392         Y_i[0] = tmp[0];
04393         Y_i[colStrideY] = tmp[1];
04394         Y_i[2*colStrideY] = tmp[2];
04395       }
04396     }
04397   }
04398 }
04399 
04400 template<class Ordinal,
04401          class MatrixScalar,
04402          class DomainScalar,
04403          class RangeScalar>
04404 void
04405 matVecCsrColMajorForfor3VecOmp (
04406   const Ordinal numRows,
04407   const Ordinal numCols,
04408   const Ordinal numVecs,
04409   const RangeScalar& beta,
04410   RangeScalar Y[],
04411   const Ordinal colStrideY,
04412   const RangeScalar& alpha,
04413   const size_t  ptr[],
04414   const Ordinal ind[],
04415   const MatrixScalar val[],
04416   const DomainScalar X[],
04417   const Ordinal colStrideX)
04418 {
04419   typedef Teuchos::ScalarTraits<RangeScalar> STS;
04420 
04421   // With CSR for alpha == 0, scale Y by beta and return.
04422   if (alpha == STS::zero()) {
04423     // Prescale: Y := beta * Y.
04424     if (beta == STS::zero()) {
04425       for (Ordinal j = 0; j < numVecs; ++j) {
04426         RangeScalar* const Y_j = &Y[j*colStrideY];
04427         #pragma omp parallel for
04428         for (Ordinal i = 0; i < numRows; ++i) {
04429           // Follow the Sparse BLAS convention for beta == 0. 
04430           Y_j[i] = STS::zero();
04431         }
04432       }
04433     }
04434     else if (beta != STS::one()) {
04435       for (Ordinal j = 0; j < numVecs; ++j) {
04436         RangeScalar* const Y_j = &Y[j*colStrideY];
04437         #pragma omp parallel for
04438         for (Ordinal i = 0; i < numRows; ++i) {
04439           Y_j[i] = beta * Y_j[i];
04440         }
04441       }
04442     }
04443     return; // Our work is done!
04444   }
04445   if (alpha == STS::one()) {
04446     if (beta == -STS::one()) {
04447       #pragma omp parallel for
04448       for (Ordinal i = 0; i < numRows; ++i) {
04449         // Initialize temporary values to -Y(i,:).
04450         RangeScalar* const Y_i = &Y[i];
04451         RangeScalar tmp[3] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY]};
04452 
04453         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04454           const MatrixScalar A_ij = val[k];
04455           const Ordinal j = ind[k];
04456           const DomainScalar* const X_j = &X[j];
04457           tmp[0] += A_ij * X_j[0];
04458           tmp[1] += A_ij * X_j[colStrideX];
04459           tmp[2] += A_ij * X_j[2*colStrideX];
04460         }
04461         // Copy temporary values into output vector.
04462         Y_i[0] = tmp[0];
04463         Y_i[colStrideY] = tmp[1];
04464         Y_i[2*colStrideY] = tmp[2];
04465       }
04466     }
04467     else if (beta == STS::zero()) {
04468       #pragma omp parallel for
04469       for (Ordinal i = 0; i < numRows; ++i) {
04470         // Initialize temporary values to 0.
04471         RangeScalar* const Y_i = &Y[i];
04472         RangeScalar tmp[3] = {STS::zero(), STS::zero(), STS::zero()};
04473 
04474         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04475           const MatrixScalar A_ij = val[k];
04476           const Ordinal j = ind[k];
04477           const DomainScalar* const X_j = &X[j];
04478           tmp[0] += A_ij * X_j[0];
04479           tmp[1] += A_ij * X_j[colStrideX];
04480           tmp[2] += A_ij * X_j[2*colStrideX];
04481         }
04482         // Copy temporary values into output vector.
04483         Y_i[0] = tmp[0];
04484         Y_i[colStrideY] = tmp[1];
04485         Y_i[2*colStrideY] = tmp[2];
04486       }
04487     }
04488     else if (beta == STS::one()) {
04489       #pragma omp parallel for
04490       for (Ordinal i = 0; i < numRows; ++i) {
04491         // Initialize temporary values to Y(i,:).
04492         RangeScalar* const Y_i = &Y[i];
04493         RangeScalar tmp[3] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY]};
04494 
04495         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04496           const MatrixScalar A_ij = val[k];
04497           const Ordinal j = ind[k];
04498           const DomainScalar* const X_j = &X[j];
04499           tmp[0] += A_ij * X_j[0];
04500           tmp[1] += A_ij * X_j[colStrideX];
04501           tmp[2] += A_ij * X_j[2*colStrideX];
04502         }
04503         // Copy temporary values into output vector.
04504         Y_i[0] = tmp[0];
04505         Y_i[colStrideY] = tmp[1];
04506         Y_i[2*colStrideY] = tmp[2];
04507       }
04508     }
04509     else { // beta != -1 && beta != 0 && beta != 1
04510       #pragma omp parallel for
04511       for (Ordinal i = 0; i < numRows; ++i) {
04512         // Initialize temporary values to Y(i,:) * beta.
04513         RangeScalar* const Y_i = &Y[i];
04514         RangeScalar tmp[3] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY]};
04515 
04516         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04517           const MatrixScalar A_ij = val[k];
04518           const Ordinal j = ind[k];
04519           const DomainScalar* const X_j = &X[j];
04520           tmp[0] += A_ij * X_j[0];
04521           tmp[1] += A_ij * X_j[colStrideX];
04522           tmp[2] += A_ij * X_j[2*colStrideX];
04523         }
04524         // Copy temporary values into output vector.
04525         Y_i[0] = tmp[0];
04526         Y_i[colStrideY] = tmp[1];
04527         Y_i[2*colStrideY] = tmp[2];
04528       }
04529     }
04530   }
04531   else if (alpha == -STS::one()) {
04532     if (beta == -STS::one()) {
04533       #pragma omp parallel for
04534       for (Ordinal i = 0; i < numRows; ++i) {
04535         // Initialize temporary values to -Y(i,:).
04536         RangeScalar* const Y_i = &Y[i];
04537         RangeScalar tmp[3] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY]};
04538 
04539         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04540           const MatrixScalar A_ij = val[k];
04541           const Ordinal j = ind[k];
04542           const DomainScalar* const X_j = &X[j];
04543           tmp[0] -= A_ij * X_j[0];
04544           tmp[1] -= A_ij * X_j[colStrideX];
04545           tmp[2] -= A_ij * X_j[2*colStrideX];
04546         }
04547         // Copy temporary values into output vector.
04548         Y_i[0] = tmp[0];
04549         Y_i[colStrideY] = tmp[1];
04550         Y_i[2*colStrideY] = tmp[2];
04551       }
04552     }
04553     else if (beta == STS::zero()) {
04554       #pragma omp parallel for
04555       for (Ordinal i = 0; i < numRows; ++i) {
04556         // Initialize temporary values to 0.
04557         RangeScalar* const Y_i = &Y[i];
04558         RangeScalar tmp[3] = {STS::zero(), STS::zero(), STS::zero()};
04559 
04560         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04561           const MatrixScalar A_ij = val[k];
04562           const Ordinal j = ind[k];
04563           const DomainScalar* const X_j = &X[j];
04564           tmp[0] -= A_ij * X_j[0];
04565           tmp[1] -= A_ij * X_j[colStrideX];
04566           tmp[2] -= A_ij * X_j[2*colStrideX];
04567         }
04568         // Copy temporary values into output vector.
04569         Y_i[0] = tmp[0];
04570         Y_i[colStrideY] = tmp[1];
04571         Y_i[2*colStrideY] = tmp[2];
04572       }
04573     }
04574     else if (beta == STS::one()) {
04575       #pragma omp parallel for
04576       for (Ordinal i = 0; i < numRows; ++i) {
04577         // Initialize temporary values to Y(i,:).
04578         RangeScalar* const Y_i = &Y[i];
04579         RangeScalar tmp[3] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY]};
04580 
04581         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04582           const MatrixScalar A_ij = val[k];
04583           const Ordinal j = ind[k];
04584           const DomainScalar* const X_j = &X[j];
04585           tmp[0] -= A_ij * X_j[0];
04586           tmp[1] -= A_ij * X_j[colStrideX];
04587           tmp[2] -= A_ij * X_j[2*colStrideX];
04588         }
04589         // Copy temporary values into output vector.
04590         Y_i[0] = tmp[0];
04591         Y_i[colStrideY] = tmp[1];
04592         Y_i[2*colStrideY] = tmp[2];
04593       }
04594     }
04595     else { // beta != -1 && beta != 0 && beta != 1
04596       #pragma omp parallel for
04597       for (Ordinal i = 0; i < numRows; ++i) {
04598         // Initialize temporary values to Y(i,:) * beta.
04599         RangeScalar* const Y_i = &Y[i];
04600         RangeScalar tmp[3] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY]};
04601 
04602         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04603           const MatrixScalar A_ij = val[k];
04604           const Ordinal j = ind[k];
04605           const DomainScalar* const X_j = &X[j];
04606           tmp[0] -= A_ij * X_j[0];
04607           tmp[1] -= A_ij * X_j[colStrideX];
04608           tmp[2] -= A_ij * X_j[2*colStrideX];
04609         }
04610         // Copy temporary values into output vector.
04611         Y_i[0] = tmp[0];
04612         Y_i[colStrideY] = tmp[1];
04613         Y_i[2*colStrideY] = tmp[2];
04614       }
04615     }
04616   }
04617   else { // alpha != 1 && alpha != -1
04618     if (beta == -STS::one()) {
04619       #pragma omp parallel for
04620       for (Ordinal i = 0; i < numRows; ++i) {
04621         // Initialize temporary values to -Y(i,:).
04622         RangeScalar* const Y_i = &Y[i];
04623         RangeScalar tmp[3] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY]};
04624 
04625         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04626           const MatrixScalar A_ij = val[k];
04627           const Ordinal j = ind[k];
04628           const DomainScalar* const X_j = &X[j];
04629           tmp[0] += alpha * A_ij * X_j[0];
04630           tmp[1] += alpha * A_ij * X_j[colStrideX];
04631           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
04632         }
04633         // Copy temporary values into output vector.
04634         Y_i[0] = tmp[0];
04635         Y_i[colStrideY] = tmp[1];
04636         Y_i[2*colStrideY] = tmp[2];
04637       }
04638     }
04639     else if (beta == STS::zero()) {
04640       #pragma omp parallel for
04641       for (Ordinal i = 0; i < numRows; ++i) {
04642         // Initialize temporary values to 0.
04643         RangeScalar* const Y_i = &Y[i];
04644         RangeScalar tmp[3] = {STS::zero(), STS::zero(), STS::zero()};
04645 
04646         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04647           const MatrixScalar A_ij = val[k];
04648           const Ordinal j = ind[k];
04649           const DomainScalar* const X_j = &X[j];
04650           tmp[0] += alpha * A_ij * X_j[0];
04651           tmp[1] += alpha * A_ij * X_j[colStrideX];
04652           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
04653         }
04654         // Copy temporary values into output vector.
04655         Y_i[0] = tmp[0];
04656         Y_i[colStrideY] = tmp[1];
04657         Y_i[2*colStrideY] = tmp[2];
04658       }
04659     }
04660     else if (beta == STS::one()) {
04661       #pragma omp parallel for
04662       for (Ordinal i = 0; i < numRows; ++i) {
04663         // Initialize temporary values to Y(i,:).
04664         RangeScalar* const Y_i = &Y[i];
04665         RangeScalar tmp[3] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY]};
04666 
04667         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04668           const MatrixScalar A_ij = val[k];
04669           const Ordinal j = ind[k];
04670           const DomainScalar* const X_j = &X[j];
04671           tmp[0] += alpha * A_ij * X_j[0];
04672           tmp[1] += alpha * A_ij * X_j[colStrideX];
04673           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
04674         }
04675         // Copy temporary values into output vector.
04676         Y_i[0] = tmp[0];
04677         Y_i[colStrideY] = tmp[1];
04678         Y_i[2*colStrideY] = tmp[2];
04679       }
04680     }
04681     else { // beta != -1 && beta != 0 && beta != 1
04682       #pragma omp parallel for
04683       for (Ordinal i = 0; i < numRows; ++i) {
04684         // Initialize temporary values to Y(i,:) * beta.
04685         RangeScalar* const Y_i = &Y[i];
04686         RangeScalar tmp[3] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY]};
04687 
04688         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04689           const MatrixScalar A_ij = val[k];
04690           const Ordinal j = ind[k];
04691           const DomainScalar* const X_j = &X[j];
04692           tmp[0] += alpha * A_ij * X_j[0];
04693           tmp[1] += alpha * A_ij * X_j[colStrideX];
04694           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
04695         }
04696         // Copy temporary values into output vector.
04697         Y_i[0] = tmp[0];
04698         Y_i[colStrideY] = tmp[1];
04699         Y_i[2*colStrideY] = tmp[2];
04700       }
04701     }
04702   }
04703 }
04704 
04705 template<class Ordinal,
04706          class MatrixScalar,
04707          class DomainScalar,
04708          class RangeScalar>
04709 void
04710 matVecCsrColMajorForfor4Vec (
04711   const Ordinal numRows,
04712   const Ordinal numCols,
04713   const Ordinal numVecs,
04714   const RangeScalar& beta,
04715   RangeScalar Y[],
04716   const Ordinal colStrideY,
04717   const RangeScalar& alpha,
04718   const size_t  ptr[],
04719   const Ordinal ind[],
04720   const MatrixScalar val[],
04721   const DomainScalar X[],
04722   const Ordinal colStrideX)
04723 {
04724   typedef Teuchos::ScalarTraits<RangeScalar> STS;
04725 
04726   // With CSR for alpha == 0, scale Y by beta and return.
04727   if (alpha == STS::zero()) {
04728     // Prescale: Y := beta * Y.
04729     if (beta == STS::zero()) {
04730       for (Ordinal j = 0; j < numVecs; ++j) {
04731         RangeScalar* const Y_j = &Y[j*colStrideY];
04732         for (Ordinal i = 0; i < numRows; ++i) {
04733           // Follow the Sparse BLAS convention for beta == 0. 
04734           Y_j[i] = STS::zero();
04735         }
04736       }
04737     }
04738     else if (beta != STS::one()) {
04739       for (Ordinal j = 0; j < numVecs; ++j) {
04740         RangeScalar* const Y_j = &Y[j*colStrideY];
04741         for (Ordinal i = 0; i < numRows; ++i) {
04742           Y_j[i] = beta * Y_j[i];
04743         }
04744       }
04745     }
04746     return; // Our work is done!
04747   }
04748   if (alpha == STS::one()) {
04749     if (beta == -STS::one()) {
04750       for (Ordinal i = 0; i < numRows; ++i) {
04751         // Initialize temporary values to -Y(i,:).
04752         RangeScalar* const Y_i = &Y[i];
04753         RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
04754 
04755         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04756           const MatrixScalar A_ij = val[k];
04757           const Ordinal j = ind[k];
04758           const DomainScalar* const X_j = &X[j];
04759           tmp[0] += A_ij * X_j[0];
04760           tmp[1] += A_ij * X_j[colStrideX];
04761           tmp[2] += A_ij * X_j[2*colStrideX];
04762           tmp[3] += A_ij * X_j[3*colStrideX];
04763         }
04764         // Copy temporary values into output vector.
04765         Y_i[0] = tmp[0];
04766         Y_i[colStrideY] = tmp[1];
04767         Y_i[2*colStrideY] = tmp[2];
04768         Y_i[3*colStrideY] = tmp[3];
04769       }
04770     }
04771     else if (beta == STS::zero()) {
04772       for (Ordinal i = 0; i < numRows; ++i) {
04773         // Initialize temporary values to 0.
04774         RangeScalar* const Y_i = &Y[i];
04775         RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
04776 
04777         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04778           const MatrixScalar A_ij = val[k];
04779           const Ordinal j = ind[k];
04780           const DomainScalar* const X_j = &X[j];
04781           tmp[0] += A_ij * X_j[0];
04782           tmp[1] += A_ij * X_j[colStrideX];
04783           tmp[2] += A_ij * X_j[2*colStrideX];
04784           tmp[3] += A_ij * X_j[3*colStrideX];
04785         }
04786         // Copy temporary values into output vector.
04787         Y_i[0] = tmp[0];
04788         Y_i[colStrideY] = tmp[1];
04789         Y_i[2*colStrideY] = tmp[2];
04790         Y_i[3*colStrideY] = tmp[3];
04791       }
04792     }
04793     else if (beta == STS::one()) {
04794       for (Ordinal i = 0; i < numRows; ++i) {
04795         // Initialize temporary values to Y(i,:).
04796         RangeScalar* const Y_i = &Y[i];
04797         RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
04798 
04799         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04800           const MatrixScalar A_ij = val[k];
04801           const Ordinal j = ind[k];
04802           const DomainScalar* const X_j = &X[j];
04803           tmp[0] += A_ij * X_j[0];
04804           tmp[1] += A_ij * X_j[colStrideX];
04805           tmp[2] += A_ij * X_j[2*colStrideX];
04806           tmp[3] += A_ij * X_j[3*colStrideX];
04807         }
04808         // Copy temporary values into output vector.
04809         Y_i[0] = tmp[0];
04810         Y_i[colStrideY] = tmp[1];
04811         Y_i[2*colStrideY] = tmp[2];
04812         Y_i[3*colStrideY] = tmp[3];
04813       }
04814     }
04815     else { // beta != -1 && beta != 0 && beta != 1
04816       for (Ordinal i = 0; i < numRows; ++i) {
04817         // Initialize temporary values to Y(i,:) * beta.
04818         RangeScalar* const Y_i = &Y[i];
04819         RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
04820 
04821         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04822           const MatrixScalar A_ij = val[k];
04823           const Ordinal j = ind[k];
04824           const DomainScalar* const X_j = &X[j];
04825           tmp[0] += A_ij * X_j[0];
04826           tmp[1] += A_ij * X_j[colStrideX];
04827           tmp[2] += A_ij * X_j[2*colStrideX];
04828           tmp[3] += A_ij * X_j[3*colStrideX];
04829         }
04830         // Copy temporary values into output vector.
04831         Y_i[0] = tmp[0];
04832         Y_i[colStrideY] = tmp[1];
04833         Y_i[2*colStrideY] = tmp[2];
04834         Y_i[3*colStrideY] = tmp[3];
04835       }
04836     }
04837   }
04838   else if (alpha == -STS::one()) {
04839     if (beta == -STS::one()) {
04840       for (Ordinal i = 0; i < numRows; ++i) {
04841         // Initialize temporary values to -Y(i,:).
04842         RangeScalar* const Y_i = &Y[i];
04843         RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
04844 
04845         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04846           const MatrixScalar A_ij = val[k];
04847           const Ordinal j = ind[k];
04848           const DomainScalar* const X_j = &X[j];
04849           tmp[0] -= A_ij * X_j[0];
04850           tmp[1] -= A_ij * X_j[colStrideX];
04851           tmp[2] -= A_ij * X_j[2*colStrideX];
04852           tmp[3] -= A_ij * X_j[3*colStrideX];
04853         }
04854         // Copy temporary values into output vector.
04855         Y_i[0] = tmp[0];
04856         Y_i[colStrideY] = tmp[1];
04857         Y_i[2*colStrideY] = tmp[2];
04858         Y_i[3*colStrideY] = tmp[3];
04859       }
04860     }
04861     else if (beta == STS::zero()) {
04862       for (Ordinal i = 0; i < numRows; ++i) {
04863         // Initialize temporary values to 0.
04864         RangeScalar* const Y_i = &Y[i];
04865         RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
04866 
04867         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04868           const MatrixScalar A_ij = val[k];
04869           const Ordinal j = ind[k];
04870           const DomainScalar* const X_j = &X[j];
04871           tmp[0] -= A_ij * X_j[0];
04872           tmp[1] -= A_ij * X_j[colStrideX];
04873           tmp[2] -= A_ij * X_j[2*colStrideX];
04874           tmp[3] -= A_ij * X_j[3*colStrideX];
04875         }
04876         // Copy temporary values into output vector.
04877         Y_i[0] = tmp[0];
04878         Y_i[colStrideY] = tmp[1];
04879         Y_i[2*colStrideY] = tmp[2];
04880         Y_i[3*colStrideY] = tmp[3];
04881       }
04882     }
04883     else if (beta == STS::one()) {
04884       for (Ordinal i = 0; i < numRows; ++i) {
04885         // Initialize temporary values to Y(i,:).
04886         RangeScalar* const Y_i = &Y[i];
04887         RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
04888 
04889         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04890           const MatrixScalar A_ij = val[k];
04891           const Ordinal j = ind[k];
04892           const DomainScalar* const X_j = &X[j];
04893           tmp[0] -= A_ij * X_j[0];
04894           tmp[1] -= A_ij * X_j[colStrideX];
04895           tmp[2] -= A_ij * X_j[2*colStrideX];
04896           tmp[3] -= A_ij * X_j[3*colStrideX];
04897         }
04898         // Copy temporary values into output vector.
04899         Y_i[0] = tmp[0];
04900         Y_i[colStrideY] = tmp[1];
04901         Y_i[2*colStrideY] = tmp[2];
04902         Y_i[3*colStrideY] = tmp[3];
04903       }
04904     }
04905     else { // beta != -1 && beta != 0 && beta != 1
04906       for (Ordinal i = 0; i < numRows; ++i) {
04907         // Initialize temporary values to Y(i,:) * beta.
04908         RangeScalar* const Y_i = &Y[i];
04909         RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
04910 
04911         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04912           const MatrixScalar A_ij = val[k];
04913           const Ordinal j = ind[k];
04914           const DomainScalar* const X_j = &X[j];
04915           tmp[0] -= A_ij * X_j[0];
04916           tmp[1] -= A_ij * X_j[colStrideX];
04917           tmp[2] -= A_ij * X_j[2*colStrideX];
04918           tmp[3] -= A_ij * X_j[3*colStrideX];
04919         }
04920         // Copy temporary values into output vector.
04921         Y_i[0] = tmp[0];
04922         Y_i[colStrideY] = tmp[1];
04923         Y_i[2*colStrideY] = tmp[2];
04924         Y_i[3*colStrideY] = tmp[3];
04925       }
04926     }
04927   }
04928   else { // alpha != 1 && alpha != -1
04929     if (beta == -STS::one()) {
04930       for (Ordinal i = 0; i < numRows; ++i) {
04931         // Initialize temporary values to -Y(i,:).
04932         RangeScalar* const Y_i = &Y[i];
04933         RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
04934 
04935         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04936           const MatrixScalar A_ij = val[k];
04937           const Ordinal j = ind[k];
04938           const DomainScalar* const X_j = &X[j];
04939           tmp[0] += alpha * A_ij * X_j[0];
04940           tmp[1] += alpha * A_ij * X_j[colStrideX];
04941           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
04942           tmp[3] += alpha * A_ij * X_j[3*colStrideX];
04943         }
04944         // Copy temporary values into output vector.
04945         Y_i[0] = tmp[0];
04946         Y_i[colStrideY] = tmp[1];
04947         Y_i[2*colStrideY] = tmp[2];
04948         Y_i[3*colStrideY] = tmp[3];
04949       }
04950     }
04951     else if (beta == STS::zero()) {
04952       for (Ordinal i = 0; i < numRows; ++i) {
04953         // Initialize temporary values to 0.
04954         RangeScalar* const Y_i = &Y[i];
04955         RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
04956 
04957         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04958           const MatrixScalar A_ij = val[k];
04959           const Ordinal j = ind[k];
04960           const DomainScalar* const X_j = &X[j];
04961           tmp[0] += alpha * A_ij * X_j[0];
04962           tmp[1] += alpha * A_ij * X_j[colStrideX];
04963           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
04964           tmp[3] += alpha * A_ij * X_j[3*colStrideX];
04965         }
04966         // Copy temporary values into output vector.
04967         Y_i[0] = tmp[0];
04968         Y_i[colStrideY] = tmp[1];
04969         Y_i[2*colStrideY] = tmp[2];
04970         Y_i[3*colStrideY] = tmp[3];
04971       }
04972     }
04973     else if (beta == STS::one()) {
04974       for (Ordinal i = 0; i < numRows; ++i) {
04975         // Initialize temporary values to Y(i,:).
04976         RangeScalar* const Y_i = &Y[i];
04977         RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
04978 
04979         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
04980           const MatrixScalar A_ij = val[k];
04981           const Ordinal j = ind[k];
04982           const DomainScalar* const X_j = &X[j];
04983           tmp[0] += alpha * A_ij * X_j[0];
04984           tmp[1] += alpha * A_ij * X_j[colStrideX];
04985           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
04986           tmp[3] += alpha * A_ij * X_j[3*colStrideX];
04987         }
04988         // Copy temporary values into output vector.
04989         Y_i[0] = tmp[0];
04990         Y_i[colStrideY] = tmp[1];
04991         Y_i[2*colStrideY] = tmp[2];
04992         Y_i[3*colStrideY] = tmp[3];
04993       }
04994     }
04995     else { // beta != -1 && beta != 0 && beta != 1
04996       for (Ordinal i = 0; i < numRows; ++i) {
04997         // Initialize temporary values to Y(i,:) * beta.
04998         RangeScalar* const Y_i = &Y[i];
04999         RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
05000 
05001         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05002           const MatrixScalar A_ij = val[k];
05003           const Ordinal j = ind[k];
05004           const DomainScalar* const X_j = &X[j];
05005           tmp[0] += alpha * A_ij * X_j[0];
05006           tmp[1] += alpha * A_ij * X_j[colStrideX];
05007           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
05008           tmp[3] += alpha * A_ij * X_j[3*colStrideX];
05009         }
05010         // Copy temporary values into output vector.
05011         Y_i[0] = tmp[0];
05012         Y_i[colStrideY] = tmp[1];
05013         Y_i[2*colStrideY] = tmp[2];
05014         Y_i[3*colStrideY] = tmp[3];
05015       }
05016     }
05017   }
05018 }
05019 
05020 template<class Ordinal,
05021          class MatrixScalar,
05022          class DomainScalar,
05023          class RangeScalar>
05024 void
05025 matVecCsrColMajorForfor4VecOmp (
05026   const Ordinal numRows,
05027   const Ordinal numCols,
05028   const Ordinal numVecs,
05029   const RangeScalar& beta,
05030   RangeScalar Y[],
05031   const Ordinal colStrideY,
05032   const RangeScalar& alpha,
05033   const size_t  ptr[],
05034   const Ordinal ind[],
05035   const MatrixScalar val[],
05036   const DomainScalar X[],
05037   const Ordinal colStrideX)
05038 {
05039   typedef Teuchos::ScalarTraits<RangeScalar> STS;
05040 
05041   // With CSR for alpha == 0, scale Y by beta and return.
05042   if (alpha == STS::zero()) {
05043     // Prescale: Y := beta * Y.
05044     if (beta == STS::zero()) {
05045       for (Ordinal j = 0; j < numVecs; ++j) {
05046         RangeScalar* const Y_j = &Y[j*colStrideY];
05047         #pragma omp parallel for
05048         for (Ordinal i = 0; i < numRows; ++i) {
05049           // Follow the Sparse BLAS convention for beta == 0. 
05050           Y_j[i] = STS::zero();
05051         }
05052       }
05053     }
05054     else if (beta != STS::one()) {
05055       for (Ordinal j = 0; j < numVecs; ++j) {
05056         RangeScalar* const Y_j = &Y[j*colStrideY];
05057         #pragma omp parallel for
05058         for (Ordinal i = 0; i < numRows; ++i) {
05059           Y_j[i] = beta * Y_j[i];
05060         }
05061       }
05062     }
05063     return; // Our work is done!
05064   }
05065   if (alpha == STS::one()) {
05066     if (beta == -STS::one()) {
05067       #pragma omp parallel for
05068       for (Ordinal i = 0; i < numRows; ++i) {
05069         // Initialize temporary values to -Y(i,:).
05070         RangeScalar* const Y_i = &Y[i];
05071         RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
05072 
05073         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05074           const MatrixScalar A_ij = val[k];
05075           const Ordinal j = ind[k];
05076           const DomainScalar* const X_j = &X[j];
05077           tmp[0] += A_ij * X_j[0];
05078           tmp[1] += A_ij * X_j[colStrideX];
05079           tmp[2] += A_ij * X_j[2*colStrideX];
05080           tmp[3] += A_ij * X_j[3*colStrideX];
05081         }
05082         // Copy temporary values into output vector.
05083         Y_i[0] = tmp[0];
05084         Y_i[colStrideY] = tmp[1];
05085         Y_i[2*colStrideY] = tmp[2];
05086         Y_i[3*colStrideY] = tmp[3];
05087       }
05088     }
05089     else if (beta == STS::zero()) {
05090       #pragma omp parallel for
05091       for (Ordinal i = 0; i < numRows; ++i) {
05092         // Initialize temporary values to 0.
05093         RangeScalar* const Y_i = &Y[i];
05094         RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
05095 
05096         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05097           const MatrixScalar A_ij = val[k];
05098           const Ordinal j = ind[k];
05099           const DomainScalar* const X_j = &X[j];
05100           tmp[0] += A_ij * X_j[0];
05101           tmp[1] += A_ij * X_j[colStrideX];
05102           tmp[2] += A_ij * X_j[2*colStrideX];
05103           tmp[3] += A_ij * X_j[3*colStrideX];
05104         }
05105         // Copy temporary values into output vector.
05106         Y_i[0] = tmp[0];
05107         Y_i[colStrideY] = tmp[1];
05108         Y_i[2*colStrideY] = tmp[2];
05109         Y_i[3*colStrideY] = tmp[3];
05110       }
05111     }
05112     else if (beta == STS::one()) {
05113       #pragma omp parallel for
05114       for (Ordinal i = 0; i < numRows; ++i) {
05115         // Initialize temporary values to Y(i,:).
05116         RangeScalar* const Y_i = &Y[i];
05117         RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
05118 
05119         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05120           const MatrixScalar A_ij = val[k];
05121           const Ordinal j = ind[k];
05122           const DomainScalar* const X_j = &X[j];
05123           tmp[0] += A_ij * X_j[0];
05124           tmp[1] += A_ij * X_j[colStrideX];
05125           tmp[2] += A_ij * X_j[2*colStrideX];
05126           tmp[3] += A_ij * X_j[3*colStrideX];
05127         }
05128         // Copy temporary values into output vector.
05129         Y_i[0] = tmp[0];
05130         Y_i[colStrideY] = tmp[1];
05131         Y_i[2*colStrideY] = tmp[2];
05132         Y_i[3*colStrideY] = tmp[3];
05133       }
05134     }
05135     else { // beta != -1 && beta != 0 && beta != 1
05136       #pragma omp parallel for
05137       for (Ordinal i = 0; i < numRows; ++i) {
05138         // Initialize temporary values to Y(i,:) * beta.
05139         RangeScalar* const Y_i = &Y[i];
05140         RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
05141 
05142         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05143           const MatrixScalar A_ij = val[k];
05144           const Ordinal j = ind[k];
05145           const DomainScalar* const X_j = &X[j];
05146           tmp[0] += A_ij * X_j[0];
05147           tmp[1] += A_ij * X_j[colStrideX];
05148           tmp[2] += A_ij * X_j[2*colStrideX];
05149           tmp[3] += A_ij * X_j[3*colStrideX];
05150         }
05151         // Copy temporary values into output vector.
05152         Y_i[0] = tmp[0];
05153         Y_i[colStrideY] = tmp[1];
05154         Y_i[2*colStrideY] = tmp[2];
05155         Y_i[3*colStrideY] = tmp[3];
05156       }
05157     }
05158   }
05159   else if (alpha == -STS::one()) {
05160     if (beta == -STS::one()) {
05161       #pragma omp parallel for
05162       for (Ordinal i = 0; i < numRows; ++i) {
05163         // Initialize temporary values to -Y(i,:).
05164         RangeScalar* const Y_i = &Y[i];
05165         RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
05166 
05167         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05168           const MatrixScalar A_ij = val[k];
05169           const Ordinal j = ind[k];
05170           const DomainScalar* const X_j = &X[j];
05171           tmp[0] -= A_ij * X_j[0];
05172           tmp[1] -= A_ij * X_j[colStrideX];
05173           tmp[2] -= A_ij * X_j[2*colStrideX];
05174           tmp[3] -= A_ij * X_j[3*colStrideX];
05175         }
05176         // Copy temporary values into output vector.
05177         Y_i[0] = tmp[0];
05178         Y_i[colStrideY] = tmp[1];
05179         Y_i[2*colStrideY] = tmp[2];
05180         Y_i[3*colStrideY] = tmp[3];
05181       }
05182     }
05183     else if (beta == STS::zero()) {
05184       #pragma omp parallel for
05185       for (Ordinal i = 0; i < numRows; ++i) {
05186         // Initialize temporary values to 0.
05187         RangeScalar* const Y_i = &Y[i];
05188         RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
05189 
05190         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05191           const MatrixScalar A_ij = val[k];
05192           const Ordinal j = ind[k];
05193           const DomainScalar* const X_j = &X[j];
05194           tmp[0] -= A_ij * X_j[0];
05195           tmp[1] -= A_ij * X_j[colStrideX];
05196           tmp[2] -= A_ij * X_j[2*colStrideX];
05197           tmp[3] -= A_ij * X_j[3*colStrideX];
05198         }
05199         // Copy temporary values into output vector.
05200         Y_i[0] = tmp[0];
05201         Y_i[colStrideY] = tmp[1];
05202         Y_i[2*colStrideY] = tmp[2];
05203         Y_i[3*colStrideY] = tmp[3];
05204       }
05205     }
05206     else if (beta == STS::one()) {
05207       #pragma omp parallel for
05208       for (Ordinal i = 0; i < numRows; ++i) {
05209         // Initialize temporary values to Y(i,:).
05210         RangeScalar* const Y_i = &Y[i];
05211         RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
05212 
05213         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05214           const MatrixScalar A_ij = val[k];
05215           const Ordinal j = ind[k];
05216           const DomainScalar* const X_j = &X[j];
05217           tmp[0] -= A_ij * X_j[0];
05218           tmp[1] -= A_ij * X_j[colStrideX];
05219           tmp[2] -= A_ij * X_j[2*colStrideX];
05220           tmp[3] -= A_ij * X_j[3*colStrideX];
05221         }
05222         // Copy temporary values into output vector.
05223         Y_i[0] = tmp[0];
05224         Y_i[colStrideY] = tmp[1];
05225         Y_i[2*colStrideY] = tmp[2];
05226         Y_i[3*colStrideY] = tmp[3];
05227       }
05228     }
05229     else { // beta != -1 && beta != 0 && beta != 1
05230       #pragma omp parallel for
05231       for (Ordinal i = 0; i < numRows; ++i) {
05232         // Initialize temporary values to Y(i,:) * beta.
05233         RangeScalar* const Y_i = &Y[i];
05234         RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
05235 
05236         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05237           const MatrixScalar A_ij = val[k];
05238           const Ordinal j = ind[k];
05239           const DomainScalar* const X_j = &X[j];
05240           tmp[0] -= A_ij * X_j[0];
05241           tmp[1] -= A_ij * X_j[colStrideX];
05242           tmp[2] -= A_ij * X_j[2*colStrideX];
05243           tmp[3] -= A_ij * X_j[3*colStrideX];
05244         }
05245         // Copy temporary values into output vector.
05246         Y_i[0] = tmp[0];
05247         Y_i[colStrideY] = tmp[1];
05248         Y_i[2*colStrideY] = tmp[2];
05249         Y_i[3*colStrideY] = tmp[3];
05250       }
05251     }
05252   }
05253   else { // alpha != 1 && alpha != -1
05254     if (beta == -STS::one()) {
05255       #pragma omp parallel for
05256       for (Ordinal i = 0; i < numRows; ++i) {
05257         // Initialize temporary values to -Y(i,:).
05258         RangeScalar* const Y_i = &Y[i];
05259         RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
05260 
05261         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05262           const MatrixScalar A_ij = val[k];
05263           const Ordinal j = ind[k];
05264           const DomainScalar* const X_j = &X[j];
05265           tmp[0] += alpha * A_ij * X_j[0];
05266           tmp[1] += alpha * A_ij * X_j[colStrideX];
05267           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
05268           tmp[3] += alpha * A_ij * X_j[3*colStrideX];
05269         }
05270         // Copy temporary values into output vector.
05271         Y_i[0] = tmp[0];
05272         Y_i[colStrideY] = tmp[1];
05273         Y_i[2*colStrideY] = tmp[2];
05274         Y_i[3*colStrideY] = tmp[3];
05275       }
05276     }
05277     else if (beta == STS::zero()) {
05278       #pragma omp parallel for
05279       for (Ordinal i = 0; i < numRows; ++i) {
05280         // Initialize temporary values to 0.
05281         RangeScalar* const Y_i = &Y[i];
05282         RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
05283 
05284         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05285           const MatrixScalar A_ij = val[k];
05286           const Ordinal j = ind[k];
05287           const DomainScalar* const X_j = &X[j];
05288           tmp[0] += alpha * A_ij * X_j[0];
05289           tmp[1] += alpha * A_ij * X_j[colStrideX];
05290           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
05291           tmp[3] += alpha * A_ij * X_j[3*colStrideX];
05292         }
05293         // Copy temporary values into output vector.
05294         Y_i[0] = tmp[0];
05295         Y_i[colStrideY] = tmp[1];
05296         Y_i[2*colStrideY] = tmp[2];
05297         Y_i[3*colStrideY] = tmp[3];
05298       }
05299     }
05300     else if (beta == STS::one()) {
05301       #pragma omp parallel for
05302       for (Ordinal i = 0; i < numRows; ++i) {
05303         // Initialize temporary values to Y(i,:).
05304         RangeScalar* const Y_i = &Y[i];
05305         RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
05306 
05307         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05308           const MatrixScalar A_ij = val[k];
05309           const Ordinal j = ind[k];
05310           const DomainScalar* const X_j = &X[j];
05311           tmp[0] += alpha * A_ij * X_j[0];
05312           tmp[1] += alpha * A_ij * X_j[colStrideX];
05313           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
05314           tmp[3] += alpha * A_ij * X_j[3*colStrideX];
05315         }
05316         // Copy temporary values into output vector.
05317         Y_i[0] = tmp[0];
05318         Y_i[colStrideY] = tmp[1];
05319         Y_i[2*colStrideY] = tmp[2];
05320         Y_i[3*colStrideY] = tmp[3];
05321       }
05322     }
05323     else { // beta != -1 && beta != 0 && beta != 1
05324       #pragma omp parallel for
05325       for (Ordinal i = 0; i < numRows; ++i) {
05326         // Initialize temporary values to Y(i,:) * beta.
05327         RangeScalar* const Y_i = &Y[i];
05328         RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
05329 
05330         for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
05331           const MatrixScalar A_ij = val[k];
05332           const Ordinal j = ind[k];
05333           const DomainScalar* const X_j = &X[j];
05334           tmp[0] += alpha * A_ij * X_j[0];
05335           tmp[1] += alpha * A_ij * X_j[colStrideX];
05336           tmp[2] += alpha * A_ij * X_j[2*colStrideX];
05337           tmp[3] += alpha * A_ij * X_j[3*colStrideX];
05338         }
05339         // Copy temporary values into output vector.
05340         Y_i[0] = tmp[0];
05341         Y_i[colStrideY] = tmp[1];
05342         Y_i[2*colStrideY] = tmp[2];
05343         Y_i[3*colStrideY] = tmp[3];
05344       }
05345     }
05346   }
05347 }
05348 
05349 template<class Ordinal,
05350          class MatrixScalar,
05351          class DomainScalar,
05352          class RangeScalar>
05353 void
05354 matVecCsrColMajorForwhile1Vec (
05355   const Ordinal numRows,
05356   const Ordinal numCols,
05357   const Ordinal numVecs,
05358   const RangeScalar& beta,
05359   RangeScalar Y[],
05360   const Ordinal colStrideY,
05361   const RangeScalar& alpha,
05362   const size_t  ptr[],
05363   const Ordinal ind[],
05364   const MatrixScalar val[],
05365   const DomainScalar X[],
05366   const Ordinal colStrideX)
05367 {
05368   typedef Teuchos::ScalarTraits<RangeScalar> STS;
05369 
05370   // Algorithm variants 'for-while' and 'for-if' need to set
05371   // Y(0,:) = 0, but only for the special case of CSR.
05372   if (beta != STS::zero()) {
05373     for (Ordinal c = 0; c < numVecs; ++c) {
05374       Y[c*colStrideY] = beta * Y[c*colStrideY];
05375     }
05376   }
05377   else {
05378     // Follow the Sparse BLAS convention for beta == 0. 
05379     for (Ordinal c = 0; c < numVecs; ++c) {
05380       Y[c*colStrideY] = STS::zero();
05381     }
05382   }
05383   if (alpha == STS::zero()) {
05384     // Prescale: Y := beta * Y.
05385     if (beta == STS::zero()) {
05386       for (Ordinal j = 0; j < numVecs; ++j) {
05387         RangeScalar* const Y_j = &Y[j*colStrideY];
05388         for (Ordinal i = 0; i < numRows; ++i) {
05389           // Follow the Sparse BLAS convention for beta == 0. 
05390           Y_j[i] = STS::zero();
05391         }
05392       }
05393     }
05394     else if (beta != STS::one()) {
05395       for (Ordinal j = 0; j < numVecs; ++j) {
05396         RangeScalar* const Y_j = &Y[j*colStrideY];
05397         for (Ordinal i = 0; i < numRows; ++i) {
05398           Y_j[i] = beta * Y_j[i];
05399         }
05400       }
05401     }
05402     return; // Our work is done!
05403   }
05404   const size_t nnz = ptr[numRows];
05405   if (alpha == STS::one()) {
05406     if (beta == -STS::one()) {
05407       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05408       RangeScalar* Y_i = Y;
05409       Ordinal i = 0;
05410       for (size_t k = 0; k < nnz; ++k) {
05411         const MatrixScalar A_ij = val[k];
05412         const Ordinal j = ind[k];
05413         while (k >= ptr[i+1]) {
05414           // Write temp output from last iteration(s) to Y,
05415           // before incrementing the current row index.
05416           if (k > 0) {
05417             Y_i[0] = tmp;
05418           }
05419           ++i;
05420           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
05421           Y_i = &Y[i];
05422           tmp = -Y[i + 0*colStrideY];
05423         }
05424         tmp += A_ij * X[j];
05425       }
05426       Y_i[0] = tmp;
05427     }
05428     else if (beta == STS::zero()) {
05429       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05430       RangeScalar* Y_i = Y;
05431       Ordinal i = 0;
05432       for (size_t k = 0; k < nnz; ++k) {
05433         const MatrixScalar A_ij = val[k];
05434         const Ordinal j = ind[k];
05435         while (k >= ptr[i+1]) {
05436           // Write temp output from last iteration(s) to Y,
05437           // before incrementing the current row index.
05438           if (k > 0) {
05439             Y_i[0] = tmp;
05440           }
05441           ++i;
05442           // We haven't seen row i before; set Y(i,:) to 0.
05443           Y_i = &Y[i];
05444           tmp = STS::zero();
05445         }
05446         tmp += A_ij * X[j];
05447       }
05448       Y_i[0] = tmp;
05449     }
05450     else if (beta == STS::one()) {
05451       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05452       RangeScalar* Y_i = Y;
05453       Ordinal i = 0;
05454       for (size_t k = 0; k < nnz; ++k) {
05455         const MatrixScalar A_ij = val[k];
05456         const Ordinal j = ind[k];
05457         while (k >= ptr[i+1]) {
05458           // Write temp output from last iteration(s) to Y,
05459           // before incrementing the current row index.
05460           if (k > 0) {
05461             Y_i[0] = tmp;
05462           }
05463           ++i;
05464           // We don't have to set Y(i,:) here, since beta == 1.
05465           Y_i = &Y[i];
05466           tmp = Y[i + 0*colStrideY];
05467         }
05468         tmp += A_ij * X[j];
05469       }
05470       Y_i[0] = tmp;
05471     }
05472     else { // beta != -1 && beta != 0 && beta != 1
05473       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05474       RangeScalar* Y_i = Y;
05475       Ordinal i = 0;
05476       for (size_t k = 0; k < nnz; ++k) {
05477         const MatrixScalar A_ij = val[k];
05478         const Ordinal j = ind[k];
05479         while (k >= ptr[i+1]) {
05480           // Write temp output from last iteration(s) to Y,
05481           // before incrementing the current row index.
05482           if (k > 0) {
05483             Y_i[0] = tmp;
05484           }
05485           ++i;
05486           // We haven't seen row i before; scale Y(i,:) by beta.
05487           Y_i = &Y[i];
05488           tmp = beta * Y[i + 0*colStrideY];
05489         }
05490         tmp += A_ij * X[j];
05491       }
05492       Y_i[0] = tmp;
05493     }
05494   }
05495   else if (alpha == -STS::one()) {
05496     if (beta == -STS::one()) {
05497       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05498       RangeScalar* Y_i = Y;
05499       Ordinal i = 0;
05500       for (size_t k = 0; k < nnz; ++k) {
05501         const MatrixScalar A_ij = val[k];
05502         const Ordinal j = ind[k];
05503         while (k >= ptr[i+1]) {
05504           // Write temp output from last iteration(s) to Y,
05505           // before incrementing the current row index.
05506           if (k > 0) {
05507             Y_i[0] = tmp;
05508           }
05509           ++i;
05510           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
05511           Y_i = &Y[i];
05512           tmp = -Y[i + 0*colStrideY];
05513         }
05514         tmp -= A_ij * X[j];
05515       }
05516       Y_i[0] = tmp;
05517     }
05518     else if (beta == STS::zero()) {
05519       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05520       RangeScalar* Y_i = Y;
05521       Ordinal i = 0;
05522       for (size_t k = 0; k < nnz; ++k) {
05523         const MatrixScalar A_ij = val[k];
05524         const Ordinal j = ind[k];
05525         while (k >= ptr[i+1]) {
05526           // Write temp output from last iteration(s) to Y,
05527           // before incrementing the current row index.
05528           if (k > 0) {
05529             Y_i[0] = tmp;
05530           }
05531           ++i;
05532           // We haven't seen row i before; set Y(i,:) to 0.
05533           Y_i = &Y[i];
05534           tmp = STS::zero();
05535         }
05536         tmp -= A_ij * X[j];
05537       }
05538       Y_i[0] = tmp;
05539     }
05540     else if (beta == STS::one()) {
05541       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05542       RangeScalar* Y_i = Y;
05543       Ordinal i = 0;
05544       for (size_t k = 0; k < nnz; ++k) {
05545         const MatrixScalar A_ij = val[k];
05546         const Ordinal j = ind[k];
05547         while (k >= ptr[i+1]) {
05548           // Write temp output from last iteration(s) to Y,
05549           // before incrementing the current row index.
05550           if (k > 0) {
05551             Y_i[0] = tmp;
05552           }
05553           ++i;
05554           // We don't have to set Y(i,:) here, since beta == 1.
05555           Y_i = &Y[i];
05556           tmp = Y[i + 0*colStrideY];
05557         }
05558         tmp -= A_ij * X[j];
05559       }
05560       Y_i[0] = tmp;
05561     }
05562     else { // beta != -1 && beta != 0 && beta != 1
05563       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05564       RangeScalar* Y_i = Y;
05565       Ordinal i = 0;
05566       for (size_t k = 0; k < nnz; ++k) {
05567         const MatrixScalar A_ij = val[k];
05568         const Ordinal j = ind[k];
05569         while (k >= ptr[i+1]) {
05570           // Write temp output from last iteration(s) to Y,
05571           // before incrementing the current row index.
05572           if (k > 0) {
05573             Y_i[0] = tmp;
05574           }
05575           ++i;
05576           // We haven't seen row i before; scale Y(i,:) by beta.
05577           Y_i = &Y[i];
05578           tmp = beta * Y[i + 0*colStrideY];
05579         }
05580         tmp -= A_ij * X[j];
05581       }
05582       Y_i[0] = tmp;
05583     }
05584   }
05585   else { // alpha != 1 && alpha != -1
05586     if (beta == -STS::one()) {
05587       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05588       RangeScalar* Y_i = Y;
05589       Ordinal i = 0;
05590       for (size_t k = 0; k < nnz; ++k) {
05591         const MatrixScalar A_ij = val[k];
05592         const Ordinal j = ind[k];
05593         while (k >= ptr[i+1]) {
05594           // Write temp output from last iteration(s) to Y,
05595           // before incrementing the current row index.
05596           if (k > 0) {
05597             Y_i[0] = tmp;
05598           }
05599           ++i;
05600           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
05601           Y_i = &Y[i];
05602           tmp = -Y[i + 0*colStrideY];
05603         }
05604         tmp += alpha * A_ij * X[j];
05605       }
05606       Y_i[0] = tmp;
05607     }
05608     else if (beta == STS::zero()) {
05609       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05610       RangeScalar* Y_i = Y;
05611       Ordinal i = 0;
05612       for (size_t k = 0; k < nnz; ++k) {
05613         const MatrixScalar A_ij = val[k];
05614         const Ordinal j = ind[k];
05615         while (k >= ptr[i+1]) {
05616           // Write temp output from last iteration(s) to Y,
05617           // before incrementing the current row index.
05618           if (k > 0) {
05619             Y_i[0] = tmp;
05620           }
05621           ++i;
05622           // We haven't seen row i before; set Y(i,:) to 0.
05623           Y_i = &Y[i];
05624           tmp = STS::zero();
05625         }
05626         tmp += alpha * A_ij * X[j];
05627       }
05628       Y_i[0] = tmp;
05629     }
05630     else if (beta == STS::one()) {
05631       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05632       RangeScalar* Y_i = Y;
05633       Ordinal i = 0;
05634       for (size_t k = 0; k < nnz; ++k) {
05635         const MatrixScalar A_ij = val[k];
05636         const Ordinal j = ind[k];
05637         while (k >= ptr[i+1]) {
05638           // Write temp output from last iteration(s) to Y,
05639           // before incrementing the current row index.
05640           if (k > 0) {
05641             Y_i[0] = tmp;
05642           }
05643           ++i;
05644           // We don't have to set Y(i,:) here, since beta == 1.
05645           Y_i = &Y[i];
05646           tmp = Y[i + 0*colStrideY];
05647         }
05648         tmp += alpha * A_ij * X[j];
05649       }
05650       Y_i[0] = tmp;
05651     }
05652     else { // beta != -1 && beta != 0 && beta != 1
05653       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
05654       RangeScalar* Y_i = Y;
05655       Ordinal i = 0;
05656       for (size_t k = 0; k < nnz; ++k) {
05657         const MatrixScalar A_ij = val[k];
05658         const Ordinal j = ind[k];
05659         while (k >= ptr[i+1]) {
05660           // Write temp output from last iteration(s) to Y,
05661           // before incrementing the current row index.
05662           if (k > 0) {
05663             Y_i[0] = tmp;
05664           }
05665           ++i;
05666           // We haven't seen row i before; scale Y(i,:) by beta.
05667           Y_i = &Y[i];
05668           tmp = beta * Y[i + 0*colStrideY];
05669         }
05670         tmp += alpha * A_ij * X[j];
05671       }
05672       Y_i[0] = tmp;
05673     }
05674   }
05675 }
05676 
05677 template<class Ordinal,
05678          class MatrixScalar,
05679          class DomainScalar,
05680          class RangeScalar>
05681 void
05682 matVecCsrColMajorForwhile2Vec (
05683   const Ordinal numRows,
05684   const Ordinal numCols,
05685   const Ordinal numVecs,
05686   const RangeScalar& beta,
05687   RangeScalar Y[],
05688   const Ordinal colStrideY,
05689   const RangeScalar& alpha,
05690   const size_t  ptr[],
05691   const Ordinal ind[],
05692   const MatrixScalar val[],
05693   const DomainScalar X[],
05694   const Ordinal colStrideX)
05695 {
05696   typedef Teuchos::ScalarTraits<RangeScalar> STS;
05697 
05698   // Algorithm variants 'for-while' and 'for-if' need to set
05699   // Y(0,:) = 0, but only for the special case of CSR.
05700   if (beta != STS::zero()) {
05701     for (Ordinal c = 0; c < numVecs; ++c) {
05702       Y[c*colStrideY] = beta * Y[c*colStrideY];
05703     }
05704   }
05705   else {
05706     // Follow the Sparse BLAS convention for beta == 0. 
05707     for (Ordinal c = 0; c < numVecs; ++c) {
05708       Y[c*colStrideY] = STS::zero();
05709     }
05710   }
05711   if (alpha == STS::zero()) {
05712     // Prescale: Y := beta * Y.
05713     if (beta == STS::zero()) {
05714       for (Ordinal j = 0; j < numVecs; ++j) {
05715         RangeScalar* const Y_j = &Y[j*colStrideY];
05716         for (Ordinal i = 0; i < numRows; ++i) {
05717           // Follow the Sparse BLAS convention for beta == 0. 
05718           Y_j[i] = STS::zero();
05719         }
05720       }
05721     }
05722     else if (beta != STS::one()) {
05723       for (Ordinal j = 0; j < numVecs; ++j) {
05724         RangeScalar* const Y_j = &Y[j*colStrideY];
05725         for (Ordinal i = 0; i < numRows; ++i) {
05726           Y_j[i] = beta * Y_j[i];
05727         }
05728       }
05729     }
05730     return; // Our work is done!
05731   }
05732   const size_t nnz = ptr[numRows];
05733   if (alpha == STS::one()) {
05734     if (beta == -STS::one()) {
05735       RangeScalar tmp[2];
05736       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
05737       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
05738 
05739       RangeScalar* Y_i = Y;
05740       Ordinal i = 0;
05741       for (size_t k = 0; k < nnz; ++k) {
05742         const MatrixScalar A_ij = val[k];
05743         const Ordinal j = ind[k];
05744         while (k >= ptr[i+1]) {
05745           // Write temp output from last iteration(s) to Y,
05746           // before incrementing the current row index.
05747           if (k > 0) {
05748             Y_i[0] = tmp[0];
05749             Y_i[colStrideY] = tmp[1];
05750           }
05751           ++i;
05752           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
05753           Y_i = &Y[i];
05754           tmp[0] = -Y_i[0];
05755           tmp[1] = -Y_i[colStrideY];
05756         }
05757         const DomainScalar* const X_j = &X[j];
05758         tmp[0] += A_ij * X_j[0];
05759         tmp[1] += A_ij * X_j[colStrideX];
05760       }
05761       Y_i[0] = tmp[0];
05762       Y_i[colStrideY] = tmp[1];
05763     }
05764     else if (beta == STS::zero()) {
05765       RangeScalar tmp[2];
05766       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
05767       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
05768 
05769       RangeScalar* Y_i = Y;
05770       Ordinal i = 0;
05771       for (size_t k = 0; k < nnz; ++k) {
05772         const MatrixScalar A_ij = val[k];
05773         const Ordinal j = ind[k];
05774         while (k >= ptr[i+1]) {
05775           // Write temp output from last iteration(s) to Y,
05776           // before incrementing the current row index.
05777           if (k > 0) {
05778             Y_i[0] = tmp[0];
05779             Y_i[colStrideY] = tmp[1];
05780           }
05781           ++i;
05782           // We haven't seen row i before; set Y(i,:) to 0.
05783           Y_i = &Y[i];
05784           tmp[0] = STS::zero();
05785           tmp[1] = STS::zero();
05786         }
05787         const DomainScalar* const X_j = &X[j];
05788         tmp[0] += A_ij * X_j[0];
05789         tmp[1] += A_ij * X_j[colStrideX];
05790       }
05791       Y_i[0] = tmp[0];
05792       Y_i[colStrideY] = tmp[1];
05793     }
05794     else if (beta == STS::one()) {
05795       RangeScalar tmp[2];
05796       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
05797       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
05798 
05799       RangeScalar* Y_i = Y;
05800       Ordinal i = 0;
05801       for (size_t k = 0; k < nnz; ++k) {
05802         const MatrixScalar A_ij = val[k];
05803         const Ordinal j = ind[k];
05804         while (k >= ptr[i+1]) {
05805           // Write temp output from last iteration(s) to Y,
05806           // before incrementing the current row index.
05807           if (k > 0) {
05808             Y_i[0] = tmp[0];
05809             Y_i[colStrideY] = tmp[1];
05810           }
05811           ++i;
05812           // We don't have to set Y(i,:) here, since beta == 1.
05813           Y_i = &Y[i];
05814           tmp[0] = Y_i[0];
05815           tmp[1] = Y_i[colStrideY];
05816         }
05817         const DomainScalar* const X_j = &X[j];
05818         tmp[0] += A_ij * X_j[0];
05819         tmp[1] += A_ij * X_j[colStrideX];
05820       }
05821       Y_i[0] = tmp[0];
05822       Y_i[colStrideY] = tmp[1];
05823     }
05824     else { // beta != -1 && beta != 0 && beta != 1
05825       RangeScalar tmp[2];
05826       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
05827       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
05828 
05829       RangeScalar* Y_i = Y;
05830       Ordinal i = 0;
05831       for (size_t k = 0; k < nnz; ++k) {
05832         const MatrixScalar A_ij = val[k];
05833         const Ordinal j = ind[k];
05834         while (k >= ptr[i+1]) {
05835           // Write temp output from last iteration(s) to Y,
05836           // before incrementing the current row index.
05837           if (k > 0) {
05838             Y_i[0] = tmp[0];
05839             Y_i[colStrideY] = tmp[1];
05840           }
05841           ++i;
05842           // We haven't seen row i before; scale Y(i,:) by beta.
05843           Y_i = &Y[i];
05844           tmp[0] = beta * Y_i[0];
05845           tmp[1] = beta * Y_i[colStrideY];
05846         }
05847         const DomainScalar* const X_j = &X[j];
05848         tmp[0] += A_ij * X_j[0];
05849         tmp[1] += A_ij * X_j[colStrideX];
05850       }
05851       Y_i[0] = tmp[0];
05852       Y_i[colStrideY] = tmp[1];
05853     }
05854   }
05855   else if (alpha == -STS::one()) {
05856     if (beta == -STS::one()) {
05857       RangeScalar tmp[2];
05858       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
05859       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
05860 
05861       RangeScalar* Y_i = Y;
05862       Ordinal i = 0;
05863       for (size_t k = 0; k < nnz; ++k) {
05864         const MatrixScalar A_ij = val[k];
05865         const Ordinal j = ind[k];
05866         while (k >= ptr[i+1]) {
05867           // Write temp output from last iteration(s) to Y,
05868           // before incrementing the current row index.
05869           if (k > 0) {
05870             Y_i[0] = tmp[0];
05871             Y_i[colStrideY] = tmp[1];
05872           }
05873           ++i;
05874           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
05875           Y_i = &Y[i];
05876           tmp[0] = -Y_i[0];
05877           tmp[1] = -Y_i[colStrideY];
05878         }
05879         const DomainScalar* const X_j = &X[j];
05880         tmp[0] -= A_ij * X_j[0];
05881         tmp[1] -= A_ij * X_j[colStrideX];
05882       }
05883       Y_i[0] = tmp[0];
05884       Y_i[colStrideY] = tmp[1];
05885     }
05886     else if (beta == STS::zero()) {
05887       RangeScalar tmp[2];
05888       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
05889       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
05890 
05891       RangeScalar* Y_i = Y;
05892       Ordinal i = 0;
05893       for (size_t k = 0; k < nnz; ++k) {
05894         const MatrixScalar A_ij = val[k];
05895         const Ordinal j = ind[k];
05896         while (k >= ptr[i+1]) {
05897           // Write temp output from last iteration(s) to Y,
05898           // before incrementing the current row index.
05899           if (k > 0) {
05900             Y_i[0] = tmp[0];
05901             Y_i[colStrideY] = tmp[1];
05902           }
05903           ++i;
05904           // We haven't seen row i before; set Y(i,:) to 0.
05905           Y_i = &Y[i];
05906           tmp[0] = STS::zero();
05907           tmp[1] = STS::zero();
05908         }
05909         const DomainScalar* const X_j = &X[j];
05910         tmp[0] -= A_ij * X_j[0];
05911         tmp[1] -= A_ij * X_j[colStrideX];
05912       }
05913       Y_i[0] = tmp[0];
05914       Y_i[colStrideY] = tmp[1];
05915     }
05916     else if (beta == STS::one()) {
05917       RangeScalar tmp[2];
05918       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
05919       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
05920 
05921       RangeScalar* Y_i = Y;
05922       Ordinal i = 0;
05923       for (size_t k = 0; k < nnz; ++k) {
05924         const MatrixScalar A_ij = val[k];
05925         const Ordinal j = ind[k];
05926         while (k >= ptr[i+1]) {
05927           // Write temp output from last iteration(s) to Y,
05928           // before incrementing the current row index.
05929           if (k > 0) {
05930             Y_i[0] = tmp[0];
05931             Y_i[colStrideY] = tmp[1];
05932           }
05933           ++i;
05934           // We don't have to set Y(i,:) here, since beta == 1.
05935           Y_i = &Y[i];
05936           tmp[0] = Y_i[0];
05937           tmp[1] = Y_i[colStrideY];
05938         }
05939         const DomainScalar* const X_j = &X[j];
05940         tmp[0] -= A_ij * X_j[0];
05941         tmp[1] -= A_ij * X_j[colStrideX];
05942       }
05943       Y_i[0] = tmp[0];
05944       Y_i[colStrideY] = tmp[1];
05945     }
05946     else { // beta != -1 && beta != 0 && beta != 1
05947       RangeScalar tmp[2];
05948       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
05949       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
05950 
05951       RangeScalar* Y_i = Y;
05952       Ordinal i = 0;
05953       for (size_t k = 0; k < nnz; ++k) {
05954         const MatrixScalar A_ij = val[k];
05955         const Ordinal j = ind[k];
05956         while (k >= ptr[i+1]) {
05957           // Write temp output from last iteration(s) to Y,
05958           // before incrementing the current row index.
05959           if (k > 0) {
05960             Y_i[0] = tmp[0];
05961             Y_i[colStrideY] = tmp[1];
05962           }
05963           ++i;
05964           // We haven't seen row i before; scale Y(i,:) by beta.
05965           Y_i = &Y[i];
05966           tmp[0] = beta * Y_i[0];
05967           tmp[1] = beta * Y_i[colStrideY];
05968         }
05969         const DomainScalar* const X_j = &X[j];
05970         tmp[0] -= A_ij * X_j[0];
05971         tmp[1] -= A_ij * X_j[colStrideX];
05972       }
05973       Y_i[0] = tmp[0];
05974       Y_i[colStrideY] = tmp[1];
05975     }
05976   }
05977   else { // alpha != 1 && alpha != -1
05978     if (beta == -STS::one()) {
05979       RangeScalar tmp[2];
05980       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
05981       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
05982 
05983       RangeScalar* Y_i = Y;
05984       Ordinal i = 0;
05985       for (size_t k = 0; k < nnz; ++k) {
05986         const MatrixScalar A_ij = val[k];
05987         const Ordinal j = ind[k];
05988         while (k >= ptr[i+1]) {
05989           // Write temp output from last iteration(s) to Y,
05990           // before incrementing the current row index.
05991           if (k > 0) {
05992             Y_i[0] = tmp[0];
05993             Y_i[colStrideY] = tmp[1];
05994           }
05995           ++i;
05996           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
05997           Y_i = &Y[i];
05998           tmp[0] = -Y_i[0];
05999           tmp[1] = -Y_i[colStrideY];
06000         }
06001         const DomainScalar* const X_j = &X[j];
06002         tmp[0] += alpha * A_ij * X_j[0];
06003         tmp[1] += alpha * A_ij * X_j[colStrideX];
06004       }
06005       Y_i[0] = tmp[0];
06006       Y_i[colStrideY] = tmp[1];
06007     }
06008     else if (beta == STS::zero()) {
06009       RangeScalar tmp[2];
06010       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06011       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06012 
06013       RangeScalar* Y_i = Y;
06014       Ordinal i = 0;
06015       for (size_t k = 0; k < nnz; ++k) {
06016         const MatrixScalar A_ij = val[k];
06017         const Ordinal j = ind[k];
06018         while (k >= ptr[i+1]) {
06019           // Write temp output from last iteration(s) to Y,
06020           // before incrementing the current row index.
06021           if (k > 0) {
06022             Y_i[0] = tmp[0];
06023             Y_i[colStrideY] = tmp[1];
06024           }
06025           ++i;
06026           // We haven't seen row i before; set Y(i,:) to 0.
06027           Y_i = &Y[i];
06028           tmp[0] = STS::zero();
06029           tmp[1] = STS::zero();
06030         }
06031         const DomainScalar* const X_j = &X[j];
06032         tmp[0] += alpha * A_ij * X_j[0];
06033         tmp[1] += alpha * A_ij * X_j[colStrideX];
06034       }
06035       Y_i[0] = tmp[0];
06036       Y_i[colStrideY] = tmp[1];
06037     }
06038     else if (beta == STS::one()) {
06039       RangeScalar tmp[2];
06040       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06041       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06042 
06043       RangeScalar* Y_i = Y;
06044       Ordinal i = 0;
06045       for (size_t k = 0; k < nnz; ++k) {
06046         const MatrixScalar A_ij = val[k];
06047         const Ordinal j = ind[k];
06048         while (k >= ptr[i+1]) {
06049           // Write temp output from last iteration(s) to Y,
06050           // before incrementing the current row index.
06051           if (k > 0) {
06052             Y_i[0] = tmp[0];
06053             Y_i[colStrideY] = tmp[1];
06054           }
06055           ++i;
06056           // We don't have to set Y(i,:) here, since beta == 1.
06057           Y_i = &Y[i];
06058           tmp[0] = Y_i[0];
06059           tmp[1] = Y_i[colStrideY];
06060         }
06061         const DomainScalar* const X_j = &X[j];
06062         tmp[0] += alpha * A_ij * X_j[0];
06063         tmp[1] += alpha * A_ij * X_j[colStrideX];
06064       }
06065       Y_i[0] = tmp[0];
06066       Y_i[colStrideY] = tmp[1];
06067     }
06068     else { // beta != -1 && beta != 0 && beta != 1
06069       RangeScalar tmp[2];
06070       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06071       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06072 
06073       RangeScalar* Y_i = Y;
06074       Ordinal i = 0;
06075       for (size_t k = 0; k < nnz; ++k) {
06076         const MatrixScalar A_ij = val[k];
06077         const Ordinal j = ind[k];
06078         while (k >= ptr[i+1]) {
06079           // Write temp output from last iteration(s) to Y,
06080           // before incrementing the current row index.
06081           if (k > 0) {
06082             Y_i[0] = tmp[0];
06083             Y_i[colStrideY] = tmp[1];
06084           }
06085           ++i;
06086           // We haven't seen row i before; scale Y(i,:) by beta.
06087           Y_i = &Y[i];
06088           tmp[0] = beta * Y_i[0];
06089           tmp[1] = beta * Y_i[colStrideY];
06090         }
06091         const DomainScalar* const X_j = &X[j];
06092         tmp[0] += alpha * A_ij * X_j[0];
06093         tmp[1] += alpha * A_ij * X_j[colStrideX];
06094       }
06095       Y_i[0] = tmp[0];
06096       Y_i[colStrideY] = tmp[1];
06097     }
06098   }
06099 }
06100 
06101 template<class Ordinal,
06102          class MatrixScalar,
06103          class DomainScalar,
06104          class RangeScalar>
06105 void
06106 matVecCsrColMajorForwhile3Vec (
06107   const Ordinal numRows,
06108   const Ordinal numCols,
06109   const Ordinal numVecs,
06110   const RangeScalar& beta,
06111   RangeScalar Y[],
06112   const Ordinal colStrideY,
06113   const RangeScalar& alpha,
06114   const size_t  ptr[],
06115   const Ordinal ind[],
06116   const MatrixScalar val[],
06117   const DomainScalar X[],
06118   const Ordinal colStrideX)
06119 {
06120   typedef Teuchos::ScalarTraits<RangeScalar> STS;
06121 
06122   // Algorithm variants 'for-while' and 'for-if' need to set
06123   // Y(0,:) = 0, but only for the special case of CSR.
06124   if (beta != STS::zero()) {
06125     for (Ordinal c = 0; c < numVecs; ++c) {
06126       Y[c*colStrideY] = beta * Y[c*colStrideY];
06127     }
06128   }
06129   else {
06130     // Follow the Sparse BLAS convention for beta == 0. 
06131     for (Ordinal c = 0; c < numVecs; ++c) {
06132       Y[c*colStrideY] = STS::zero();
06133     }
06134   }
06135   if (alpha == STS::zero()) {
06136     // Prescale: Y := beta * Y.
06137     if (beta == STS::zero()) {
06138       for (Ordinal j = 0; j < numVecs; ++j) {
06139         RangeScalar* const Y_j = &Y[j*colStrideY];
06140         for (Ordinal i = 0; i < numRows; ++i) {
06141           // Follow the Sparse BLAS convention for beta == 0. 
06142           Y_j[i] = STS::zero();
06143         }
06144       }
06145     }
06146     else if (beta != STS::one()) {
06147       for (Ordinal j = 0; j < numVecs; ++j) {
06148         RangeScalar* const Y_j = &Y[j*colStrideY];
06149         for (Ordinal i = 0; i < numRows; ++i) {
06150           Y_j[i] = beta * Y_j[i];
06151         }
06152       }
06153     }
06154     return; // Our work is done!
06155   }
06156   const size_t nnz = ptr[numRows];
06157   if (alpha == STS::one()) {
06158     if (beta == -STS::one()) {
06159       RangeScalar tmp[3];
06160       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06161       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06162       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06163 
06164       RangeScalar* Y_i = Y;
06165       Ordinal i = 0;
06166       for (size_t k = 0; k < nnz; ++k) {
06167         const MatrixScalar A_ij = val[k];
06168         const Ordinal j = ind[k];
06169         while (k >= ptr[i+1]) {
06170           // Write temp output from last iteration(s) to Y,
06171           // before incrementing the current row index.
06172           if (k > 0) {
06173             Y_i[0] = tmp[0];
06174             Y_i[colStrideY] = tmp[1];
06175             Y_i[2*colStrideY] = tmp[2];
06176           }
06177           ++i;
06178           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
06179           Y_i = &Y[i];
06180           tmp[0] = -Y_i[0];
06181           tmp[1] = -Y_i[colStrideY];
06182           tmp[2] = -Y_i[2*colStrideY];
06183         }
06184         const DomainScalar* const X_j = &X[j];
06185         tmp[0] += A_ij * X_j[0];
06186         tmp[1] += A_ij * X_j[colStrideX];
06187         tmp[2] += A_ij * X_j[2*colStrideX];
06188       }
06189       Y_i[0] = tmp[0];
06190       Y_i[colStrideY] = tmp[1];
06191       Y_i[2*colStrideY] = tmp[2];
06192     }
06193     else if (beta == STS::zero()) {
06194       RangeScalar tmp[3];
06195       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06196       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06197       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06198 
06199       RangeScalar* Y_i = Y;
06200       Ordinal i = 0;
06201       for (size_t k = 0; k < nnz; ++k) {
06202         const MatrixScalar A_ij = val[k];
06203         const Ordinal j = ind[k];
06204         while (k >= ptr[i+1]) {
06205           // Write temp output from last iteration(s) to Y,
06206           // before incrementing the current row index.
06207           if (k > 0) {
06208             Y_i[0] = tmp[0];
06209             Y_i[colStrideY] = tmp[1];
06210             Y_i[2*colStrideY] = tmp[2];
06211           }
06212           ++i;
06213           // We haven't seen row i before; set Y(i,:) to 0.
06214           Y_i = &Y[i];
06215           tmp[0] = STS::zero();
06216           tmp[1] = STS::zero();
06217           tmp[2] = STS::zero();
06218         }
06219         const DomainScalar* const X_j = &X[j];
06220         tmp[0] += A_ij * X_j[0];
06221         tmp[1] += A_ij * X_j[colStrideX];
06222         tmp[2] += A_ij * X_j[2*colStrideX];
06223       }
06224       Y_i[0] = tmp[0];
06225       Y_i[colStrideY] = tmp[1];
06226       Y_i[2*colStrideY] = tmp[2];
06227     }
06228     else if (beta == STS::one()) {
06229       RangeScalar tmp[3];
06230       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06231       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06232       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06233 
06234       RangeScalar* Y_i = Y;
06235       Ordinal i = 0;
06236       for (size_t k = 0; k < nnz; ++k) {
06237         const MatrixScalar A_ij = val[k];
06238         const Ordinal j = ind[k];
06239         while (k >= ptr[i+1]) {
06240           // Write temp output from last iteration(s) to Y,
06241           // before incrementing the current row index.
06242           if (k > 0) {
06243             Y_i[0] = tmp[0];
06244             Y_i[colStrideY] = tmp[1];
06245             Y_i[2*colStrideY] = tmp[2];
06246           }
06247           ++i;
06248           // We don't have to set Y(i,:) here, since beta == 1.
06249           Y_i = &Y[i];
06250           tmp[0] = Y_i[0];
06251           tmp[1] = Y_i[colStrideY];
06252           tmp[2] = Y_i[2*colStrideY];
06253         }
06254         const DomainScalar* const X_j = &X[j];
06255         tmp[0] += A_ij * X_j[0];
06256         tmp[1] += A_ij * X_j[colStrideX];
06257         tmp[2] += A_ij * X_j[2*colStrideX];
06258       }
06259       Y_i[0] = tmp[0];
06260       Y_i[colStrideY] = tmp[1];
06261       Y_i[2*colStrideY] = tmp[2];
06262     }
06263     else { // beta != -1 && beta != 0 && beta != 1
06264       RangeScalar tmp[3];
06265       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06266       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06267       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06268 
06269       RangeScalar* Y_i = Y;
06270       Ordinal i = 0;
06271       for (size_t k = 0; k < nnz; ++k) {
06272         const MatrixScalar A_ij = val[k];
06273         const Ordinal j = ind[k];
06274         while (k >= ptr[i+1]) {
06275           // Write temp output from last iteration(s) to Y,
06276           // before incrementing the current row index.
06277           if (k > 0) {
06278             Y_i[0] = tmp[0];
06279             Y_i[colStrideY] = tmp[1];
06280             Y_i[2*colStrideY] = tmp[2];
06281           }
06282           ++i;
06283           // We haven't seen row i before; scale Y(i,:) by beta.
06284           Y_i = &Y[i];
06285           tmp[0] = beta * Y_i[0];
06286           tmp[1] = beta * Y_i[colStrideY];
06287           tmp[2] = beta * Y_i[2*colStrideY];
06288         }
06289         const DomainScalar* const X_j = &X[j];
06290         tmp[0] += A_ij * X_j[0];
06291         tmp[1] += A_ij * X_j[colStrideX];
06292         tmp[2] += A_ij * X_j[2*colStrideX];
06293       }
06294       Y_i[0] = tmp[0];
06295       Y_i[colStrideY] = tmp[1];
06296       Y_i[2*colStrideY] = tmp[2];
06297     }
06298   }
06299   else if (alpha == -STS::one()) {
06300     if (beta == -STS::one()) {
06301       RangeScalar tmp[3];
06302       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06303       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06304       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06305 
06306       RangeScalar* Y_i = Y;
06307       Ordinal i = 0;
06308       for (size_t k = 0; k < nnz; ++k) {
06309         const MatrixScalar A_ij = val[k];
06310         const Ordinal j = ind[k];
06311         while (k >= ptr[i+1]) {
06312           // Write temp output from last iteration(s) to Y,
06313           // before incrementing the current row index.
06314           if (k > 0) {
06315             Y_i[0] = tmp[0];
06316             Y_i[colStrideY] = tmp[1];
06317             Y_i[2*colStrideY] = tmp[2];
06318           }
06319           ++i;
06320           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
06321           Y_i = &Y[i];
06322           tmp[0] = -Y_i[0];
06323           tmp[1] = -Y_i[colStrideY];
06324           tmp[2] = -Y_i[2*colStrideY];
06325         }
06326         const DomainScalar* const X_j = &X[j];
06327         tmp[0] -= A_ij * X_j[0];
06328         tmp[1] -= A_ij * X_j[colStrideX];
06329         tmp[2] -= A_ij * X_j[2*colStrideX];
06330       }
06331       Y_i[0] = tmp[0];
06332       Y_i[colStrideY] = tmp[1];
06333       Y_i[2*colStrideY] = tmp[2];
06334     }
06335     else if (beta == STS::zero()) {
06336       RangeScalar tmp[3];
06337       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06338       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06339       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06340 
06341       RangeScalar* Y_i = Y;
06342       Ordinal i = 0;
06343       for (size_t k = 0; k < nnz; ++k) {
06344         const MatrixScalar A_ij = val[k];
06345         const Ordinal j = ind[k];
06346         while (k >= ptr[i+1]) {
06347           // Write temp output from last iteration(s) to Y,
06348           // before incrementing the current row index.
06349           if (k > 0) {
06350             Y_i[0] = tmp[0];
06351             Y_i[colStrideY] = tmp[1];
06352             Y_i[2*colStrideY] = tmp[2];
06353           }
06354           ++i;
06355           // We haven't seen row i before; set Y(i,:) to 0.
06356           Y_i = &Y[i];
06357           tmp[0] = STS::zero();
06358           tmp[1] = STS::zero();
06359           tmp[2] = STS::zero();
06360         }
06361         const DomainScalar* const X_j = &X[j];
06362         tmp[0] -= A_ij * X_j[0];
06363         tmp[1] -= A_ij * X_j[colStrideX];
06364         tmp[2] -= A_ij * X_j[2*colStrideX];
06365       }
06366       Y_i[0] = tmp[0];
06367       Y_i[colStrideY] = tmp[1];
06368       Y_i[2*colStrideY] = tmp[2];
06369     }
06370     else if (beta == STS::one()) {
06371       RangeScalar tmp[3];
06372       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06373       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06374       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06375 
06376       RangeScalar* Y_i = Y;
06377       Ordinal i = 0;
06378       for (size_t k = 0; k < nnz; ++k) {
06379         const MatrixScalar A_ij = val[k];
06380         const Ordinal j = ind[k];
06381         while (k >= ptr[i+1]) {
06382           // Write temp output from last iteration(s) to Y,
06383           // before incrementing the current row index.
06384           if (k > 0) {
06385             Y_i[0] = tmp[0];
06386             Y_i[colStrideY] = tmp[1];
06387             Y_i[2*colStrideY] = tmp[2];
06388           }
06389           ++i;
06390           // We don't have to set Y(i,:) here, since beta == 1.
06391           Y_i = &Y[i];
06392           tmp[0] = Y_i[0];
06393           tmp[1] = Y_i[colStrideY];
06394           tmp[2] = Y_i[2*colStrideY];
06395         }
06396         const DomainScalar* const X_j = &X[j];
06397         tmp[0] -= A_ij * X_j[0];
06398         tmp[1] -= A_ij * X_j[colStrideX];
06399         tmp[2] -= A_ij * X_j[2*colStrideX];
06400       }
06401       Y_i[0] = tmp[0];
06402       Y_i[colStrideY] = tmp[1];
06403       Y_i[2*colStrideY] = tmp[2];
06404     }
06405     else { // beta != -1 && beta != 0 && beta != 1
06406       RangeScalar tmp[3];
06407       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06408       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06409       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06410 
06411       RangeScalar* Y_i = Y;
06412       Ordinal i = 0;
06413       for (size_t k = 0; k < nnz; ++k) {
06414         const MatrixScalar A_ij = val[k];
06415         const Ordinal j = ind[k];
06416         while (k >= ptr[i+1]) {
06417           // Write temp output from last iteration(s) to Y,
06418           // before incrementing the current row index.
06419           if (k > 0) {
06420             Y_i[0] = tmp[0];
06421             Y_i[colStrideY] = tmp[1];
06422             Y_i[2*colStrideY] = tmp[2];
06423           }
06424           ++i;
06425           // We haven't seen row i before; scale Y(i,:) by beta.
06426           Y_i = &Y[i];
06427           tmp[0] = beta * Y_i[0];
06428           tmp[1] = beta * Y_i[colStrideY];
06429           tmp[2] = beta * Y_i[2*colStrideY];
06430         }
06431         const DomainScalar* const X_j = &X[j];
06432         tmp[0] -= A_ij * X_j[0];
06433         tmp[1] -= A_ij * X_j[colStrideX];
06434         tmp[2] -= A_ij * X_j[2*colStrideX];
06435       }
06436       Y_i[0] = tmp[0];
06437       Y_i[colStrideY] = tmp[1];
06438       Y_i[2*colStrideY] = tmp[2];
06439     }
06440   }
06441   else { // alpha != 1 && alpha != -1
06442     if (beta == -STS::one()) {
06443       RangeScalar tmp[3];
06444       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06445       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06446       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06447 
06448       RangeScalar* Y_i = Y;
06449       Ordinal i = 0;
06450       for (size_t k = 0; k < nnz; ++k) {
06451         const MatrixScalar A_ij = val[k];
06452         const Ordinal j = ind[k];
06453         while (k >= ptr[i+1]) {
06454           // Write temp output from last iteration(s) to Y,
06455           // before incrementing the current row index.
06456           if (k > 0) {
06457             Y_i[0] = tmp[0];
06458             Y_i[colStrideY] = tmp[1];
06459             Y_i[2*colStrideY] = tmp[2];
06460           }
06461           ++i;
06462           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
06463           Y_i = &Y[i];
06464           tmp[0] = -Y_i[0];
06465           tmp[1] = -Y_i[colStrideY];
06466           tmp[2] = -Y_i[2*colStrideY];
06467         }
06468         const DomainScalar* const X_j = &X[j];
06469         tmp[0] += alpha * A_ij * X_j[0];
06470         tmp[1] += alpha * A_ij * X_j[colStrideX];
06471         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
06472       }
06473       Y_i[0] = tmp[0];
06474       Y_i[colStrideY] = tmp[1];
06475       Y_i[2*colStrideY] = tmp[2];
06476     }
06477     else if (beta == STS::zero()) {
06478       RangeScalar tmp[3];
06479       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06480       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06481       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06482 
06483       RangeScalar* Y_i = Y;
06484       Ordinal i = 0;
06485       for (size_t k = 0; k < nnz; ++k) {
06486         const MatrixScalar A_ij = val[k];
06487         const Ordinal j = ind[k];
06488         while (k >= ptr[i+1]) {
06489           // Write temp output from last iteration(s) to Y,
06490           // before incrementing the current row index.
06491           if (k > 0) {
06492             Y_i[0] = tmp[0];
06493             Y_i[colStrideY] = tmp[1];
06494             Y_i[2*colStrideY] = tmp[2];
06495           }
06496           ++i;
06497           // We haven't seen row i before; set Y(i,:) to 0.
06498           Y_i = &Y[i];
06499           tmp[0] = STS::zero();
06500           tmp[1] = STS::zero();
06501           tmp[2] = STS::zero();
06502         }
06503         const DomainScalar* const X_j = &X[j];
06504         tmp[0] += alpha * A_ij * X_j[0];
06505         tmp[1] += alpha * A_ij * X_j[colStrideX];
06506         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
06507       }
06508       Y_i[0] = tmp[0];
06509       Y_i[colStrideY] = tmp[1];
06510       Y_i[2*colStrideY] = tmp[2];
06511     }
06512     else if (beta == STS::one()) {
06513       RangeScalar tmp[3];
06514       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06515       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06516       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06517 
06518       RangeScalar* Y_i = Y;
06519       Ordinal i = 0;
06520       for (size_t k = 0; k < nnz; ++k) {
06521         const MatrixScalar A_ij = val[k];
06522         const Ordinal j = ind[k];
06523         while (k >= ptr[i+1]) {
06524           // Write temp output from last iteration(s) to Y,
06525           // before incrementing the current row index.
06526           if (k > 0) {
06527             Y_i[0] = tmp[0];
06528             Y_i[colStrideY] = tmp[1];
06529             Y_i[2*colStrideY] = tmp[2];
06530           }
06531           ++i;
06532           // We don't have to set Y(i,:) here, since beta == 1.
06533           Y_i = &Y[i];
06534           tmp[0] = Y_i[0];
06535           tmp[1] = Y_i[colStrideY];
06536           tmp[2] = Y_i[2*colStrideY];
06537         }
06538         const DomainScalar* const X_j = &X[j];
06539         tmp[0] += alpha * A_ij * X_j[0];
06540         tmp[1] += alpha * A_ij * X_j[colStrideX];
06541         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
06542       }
06543       Y_i[0] = tmp[0];
06544       Y_i[colStrideY] = tmp[1];
06545       Y_i[2*colStrideY] = tmp[2];
06546     }
06547     else { // beta != -1 && beta != 0 && beta != 1
06548       RangeScalar tmp[3];
06549       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06550       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06551       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06552 
06553       RangeScalar* Y_i = Y;
06554       Ordinal i = 0;
06555       for (size_t k = 0; k < nnz; ++k) {
06556         const MatrixScalar A_ij = val[k];
06557         const Ordinal j = ind[k];
06558         while (k >= ptr[i+1]) {
06559           // Write temp output from last iteration(s) to Y,
06560           // before incrementing the current row index.
06561           if (k > 0) {
06562             Y_i[0] = tmp[0];
06563             Y_i[colStrideY] = tmp[1];
06564             Y_i[2*colStrideY] = tmp[2];
06565           }
06566           ++i;
06567           // We haven't seen row i before; scale Y(i,:) by beta.
06568           Y_i = &Y[i];
06569           tmp[0] = beta * Y_i[0];
06570           tmp[1] = beta * Y_i[colStrideY];
06571           tmp[2] = beta * Y_i[2*colStrideY];
06572         }
06573         const DomainScalar* const X_j = &X[j];
06574         tmp[0] += alpha * A_ij * X_j[0];
06575         tmp[1] += alpha * A_ij * X_j[colStrideX];
06576         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
06577       }
06578       Y_i[0] = tmp[0];
06579       Y_i[colStrideY] = tmp[1];
06580       Y_i[2*colStrideY] = tmp[2];
06581     }
06582   }
06583 }
06584 
06585 template<class Ordinal,
06586          class MatrixScalar,
06587          class DomainScalar,
06588          class RangeScalar>
06589 void
06590 matVecCsrColMajorForwhile4Vec (
06591   const Ordinal numRows,
06592   const Ordinal numCols,
06593   const Ordinal numVecs,
06594   const RangeScalar& beta,
06595   RangeScalar Y[],
06596   const Ordinal colStrideY,
06597   const RangeScalar& alpha,
06598   const size_t  ptr[],
06599   const Ordinal ind[],
06600   const MatrixScalar val[],
06601   const DomainScalar X[],
06602   const Ordinal colStrideX)
06603 {
06604   typedef Teuchos::ScalarTraits<RangeScalar> STS;
06605 
06606   // Algorithm variants 'for-while' and 'for-if' need to set
06607   // Y(0,:) = 0, but only for the special case of CSR.
06608   if (beta != STS::zero()) {
06609     for (Ordinal c = 0; c < numVecs; ++c) {
06610       Y[c*colStrideY] = beta * Y[c*colStrideY];
06611     }
06612   }
06613   else {
06614     // Follow the Sparse BLAS convention for beta == 0. 
06615     for (Ordinal c = 0; c < numVecs; ++c) {
06616       Y[c*colStrideY] = STS::zero();
06617     }
06618   }
06619   if (alpha == STS::zero()) {
06620     // Prescale: Y := beta * Y.
06621     if (beta == STS::zero()) {
06622       for (Ordinal j = 0; j < numVecs; ++j) {
06623         RangeScalar* const Y_j = &Y[j*colStrideY];
06624         for (Ordinal i = 0; i < numRows; ++i) {
06625           // Follow the Sparse BLAS convention for beta == 0. 
06626           Y_j[i] = STS::zero();
06627         }
06628       }
06629     }
06630     else if (beta != STS::one()) {
06631       for (Ordinal j = 0; j < numVecs; ++j) {
06632         RangeScalar* const Y_j = &Y[j*colStrideY];
06633         for (Ordinal i = 0; i < numRows; ++i) {
06634           Y_j[i] = beta * Y_j[i];
06635         }
06636       }
06637     }
06638     return; // Our work is done!
06639   }
06640   const size_t nnz = ptr[numRows];
06641   if (alpha == STS::one()) {
06642     if (beta == -STS::one()) {
06643       RangeScalar tmp[4];
06644       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06645       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06646       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06647       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
06648 
06649       RangeScalar* Y_i = Y;
06650       Ordinal i = 0;
06651       for (size_t k = 0; k < nnz; ++k) {
06652         const MatrixScalar A_ij = val[k];
06653         const Ordinal j = ind[k];
06654         while (k >= ptr[i+1]) {
06655           // Write temp output from last iteration(s) to Y,
06656           // before incrementing the current row index.
06657           if (k > 0) {
06658             Y_i[0] = tmp[0];
06659             Y_i[colStrideY] = tmp[1];
06660             Y_i[2*colStrideY] = tmp[2];
06661             Y_i[3*colStrideY] = tmp[3];
06662           }
06663           ++i;
06664           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
06665           Y_i = &Y[i];
06666           tmp[0] = -Y_i[0];
06667           tmp[1] = -Y_i[colStrideY];
06668           tmp[2] = -Y_i[2*colStrideY];
06669           tmp[3] = -Y_i[3*colStrideY];
06670         }
06671         const DomainScalar* const X_j = &X[j];
06672         tmp[0] += A_ij * X_j[0];
06673         tmp[1] += A_ij * X_j[colStrideX];
06674         tmp[2] += A_ij * X_j[2*colStrideX];
06675         tmp[3] += A_ij * X_j[3*colStrideX];
06676       }
06677       Y_i[0] = tmp[0];
06678       Y_i[colStrideY] = tmp[1];
06679       Y_i[2*colStrideY] = tmp[2];
06680       Y_i[3*colStrideY] = tmp[3];
06681     }
06682     else if (beta == STS::zero()) {
06683       RangeScalar tmp[4];
06684       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06685       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06686       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06687       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
06688 
06689       RangeScalar* Y_i = Y;
06690       Ordinal i = 0;
06691       for (size_t k = 0; k < nnz; ++k) {
06692         const MatrixScalar A_ij = val[k];
06693         const Ordinal j = ind[k];
06694         while (k >= ptr[i+1]) {
06695           // Write temp output from last iteration(s) to Y,
06696           // before incrementing the current row index.
06697           if (k > 0) {
06698             Y_i[0] = tmp[0];
06699             Y_i[colStrideY] = tmp[1];
06700             Y_i[2*colStrideY] = tmp[2];
06701             Y_i[3*colStrideY] = tmp[3];
06702           }
06703           ++i;
06704           // We haven't seen row i before; set Y(i,:) to 0.
06705           Y_i = &Y[i];
06706           tmp[0] = STS::zero();
06707           tmp[1] = STS::zero();
06708           tmp[2] = STS::zero();
06709           tmp[3] = STS::zero();
06710         }
06711         const DomainScalar* const X_j = &X[j];
06712         tmp[0] += A_ij * X_j[0];
06713         tmp[1] += A_ij * X_j[colStrideX];
06714         tmp[2] += A_ij * X_j[2*colStrideX];
06715         tmp[3] += A_ij * X_j[3*colStrideX];
06716       }
06717       Y_i[0] = tmp[0];
06718       Y_i[colStrideY] = tmp[1];
06719       Y_i[2*colStrideY] = tmp[2];
06720       Y_i[3*colStrideY] = tmp[3];
06721     }
06722     else if (beta == STS::one()) {
06723       RangeScalar tmp[4];
06724       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06725       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06726       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06727       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
06728 
06729       RangeScalar* Y_i = Y;
06730       Ordinal i = 0;
06731       for (size_t k = 0; k < nnz; ++k) {
06732         const MatrixScalar A_ij = val[k];
06733         const Ordinal j = ind[k];
06734         while (k >= ptr[i+1]) {
06735           // Write temp output from last iteration(s) to Y,
06736           // before incrementing the current row index.
06737           if (k > 0) {
06738             Y_i[0] = tmp[0];
06739             Y_i[colStrideY] = tmp[1];
06740             Y_i[2*colStrideY] = tmp[2];
06741             Y_i[3*colStrideY] = tmp[3];
06742           }
06743           ++i;
06744           // We don't have to set Y(i,:) here, since beta == 1.
06745           Y_i = &Y[i];
06746           tmp[0] = Y_i[0];
06747           tmp[1] = Y_i[colStrideY];
06748           tmp[2] = Y_i[2*colStrideY];
06749           tmp[3] = Y_i[3*colStrideY];
06750         }
06751         const DomainScalar* const X_j = &X[j];
06752         tmp[0] += A_ij * X_j[0];
06753         tmp[1] += A_ij * X_j[colStrideX];
06754         tmp[2] += A_ij * X_j[2*colStrideX];
06755         tmp[3] += A_ij * X_j[3*colStrideX];
06756       }
06757       Y_i[0] = tmp[0];
06758       Y_i[colStrideY] = tmp[1];
06759       Y_i[2*colStrideY] = tmp[2];
06760       Y_i[3*colStrideY] = tmp[3];
06761     }
06762     else { // beta != -1 && beta != 0 && beta != 1
06763       RangeScalar tmp[4];
06764       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06765       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06766       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06767       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
06768 
06769       RangeScalar* Y_i = Y;
06770       Ordinal i = 0;
06771       for (size_t k = 0; k < nnz; ++k) {
06772         const MatrixScalar A_ij = val[k];
06773         const Ordinal j = ind[k];
06774         while (k >= ptr[i+1]) {
06775           // Write temp output from last iteration(s) to Y,
06776           // before incrementing the current row index.
06777           if (k > 0) {
06778             Y_i[0] = tmp[0];
06779             Y_i[colStrideY] = tmp[1];
06780             Y_i[2*colStrideY] = tmp[2];
06781             Y_i[3*colStrideY] = tmp[3];
06782           }
06783           ++i;
06784           // We haven't seen row i before; scale Y(i,:) by beta.
06785           Y_i = &Y[i];
06786           tmp[0] = beta * Y_i[0];
06787           tmp[1] = beta * Y_i[colStrideY];
06788           tmp[2] = beta * Y_i[2*colStrideY];
06789           tmp[3] = beta * Y_i[3*colStrideY];
06790         }
06791         const DomainScalar* const X_j = &X[j];
06792         tmp[0] += A_ij * X_j[0];
06793         tmp[1] += A_ij * X_j[colStrideX];
06794         tmp[2] += A_ij * X_j[2*colStrideX];
06795         tmp[3] += A_ij * X_j[3*colStrideX];
06796       }
06797       Y_i[0] = tmp[0];
06798       Y_i[colStrideY] = tmp[1];
06799       Y_i[2*colStrideY] = tmp[2];
06800       Y_i[3*colStrideY] = tmp[3];
06801     }
06802   }
06803   else if (alpha == -STS::one()) {
06804     if (beta == -STS::one()) {
06805       RangeScalar tmp[4];
06806       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06807       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06808       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06809       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
06810 
06811       RangeScalar* Y_i = Y;
06812       Ordinal i = 0;
06813       for (size_t k = 0; k < nnz; ++k) {
06814         const MatrixScalar A_ij = val[k];
06815         const Ordinal j = ind[k];
06816         while (k >= ptr[i+1]) {
06817           // Write temp output from last iteration(s) to Y,
06818           // before incrementing the current row index.
06819           if (k > 0) {
06820             Y_i[0] = tmp[0];
06821             Y_i[colStrideY] = tmp[1];
06822             Y_i[2*colStrideY] = tmp[2];
06823             Y_i[3*colStrideY] = tmp[3];
06824           }
06825           ++i;
06826           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
06827           Y_i = &Y[i];
06828           tmp[0] = -Y_i[0];
06829           tmp[1] = -Y_i[colStrideY];
06830           tmp[2] = -Y_i[2*colStrideY];
06831           tmp[3] = -Y_i[3*colStrideY];
06832         }
06833         const DomainScalar* const X_j = &X[j];
06834         tmp[0] -= A_ij * X_j[0];
06835         tmp[1] -= A_ij * X_j[colStrideX];
06836         tmp[2] -= A_ij * X_j[2*colStrideX];
06837         tmp[3] -= A_ij * X_j[3*colStrideX];
06838       }
06839       Y_i[0] = tmp[0];
06840       Y_i[colStrideY] = tmp[1];
06841       Y_i[2*colStrideY] = tmp[2];
06842       Y_i[3*colStrideY] = tmp[3];
06843     }
06844     else if (beta == STS::zero()) {
06845       RangeScalar tmp[4];
06846       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06847       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06848       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06849       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
06850 
06851       RangeScalar* Y_i = Y;
06852       Ordinal i = 0;
06853       for (size_t k = 0; k < nnz; ++k) {
06854         const MatrixScalar A_ij = val[k];
06855         const Ordinal j = ind[k];
06856         while (k >= ptr[i+1]) {
06857           // Write temp output from last iteration(s) to Y,
06858           // before incrementing the current row index.
06859           if (k > 0) {
06860             Y_i[0] = tmp[0];
06861             Y_i[colStrideY] = tmp[1];
06862             Y_i[2*colStrideY] = tmp[2];
06863             Y_i[3*colStrideY] = tmp[3];
06864           }
06865           ++i;
06866           // We haven't seen row i before; set Y(i,:) to 0.
06867           Y_i = &Y[i];
06868           tmp[0] = STS::zero();
06869           tmp[1] = STS::zero();
06870           tmp[2] = STS::zero();
06871           tmp[3] = STS::zero();
06872         }
06873         const DomainScalar* const X_j = &X[j];
06874         tmp[0] -= A_ij * X_j[0];
06875         tmp[1] -= A_ij * X_j[colStrideX];
06876         tmp[2] -= A_ij * X_j[2*colStrideX];
06877         tmp[3] -= A_ij * X_j[3*colStrideX];
06878       }
06879       Y_i[0] = tmp[0];
06880       Y_i[colStrideY] = tmp[1];
06881       Y_i[2*colStrideY] = tmp[2];
06882       Y_i[3*colStrideY] = tmp[3];
06883     }
06884     else if (beta == STS::one()) {
06885       RangeScalar tmp[4];
06886       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06887       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06888       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06889       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
06890 
06891       RangeScalar* Y_i = Y;
06892       Ordinal i = 0;
06893       for (size_t k = 0; k < nnz; ++k) {
06894         const MatrixScalar A_ij = val[k];
06895         const Ordinal j = ind[k];
06896         while (k >= ptr[i+1]) {
06897           // Write temp output from last iteration(s) to Y,
06898           // before incrementing the current row index.
06899           if (k > 0) {
06900             Y_i[0] = tmp[0];
06901             Y_i[colStrideY] = tmp[1];
06902             Y_i[2*colStrideY] = tmp[2];
06903             Y_i[3*colStrideY] = tmp[3];
06904           }
06905           ++i;
06906           // We don't have to set Y(i,:) here, since beta == 1.
06907           Y_i = &Y[i];
06908           tmp[0] = Y_i[0];
06909           tmp[1] = Y_i[colStrideY];
06910           tmp[2] = Y_i[2*colStrideY];
06911           tmp[3] = Y_i[3*colStrideY];
06912         }
06913         const DomainScalar* const X_j = &X[j];
06914         tmp[0] -= A_ij * X_j[0];
06915         tmp[1] -= A_ij * X_j[colStrideX];
06916         tmp[2] -= A_ij * X_j[2*colStrideX];
06917         tmp[3] -= A_ij * X_j[3*colStrideX];
06918       }
06919       Y_i[0] = tmp[0];
06920       Y_i[colStrideY] = tmp[1];
06921       Y_i[2*colStrideY] = tmp[2];
06922       Y_i[3*colStrideY] = tmp[3];
06923     }
06924     else { // beta != -1 && beta != 0 && beta != 1
06925       RangeScalar tmp[4];
06926       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06927       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06928       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06929       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
06930 
06931       RangeScalar* Y_i = Y;
06932       Ordinal i = 0;
06933       for (size_t k = 0; k < nnz; ++k) {
06934         const MatrixScalar A_ij = val[k];
06935         const Ordinal j = ind[k];
06936         while (k >= ptr[i+1]) {
06937           // Write temp output from last iteration(s) to Y,
06938           // before incrementing the current row index.
06939           if (k > 0) {
06940             Y_i[0] = tmp[0];
06941             Y_i[colStrideY] = tmp[1];
06942             Y_i[2*colStrideY] = tmp[2];
06943             Y_i[3*colStrideY] = tmp[3];
06944           }
06945           ++i;
06946           // We haven't seen row i before; scale Y(i,:) by beta.
06947           Y_i = &Y[i];
06948           tmp[0] = beta * Y_i[0];
06949           tmp[1] = beta * Y_i[colStrideY];
06950           tmp[2] = beta * Y_i[2*colStrideY];
06951           tmp[3] = beta * Y_i[3*colStrideY];
06952         }
06953         const DomainScalar* const X_j = &X[j];
06954         tmp[0] -= A_ij * X_j[0];
06955         tmp[1] -= A_ij * X_j[colStrideX];
06956         tmp[2] -= A_ij * X_j[2*colStrideX];
06957         tmp[3] -= A_ij * X_j[3*colStrideX];
06958       }
06959       Y_i[0] = tmp[0];
06960       Y_i[colStrideY] = tmp[1];
06961       Y_i[2*colStrideY] = tmp[2];
06962       Y_i[3*colStrideY] = tmp[3];
06963     }
06964   }
06965   else { // alpha != 1 && alpha != -1
06966     if (beta == -STS::one()) {
06967       RangeScalar tmp[4];
06968       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
06969       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
06970       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
06971       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
06972 
06973       RangeScalar* Y_i = Y;
06974       Ordinal i = 0;
06975       for (size_t k = 0; k < nnz; ++k) {
06976         const MatrixScalar A_ij = val[k];
06977         const Ordinal j = ind[k];
06978         while (k >= ptr[i+1]) {
06979           // Write temp output from last iteration(s) to Y,
06980           // before incrementing the current row index.
06981           if (k > 0) {
06982             Y_i[0] = tmp[0];
06983             Y_i[colStrideY] = tmp[1];
06984             Y_i[2*colStrideY] = tmp[2];
06985             Y_i[3*colStrideY] = tmp[3];
06986           }
06987           ++i;
06988           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
06989           Y_i = &Y[i];
06990           tmp[0] = -Y_i[0];
06991           tmp[1] = -Y_i[colStrideY];
06992           tmp[2] = -Y_i[2*colStrideY];
06993           tmp[3] = -Y_i[3*colStrideY];
06994         }
06995         const DomainScalar* const X_j = &X[j];
06996         tmp[0] += alpha * A_ij * X_j[0];
06997         tmp[1] += alpha * A_ij * X_j[colStrideX];
06998         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
06999         tmp[3] += alpha * A_ij * X_j[3*colStrideX];
07000       }
07001       Y_i[0] = tmp[0];
07002       Y_i[colStrideY] = tmp[1];
07003       Y_i[2*colStrideY] = tmp[2];
07004       Y_i[3*colStrideY] = tmp[3];
07005     }
07006     else if (beta == STS::zero()) {
07007       RangeScalar tmp[4];
07008       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07009       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07010       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
07011       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
07012 
07013       RangeScalar* Y_i = Y;
07014       Ordinal i = 0;
07015       for (size_t k = 0; k < nnz; ++k) {
07016         const MatrixScalar A_ij = val[k];
07017         const Ordinal j = ind[k];
07018         while (k >= ptr[i+1]) {
07019           // Write temp output from last iteration(s) to Y,
07020           // before incrementing the current row index.
07021           if (k > 0) {
07022             Y_i[0] = tmp[0];
07023             Y_i[colStrideY] = tmp[1];
07024             Y_i[2*colStrideY] = tmp[2];
07025             Y_i[3*colStrideY] = tmp[3];
07026           }
07027           ++i;
07028           // We haven't seen row i before; set Y(i,:) to 0.
07029           Y_i = &Y[i];
07030           tmp[0] = STS::zero();
07031           tmp[1] = STS::zero();
07032           tmp[2] = STS::zero();
07033           tmp[3] = STS::zero();
07034         }
07035         const DomainScalar* const X_j = &X[j];
07036         tmp[0] += alpha * A_ij * X_j[0];
07037         tmp[1] += alpha * A_ij * X_j[colStrideX];
07038         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
07039         tmp[3] += alpha * A_ij * X_j[3*colStrideX];
07040       }
07041       Y_i[0] = tmp[0];
07042       Y_i[colStrideY] = tmp[1];
07043       Y_i[2*colStrideY] = tmp[2];
07044       Y_i[3*colStrideY] = tmp[3];
07045     }
07046     else if (beta == STS::one()) {
07047       RangeScalar tmp[4];
07048       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07049       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07050       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
07051       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
07052 
07053       RangeScalar* Y_i = Y;
07054       Ordinal i = 0;
07055       for (size_t k = 0; k < nnz; ++k) {
07056         const MatrixScalar A_ij = val[k];
07057         const Ordinal j = ind[k];
07058         while (k >= ptr[i+1]) {
07059           // Write temp output from last iteration(s) to Y,
07060           // before incrementing the current row index.
07061           if (k > 0) {
07062             Y_i[0] = tmp[0];
07063             Y_i[colStrideY] = tmp[1];
07064             Y_i[2*colStrideY] = tmp[2];
07065             Y_i[3*colStrideY] = tmp[3];
07066           }
07067           ++i;
07068           // We don't have to set Y(i,:) here, since beta == 1.
07069           Y_i = &Y[i];
07070           tmp[0] = Y_i[0];
07071           tmp[1] = Y_i[colStrideY];
07072           tmp[2] = Y_i[2*colStrideY];
07073           tmp[3] = Y_i[3*colStrideY];
07074         }
07075         const DomainScalar* const X_j = &X[j];
07076         tmp[0] += alpha * A_ij * X_j[0];
07077         tmp[1] += alpha * A_ij * X_j[colStrideX];
07078         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
07079         tmp[3] += alpha * A_ij * X_j[3*colStrideX];
07080       }
07081       Y_i[0] = tmp[0];
07082       Y_i[colStrideY] = tmp[1];
07083       Y_i[2*colStrideY] = tmp[2];
07084       Y_i[3*colStrideY] = tmp[3];
07085     }
07086     else { // beta != -1 && beta != 0 && beta != 1
07087       RangeScalar tmp[4];
07088       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07089       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07090       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
07091       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
07092 
07093       RangeScalar* Y_i = Y;
07094       Ordinal i = 0;
07095       for (size_t k = 0; k < nnz; ++k) {
07096         const MatrixScalar A_ij = val[k];
07097         const Ordinal j = ind[k];
07098         while (k >= ptr[i+1]) {
07099           // Write temp output from last iteration(s) to Y,
07100           // before incrementing the current row index.
07101           if (k > 0) {
07102             Y_i[0] = tmp[0];
07103             Y_i[colStrideY] = tmp[1];
07104             Y_i[2*colStrideY] = tmp[2];
07105             Y_i[3*colStrideY] = tmp[3];
07106           }
07107           ++i;
07108           // We haven't seen row i before; scale Y(i,:) by beta.
07109           Y_i = &Y[i];
07110           tmp[0] = beta * Y_i[0];
07111           tmp[1] = beta * Y_i[colStrideY];
07112           tmp[2] = beta * Y_i[2*colStrideY];
07113           tmp[3] = beta * Y_i[3*colStrideY];
07114         }
07115         const DomainScalar* const X_j = &X[j];
07116         tmp[0] += alpha * A_ij * X_j[0];
07117         tmp[1] += alpha * A_ij * X_j[colStrideX];
07118         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
07119         tmp[3] += alpha * A_ij * X_j[3*colStrideX];
07120       }
07121       Y_i[0] = tmp[0];
07122       Y_i[colStrideY] = tmp[1];
07123       Y_i[2*colStrideY] = tmp[2];
07124       Y_i[3*colStrideY] = tmp[3];
07125     }
07126   }
07127 }
07128 
07129 template<class Ordinal,
07130          class MatrixScalar,
07131          class DomainScalar,
07132          class RangeScalar>
07133 void
07134 matVecCsrColMajorForif1Vec (
07135   const Ordinal numRows,
07136   const Ordinal numCols,
07137   const Ordinal numVecs,
07138   const RangeScalar& beta,
07139   RangeScalar Y[],
07140   const Ordinal colStrideY,
07141   const RangeScalar& alpha,
07142   const size_t  ptr[],
07143   const Ordinal ind[],
07144   const MatrixScalar val[],
07145   const DomainScalar X[],
07146   const Ordinal colStrideX)
07147 {
07148   typedef Teuchos::ScalarTraits<RangeScalar> STS;
07149 
07150   // Algorithm variants 'for-while' and 'for-if' need to set
07151   // Y(0,:) = 0, but only for the special case of CSR.
07152   if (beta != STS::zero()) {
07153     for (Ordinal c = 0; c < numVecs; ++c) {
07154       Y[c*colStrideY] = beta * Y[c*colStrideY];
07155     }
07156   }
07157   else {
07158     // Follow the Sparse BLAS convention for beta == 0. 
07159     for (Ordinal c = 0; c < numVecs; ++c) {
07160       Y[c*colStrideY] = STS::zero();
07161     }
07162   }
07163   if (alpha == STS::zero()) {
07164     // Prescale: Y := beta * Y.
07165     if (beta == STS::zero()) {
07166       for (Ordinal j = 0; j < numVecs; ++j) {
07167         RangeScalar* const Y_j = &Y[j*colStrideY];
07168         for (Ordinal i = 0; i < numRows; ++i) {
07169           // Follow the Sparse BLAS convention for beta == 0. 
07170           Y_j[i] = STS::zero();
07171         }
07172       }
07173     }
07174     else if (beta != STS::one()) {
07175       for (Ordinal j = 0; j < numVecs; ++j) {
07176         RangeScalar* const Y_j = &Y[j*colStrideY];
07177         for (Ordinal i = 0; i < numRows; ++i) {
07178           Y_j[i] = beta * Y_j[i];
07179         }
07180       }
07181     }
07182     return; // Our work is done!
07183   }
07184   const size_t nnz = ptr[numRows];
07185   if (alpha == STS::one()) {
07186     if (beta == -STS::one()) {
07187       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07188       RangeScalar* Y_i = Y;
07189       Ordinal i = 0;
07190       for (size_t k = 0; k < nnz; ++k) {
07191         const MatrixScalar A_ij = val[k];
07192         const Ordinal j = ind[k];
07193         // NOTE: "if" instead of "while" here is only valid
07194         // if the matrix contains no empty columns.
07195         if (k >= ptr[i+1]) {
07196           // Write temp output from last iteration(s) to Y,
07197           // before incrementing the current row index.
07198           if (k > 0) {
07199             Y_i[0] = tmp;
07200           }
07201           ++i;
07202           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
07203           Y_i = &Y[i];
07204           tmp = -Y[i + 0*colStrideY];
07205         }
07206         tmp += A_ij * X[j];
07207       }
07208       Y_i[0] = tmp;
07209     }
07210     else if (beta == STS::zero()) {
07211       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07212       RangeScalar* Y_i = Y;
07213       Ordinal i = 0;
07214       for (size_t k = 0; k < nnz; ++k) {
07215         const MatrixScalar A_ij = val[k];
07216         const Ordinal j = ind[k];
07217         // NOTE: "if" instead of "while" here is only valid
07218         // if the matrix contains no empty columns.
07219         if (k >= ptr[i+1]) {
07220           // Write temp output from last iteration(s) to Y,
07221           // before incrementing the current row index.
07222           if (k > 0) {
07223             Y_i[0] = tmp;
07224           }
07225           ++i;
07226           // We haven't seen row i before; set Y(i,:) to 0.
07227           Y_i = &Y[i];
07228           tmp = STS::zero();
07229         }
07230         tmp += A_ij * X[j];
07231       }
07232       Y_i[0] = tmp;
07233     }
07234     else if (beta == STS::one()) {
07235       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07236       RangeScalar* Y_i = Y;
07237       Ordinal i = 0;
07238       for (size_t k = 0; k < nnz; ++k) {
07239         const MatrixScalar A_ij = val[k];
07240         const Ordinal j = ind[k];
07241         // NOTE: "if" instead of "while" here is only valid
07242         // if the matrix contains no empty columns.
07243         if (k >= ptr[i+1]) {
07244           // Write temp output from last iteration(s) to Y,
07245           // before incrementing the current row index.
07246           if (k > 0) {
07247             Y_i[0] = tmp;
07248           }
07249           ++i;
07250           // We don't have to set Y(i,:) here, since beta == 1.
07251           Y_i = &Y[i];
07252           tmp = Y[i + 0*colStrideY];
07253         }
07254         tmp += A_ij * X[j];
07255       }
07256       Y_i[0] = tmp;
07257     }
07258     else { // beta != -1 && beta != 0 && beta != 1
07259       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07260       RangeScalar* Y_i = Y;
07261       Ordinal i = 0;
07262       for (size_t k = 0; k < nnz; ++k) {
07263         const MatrixScalar A_ij = val[k];
07264         const Ordinal j = ind[k];
07265         // NOTE: "if" instead of "while" here is only valid
07266         // if the matrix contains no empty columns.
07267         if (k >= ptr[i+1]) {
07268           // Write temp output from last iteration(s) to Y,
07269           // before incrementing the current row index.
07270           if (k > 0) {
07271             Y_i[0] = tmp;
07272           }
07273           ++i;
07274           // We haven't seen row i before; scale Y(i,:) by beta.
07275           Y_i = &Y[i];
07276           tmp = beta * Y[i + 0*colStrideY];
07277         }
07278         tmp += A_ij * X[j];
07279       }
07280       Y_i[0] = tmp;
07281     }
07282   }
07283   else if (alpha == -STS::one()) {
07284     if (beta == -STS::one()) {
07285       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07286       RangeScalar* Y_i = Y;
07287       Ordinal i = 0;
07288       for (size_t k = 0; k < nnz; ++k) {
07289         const MatrixScalar A_ij = val[k];
07290         const Ordinal j = ind[k];
07291         // NOTE: "if" instead of "while" here is only valid
07292         // if the matrix contains no empty columns.
07293         if (k >= ptr[i+1]) {
07294           // Write temp output from last iteration(s) to Y,
07295           // before incrementing the current row index.
07296           if (k > 0) {
07297             Y_i[0] = tmp;
07298           }
07299           ++i;
07300           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
07301           Y_i = &Y[i];
07302           tmp = -Y[i + 0*colStrideY];
07303         }
07304         tmp -= A_ij * X[j];
07305       }
07306       Y_i[0] = tmp;
07307     }
07308     else if (beta == STS::zero()) {
07309       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07310       RangeScalar* Y_i = Y;
07311       Ordinal i = 0;
07312       for (size_t k = 0; k < nnz; ++k) {
07313         const MatrixScalar A_ij = val[k];
07314         const Ordinal j = ind[k];
07315         // NOTE: "if" instead of "while" here is only valid
07316         // if the matrix contains no empty columns.
07317         if (k >= ptr[i+1]) {
07318           // Write temp output from last iteration(s) to Y,
07319           // before incrementing the current row index.
07320           if (k > 0) {
07321             Y_i[0] = tmp;
07322           }
07323           ++i;
07324           // We haven't seen row i before; set Y(i,:) to 0.
07325           Y_i = &Y[i];
07326           tmp = STS::zero();
07327         }
07328         tmp -= A_ij * X[j];
07329       }
07330       Y_i[0] = tmp;
07331     }
07332     else if (beta == STS::one()) {
07333       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07334       RangeScalar* Y_i = Y;
07335       Ordinal i = 0;
07336       for (size_t k = 0; k < nnz; ++k) {
07337         const MatrixScalar A_ij = val[k];
07338         const Ordinal j = ind[k];
07339         // NOTE: "if" instead of "while" here is only valid
07340         // if the matrix contains no empty columns.
07341         if (k >= ptr[i+1]) {
07342           // Write temp output from last iteration(s) to Y,
07343           // before incrementing the current row index.
07344           if (k > 0) {
07345             Y_i[0] = tmp;
07346           }
07347           ++i;
07348           // We don't have to set Y(i,:) here, since beta == 1.
07349           Y_i = &Y[i];
07350           tmp = Y[i + 0*colStrideY];
07351         }
07352         tmp -= A_ij * X[j];
07353       }
07354       Y_i[0] = tmp;
07355     }
07356     else { // beta != -1 && beta != 0 && beta != 1
07357       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07358       RangeScalar* Y_i = Y;
07359       Ordinal i = 0;
07360       for (size_t k = 0; k < nnz; ++k) {
07361         const MatrixScalar A_ij = val[k];
07362         const Ordinal j = ind[k];
07363         // NOTE: "if" instead of "while" here is only valid
07364         // if the matrix contains no empty columns.
07365         if (k >= ptr[i+1]) {
07366           // Write temp output from last iteration(s) to Y,
07367           // before incrementing the current row index.
07368           if (k > 0) {
07369             Y_i[0] = tmp;
07370           }
07371           ++i;
07372           // We haven't seen row i before; scale Y(i,:) by beta.
07373           Y_i = &Y[i];
07374           tmp = beta * Y[i + 0*colStrideY];
07375         }
07376         tmp -= A_ij * X[j];
07377       }
07378       Y_i[0] = tmp;
07379     }
07380   }
07381   else { // alpha != 1 && alpha != -1
07382     if (beta == -STS::one()) {
07383       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07384       RangeScalar* Y_i = Y;
07385       Ordinal i = 0;
07386       for (size_t k = 0; k < nnz; ++k) {
07387         const MatrixScalar A_ij = val[k];
07388         const Ordinal j = ind[k];
07389         // NOTE: "if" instead of "while" here is only valid
07390         // if the matrix contains no empty columns.
07391         if (k >= ptr[i+1]) {
07392           // Write temp output from last iteration(s) to Y,
07393           // before incrementing the current row index.
07394           if (k > 0) {
07395             Y_i[0] = tmp;
07396           }
07397           ++i;
07398           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
07399           Y_i = &Y[i];
07400           tmp = -Y[i + 0*colStrideY];
07401         }
07402         tmp += alpha * A_ij * X[j];
07403       }
07404       Y_i[0] = tmp;
07405     }
07406     else if (beta == STS::zero()) {
07407       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07408       RangeScalar* Y_i = Y;
07409       Ordinal i = 0;
07410       for (size_t k = 0; k < nnz; ++k) {
07411         const MatrixScalar A_ij = val[k];
07412         const Ordinal j = ind[k];
07413         // NOTE: "if" instead of "while" here is only valid
07414         // if the matrix contains no empty columns.
07415         if (k >= ptr[i+1]) {
07416           // Write temp output from last iteration(s) to Y,
07417           // before incrementing the current row index.
07418           if (k > 0) {
07419             Y_i[0] = tmp;
07420           }
07421           ++i;
07422           // We haven't seen row i before; set Y(i,:) to 0.
07423           Y_i = &Y[i];
07424           tmp = STS::zero();
07425         }
07426         tmp += alpha * A_ij * X[j];
07427       }
07428       Y_i[0] = tmp;
07429     }
07430     else if (beta == STS::one()) {
07431       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07432       RangeScalar* Y_i = Y;
07433       Ordinal i = 0;
07434       for (size_t k = 0; k < nnz; ++k) {
07435         const MatrixScalar A_ij = val[k];
07436         const Ordinal j = ind[k];
07437         // NOTE: "if" instead of "while" here is only valid
07438         // if the matrix contains no empty columns.
07439         if (k >= ptr[i+1]) {
07440           // Write temp output from last iteration(s) to Y,
07441           // before incrementing the current row index.
07442           if (k > 0) {
07443             Y_i[0] = tmp;
07444           }
07445           ++i;
07446           // We don't have to set Y(i,:) here, since beta == 1.
07447           Y_i = &Y[i];
07448           tmp = Y[i + 0*colStrideY];
07449         }
07450         tmp += alpha * A_ij * X[j];
07451       }
07452       Y_i[0] = tmp;
07453     }
07454     else { // beta != -1 && beta != 0 && beta != 1
07455       RangeScalar tmp = Teuchos::ScalarTraits<RangeScalar>::zero();
07456       RangeScalar* Y_i = Y;
07457       Ordinal i = 0;
07458       for (size_t k = 0; k < nnz; ++k) {
07459         const MatrixScalar A_ij = val[k];
07460         const Ordinal j = ind[k];
07461         // NOTE: "if" instead of "while" here is only valid
07462         // if the matrix contains no empty columns.
07463         if (k >= ptr[i+1]) {
07464           // Write temp output from last iteration(s) to Y,
07465           // before incrementing the current row index.
07466           if (k > 0) {
07467             Y_i[0] = tmp;
07468           }
07469           ++i;
07470           // We haven't seen row i before; scale Y(i,:) by beta.
07471           Y_i = &Y[i];
07472           tmp = beta * Y[i + 0*colStrideY];
07473         }
07474         tmp += alpha * A_ij * X[j];
07475       }
07476       Y_i[0] = tmp;
07477     }
07478   }
07479 }
07480 
07481 template<class Ordinal,
07482          class MatrixScalar,
07483          class DomainScalar,
07484          class RangeScalar>
07485 void
07486 matVecCsrColMajorForif2Vec (
07487   const Ordinal numRows,
07488   const Ordinal numCols,
07489   const Ordinal numVecs,
07490   const RangeScalar& beta,
07491   RangeScalar Y[],
07492   const Ordinal colStrideY,
07493   const RangeScalar& alpha,
07494   const size_t  ptr[],
07495   const Ordinal ind[],
07496   const MatrixScalar val[],
07497   const DomainScalar X[],
07498   const Ordinal colStrideX)
07499 {
07500   typedef Teuchos::ScalarTraits<RangeScalar> STS;
07501 
07502   // Algorithm variants 'for-while' and 'for-if' need to set
07503   // Y(0,:) = 0, but only for the special case of CSR.
07504   if (beta != STS::zero()) {
07505     for (Ordinal c = 0; c < numVecs; ++c) {
07506       Y[c*colStrideY] = beta * Y[c*colStrideY];
07507     }
07508   }
07509   else {
07510     // Follow the Sparse BLAS convention for beta == 0. 
07511     for (Ordinal c = 0; c < numVecs; ++c) {
07512       Y[c*colStrideY] = STS::zero();
07513     }
07514   }
07515   if (alpha == STS::zero()) {
07516     // Prescale: Y := beta * Y.
07517     if (beta == STS::zero()) {
07518       for (Ordinal j = 0; j < numVecs; ++j) {
07519         RangeScalar* const Y_j = &Y[j*colStrideY];
07520         for (Ordinal i = 0; i < numRows; ++i) {
07521           // Follow the Sparse BLAS convention for beta == 0. 
07522           Y_j[i] = STS::zero();
07523         }
07524       }
07525     }
07526     else if (beta != STS::one()) {
07527       for (Ordinal j = 0; j < numVecs; ++j) {
07528         RangeScalar* const Y_j = &Y[j*colStrideY];
07529         for (Ordinal i = 0; i < numRows; ++i) {
07530           Y_j[i] = beta * Y_j[i];
07531         }
07532       }
07533     }
07534     return; // Our work is done!
07535   }
07536   const size_t nnz = ptr[numRows];
07537   if (alpha == STS::one()) {
07538     if (beta == -STS::one()) {
07539       RangeScalar tmp[2];
07540       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07541       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07542 
07543       RangeScalar* Y_i = Y;
07544       Ordinal i = 0;
07545       for (size_t k = 0; k < nnz; ++k) {
07546         const MatrixScalar A_ij = val[k];
07547         const Ordinal j = ind[k];
07548         // NOTE: "if" instead of "while" here is only valid
07549         // if the matrix contains no empty columns.
07550         if (k >= ptr[i+1]) {
07551           // Write temp output from last iteration(s) to Y,
07552           // before incrementing the current row index.
07553           if (k > 0) {
07554             Y_i[0] = tmp[0];
07555             Y_i[colStrideY] = tmp[1];
07556           }
07557           ++i;
07558           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
07559           Y_i = &Y[i];
07560           tmp[0] = -Y_i[0];
07561           tmp[1] = -Y_i[colStrideY];
07562         }
07563         const DomainScalar* const X_j = &X[j];
07564         tmp[0] += A_ij * X_j[0];
07565         tmp[1] += A_ij * X_j[colStrideX];
07566       }
07567       Y_i[0] = tmp[0];
07568       Y_i[colStrideY] = tmp[1];
07569     }
07570     else if (beta == STS::zero()) {
07571       RangeScalar tmp[2];
07572       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07573       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07574 
07575       RangeScalar* Y_i = Y;
07576       Ordinal i = 0;
07577       for (size_t k = 0; k < nnz; ++k) {
07578         const MatrixScalar A_ij = val[k];
07579         const Ordinal j = ind[k];
07580         // NOTE: "if" instead of "while" here is only valid
07581         // if the matrix contains no empty columns.
07582         if (k >= ptr[i+1]) {
07583           // Write temp output from last iteration(s) to Y,
07584           // before incrementing the current row index.
07585           if (k > 0) {
07586             Y_i[0] = tmp[0];
07587             Y_i[colStrideY] = tmp[1];
07588           }
07589           ++i;
07590           // We haven't seen row i before; set Y(i,:) to 0.
07591           Y_i = &Y[i];
07592           tmp[0] = STS::zero();
07593           tmp[1] = STS::zero();
07594         }
07595         const DomainScalar* const X_j = &X[j];
07596         tmp[0] += A_ij * X_j[0];
07597         tmp[1] += A_ij * X_j[colStrideX];
07598       }
07599       Y_i[0] = tmp[0];
07600       Y_i[colStrideY] = tmp[1];
07601     }
07602     else if (beta == STS::one()) {
07603       RangeScalar tmp[2];
07604       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07605       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07606 
07607       RangeScalar* Y_i = Y;
07608       Ordinal i = 0;
07609       for (size_t k = 0; k < nnz; ++k) {
07610         const MatrixScalar A_ij = val[k];
07611         const Ordinal j = ind[k];
07612         // NOTE: "if" instead of "while" here is only valid
07613         // if the matrix contains no empty columns.
07614         if (k >= ptr[i+1]) {
07615           // Write temp output from last iteration(s) to Y,
07616           // before incrementing the current row index.
07617           if (k > 0) {
07618             Y_i[0] = tmp[0];
07619             Y_i[colStrideY] = tmp[1];
07620           }
07621           ++i;
07622           // We don't have to set Y(i,:) here, since beta == 1.
07623           Y_i = &Y[i];
07624           tmp[0] = Y_i[0];
07625           tmp[1] = Y_i[colStrideY];
07626         }
07627         const DomainScalar* const X_j = &X[j];
07628         tmp[0] += A_ij * X_j[0];
07629         tmp[1] += A_ij * X_j[colStrideX];
07630       }
07631       Y_i[0] = tmp[0];
07632       Y_i[colStrideY] = tmp[1];
07633     }
07634     else { // beta != -1 && beta != 0 && beta != 1
07635       RangeScalar tmp[2];
07636       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07637       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07638 
07639       RangeScalar* Y_i = Y;
07640       Ordinal i = 0;
07641       for (size_t k = 0; k < nnz; ++k) {
07642         const MatrixScalar A_ij = val[k];
07643         const Ordinal j = ind[k];
07644         // NOTE: "if" instead of "while" here is only valid
07645         // if the matrix contains no empty columns.
07646         if (k >= ptr[i+1]) {
07647           // Write temp output from last iteration(s) to Y,
07648           // before incrementing the current row index.
07649           if (k > 0) {
07650             Y_i[0] = tmp[0];
07651             Y_i[colStrideY] = tmp[1];
07652           }
07653           ++i;
07654           // We haven't seen row i before; scale Y(i,:) by beta.
07655           Y_i = &Y[i];
07656           tmp[0] = beta * Y_i[0];
07657           tmp[1] = beta * Y_i[colStrideY];
07658         }
07659         const DomainScalar* const X_j = &X[j];
07660         tmp[0] += A_ij * X_j[0];
07661         tmp[1] += A_ij * X_j[colStrideX];
07662       }
07663       Y_i[0] = tmp[0];
07664       Y_i[colStrideY] = tmp[1];
07665     }
07666   }
07667   else if (alpha == -STS::one()) {
07668     if (beta == -STS::one()) {
07669       RangeScalar tmp[2];
07670       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07671       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07672 
07673       RangeScalar* Y_i = Y;
07674       Ordinal i = 0;
07675       for (size_t k = 0; k < nnz; ++k) {
07676         const MatrixScalar A_ij = val[k];
07677         const Ordinal j = ind[k];
07678         // NOTE: "if" instead of "while" here is only valid
07679         // if the matrix contains no empty columns.
07680         if (k >= ptr[i+1]) {
07681           // Write temp output from last iteration(s) to Y,
07682           // before incrementing the current row index.
07683           if (k > 0) {
07684             Y_i[0] = tmp[0];
07685             Y_i[colStrideY] = tmp[1];
07686           }
07687           ++i;
07688           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
07689           Y_i = &Y[i];
07690           tmp[0] = -Y_i[0];
07691           tmp[1] = -Y_i[colStrideY];
07692         }
07693         const DomainScalar* const X_j = &X[j];
07694         tmp[0] -= A_ij * X_j[0];
07695         tmp[1] -= A_ij * X_j[colStrideX];
07696       }
07697       Y_i[0] = tmp[0];
07698       Y_i[colStrideY] = tmp[1];
07699     }
07700     else if (beta == STS::zero()) {
07701       RangeScalar tmp[2];
07702       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07703       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07704 
07705       RangeScalar* Y_i = Y;
07706       Ordinal i = 0;
07707       for (size_t k = 0; k < nnz; ++k) {
07708         const MatrixScalar A_ij = val[k];
07709         const Ordinal j = ind[k];
07710         // NOTE: "if" instead of "while" here is only valid
07711         // if the matrix contains no empty columns.
07712         if (k >= ptr[i+1]) {
07713           // Write temp output from last iteration(s) to Y,
07714           // before incrementing the current row index.
07715           if (k > 0) {
07716             Y_i[0] = tmp[0];
07717             Y_i[colStrideY] = tmp[1];
07718           }
07719           ++i;
07720           // We haven't seen row i before; set Y(i,:) to 0.
07721           Y_i = &Y[i];
07722           tmp[0] = STS::zero();
07723           tmp[1] = STS::zero();
07724         }
07725         const DomainScalar* const X_j = &X[j];
07726         tmp[0] -= A_ij * X_j[0];
07727         tmp[1] -= A_ij * X_j[colStrideX];
07728       }
07729       Y_i[0] = tmp[0];
07730       Y_i[colStrideY] = tmp[1];
07731     }
07732     else if (beta == STS::one()) {
07733       RangeScalar tmp[2];
07734       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07735       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07736 
07737       RangeScalar* Y_i = Y;
07738       Ordinal i = 0;
07739       for (size_t k = 0; k < nnz; ++k) {
07740         const MatrixScalar A_ij = val[k];
07741         const Ordinal j = ind[k];
07742         // NOTE: "if" instead of "while" here is only valid
07743         // if the matrix contains no empty columns.
07744         if (k >= ptr[i+1]) {
07745           // Write temp output from last iteration(s) to Y,
07746           // before incrementing the current row index.
07747           if (k > 0) {
07748             Y_i[0] = tmp[0];
07749             Y_i[colStrideY] = tmp[1];
07750           }
07751           ++i;
07752           // We don't have to set Y(i,:) here, since beta == 1.
07753           Y_i = &Y[i];
07754           tmp[0] = Y_i[0];
07755           tmp[1] = Y_i[colStrideY];
07756         }
07757         const DomainScalar* const X_j = &X[j];
07758         tmp[0] -= A_ij * X_j[0];
07759         tmp[1] -= A_ij * X_j[colStrideX];
07760       }
07761       Y_i[0] = tmp[0];
07762       Y_i[colStrideY] = tmp[1];
07763     }
07764     else { // beta != -1 && beta != 0 && beta != 1
07765       RangeScalar tmp[2];
07766       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07767       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07768 
07769       RangeScalar* Y_i = Y;
07770       Ordinal i = 0;
07771       for (size_t k = 0; k < nnz; ++k) {
07772         const MatrixScalar A_ij = val[k];
07773         const Ordinal j = ind[k];
07774         // NOTE: "if" instead of "while" here is only valid
07775         // if the matrix contains no empty columns.
07776         if (k >= ptr[i+1]) {
07777           // Write temp output from last iteration(s) to Y,
07778           // before incrementing the current row index.
07779           if (k > 0) {
07780             Y_i[0] = tmp[0];
07781             Y_i[colStrideY] = tmp[1];
07782           }
07783           ++i;
07784           // We haven't seen row i before; scale Y(i,:) by beta.
07785           Y_i = &Y[i];
07786           tmp[0] = beta * Y_i[0];
07787           tmp[1] = beta * Y_i[colStrideY];
07788         }
07789         const DomainScalar* const X_j = &X[j];
07790         tmp[0] -= A_ij * X_j[0];
07791         tmp[1] -= A_ij * X_j[colStrideX];
07792       }
07793       Y_i[0] = tmp[0];
07794       Y_i[colStrideY] = tmp[1];
07795     }
07796   }
07797   else { // alpha != 1 && alpha != -1
07798     if (beta == -STS::one()) {
07799       RangeScalar tmp[2];
07800       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07801       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07802 
07803       RangeScalar* Y_i = Y;
07804       Ordinal i = 0;
07805       for (size_t k = 0; k < nnz; ++k) {
07806         const MatrixScalar A_ij = val[k];
07807         const Ordinal j = ind[k];
07808         // NOTE: "if" instead of "while" here is only valid
07809         // if the matrix contains no empty columns.
07810         if (k >= ptr[i+1]) {
07811           // Write temp output from last iteration(s) to Y,
07812           // before incrementing the current row index.
07813           if (k > 0) {
07814             Y_i[0] = tmp[0];
07815             Y_i[colStrideY] = tmp[1];
07816           }
07817           ++i;
07818           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
07819           Y_i = &Y[i];
07820           tmp[0] = -Y_i[0];
07821           tmp[1] = -Y_i[colStrideY];
07822         }
07823         const DomainScalar* const X_j = &X[j];
07824         tmp[0] += alpha * A_ij * X_j[0];
07825         tmp[1] += alpha * A_ij * X_j[colStrideX];
07826       }
07827       Y_i[0] = tmp[0];
07828       Y_i[colStrideY] = tmp[1];
07829     }
07830     else if (beta == STS::zero()) {
07831       RangeScalar tmp[2];
07832       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07833       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07834 
07835       RangeScalar* Y_i = Y;
07836       Ordinal i = 0;
07837       for (size_t k = 0; k < nnz; ++k) {
07838         const MatrixScalar A_ij = val[k];
07839         const Ordinal j = ind[k];
07840         // NOTE: "if" instead of "while" here is only valid
07841         // if the matrix contains no empty columns.
07842         if (k >= ptr[i+1]) {
07843           // Write temp output from last iteration(s) to Y,
07844           // before incrementing the current row index.
07845           if (k > 0) {
07846             Y_i[0] = tmp[0];
07847             Y_i[colStrideY] = tmp[1];
07848           }
07849           ++i;
07850           // We haven't seen row i before; set Y(i,:) to 0.
07851           Y_i = &Y[i];
07852           tmp[0] = STS::zero();
07853           tmp[1] = STS::zero();
07854         }
07855         const DomainScalar* const X_j = &X[j];
07856         tmp[0] += alpha * A_ij * X_j[0];
07857         tmp[1] += alpha * A_ij * X_j[colStrideX];
07858       }
07859       Y_i[0] = tmp[0];
07860       Y_i[colStrideY] = tmp[1];
07861     }
07862     else if (beta == STS::one()) {
07863       RangeScalar tmp[2];
07864       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07865       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07866 
07867       RangeScalar* Y_i = Y;
07868       Ordinal i = 0;
07869       for (size_t k = 0; k < nnz; ++k) {
07870         const MatrixScalar A_ij = val[k];
07871         const Ordinal j = ind[k];
07872         // NOTE: "if" instead of "while" here is only valid
07873         // if the matrix contains no empty columns.
07874         if (k >= ptr[i+1]) {
07875           // Write temp output from last iteration(s) to Y,
07876           // before incrementing the current row index.
07877           if (k > 0) {
07878             Y_i[0] = tmp[0];
07879             Y_i[colStrideY] = tmp[1];
07880           }
07881           ++i;
07882           // We don't have to set Y(i,:) here, since beta == 1.
07883           Y_i = &Y[i];
07884           tmp[0] = Y_i[0];
07885           tmp[1] = Y_i[colStrideY];
07886         }
07887         const DomainScalar* const X_j = &X[j];
07888         tmp[0] += alpha * A_ij * X_j[0];
07889         tmp[1] += alpha * A_ij * X_j[colStrideX];
07890       }
07891       Y_i[0] = tmp[0];
07892       Y_i[colStrideY] = tmp[1];
07893     }
07894     else { // beta != -1 && beta != 0 && beta != 1
07895       RangeScalar tmp[2];
07896       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07897       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07898 
07899       RangeScalar* Y_i = Y;
07900       Ordinal i = 0;
07901       for (size_t k = 0; k < nnz; ++k) {
07902         const MatrixScalar A_ij = val[k];
07903         const Ordinal j = ind[k];
07904         // NOTE: "if" instead of "while" here is only valid
07905         // if the matrix contains no empty columns.
07906         if (k >= ptr[i+1]) {
07907           // Write temp output from last iteration(s) to Y,
07908           // before incrementing the current row index.
07909           if (k > 0) {
07910             Y_i[0] = tmp[0];
07911             Y_i[colStrideY] = tmp[1];
07912           }
07913           ++i;
07914           // We haven't seen row i before; scale Y(i,:) by beta.
07915           Y_i = &Y[i];
07916           tmp[0] = beta * Y_i[0];
07917           tmp[1] = beta * Y_i[colStrideY];
07918         }
07919         const DomainScalar* const X_j = &X[j];
07920         tmp[0] += alpha * A_ij * X_j[0];
07921         tmp[1] += alpha * A_ij * X_j[colStrideX];
07922       }
07923       Y_i[0] = tmp[0];
07924       Y_i[colStrideY] = tmp[1];
07925     }
07926   }
07927 }
07928 
07929 template<class Ordinal,
07930          class MatrixScalar,
07931          class DomainScalar,
07932          class RangeScalar>
07933 void
07934 matVecCsrColMajorForif3Vec (
07935   const Ordinal numRows,
07936   const Ordinal numCols,
07937   const Ordinal numVecs,
07938   const RangeScalar& beta,
07939   RangeScalar Y[],
07940   const Ordinal colStrideY,
07941   const RangeScalar& alpha,
07942   const size_t  ptr[],
07943   const Ordinal ind[],
07944   const MatrixScalar val[],
07945   const DomainScalar X[],
07946   const Ordinal colStrideX)
07947 {
07948   typedef Teuchos::ScalarTraits<RangeScalar> STS;
07949 
07950   // Algorithm variants 'for-while' and 'for-if' need to set
07951   // Y(0,:) = 0, but only for the special case of CSR.
07952   if (beta != STS::zero()) {
07953     for (Ordinal c = 0; c < numVecs; ++c) {
07954       Y[c*colStrideY] = beta * Y[c*colStrideY];
07955     }
07956   }
07957   else {
07958     // Follow the Sparse BLAS convention for beta == 0. 
07959     for (Ordinal c = 0; c < numVecs; ++c) {
07960       Y[c*colStrideY] = STS::zero();
07961     }
07962   }
07963   if (alpha == STS::zero()) {
07964     // Prescale: Y := beta * Y.
07965     if (beta == STS::zero()) {
07966       for (Ordinal j = 0; j < numVecs; ++j) {
07967         RangeScalar* const Y_j = &Y[j*colStrideY];
07968         for (Ordinal i = 0; i < numRows; ++i) {
07969           // Follow the Sparse BLAS convention for beta == 0. 
07970           Y_j[i] = STS::zero();
07971         }
07972       }
07973     }
07974     else if (beta != STS::one()) {
07975       for (Ordinal j = 0; j < numVecs; ++j) {
07976         RangeScalar* const Y_j = &Y[j*colStrideY];
07977         for (Ordinal i = 0; i < numRows; ++i) {
07978           Y_j[i] = beta * Y_j[i];
07979         }
07980       }
07981     }
07982     return; // Our work is done!
07983   }
07984   const size_t nnz = ptr[numRows];
07985   if (alpha == STS::one()) {
07986     if (beta == -STS::one()) {
07987       RangeScalar tmp[3];
07988       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
07989       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
07990       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
07991 
07992       RangeScalar* Y_i = Y;
07993       Ordinal i = 0;
07994       for (size_t k = 0; k < nnz; ++k) {
07995         const MatrixScalar A_ij = val[k];
07996         const Ordinal j = ind[k];
07997         // NOTE: "if" instead of "while" here is only valid
07998         // if the matrix contains no empty columns.
07999         if (k >= ptr[i+1]) {
08000           // Write temp output from last iteration(s) to Y,
08001           // before incrementing the current row index.
08002           if (k > 0) {
08003             Y_i[0] = tmp[0];
08004             Y_i[colStrideY] = tmp[1];
08005             Y_i[2*colStrideY] = tmp[2];
08006           }
08007           ++i;
08008           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
08009           Y_i = &Y[i];
08010           tmp[0] = -Y_i[0];
08011           tmp[1] = -Y_i[colStrideY];
08012           tmp[2] = -Y_i[2*colStrideY];
08013         }
08014         const DomainScalar* const X_j = &X[j];
08015         tmp[0] += A_ij * X_j[0];
08016         tmp[1] += A_ij * X_j[colStrideX];
08017         tmp[2] += A_ij * X_j[2*colStrideX];
08018       }
08019       Y_i[0] = tmp[0];
08020       Y_i[colStrideY] = tmp[1];
08021       Y_i[2*colStrideY] = tmp[2];
08022     }
08023     else if (beta == STS::zero()) {
08024       RangeScalar tmp[3];
08025       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08026       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08027       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08028 
08029       RangeScalar* Y_i = Y;
08030       Ordinal i = 0;
08031       for (size_t k = 0; k < nnz; ++k) {
08032         const MatrixScalar A_ij = val[k];
08033         const Ordinal j = ind[k];
08034         // NOTE: "if" instead of "while" here is only valid
08035         // if the matrix contains no empty columns.
08036         if (k >= ptr[i+1]) {
08037           // Write temp output from last iteration(s) to Y,
08038           // before incrementing the current row index.
08039           if (k > 0) {
08040             Y_i[0] = tmp[0];
08041             Y_i[colStrideY] = tmp[1];
08042             Y_i[2*colStrideY] = tmp[2];
08043           }
08044           ++i;
08045           // We haven't seen row i before; set Y(i,:) to 0.
08046           Y_i = &Y[i];
08047           tmp[0] = STS::zero();
08048           tmp[1] = STS::zero();
08049           tmp[2] = STS::zero();
08050         }
08051         const DomainScalar* const X_j = &X[j];
08052         tmp[0] += A_ij * X_j[0];
08053         tmp[1] += A_ij * X_j[colStrideX];
08054         tmp[2] += A_ij * X_j[2*colStrideX];
08055       }
08056       Y_i[0] = tmp[0];
08057       Y_i[colStrideY] = tmp[1];
08058       Y_i[2*colStrideY] = tmp[2];
08059     }
08060     else if (beta == STS::one()) {
08061       RangeScalar tmp[3];
08062       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08063       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08064       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08065 
08066       RangeScalar* Y_i = Y;
08067       Ordinal i = 0;
08068       for (size_t k = 0; k < nnz; ++k) {
08069         const MatrixScalar A_ij = val[k];
08070         const Ordinal j = ind[k];
08071         // NOTE: "if" instead of "while" here is only valid
08072         // if the matrix contains no empty columns.
08073         if (k >= ptr[i+1]) {
08074           // Write temp output from last iteration(s) to Y,
08075           // before incrementing the current row index.
08076           if (k > 0) {
08077             Y_i[0] = tmp[0];
08078             Y_i[colStrideY] = tmp[1];
08079             Y_i[2*colStrideY] = tmp[2];
08080           }
08081           ++i;
08082           // We don't have to set Y(i,:) here, since beta == 1.
08083           Y_i = &Y[i];
08084           tmp[0] = Y_i[0];
08085           tmp[1] = Y_i[colStrideY];
08086           tmp[2] = Y_i[2*colStrideY];
08087         }
08088         const DomainScalar* const X_j = &X[j];
08089         tmp[0] += A_ij * X_j[0];
08090         tmp[1] += A_ij * X_j[colStrideX];
08091         tmp[2] += A_ij * X_j[2*colStrideX];
08092       }
08093       Y_i[0] = tmp[0];
08094       Y_i[colStrideY] = tmp[1];
08095       Y_i[2*colStrideY] = tmp[2];
08096     }
08097     else { // beta != -1 && beta != 0 && beta != 1
08098       RangeScalar tmp[3];
08099       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08100       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08101       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08102 
08103       RangeScalar* Y_i = Y;
08104       Ordinal i = 0;
08105       for (size_t k = 0; k < nnz; ++k) {
08106         const MatrixScalar A_ij = val[k];
08107         const Ordinal j = ind[k];
08108         // NOTE: "if" instead of "while" here is only valid
08109         // if the matrix contains no empty columns.
08110         if (k >= ptr[i+1]) {
08111           // Write temp output from last iteration(s) to Y,
08112           // before incrementing the current row index.
08113           if (k > 0) {
08114             Y_i[0] = tmp[0];
08115             Y_i[colStrideY] = tmp[1];
08116             Y_i[2*colStrideY] = tmp[2];
08117           }
08118           ++i;
08119           // We haven't seen row i before; scale Y(i,:) by beta.
08120           Y_i = &Y[i];
08121           tmp[0] = beta * Y_i[0];
08122           tmp[1] = beta * Y_i[colStrideY];
08123           tmp[2] = beta * Y_i[2*colStrideY];
08124         }
08125         const DomainScalar* const X_j = &X[j];
08126         tmp[0] += A_ij * X_j[0];
08127         tmp[1] += A_ij * X_j[colStrideX];
08128         tmp[2] += A_ij * X_j[2*colStrideX];
08129       }
08130       Y_i[0] = tmp[0];
08131       Y_i[colStrideY] = tmp[1];
08132       Y_i[2*colStrideY] = tmp[2];
08133     }
08134   }
08135   else if (alpha == -STS::one()) {
08136     if (beta == -STS::one()) {
08137       RangeScalar tmp[3];
08138       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08139       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08140       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08141 
08142       RangeScalar* Y_i = Y;
08143       Ordinal i = 0;
08144       for (size_t k = 0; k < nnz; ++k) {
08145         const MatrixScalar A_ij = val[k];
08146         const Ordinal j = ind[k];
08147         // NOTE: "if" instead of "while" here is only valid
08148         // if the matrix contains no empty columns.
08149         if (k >= ptr[i+1]) {
08150           // Write temp output from last iteration(s) to Y,
08151           // before incrementing the current row index.
08152           if (k > 0) {
08153             Y_i[0] = tmp[0];
08154             Y_i[colStrideY] = tmp[1];
08155             Y_i[2*colStrideY] = tmp[2];
08156           }
08157           ++i;
08158           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
08159           Y_i = &Y[i];
08160           tmp[0] = -Y_i[0];
08161           tmp[1] = -Y_i[colStrideY];
08162           tmp[2] = -Y_i[2*colStrideY];
08163         }
08164         const DomainScalar* const X_j = &X[j];
08165         tmp[0] -= A_ij * X_j[0];
08166         tmp[1] -= A_ij * X_j[colStrideX];
08167         tmp[2] -= A_ij * X_j[2*colStrideX];
08168       }
08169       Y_i[0] = tmp[0];
08170       Y_i[colStrideY] = tmp[1];
08171       Y_i[2*colStrideY] = tmp[2];
08172     }
08173     else if (beta == STS::zero()) {
08174       RangeScalar tmp[3];
08175       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08176       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08177       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08178 
08179       RangeScalar* Y_i = Y;
08180       Ordinal i = 0;
08181       for (size_t k = 0; k < nnz; ++k) {
08182         const MatrixScalar A_ij = val[k];
08183         const Ordinal j = ind[k];
08184         // NOTE: "if" instead of "while" here is only valid
08185         // if the matrix contains no empty columns.
08186         if (k >= ptr[i+1]) {
08187           // Write temp output from last iteration(s) to Y,
08188           // before incrementing the current row index.
08189           if (k > 0) {
08190             Y_i[0] = tmp[0];
08191             Y_i[colStrideY] = tmp[1];
08192             Y_i[2*colStrideY] = tmp[2];
08193           }
08194           ++i;
08195           // We haven't seen row i before; set Y(i,:) to 0.
08196           Y_i = &Y[i];
08197           tmp[0] = STS::zero();
08198           tmp[1] = STS::zero();
08199           tmp[2] = STS::zero();
08200         }
08201         const DomainScalar* const X_j = &X[j];
08202         tmp[0] -= A_ij * X_j[0];
08203         tmp[1] -= A_ij * X_j[colStrideX];
08204         tmp[2] -= A_ij * X_j[2*colStrideX];
08205       }
08206       Y_i[0] = tmp[0];
08207       Y_i[colStrideY] = tmp[1];
08208       Y_i[2*colStrideY] = tmp[2];
08209     }
08210     else if (beta == STS::one()) {
08211       RangeScalar tmp[3];
08212       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08213       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08214       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08215 
08216       RangeScalar* Y_i = Y;
08217       Ordinal i = 0;
08218       for (size_t k = 0; k < nnz; ++k) {
08219         const MatrixScalar A_ij = val[k];
08220         const Ordinal j = ind[k];
08221         // NOTE: "if" instead of "while" here is only valid
08222         // if the matrix contains no empty columns.
08223         if (k >= ptr[i+1]) {
08224           // Write temp output from last iteration(s) to Y,
08225           // before incrementing the current row index.
08226           if (k > 0) {
08227             Y_i[0] = tmp[0];
08228             Y_i[colStrideY] = tmp[1];
08229             Y_i[2*colStrideY] = tmp[2];
08230           }
08231           ++i;
08232           // We don't have to set Y(i,:) here, since beta == 1.
08233           Y_i = &Y[i];
08234           tmp[0] = Y_i[0];
08235           tmp[1] = Y_i[colStrideY];
08236           tmp[2] = Y_i[2*colStrideY];
08237         }
08238         const DomainScalar* const X_j = &X[j];
08239         tmp[0] -= A_ij * X_j[0];
08240         tmp[1] -= A_ij * X_j[colStrideX];
08241         tmp[2] -= A_ij * X_j[2*colStrideX];
08242       }
08243       Y_i[0] = tmp[0];
08244       Y_i[colStrideY] = tmp[1];
08245       Y_i[2*colStrideY] = tmp[2];
08246     }
08247     else { // beta != -1 && beta != 0 && beta != 1
08248       RangeScalar tmp[3];
08249       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08250       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08251       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08252 
08253       RangeScalar* Y_i = Y;
08254       Ordinal i = 0;
08255       for (size_t k = 0; k < nnz; ++k) {
08256         const MatrixScalar A_ij = val[k];
08257         const Ordinal j = ind[k];
08258         // NOTE: "if" instead of "while" here is only valid
08259         // if the matrix contains no empty columns.
08260         if (k >= ptr[i+1]) {
08261           // Write temp output from last iteration(s) to Y,
08262           // before incrementing the current row index.
08263           if (k > 0) {
08264             Y_i[0] = tmp[0];
08265             Y_i[colStrideY] = tmp[1];
08266             Y_i[2*colStrideY] = tmp[2];
08267           }
08268           ++i;
08269           // We haven't seen row i before; scale Y(i,:) by beta.
08270           Y_i = &Y[i];
08271           tmp[0] = beta * Y_i[0];
08272           tmp[1] = beta * Y_i[colStrideY];
08273           tmp[2] = beta * Y_i[2*colStrideY];
08274         }
08275         const DomainScalar* const X_j = &X[j];
08276         tmp[0] -= A_ij * X_j[0];
08277         tmp[1] -= A_ij * X_j[colStrideX];
08278         tmp[2] -= A_ij * X_j[2*colStrideX];
08279       }
08280       Y_i[0] = tmp[0];
08281       Y_i[colStrideY] = tmp[1];
08282       Y_i[2*colStrideY] = tmp[2];
08283     }
08284   }
08285   else { // alpha != 1 && alpha != -1
08286     if (beta == -STS::one()) {
08287       RangeScalar tmp[3];
08288       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08289       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08290       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08291 
08292       RangeScalar* Y_i = Y;
08293       Ordinal i = 0;
08294       for (size_t k = 0; k < nnz; ++k) {
08295         const MatrixScalar A_ij = val[k];
08296         const Ordinal j = ind[k];
08297         // NOTE: "if" instead of "while" here is only valid
08298         // if the matrix contains no empty columns.
08299         if (k >= ptr[i+1]) {
08300           // Write temp output from last iteration(s) to Y,
08301           // before incrementing the current row index.
08302           if (k > 0) {
08303             Y_i[0] = tmp[0];
08304             Y_i[colStrideY] = tmp[1];
08305             Y_i[2*colStrideY] = tmp[2];
08306           }
08307           ++i;
08308           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
08309           Y_i = &Y[i];
08310           tmp[0] = -Y_i[0];
08311           tmp[1] = -Y_i[colStrideY];
08312           tmp[2] = -Y_i[2*colStrideY];
08313         }
08314         const DomainScalar* const X_j = &X[j];
08315         tmp[0] += alpha * A_ij * X_j[0];
08316         tmp[1] += alpha * A_ij * X_j[colStrideX];
08317         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
08318       }
08319       Y_i[0] = tmp[0];
08320       Y_i[colStrideY] = tmp[1];
08321       Y_i[2*colStrideY] = tmp[2];
08322     }
08323     else if (beta == STS::zero()) {
08324       RangeScalar tmp[3];
08325       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08326       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08327       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08328 
08329       RangeScalar* Y_i = Y;
08330       Ordinal i = 0;
08331       for (size_t k = 0; k < nnz; ++k) {
08332         const MatrixScalar A_ij = val[k];
08333         const Ordinal j = ind[k];
08334         // NOTE: "if" instead of "while" here is only valid
08335         // if the matrix contains no empty columns.
08336         if (k >= ptr[i+1]) {
08337           // Write temp output from last iteration(s) to Y,
08338           // before incrementing the current row index.
08339           if (k > 0) {
08340             Y_i[0] = tmp[0];
08341             Y_i[colStrideY] = tmp[1];
08342             Y_i[2*colStrideY] = tmp[2];
08343           }
08344           ++i;
08345           // We haven't seen row i before; set Y(i,:) to 0.
08346           Y_i = &Y[i];
08347           tmp[0] = STS::zero();
08348           tmp[1] = STS::zero();
08349           tmp[2] = STS::zero();
08350         }
08351         const DomainScalar* const X_j = &X[j];
08352         tmp[0] += alpha * A_ij * X_j[0];
08353         tmp[1] += alpha * A_ij * X_j[colStrideX];
08354         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
08355       }
08356       Y_i[0] = tmp[0];
08357       Y_i[colStrideY] = tmp[1];
08358       Y_i[2*colStrideY] = tmp[2];
08359     }
08360     else if (beta == STS::one()) {
08361       RangeScalar tmp[3];
08362       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08363       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08364       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08365 
08366       RangeScalar* Y_i = Y;
08367       Ordinal i = 0;
08368       for (size_t k = 0; k < nnz; ++k) {
08369         const MatrixScalar A_ij = val[k];
08370         const Ordinal j = ind[k];
08371         // NOTE: "if" instead of "while" here is only valid
08372         // if the matrix contains no empty columns.
08373         if (k >= ptr[i+1]) {
08374           // Write temp output from last iteration(s) to Y,
08375           // before incrementing the current row index.
08376           if (k > 0) {
08377             Y_i[0] = tmp[0];
08378             Y_i[colStrideY] = tmp[1];
08379             Y_i[2*colStrideY] = tmp[2];
08380           }
08381           ++i;
08382           // We don't have to set Y(i,:) here, since beta == 1.
08383           Y_i = &Y[i];
08384           tmp[0] = Y_i[0];
08385           tmp[1] = Y_i[colStrideY];
08386           tmp[2] = Y_i[2*colStrideY];
08387         }
08388         const DomainScalar* const X_j = &X[j];
08389         tmp[0] += alpha * A_ij * X_j[0];
08390         tmp[1] += alpha * A_ij * X_j[colStrideX];
08391         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
08392       }
08393       Y_i[0] = tmp[0];
08394       Y_i[colStrideY] = tmp[1];
08395       Y_i[2*colStrideY] = tmp[2];
08396     }
08397     else { // beta != -1 && beta != 0 && beta != 1
08398       RangeScalar tmp[3];
08399       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08400       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08401       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08402 
08403       RangeScalar* Y_i = Y;
08404       Ordinal i = 0;
08405       for (size_t k = 0; k < nnz; ++k) {
08406         const MatrixScalar A_ij = val[k];
08407         const Ordinal j = ind[k];
08408         // NOTE: "if" instead of "while" here is only valid
08409         // if the matrix contains no empty columns.
08410         if (k >= ptr[i+1]) {
08411           // Write temp output from last iteration(s) to Y,
08412           // before incrementing the current row index.
08413           if (k > 0) {
08414             Y_i[0] = tmp[0];
08415             Y_i[colStrideY] = tmp[1];
08416             Y_i[2*colStrideY] = tmp[2];
08417           }
08418           ++i;
08419           // We haven't seen row i before; scale Y(i,:) by beta.
08420           Y_i = &Y[i];
08421           tmp[0] = beta * Y_i[0];
08422           tmp[1] = beta * Y_i[colStrideY];
08423           tmp[2] = beta * Y_i[2*colStrideY];
08424         }
08425         const DomainScalar* const X_j = &X[j];
08426         tmp[0] += alpha * A_ij * X_j[0];
08427         tmp[1] += alpha * A_ij * X_j[colStrideX];
08428         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
08429       }
08430       Y_i[0] = tmp[0];
08431       Y_i[colStrideY] = tmp[1];
08432       Y_i[2*colStrideY] = tmp[2];
08433     }
08434   }
08435 }
08436 
08437 template<class Ordinal,
08438          class MatrixScalar,
08439          class DomainScalar,
08440          class RangeScalar>
08441 void
08442 matVecCsrColMajorForif4Vec (
08443   const Ordinal numRows,
08444   const Ordinal numCols,
08445   const Ordinal numVecs,
08446   const RangeScalar& beta,
08447   RangeScalar Y[],
08448   const Ordinal colStrideY,
08449   const RangeScalar& alpha,
08450   const size_t  ptr[],
08451   const Ordinal ind[],
08452   const MatrixScalar val[],
08453   const DomainScalar X[],
08454   const Ordinal colStrideX)
08455 {
08456   typedef Teuchos::ScalarTraits<RangeScalar> STS;
08457 
08458   // Algorithm variants 'for-while' and 'for-if' need to set
08459   // Y(0,:) = 0, but only for the special case of CSR.
08460   if (beta != STS::zero()) {
08461     for (Ordinal c = 0; c < numVecs; ++c) {
08462       Y[c*colStrideY] = beta * Y[c*colStrideY];
08463     }
08464   }
08465   else {
08466     // Follow the Sparse BLAS convention for beta == 0. 
08467     for (Ordinal c = 0; c < numVecs; ++c) {
08468       Y[c*colStrideY] = STS::zero();
08469     }
08470   }
08471   if (alpha == STS::zero()) {
08472     // Prescale: Y := beta * Y.
08473     if (beta == STS::zero()) {
08474       for (Ordinal j = 0; j < numVecs; ++j) {
08475         RangeScalar* const Y_j = &Y[j*colStrideY];
08476         for (Ordinal i = 0; i < numRows; ++i) {
08477           // Follow the Sparse BLAS convention for beta == 0. 
08478           Y_j[i] = STS::zero();
08479         }
08480       }
08481     }
08482     else if (beta != STS::one()) {
08483       for (Ordinal j = 0; j < numVecs; ++j) {
08484         RangeScalar* const Y_j = &Y[j*colStrideY];
08485         for (Ordinal i = 0; i < numRows; ++i) {
08486           Y_j[i] = beta * Y_j[i];
08487         }
08488       }
08489     }
08490     return; // Our work is done!
08491   }
08492   const size_t nnz = ptr[numRows];
08493   if (alpha == STS::one()) {
08494     if (beta == -STS::one()) {
08495       RangeScalar tmp[4];
08496       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08497       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08498       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08499       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08500 
08501       RangeScalar* Y_i = Y;
08502       Ordinal i = 0;
08503       for (size_t k = 0; k < nnz; ++k) {
08504         const MatrixScalar A_ij = val[k];
08505         const Ordinal j = ind[k];
08506         // NOTE: "if" instead of "while" here is only valid
08507         // if the matrix contains no empty columns.
08508         if (k >= ptr[i+1]) {
08509           // Write temp output from last iteration(s) to Y,
08510           // before incrementing the current row index.
08511           if (k > 0) {
08512             Y_i[0] = tmp[0];
08513             Y_i[colStrideY] = tmp[1];
08514             Y_i[2*colStrideY] = tmp[2];
08515             Y_i[3*colStrideY] = tmp[3];
08516           }
08517           ++i;
08518           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
08519           Y_i = &Y[i];
08520           tmp[0] = -Y_i[0];
08521           tmp[1] = -Y_i[colStrideY];
08522           tmp[2] = -Y_i[2*colStrideY];
08523           tmp[3] = -Y_i[3*colStrideY];
08524         }
08525         const DomainScalar* const X_j = &X[j];
08526         tmp[0] += A_ij * X_j[0];
08527         tmp[1] += A_ij * X_j[colStrideX];
08528         tmp[2] += A_ij * X_j[2*colStrideX];
08529         tmp[3] += A_ij * X_j[3*colStrideX];
08530       }
08531       Y_i[0] = tmp[0];
08532       Y_i[colStrideY] = tmp[1];
08533       Y_i[2*colStrideY] = tmp[2];
08534       Y_i[3*colStrideY] = tmp[3];
08535     }
08536     else if (beta == STS::zero()) {
08537       RangeScalar tmp[4];
08538       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08539       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08540       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08541       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08542 
08543       RangeScalar* Y_i = Y;
08544       Ordinal i = 0;
08545       for (size_t k = 0; k < nnz; ++k) {
08546         const MatrixScalar A_ij = val[k];
08547         const Ordinal j = ind[k];
08548         // NOTE: "if" instead of "while" here is only valid
08549         // if the matrix contains no empty columns.
08550         if (k >= ptr[i+1]) {
08551           // Write temp output from last iteration(s) to Y,
08552           // before incrementing the current row index.
08553           if (k > 0) {
08554             Y_i[0] = tmp[0];
08555             Y_i[colStrideY] = tmp[1];
08556             Y_i[2*colStrideY] = tmp[2];
08557             Y_i[3*colStrideY] = tmp[3];
08558           }
08559           ++i;
08560           // We haven't seen row i before; set Y(i,:) to 0.
08561           Y_i = &Y[i];
08562           tmp[0] = STS::zero();
08563           tmp[1] = STS::zero();
08564           tmp[2] = STS::zero();
08565           tmp[3] = STS::zero();
08566         }
08567         const DomainScalar* const X_j = &X[j];
08568         tmp[0] += A_ij * X_j[0];
08569         tmp[1] += A_ij * X_j[colStrideX];
08570         tmp[2] += A_ij * X_j[2*colStrideX];
08571         tmp[3] += A_ij * X_j[3*colStrideX];
08572       }
08573       Y_i[0] = tmp[0];
08574       Y_i[colStrideY] = tmp[1];
08575       Y_i[2*colStrideY] = tmp[2];
08576       Y_i[3*colStrideY] = tmp[3];
08577     }
08578     else if (beta == STS::one()) {
08579       RangeScalar tmp[4];
08580       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08581       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08582       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08583       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08584 
08585       RangeScalar* Y_i = Y;
08586       Ordinal i = 0;
08587       for (size_t k = 0; k < nnz; ++k) {
08588         const MatrixScalar A_ij = val[k];
08589         const Ordinal j = ind[k];
08590         // NOTE: "if" instead of "while" here is only valid
08591         // if the matrix contains no empty columns.
08592         if (k >= ptr[i+1]) {
08593           // Write temp output from last iteration(s) to Y,
08594           // before incrementing the current row index.
08595           if (k > 0) {
08596             Y_i[0] = tmp[0];
08597             Y_i[colStrideY] = tmp[1];
08598             Y_i[2*colStrideY] = tmp[2];
08599             Y_i[3*colStrideY] = tmp[3];
08600           }
08601           ++i;
08602           // We don't have to set Y(i,:) here, since beta == 1.
08603           Y_i = &Y[i];
08604           tmp[0] = Y_i[0];
08605           tmp[1] = Y_i[colStrideY];
08606           tmp[2] = Y_i[2*colStrideY];
08607           tmp[3] = Y_i[3*colStrideY];
08608         }
08609         const DomainScalar* const X_j = &X[j];
08610         tmp[0] += A_ij * X_j[0];
08611         tmp[1] += A_ij * X_j[colStrideX];
08612         tmp[2] += A_ij * X_j[2*colStrideX];
08613         tmp[3] += A_ij * X_j[3*colStrideX];
08614       }
08615       Y_i[0] = tmp[0];
08616       Y_i[colStrideY] = tmp[1];
08617       Y_i[2*colStrideY] = tmp[2];
08618       Y_i[3*colStrideY] = tmp[3];
08619     }
08620     else { // beta != -1 && beta != 0 && beta != 1
08621       RangeScalar tmp[4];
08622       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08623       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08624       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08625       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08626 
08627       RangeScalar* Y_i = Y;
08628       Ordinal i = 0;
08629       for (size_t k = 0; k < nnz; ++k) {
08630         const MatrixScalar A_ij = val[k];
08631         const Ordinal j = ind[k];
08632         // NOTE: "if" instead of "while" here is only valid
08633         // if the matrix contains no empty columns.
08634         if (k >= ptr[i+1]) {
08635           // Write temp output from last iteration(s) to Y,
08636           // before incrementing the current row index.
08637           if (k > 0) {
08638             Y_i[0] = tmp[0];
08639             Y_i[colStrideY] = tmp[1];
08640             Y_i[2*colStrideY] = tmp[2];
08641             Y_i[3*colStrideY] = tmp[3];
08642           }
08643           ++i;
08644           // We haven't seen row i before; scale Y(i,:) by beta.
08645           Y_i = &Y[i];
08646           tmp[0] = beta * Y_i[0];
08647           tmp[1] = beta * Y_i[colStrideY];
08648           tmp[2] = beta * Y_i[2*colStrideY];
08649           tmp[3] = beta * Y_i[3*colStrideY];
08650         }
08651         const DomainScalar* const X_j = &X[j];
08652         tmp[0] += A_ij * X_j[0];
08653         tmp[1] += A_ij * X_j[colStrideX];
08654         tmp[2] += A_ij * X_j[2*colStrideX];
08655         tmp[3] += A_ij * X_j[3*colStrideX];
08656       }
08657       Y_i[0] = tmp[0];
08658       Y_i[colStrideY] = tmp[1];
08659       Y_i[2*colStrideY] = tmp[2];
08660       Y_i[3*colStrideY] = tmp[3];
08661     }
08662   }
08663   else if (alpha == -STS::one()) {
08664     if (beta == -STS::one()) {
08665       RangeScalar tmp[4];
08666       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08667       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08668       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08669       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08670 
08671       RangeScalar* Y_i = Y;
08672       Ordinal i = 0;
08673       for (size_t k = 0; k < nnz; ++k) {
08674         const MatrixScalar A_ij = val[k];
08675         const Ordinal j = ind[k];
08676         // NOTE: "if" instead of "while" here is only valid
08677         // if the matrix contains no empty columns.
08678         if (k >= ptr[i+1]) {
08679           // Write temp output from last iteration(s) to Y,
08680           // before incrementing the current row index.
08681           if (k > 0) {
08682             Y_i[0] = tmp[0];
08683             Y_i[colStrideY] = tmp[1];
08684             Y_i[2*colStrideY] = tmp[2];
08685             Y_i[3*colStrideY] = tmp[3];
08686           }
08687           ++i;
08688           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
08689           Y_i = &Y[i];
08690           tmp[0] = -Y_i[0];
08691           tmp[1] = -Y_i[colStrideY];
08692           tmp[2] = -Y_i[2*colStrideY];
08693           tmp[3] = -Y_i[3*colStrideY];
08694         }
08695         const DomainScalar* const X_j = &X[j];
08696         tmp[0] -= A_ij * X_j[0];
08697         tmp[1] -= A_ij * X_j[colStrideX];
08698         tmp[2] -= A_ij * X_j[2*colStrideX];
08699         tmp[3] -= A_ij * X_j[3*colStrideX];
08700       }
08701       Y_i[0] = tmp[0];
08702       Y_i[colStrideY] = tmp[1];
08703       Y_i[2*colStrideY] = tmp[2];
08704       Y_i[3*colStrideY] = tmp[3];
08705     }
08706     else if (beta == STS::zero()) {
08707       RangeScalar tmp[4];
08708       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08709       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08710       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08711       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08712 
08713       RangeScalar* Y_i = Y;
08714       Ordinal i = 0;
08715       for (size_t k = 0; k < nnz; ++k) {
08716         const MatrixScalar A_ij = val[k];
08717         const Ordinal j = ind[k];
08718         // NOTE: "if" instead of "while" here is only valid
08719         // if the matrix contains no empty columns.
08720         if (k >= ptr[i+1]) {
08721           // Write temp output from last iteration(s) to Y,
08722           // before incrementing the current row index.
08723           if (k > 0) {
08724             Y_i[0] = tmp[0];
08725             Y_i[colStrideY] = tmp[1];
08726             Y_i[2*colStrideY] = tmp[2];
08727             Y_i[3*colStrideY] = tmp[3];
08728           }
08729           ++i;
08730           // We haven't seen row i before; set Y(i,:) to 0.
08731           Y_i = &Y[i];
08732           tmp[0] = STS::zero();
08733           tmp[1] = STS::zero();
08734           tmp[2] = STS::zero();
08735           tmp[3] = STS::zero();
08736         }
08737         const DomainScalar* const X_j = &X[j];
08738         tmp[0] -= A_ij * X_j[0];
08739         tmp[1] -= A_ij * X_j[colStrideX];
08740         tmp[2] -= A_ij * X_j[2*colStrideX];
08741         tmp[3] -= A_ij * X_j[3*colStrideX];
08742       }
08743       Y_i[0] = tmp[0];
08744       Y_i[colStrideY] = tmp[1];
08745       Y_i[2*colStrideY] = tmp[2];
08746       Y_i[3*colStrideY] = tmp[3];
08747     }
08748     else if (beta == STS::one()) {
08749       RangeScalar tmp[4];
08750       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08751       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08752       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08753       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08754 
08755       RangeScalar* Y_i = Y;
08756       Ordinal i = 0;
08757       for (size_t k = 0; k < nnz; ++k) {
08758         const MatrixScalar A_ij = val[k];
08759         const Ordinal j = ind[k];
08760         // NOTE: "if" instead of "while" here is only valid
08761         // if the matrix contains no empty columns.
08762         if (k >= ptr[i+1]) {
08763           // Write temp output from last iteration(s) to Y,
08764           // before incrementing the current row index.
08765           if (k > 0) {
08766             Y_i[0] = tmp[0];
08767             Y_i[colStrideY] = tmp[1];
08768             Y_i[2*colStrideY] = tmp[2];
08769             Y_i[3*colStrideY] = tmp[3];
08770           }
08771           ++i;
08772           // We don't have to set Y(i,:) here, since beta == 1.
08773           Y_i = &Y[i];
08774           tmp[0] = Y_i[0];
08775           tmp[1] = Y_i[colStrideY];
08776           tmp[2] = Y_i[2*colStrideY];
08777           tmp[3] = Y_i[3*colStrideY];
08778         }
08779         const DomainScalar* const X_j = &X[j];
08780         tmp[0] -= A_ij * X_j[0];
08781         tmp[1] -= A_ij * X_j[colStrideX];
08782         tmp[2] -= A_ij * X_j[2*colStrideX];
08783         tmp[3] -= A_ij * X_j[3*colStrideX];
08784       }
08785       Y_i[0] = tmp[0];
08786       Y_i[colStrideY] = tmp[1];
08787       Y_i[2*colStrideY] = tmp[2];
08788       Y_i[3*colStrideY] = tmp[3];
08789     }
08790     else { // beta != -1 && beta != 0 && beta != 1
08791       RangeScalar tmp[4];
08792       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08793       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08794       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08795       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08796 
08797       RangeScalar* Y_i = Y;
08798       Ordinal i = 0;
08799       for (size_t k = 0; k < nnz; ++k) {
08800         const MatrixScalar A_ij = val[k];
08801         const Ordinal j = ind[k];
08802         // NOTE: "if" instead of "while" here is only valid
08803         // if the matrix contains no empty columns.
08804         if (k >= ptr[i+1]) {
08805           // Write temp output from last iteration(s) to Y,
08806           // before incrementing the current row index.
08807           if (k > 0) {
08808             Y_i[0] = tmp[0];
08809             Y_i[colStrideY] = tmp[1];
08810             Y_i[2*colStrideY] = tmp[2];
08811             Y_i[3*colStrideY] = tmp[3];
08812           }
08813           ++i;
08814           // We haven't seen row i before; scale Y(i,:) by beta.
08815           Y_i = &Y[i];
08816           tmp[0] = beta * Y_i[0];
08817           tmp[1] = beta * Y_i[colStrideY];
08818           tmp[2] = beta * Y_i[2*colStrideY];
08819           tmp[3] = beta * Y_i[3*colStrideY];
08820         }
08821         const DomainScalar* const X_j = &X[j];
08822         tmp[0] -= A_ij * X_j[0];
08823         tmp[1] -= A_ij * X_j[colStrideX];
08824         tmp[2] -= A_ij * X_j[2*colStrideX];
08825         tmp[3] -= A_ij * X_j[3*colStrideX];
08826       }
08827       Y_i[0] = tmp[0];
08828       Y_i[colStrideY] = tmp[1];
08829       Y_i[2*colStrideY] = tmp[2];
08830       Y_i[3*colStrideY] = tmp[3];
08831     }
08832   }
08833   else { // alpha != 1 && alpha != -1
08834     if (beta == -STS::one()) {
08835       RangeScalar tmp[4];
08836       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08837       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08838       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08839       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08840 
08841       RangeScalar* Y_i = Y;
08842       Ordinal i = 0;
08843       for (size_t k = 0; k < nnz; ++k) {
08844         const MatrixScalar A_ij = val[k];
08845         const Ordinal j = ind[k];
08846         // NOTE: "if" instead of "while" here is only valid
08847         // if the matrix contains no empty columns.
08848         if (k >= ptr[i+1]) {
08849           // Write temp output from last iteration(s) to Y,
08850           // before incrementing the current row index.
08851           if (k > 0) {
08852             Y_i[0] = tmp[0];
08853             Y_i[colStrideY] = tmp[1];
08854             Y_i[2*colStrideY] = tmp[2];
08855             Y_i[3*colStrideY] = tmp[3];
08856           }
08857           ++i;
08858           // We haven't seen row i before; set Y(i,:) to -Y(i,:).
08859           Y_i = &Y[i];
08860           tmp[0] = -Y_i[0];
08861           tmp[1] = -Y_i[colStrideY];
08862           tmp[2] = -Y_i[2*colStrideY];
08863           tmp[3] = -Y_i[3*colStrideY];
08864         }
08865         const DomainScalar* const X_j = &X[j];
08866         tmp[0] += alpha * A_ij * X_j[0];
08867         tmp[1] += alpha * A_ij * X_j[colStrideX];
08868         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
08869         tmp[3] += alpha * A_ij * X_j[3*colStrideX];
08870       }
08871       Y_i[0] = tmp[0];
08872       Y_i[colStrideY] = tmp[1];
08873       Y_i[2*colStrideY] = tmp[2];
08874       Y_i[3*colStrideY] = tmp[3];
08875     }
08876     else if (beta == STS::zero()) {
08877       RangeScalar tmp[4];
08878       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08879       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08880       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08881       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08882 
08883       RangeScalar* Y_i = Y;
08884       Ordinal i = 0;
08885       for (size_t k = 0; k < nnz; ++k) {
08886         const MatrixScalar A_ij = val[k];
08887         const Ordinal j = ind[k];
08888         // NOTE: "if" instead of "while" here is only valid
08889         // if the matrix contains no empty columns.
08890         if (k >= ptr[i+1]) {
08891           // Write temp output from last iteration(s) to Y,
08892           // before incrementing the current row index.
08893           if (k > 0) {
08894             Y_i[0] = tmp[0];
08895             Y_i[colStrideY] = tmp[1];
08896             Y_i[2*colStrideY] = tmp[2];
08897             Y_i[3*colStrideY] = tmp[3];
08898           }
08899           ++i;
08900           // We haven't seen row i before; set Y(i,:) to 0.
08901           Y_i = &Y[i];
08902           tmp[0] = STS::zero();
08903           tmp[1] = STS::zero();
08904           tmp[2] = STS::zero();
08905           tmp[3] = STS::zero();
08906         }
08907         const DomainScalar* const X_j = &X[j];
08908         tmp[0] += alpha * A_ij * X_j[0];
08909         tmp[1] += alpha * A_ij * X_j[colStrideX];
08910         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
08911         tmp[3] += alpha * A_ij * X_j[3*colStrideX];
08912       }
08913       Y_i[0] = tmp[0];
08914       Y_i[colStrideY] = tmp[1];
08915       Y_i[2*colStrideY] = tmp[2];
08916       Y_i[3*colStrideY] = tmp[3];
08917     }
08918     else if (beta == STS::one()) {
08919       RangeScalar tmp[4];
08920       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08921       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08922       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08923       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08924 
08925       RangeScalar* Y_i = Y;
08926       Ordinal i = 0;
08927       for (size_t k = 0; k < nnz; ++k) {
08928         const MatrixScalar A_ij = val[k];
08929         const Ordinal j = ind[k];
08930         // NOTE: "if" instead of "while" here is only valid
08931         // if the matrix contains no empty columns.
08932         if (k >= ptr[i+1]) {
08933           // Write temp output from last iteration(s) to Y,
08934           // before incrementing the current row index.
08935           if (k > 0) {
08936             Y_i[0] = tmp[0];
08937             Y_i[colStrideY] = tmp[1];
08938             Y_i[2*colStrideY] = tmp[2];
08939             Y_i[3*colStrideY] = tmp[3];
08940           }
08941           ++i;
08942           // We don't have to set Y(i,:) here, since beta == 1.
08943           Y_i = &Y[i];
08944           tmp[0] = Y_i[0];
08945           tmp[1] = Y_i[colStrideY];
08946           tmp[2] = Y_i[2*colStrideY];
08947           tmp[3] = Y_i[3*colStrideY];
08948         }
08949         const DomainScalar* const X_j = &X[j];
08950         tmp[0] += alpha * A_ij * X_j[0];
08951         tmp[1] += alpha * A_ij * X_j[colStrideX];
08952         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
08953         tmp[3] += alpha * A_ij * X_j[3*colStrideX];
08954       }
08955       Y_i[0] = tmp[0];
08956       Y_i[colStrideY] = tmp[1];
08957       Y_i[2*colStrideY] = tmp[2];
08958       Y_i[3*colStrideY] = tmp[3];
08959     }
08960     else { // beta != -1 && beta != 0 && beta != 1
08961       RangeScalar tmp[4];
08962       tmp[0] = Teuchos::ScalarTraits<RangeScalar>::zero();
08963       tmp[1] = Teuchos::ScalarTraits<RangeScalar>::zero();
08964       tmp[2] = Teuchos::ScalarTraits<RangeScalar>::zero();
08965       tmp[3] = Teuchos::ScalarTraits<RangeScalar>::zero();
08966 
08967       RangeScalar* Y_i = Y;
08968       Ordinal i = 0;
08969       for (size_t k = 0; k < nnz; ++k) {
08970         const MatrixScalar A_ij = val[k];
08971         const Ordinal j = ind[k];
08972         // NOTE: "if" instead of "while" here is only valid
08973         // if the matrix contains no empty columns.
08974         if (k >= ptr[i+1]) {
08975           // Write temp output from last iteration(s) to Y,
08976           // before incrementing the current row index.
08977           if (k > 0) {
08978             Y_i[0] = tmp[0];
08979             Y_i[colStrideY] = tmp[1];
08980             Y_i[2*colStrideY] = tmp[2];
08981             Y_i[3*colStrideY] = tmp[3];
08982           }
08983           ++i;
08984           // We haven't seen row i before; scale Y(i,:) by beta.
08985           Y_i = &Y[i];
08986           tmp[0] = beta * Y_i[0];
08987           tmp[1] = beta * Y_i[colStrideY];
08988           tmp[2] = beta * Y_i[2*colStrideY];
08989           tmp[3] = beta * Y_i[3*colStrideY];
08990         }
08991         const DomainScalar* const X_j = &X[j];
08992         tmp[0] += alpha * A_ij * X_j[0];
08993         tmp[1] += alpha * A_ij * X_j[colStrideX];
08994         tmp[2] += alpha * A_ij * X_j[2*colStrideX];
08995         tmp[3] += alpha * A_ij * X_j[3*colStrideX];
08996       }
08997       Y_i[0] = tmp[0];
08998       Y_i[colStrideY] = tmp[1];
08999       Y_i[2*colStrideY] = tmp[2];
09000       Y_i[3*colStrideY] = tmp[3];
09001     }
09002   }
09003 }
09004 
09005 template<class Ordinal,
09006          class MatrixScalar,
09007          class DomainScalar,
09008          class RangeScalar>
09009 void
09010 matVecCscColMajorForforConj (
09011   const Ordinal numRows,
09012   const Ordinal numCols,
09013   const Ordinal numVecs,
09014   const RangeScalar& beta,
09015   RangeScalar Y[],
09016   const Ordinal colStrideY,
09017   const RangeScalar& alpha,
09018   const size_t  ptr[],
09019   const Ordinal ind[],
09020   const MatrixScalar val[],
09021   const DomainScalar X[],
09022   const Ordinal colStrideX)
09023 {
09024   typedef Teuchos::ScalarTraits<RangeScalar> STS;
09025 
09026   // Prescale: Y := beta * Y.
09027   if (beta == STS::zero()) {
09028     for (Ordinal j = 0; j < numVecs; ++j) {
09029       RangeScalar* const Y_j = &Y[j*colStrideY];
09030       for (Ordinal i = 0; i < numRows; ++i) {
09031         // Follow the Sparse BLAS convention for beta == 0. 
09032         Y_j[i] = STS::zero();
09033       }
09034     }
09035   }
09036   else if (beta != STS::one()) {
09037     for (Ordinal j = 0; j < numVecs; ++j) {
09038       RangeScalar* const Y_j = &Y[j*colStrideY];
09039       for (Ordinal i = 0; i < numRows; ++i) {
09040         Y_j[i] = beta * Y_j[i];
09041       }
09042     }
09043   }
09044   // Outer for loop preface:
09045   if (alpha == STS::zero()) {
09046     return; // Our work is done!
09047   }
09048   if (alpha == STS::one()) {
09049     for (Ordinal j = 0; j < numCols; ++j) {
09050       for (Ordinal c = 0; c < numVecs; ++c) {
09051         const DomainScalar tmp = X[j + c*colStrideX];
09052 
09053         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09054           const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09055           const Ordinal i = ind[k];
09056           Y[i + c*colStrideY] += A_ij * tmp;
09057         }
09058       }
09059     }
09060   }
09061   else if (alpha == -STS::one()) {
09062     for (Ordinal j = 0; j < numCols; ++j) {
09063       for (Ordinal c = 0; c < numVecs; ++c) {
09064         const DomainScalar tmp = X[j + c*colStrideX];
09065 
09066         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09067           const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09068           const Ordinal i = ind[k];
09069           Y[i + c*colStrideY] -= A_ij * tmp;
09070         }
09071       }
09072     }
09073   }
09074   else { // alpha != 1 && alpha != -1
09075     for (Ordinal j = 0; j < numCols; ++j) {
09076       for (Ordinal c = 0; c < numVecs; ++c) {
09077         const DomainScalar tmp = X[j + c*colStrideX];
09078 
09079         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09080           const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09081           const Ordinal i = ind[k];
09082           Y[i + c*colStrideY] += alpha * A_ij * tmp;
09083         }
09084       }
09085     }
09086   }
09087 }
09088 
09089 template<class Ordinal,
09090          class MatrixScalar,
09091          class DomainScalar,
09092          class RangeScalar>
09093 void
09094 matVecCscColMajorForforConj4Unrolled (
09095   const Ordinal numRows,
09096   const Ordinal numCols,
09097   const Ordinal numVecs,
09098   const RangeScalar& beta,
09099   RangeScalar Y[],
09100   const Ordinal colStrideY,
09101   const RangeScalar& alpha,
09102   const size_t  ptr[],
09103   const Ordinal ind[],
09104   const MatrixScalar val[],
09105   const DomainScalar X[],
09106   const Ordinal colStrideX)
09107 {
09108   typedef Teuchos::ScalarTraits<RangeScalar> STS;
09109 
09110   // Prescale: Y := beta * Y.
09111   if (beta == STS::zero()) {
09112     for (Ordinal j = 0; j < numVecs; ++j) {
09113       RangeScalar* const Y_j = &Y[j*colStrideY];
09114       for (Ordinal i = 0; i < numRows; ++i) {
09115         // Follow the Sparse BLAS convention for beta == 0. 
09116         Y_j[i] = STS::zero();
09117       }
09118     }
09119   }
09120   else if (beta != STS::one()) {
09121     for (Ordinal j = 0; j < numVecs; ++j) {
09122       RangeScalar* const Y_j = &Y[j*colStrideY];
09123       for (Ordinal i = 0; i < numRows; ++i) {
09124         Y_j[i] = beta * Y_j[i];
09125       }
09126     }
09127   }
09128   // Outer for loop preface:
09129   if (alpha == STS::zero()) {
09130     return; // Our work is done!
09131   }
09132   if (alpha == STS::one()) {
09133     for (Ordinal j = 0; j < numCols; ++j) {
09134       // Extra +1 in loop bound ensures first 4 iterations get
09135       // strip-mined, but requires that Ordinal be a signed type.
09136       Ordinal c = 0;
09137       for ( ; c < numVecs - 3; c += 4) {
09138         const DomainScalar* const X_j = &X[j + c*colStrideX];
09139         const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
09140 
09141         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09142           const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09143           const Ordinal i = ind[k];
09144           RangeScalar* const Y_i = &Y[i + c*colStrideY];
09145           Y_i[0] += A_ij * tmp[0];
09146           Y_i[colStrideY] += A_ij * tmp[1];
09147           Y_i[2*colStrideY] += A_ij * tmp[2];
09148           Y_i[3*colStrideY] += A_ij * tmp[3];
09149         }
09150       }
09151       // Mop up left-over iterations over multivector columns.
09152       for ( ; c < numVecs; ++c) {
09153         const DomainScalar tmp = X[j + c*colStrideX];
09154 
09155         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09156           const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09157           const Ordinal i = ind[k];
09158           Y[i + c*colStrideY] += A_ij * tmp;
09159         }
09160       }
09161     }
09162   }
09163   else if (alpha == -STS::one()) {
09164     for (Ordinal j = 0; j < numCols; ++j) {
09165       // Extra +1 in loop bound ensures first 4 iterations get
09166       // strip-mined, but requires that Ordinal be a signed type.
09167       Ordinal c = 0;
09168       for ( ; c < numVecs - 3; c += 4) {
09169         const DomainScalar* const X_j = &X[j + c*colStrideX];
09170         const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
09171 
09172         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09173           const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09174           const Ordinal i = ind[k];
09175           RangeScalar* const Y_i = &Y[i + c*colStrideY];
09176           Y_i[0] -= A_ij * tmp[0];
09177           Y_i[colStrideY] -= A_ij * tmp[1];
09178           Y_i[2*colStrideY] -= A_ij * tmp[2];
09179           Y_i[3*colStrideY] -= A_ij * tmp[3];
09180         }
09181       }
09182       // Mop up left-over iterations over multivector columns.
09183       for ( ; c < numVecs; ++c) {
09184         const DomainScalar tmp = X[j + c*colStrideX];
09185 
09186         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09187           const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09188           const Ordinal i = ind[k];
09189           Y[i + c*colStrideY] -= A_ij * tmp;
09190         }
09191       }
09192     }
09193   }
09194   else { // alpha != 1 && alpha != -1
09195     for (Ordinal j = 0; j < numCols; ++j) {
09196       // Extra +1 in loop bound ensures first 4 iterations get
09197       // strip-mined, but requires that Ordinal be a signed type.
09198       Ordinal c = 0;
09199       for ( ; c < numVecs - 3; c += 4) {
09200         const DomainScalar* const X_j = &X[j + c*colStrideX];
09201         const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
09202 
09203         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09204           const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09205           const Ordinal i = ind[k];
09206           RangeScalar* const Y_i = &Y[i + c*colStrideY];
09207           Y_i[0] += alpha * A_ij * tmp[0];
09208           Y_i[colStrideY] += alpha * A_ij * tmp[1];
09209           Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
09210           Y_i[3*colStrideY] += alpha * A_ij * tmp[3];
09211         }
09212       }
09213       // Mop up left-over iterations over multivector columns.
09214       for ( ; c < numVecs; ++c) {
09215         const DomainScalar tmp = X[j + c*colStrideX];
09216 
09217         for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09218           const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09219           const Ordinal i = ind[k];
09220           Y[i + c*colStrideY] += alpha * A_ij * tmp;
09221         }
09222       }
09223     }
09224   }
09225 }
09226 
09227 template<class Ordinal,
09228          class MatrixScalar,
09229          class DomainScalar,
09230          class RangeScalar>
09231 void
09232 matVecCscColMajorForforConj1Vec (
09233   const Ordinal numRows,
09234   const Ordinal numCols,
09235   const Ordinal numVecs,
09236   const RangeScalar& beta,
09237   RangeScalar Y[],
09238   const Ordinal colStrideY,
09239   const RangeScalar& alpha,
09240   const size_t  ptr[],
09241   const Ordinal ind[],
09242   const MatrixScalar val[],
09243   const DomainScalar X[],
09244   const Ordinal colStrideX)
09245 {
09246   typedef Teuchos::ScalarTraits<RangeScalar> STS;
09247 
09248   // Prescale: Y := beta * Y.
09249   if (beta == STS::zero()) {
09250     for (Ordinal j = 0; j < numVecs; ++j) {
09251       RangeScalar* const Y_j = &Y[j*colStrideY];
09252       for (Ordinal i = 0; i < numRows; ++i) {
09253         // Follow the Sparse BLAS convention for beta == 0. 
09254         Y_j[i] = STS::zero();
09255       }
09256     }
09257   }
09258   else if (beta != STS::one()) {
09259     for (Ordinal j = 0; j < numVecs; ++j) {
09260       RangeScalar* const Y_j = &Y[j*colStrideY];
09261       for (Ordinal i = 0; i < numRows; ++i) {
09262         Y_j[i] = beta * Y_j[i];
09263       }
09264     }
09265   }
09266   // Outer for loop preface:
09267   if (alpha == STS::zero()) {
09268     return; // Our work is done!
09269   }
09270   if (alpha == STS::one()) {
09271     for (Ordinal j = 0; j < numCols; ++j) {
09272       const DomainScalar tmp = X[j];
09273 
09274       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09275         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09276         const Ordinal i = ind[k];
09277         Y[i] += A_ij * tmp;
09278       }
09279     }
09280   }
09281   else if (alpha == -STS::one()) {
09282     for (Ordinal j = 0; j < numCols; ++j) {
09283       const DomainScalar tmp = X[j];
09284 
09285       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09286         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09287         const Ordinal i = ind[k];
09288         Y[i] -= A_ij * tmp;
09289       }
09290     }
09291   }
09292   else { // alpha != 1 && alpha != -1
09293     for (Ordinal j = 0; j < numCols; ++j) {
09294       const DomainScalar tmp = X[j];
09295 
09296       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09297         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09298         const Ordinal i = ind[k];
09299         Y[i] += alpha * A_ij * tmp;
09300       }
09301     }
09302   }
09303 }
09304 
09305 template<class Ordinal,
09306          class MatrixScalar,
09307          class DomainScalar,
09308          class RangeScalar>
09309 void
09310 matVecCscColMajorForforConj2Vec (
09311   const Ordinal numRows,
09312   const Ordinal numCols,
09313   const Ordinal numVecs,
09314   const RangeScalar& beta,
09315   RangeScalar Y[],
09316   const Ordinal colStrideY,
09317   const RangeScalar& alpha,
09318   const size_t  ptr[],
09319   const Ordinal ind[],
09320   const MatrixScalar val[],
09321   const DomainScalar X[],
09322   const Ordinal colStrideX)
09323 {
09324   typedef Teuchos::ScalarTraits<RangeScalar> STS;
09325 
09326   // Prescale: Y := beta * Y.
09327   if (beta == STS::zero()) {
09328     for (Ordinal j = 0; j < numVecs; ++j) {
09329       RangeScalar* const Y_j = &Y[j*colStrideY];
09330       for (Ordinal i = 0; i < numRows; ++i) {
09331         // Follow the Sparse BLAS convention for beta == 0. 
09332         Y_j[i] = STS::zero();
09333       }
09334     }
09335   }
09336   else if (beta != STS::one()) {
09337     for (Ordinal j = 0; j < numVecs; ++j) {
09338       RangeScalar* const Y_j = &Y[j*colStrideY];
09339       for (Ordinal i = 0; i < numRows; ++i) {
09340         Y_j[i] = beta * Y_j[i];
09341       }
09342     }
09343   }
09344   // Outer for loop preface:
09345   if (alpha == STS::zero()) {
09346     return; // Our work is done!
09347   }
09348   if (alpha == STS::one()) {
09349     for (Ordinal j = 0; j < numCols; ++j) {
09350       const DomainScalar* const X_j = &X[j];
09351       const DomainScalar tmp[2] = {X_j[0], X_j[colStrideX]};
09352 
09353       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09354         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09355         const Ordinal i = ind[k];
09356         RangeScalar* const Y_i = &Y[i];
09357         Y_i[0] += A_ij * tmp[0];
09358         Y_i[colStrideY] += A_ij * tmp[1];
09359       }
09360     }
09361   }
09362   else if (alpha == -STS::one()) {
09363     for (Ordinal j = 0; j < numCols; ++j) {
09364       const DomainScalar* const X_j = &X[j];
09365       const DomainScalar tmp[2] = {X_j[0], X_j[colStrideX]};
09366 
09367       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09368         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09369         const Ordinal i = ind[k];
09370         RangeScalar* const Y_i = &Y[i];
09371         Y_i[0] -= A_ij * tmp[0];
09372         Y_i[colStrideY] -= A_ij * tmp[1];
09373       }
09374     }
09375   }
09376   else { // alpha != 1 && alpha != -1
09377     for (Ordinal j = 0; j < numCols; ++j) {
09378       const DomainScalar* const X_j = &X[j];
09379       const DomainScalar tmp[2] = {X_j[0], X_j[colStrideX]};
09380 
09381       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09382         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09383         const Ordinal i = ind[k];
09384         RangeScalar* const Y_i = &Y[i];
09385         Y_i[0] += alpha * A_ij * tmp[0];
09386         Y_i[colStrideY] += alpha * A_ij * tmp[1];
09387       }
09388     }
09389   }
09390 }
09391 
09392 template<class Ordinal,
09393          class MatrixScalar,
09394          class DomainScalar,
09395          class RangeScalar>
09396 void
09397 matVecCscColMajorForforConj3Vec (
09398   const Ordinal numRows,
09399   const Ordinal numCols,
09400   const Ordinal numVecs,
09401   const RangeScalar& beta,
09402   RangeScalar Y[],
09403   const Ordinal colStrideY,
09404   const RangeScalar& alpha,
09405   const size_t  ptr[],
09406   const Ordinal ind[],
09407   const MatrixScalar val[],
09408   const DomainScalar X[],
09409   const Ordinal colStrideX)
09410 {
09411   typedef Teuchos::ScalarTraits<RangeScalar> STS;
09412 
09413   // Prescale: Y := beta * Y.
09414   if (beta == STS::zero()) {
09415     for (Ordinal j = 0; j < numVecs; ++j) {
09416       RangeScalar* const Y_j = &Y[j*colStrideY];
09417       for (Ordinal i = 0; i < numRows; ++i) {
09418         // Follow the Sparse BLAS convention for beta == 0. 
09419         Y_j[i] = STS::zero();
09420       }
09421     }
09422   }
09423   else if (beta != STS::one()) {
09424     for (Ordinal j = 0; j < numVecs; ++j) {
09425       RangeScalar* const Y_j = &Y[j*colStrideY];
09426       for (Ordinal i = 0; i < numRows; ++i) {
09427         Y_j[i] = beta * Y_j[i];
09428       }
09429     }
09430   }
09431   // Outer for loop preface:
09432   if (alpha == STS::zero()) {
09433     return; // Our work is done!
09434   }
09435   if (alpha == STS::one()) {
09436     for (Ordinal j = 0; j < numCols; ++j) {
09437       const DomainScalar* const X_j = &X[j];
09438       const DomainScalar tmp[3] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX]};
09439 
09440       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09441         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09442         const Ordinal i = ind[k];
09443         RangeScalar* const Y_i = &Y[i];
09444         Y_i[0] += A_ij * tmp[0];
09445         Y_i[colStrideY] += A_ij * tmp[1];
09446         Y_i[2*colStrideY] += A_ij * tmp[2];
09447       }
09448     }
09449   }
09450   else if (alpha == -STS::one()) {
09451     for (Ordinal j = 0; j < numCols; ++j) {
09452       const DomainScalar* const X_j = &X[j];
09453       const DomainScalar tmp[3] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX]};
09454 
09455       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09456         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09457         const Ordinal i = ind[k];
09458         RangeScalar* const Y_i = &Y[i];
09459         Y_i[0] -= A_ij * tmp[0];
09460         Y_i[colStrideY] -= A_ij * tmp[1];
09461         Y_i[2*colStrideY] -= A_ij * tmp[2];
09462       }
09463     }
09464   }
09465   else { // alpha != 1 && alpha != -1
09466     for (Ordinal j = 0; j < numCols; ++j) {
09467       const DomainScalar* const X_j = &X[j];
09468       const DomainScalar tmp[3] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX]};
09469 
09470       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09471         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09472         const Ordinal i = ind[k];
09473         RangeScalar* const Y_i = &Y[i];
09474         Y_i[0] += alpha * A_ij * tmp[0];
09475         Y_i[colStrideY] += alpha * A_ij * tmp[1];
09476         Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
09477       }
09478     }
09479   }
09480 }
09481 
09482 template<class Ordinal,
09483          class MatrixScalar,
09484          class DomainScalar,
09485          class RangeScalar>
09486 void
09487 matVecCscColMajorForforConj4Vec (
09488   const Ordinal numRows,
09489   const Ordinal numCols,
09490   const Ordinal numVecs,
09491   const RangeScalar& beta,
09492   RangeScalar Y[],
09493   const Ordinal colStrideY,
09494   const RangeScalar& alpha,
09495   const size_t  ptr[],
09496   const Ordinal ind[],
09497   const MatrixScalar val[],
09498   const DomainScalar X[],
09499   const Ordinal colStrideX)
09500 {
09501   typedef Teuchos::ScalarTraits<RangeScalar> STS;
09502 
09503   // Prescale: Y := beta * Y.
09504   if (beta == STS::zero()) {
09505     for (Ordinal j = 0; j < numVecs; ++j) {
09506       RangeScalar* const Y_j = &Y[j*colStrideY];
09507       for (Ordinal i = 0; i < numRows; ++i) {
09508         // Follow the Sparse BLAS convention for beta == 0. 
09509         Y_j[i] = STS::zero();
09510       }
09511     }
09512   }
09513   else if (beta != STS::one()) {
09514     for (Ordinal j = 0; j < numVecs; ++j) {
09515       RangeScalar* const Y_j = &Y[j*colStrideY];
09516       for (Ordinal i = 0; i < numRows; ++i) {
09517         Y_j[i] = beta * Y_j[i];
09518       }
09519     }
09520   }
09521   // Outer for loop preface:
09522   if (alpha == STS::zero()) {
09523     return; // Our work is done!
09524   }
09525   if (alpha == STS::one()) {
09526     for (Ordinal j = 0; j < numCols; ++j) {
09527       const DomainScalar* const X_j = &X[j];
09528       const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
09529 
09530       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09531         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09532         const Ordinal i = ind[k];
09533         RangeScalar* const Y_i = &Y[i];
09534         Y_i[0] += A_ij * tmp[0];
09535         Y_i[colStrideY] += A_ij * tmp[1];
09536         Y_i[2*colStrideY] += A_ij * tmp[2];
09537         Y_i[3*colStrideY] += A_ij * tmp[3];
09538       }
09539     }
09540   }
09541   else if (alpha == -STS::one()) {
09542     for (Ordinal j = 0; j < numCols; ++j) {
09543       const DomainScalar* const X_j = &X[j];
09544       const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
09545 
09546       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09547         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09548         const Ordinal i = ind[k];
09549         RangeScalar* const Y_i = &Y[i];
09550         Y_i[0] -= A_ij * tmp[0];
09551         Y_i[colStrideY] -= A_ij * tmp[1];
09552         Y_i[2*colStrideY] -= A_ij * tmp[2];
09553         Y_i[3*colStrideY] -= A_ij * tmp[3];
09554       }
09555     }
09556   }
09557   else { // alpha != 1 && alpha != -1
09558     for (Ordinal j = 0; j < numCols; ++j) {
09559       const DomainScalar* const X_j = &X[j];
09560       const DomainScalar tmp[4] = {X_j[0], X_j[colStrideX], X_j[2*colStrideX], X_j[3*colStrideX]};
09561 
09562       for (size_t k = ptr[j]; k < ptr[j+1]; ++k) {
09563         const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09564         const Ordinal i = ind[k];
09565         RangeScalar* const Y_i = &Y[i];
09566         Y_i[0] += alpha * A_ij * tmp[0];
09567         Y_i[colStrideY] += alpha * A_ij * tmp[1];
09568         Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
09569         Y_i[3*colStrideY] += alpha * A_ij * tmp[3];
09570       }
09571     }
09572   }
09573 }
09574 
09575 template<class Ordinal,
09576          class MatrixScalar,
09577          class DomainScalar,
09578          class RangeScalar>
09579 void
09580 matVecCscColMajorForwhileConj1Vec (
09581   const Ordinal numRows,
09582   const Ordinal numCols,
09583   const Ordinal numVecs,
09584   const RangeScalar& beta,
09585   RangeScalar Y[],
09586   const Ordinal colStrideY,
09587   const RangeScalar& alpha,
09588   const size_t  ptr[],
09589   const Ordinal ind[],
09590   const MatrixScalar val[],
09591   const DomainScalar X[],
09592   const Ordinal colStrideX)
09593 {
09594   typedef Teuchos::ScalarTraits<RangeScalar> STS;
09595 
09596   // Prescale: Y := beta * Y.
09597   if (beta == STS::zero()) {
09598     for (Ordinal j = 0; j < numVecs; ++j) {
09599       RangeScalar* const Y_j = &Y[j*colStrideY];
09600       for (Ordinal i = 0; i < numRows; ++i) {
09601         // Follow the Sparse BLAS convention for beta == 0. 
09602         Y_j[i] = STS::zero();
09603       }
09604     }
09605   }
09606   else if (beta != STS::one()) {
09607     for (Ordinal j = 0; j < numVecs; ++j) {
09608       RangeScalar* const Y_j = &Y[j*colStrideY];
09609       for (Ordinal i = 0; i < numRows; ++i) {
09610         Y_j[i] = beta * Y_j[i];
09611       }
09612     }
09613   }
09614   // Outer for loop preface:
09615   if (alpha == STS::zero()) {
09616     return; // Our work is done!
09617   }
09618   const size_t nnz = ptr[numCols];
09619   if (alpha == STS::one()) {
09620     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
09621     // Initializing tmp here isn't necessary for correctness, but it
09622     // makes compilers stop complaining about uninitialized variables.
09623     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
09624     Ordinal j = 0;
09625     for (size_t k = 0; k < nnz; ++k) {
09626       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09627       const Ordinal i = ind[k];
09628       while (k >= ptr[j+1]) {
09629         ++j;
09630         tmp = X[j + 0*colStrideX];
09631       }
09632       Y[i] += A_ij * tmp;
09633     }
09634   }
09635   else if (alpha == -STS::one()) {
09636     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
09637     // Initializing tmp here isn't necessary for correctness, but it
09638     // makes compilers stop complaining about uninitialized variables.
09639     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
09640     Ordinal j = 0;
09641     for (size_t k = 0; k < nnz; ++k) {
09642       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09643       const Ordinal i = ind[k];
09644       while (k >= ptr[j+1]) {
09645         ++j;
09646         tmp = X[j + 0*colStrideX];
09647       }
09648       Y[i] -= A_ij * tmp;
09649     }
09650   }
09651   else { // alpha != 1 && alpha != -1
09652     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
09653     // Initializing tmp here isn't necessary for correctness, but it
09654     // makes compilers stop complaining about uninitialized variables.
09655     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
09656     Ordinal j = 0;
09657     for (size_t k = 0; k < nnz; ++k) {
09658       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09659       const Ordinal i = ind[k];
09660       while (k >= ptr[j+1]) {
09661         ++j;
09662         tmp = X[j + 0*colStrideX];
09663       }
09664       Y[i] += alpha * A_ij * tmp;
09665     }
09666   }
09667 }
09668 
09669 template<class Ordinal,
09670          class MatrixScalar,
09671          class DomainScalar,
09672          class RangeScalar>
09673 void
09674 matVecCscColMajorForwhileConj2Vec (
09675   const Ordinal numRows,
09676   const Ordinal numCols,
09677   const Ordinal numVecs,
09678   const RangeScalar& beta,
09679   RangeScalar Y[],
09680   const Ordinal colStrideY,
09681   const RangeScalar& alpha,
09682   const size_t  ptr[],
09683   const Ordinal ind[],
09684   const MatrixScalar val[],
09685   const DomainScalar X[],
09686   const Ordinal colStrideX)
09687 {
09688   typedef Teuchos::ScalarTraits<RangeScalar> STS;
09689 
09690   // Prescale: Y := beta * Y.
09691   if (beta == STS::zero()) {
09692     for (Ordinal j = 0; j < numVecs; ++j) {
09693       RangeScalar* const Y_j = &Y[j*colStrideY];
09694       for (Ordinal i = 0; i < numRows; ++i) {
09695         // Follow the Sparse BLAS convention for beta == 0. 
09696         Y_j[i] = STS::zero();
09697       }
09698     }
09699   }
09700   else if (beta != STS::one()) {
09701     for (Ordinal j = 0; j < numVecs; ++j) {
09702       RangeScalar* const Y_j = &Y[j*colStrideY];
09703       for (Ordinal i = 0; i < numRows; ++i) {
09704         Y_j[i] = beta * Y_j[i];
09705       }
09706     }
09707   }
09708   // Outer for loop preface:
09709   if (alpha == STS::zero()) {
09710     return; // Our work is done!
09711   }
09712   const size_t nnz = ptr[numCols];
09713   if (alpha == STS::one()) {
09714     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
09715     // Initializing tmp here isn't necessary for correctness, but it
09716     // makes compilers stop complaining about uninitialized variables.
09717     DomainScalar tmp[2];
09718     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
09719     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
09720 
09721     Ordinal j = 0;
09722     for (size_t k = 0; k < nnz; ++k) {
09723       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09724       const Ordinal i = ind[k];
09725       while (k >= ptr[j+1]) {
09726         ++j;
09727         const DomainScalar* const X_j = &X[j];
09728         tmp[0] = X_j[0];
09729         tmp[1] = X_j[colStrideX];
09730       }
09731       RangeScalar* const Y_i = &Y[i];
09732       Y_i[0] += A_ij * tmp[0];
09733       Y_i[colStrideY] += A_ij * tmp[1];
09734     }
09735   }
09736   else if (alpha == -STS::one()) {
09737     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
09738     // Initializing tmp here isn't necessary for correctness, but it
09739     // makes compilers stop complaining about uninitialized variables.
09740     DomainScalar tmp[2];
09741     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
09742     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
09743 
09744     Ordinal j = 0;
09745     for (size_t k = 0; k < nnz; ++k) {
09746       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09747       const Ordinal i = ind[k];
09748       while (k >= ptr[j+1]) {
09749         ++j;
09750         const DomainScalar* const X_j = &X[j];
09751         tmp[0] = X_j[0];
09752         tmp[1] = X_j[colStrideX];
09753       }
09754       RangeScalar* const Y_i = &Y[i];
09755       Y_i[0] -= A_ij * tmp[0];
09756       Y_i[colStrideY] -= A_ij * tmp[1];
09757     }
09758   }
09759   else { // alpha != 1 && alpha != -1
09760     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
09761     // Initializing tmp here isn't necessary for correctness, but it
09762     // makes compilers stop complaining about uninitialized variables.
09763     DomainScalar tmp[2];
09764     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
09765     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
09766 
09767     Ordinal j = 0;
09768     for (size_t k = 0; k < nnz; ++k) {
09769       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09770       const Ordinal i = ind[k];
09771       while (k >= ptr[j+1]) {
09772         ++j;
09773         const DomainScalar* const X_j = &X[j];
09774         tmp[0] = X_j[0];
09775         tmp[1] = X_j[colStrideX];
09776       }
09777       RangeScalar* const Y_i = &Y[i];
09778       Y_i[0] += alpha * A_ij * tmp[0];
09779       Y_i[colStrideY] += alpha * A_ij * tmp[1];
09780     }
09781   }
09782 }
09783 
09784 template<class Ordinal,
09785          class MatrixScalar,
09786          class DomainScalar,
09787          class RangeScalar>
09788 void
09789 matVecCscColMajorForwhileConj3Vec (
09790   const Ordinal numRows,
09791   const Ordinal numCols,
09792   const Ordinal numVecs,
09793   const RangeScalar& beta,
09794   RangeScalar Y[],
09795   const Ordinal colStrideY,
09796   const RangeScalar& alpha,
09797   const size_t  ptr[],
09798   const Ordinal ind[],
09799   const MatrixScalar val[],
09800   const DomainScalar X[],
09801   const Ordinal colStrideX)
09802 {
09803   typedef Teuchos::ScalarTraits<RangeScalar> STS;
09804 
09805   // Prescale: Y := beta * Y.
09806   if (beta == STS::zero()) {
09807     for (Ordinal j = 0; j < numVecs; ++j) {
09808       RangeScalar* const Y_j = &Y[j*colStrideY];
09809       for (Ordinal i = 0; i < numRows; ++i) {
09810         // Follow the Sparse BLAS convention for beta == 0. 
09811         Y_j[i] = STS::zero();
09812       }
09813     }
09814   }
09815   else if (beta != STS::one()) {
09816     for (Ordinal j = 0; j < numVecs; ++j) {
09817       RangeScalar* const Y_j = &Y[j*colStrideY];
09818       for (Ordinal i = 0; i < numRows; ++i) {
09819         Y_j[i] = beta * Y_j[i];
09820       }
09821     }
09822   }
09823   // Outer for loop preface:
09824   if (alpha == STS::zero()) {
09825     return; // Our work is done!
09826   }
09827   const size_t nnz = ptr[numCols];
09828   if (alpha == STS::one()) {
09829     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
09830     // Initializing tmp here isn't necessary for correctness, but it
09831     // makes compilers stop complaining about uninitialized variables.
09832     DomainScalar tmp[3];
09833     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
09834     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
09835     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
09836 
09837     Ordinal j = 0;
09838     for (size_t k = 0; k < nnz; ++k) {
09839       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09840       const Ordinal i = ind[k];
09841       while (k >= ptr[j+1]) {
09842         ++j;
09843         const DomainScalar* const X_j = &X[j];
09844         tmp[0] = X_j[0];
09845         tmp[1] = X_j[colStrideX];
09846         tmp[2] = X_j[2*colStrideX];
09847       }
09848       RangeScalar* const Y_i = &Y[i];
09849       Y_i[0] += A_ij * tmp[0];
09850       Y_i[colStrideY] += A_ij * tmp[1];
09851       Y_i[2*colStrideY] += A_ij * tmp[2];
09852     }
09853   }
09854   else if (alpha == -STS::one()) {
09855     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
09856     // Initializing tmp here isn't necessary for correctness, but it
09857     // makes compilers stop complaining about uninitialized variables.
09858     DomainScalar tmp[3];
09859     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
09860     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
09861     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
09862 
09863     Ordinal j = 0;
09864     for (size_t k = 0; k < nnz; ++k) {
09865       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09866       const Ordinal i = ind[k];
09867       while (k >= ptr[j+1]) {
09868         ++j;
09869         const DomainScalar* const X_j = &X[j];
09870         tmp[0] = X_j[0];
09871         tmp[1] = X_j[colStrideX];
09872         tmp[2] = X_j[2*colStrideX];
09873       }
09874       RangeScalar* const Y_i = &Y[i];
09875       Y_i[0] -= A_ij * tmp[0];
09876       Y_i[colStrideY] -= A_ij * tmp[1];
09877       Y_i[2*colStrideY] -= A_ij * tmp[2];
09878     }
09879   }
09880   else { // alpha != 1 && alpha != -1
09881     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
09882     // Initializing tmp here isn't necessary for correctness, but it
09883     // makes compilers stop complaining about uninitialized variables.
09884     DomainScalar tmp[3];
09885     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
09886     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
09887     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
09888 
09889     Ordinal j = 0;
09890     for (size_t k = 0; k < nnz; ++k) {
09891       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09892       const Ordinal i = ind[k];
09893       while (k >= ptr[j+1]) {
09894         ++j;
09895         const DomainScalar* const X_j = &X[j];
09896         tmp[0] = X_j[0];
09897         tmp[1] = X_j[colStrideX];
09898         tmp[2] = X_j[2*colStrideX];
09899       }
09900       RangeScalar* const Y_i = &Y[i];
09901       Y_i[0] += alpha * A_ij * tmp[0];
09902       Y_i[colStrideY] += alpha * A_ij * tmp[1];
09903       Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
09904     }
09905   }
09906 }
09907 
09908 template<class Ordinal,
09909          class MatrixScalar,
09910          class DomainScalar,
09911          class RangeScalar>
09912 void
09913 matVecCscColMajorForwhileConj4Vec (
09914   const Ordinal numRows,
09915   const Ordinal numCols,
09916   const Ordinal numVecs,
09917   const RangeScalar& beta,
09918   RangeScalar Y[],
09919   const Ordinal colStrideY,
09920   const RangeScalar& alpha,
09921   const size_t  ptr[],
09922   const Ordinal ind[],
09923   const MatrixScalar val[],
09924   const DomainScalar X[],
09925   const Ordinal colStrideX)
09926 {
09927   typedef Teuchos::ScalarTraits<RangeScalar> STS;
09928 
09929   // Prescale: Y := beta * Y.
09930   if (beta == STS::zero()) {
09931     for (Ordinal j = 0; j < numVecs; ++j) {
09932       RangeScalar* const Y_j = &Y[j*colStrideY];
09933       for (Ordinal i = 0; i < numRows; ++i) {
09934         // Follow the Sparse BLAS convention for beta == 0. 
09935         Y_j[i] = STS::zero();
09936       }
09937     }
09938   }
09939   else if (beta != STS::one()) {
09940     for (Ordinal j = 0; j < numVecs; ++j) {
09941       RangeScalar* const Y_j = &Y[j*colStrideY];
09942       for (Ordinal i = 0; i < numRows; ++i) {
09943         Y_j[i] = beta * Y_j[i];
09944       }
09945     }
09946   }
09947   // Outer for loop preface:
09948   if (alpha == STS::zero()) {
09949     return; // Our work is done!
09950   }
09951   const size_t nnz = ptr[numCols];
09952   if (alpha == STS::one()) {
09953     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
09954     // Initializing tmp here isn't necessary for correctness, but it
09955     // makes compilers stop complaining about uninitialized variables.
09956     DomainScalar tmp[4];
09957     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
09958     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
09959     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
09960     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
09961 
09962     Ordinal j = 0;
09963     for (size_t k = 0; k < nnz; ++k) {
09964       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09965       const Ordinal i = ind[k];
09966       while (k >= ptr[j+1]) {
09967         ++j;
09968         const DomainScalar* const X_j = &X[j];
09969         tmp[0] = X_j[0];
09970         tmp[1] = X_j[colStrideX];
09971         tmp[2] = X_j[2*colStrideX];
09972         tmp[3] = X_j[3*colStrideX];
09973       }
09974       RangeScalar* const Y_i = &Y[i];
09975       Y_i[0] += A_ij * tmp[0];
09976       Y_i[colStrideY] += A_ij * tmp[1];
09977       Y_i[2*colStrideY] += A_ij * tmp[2];
09978       Y_i[3*colStrideY] += A_ij * tmp[3];
09979     }
09980   }
09981   else if (alpha == -STS::one()) {
09982     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
09983     // Initializing tmp here isn't necessary for correctness, but it
09984     // makes compilers stop complaining about uninitialized variables.
09985     DomainScalar tmp[4];
09986     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
09987     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
09988     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
09989     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
09990 
09991     Ordinal j = 0;
09992     for (size_t k = 0; k < nnz; ++k) {
09993       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
09994       const Ordinal i = ind[k];
09995       while (k >= ptr[j+1]) {
09996         ++j;
09997         const DomainScalar* const X_j = &X[j];
09998         tmp[0] = X_j[0];
09999         tmp[1] = X_j[colStrideX];
10000         tmp[2] = X_j[2*colStrideX];
10001         tmp[3] = X_j[3*colStrideX];
10002       }
10003       RangeScalar* const Y_i = &Y[i];
10004       Y_i[0] -= A_ij * tmp[0];
10005       Y_i[colStrideY] -= A_ij * tmp[1];
10006       Y_i[2*colStrideY] -= A_ij * tmp[2];
10007       Y_i[3*colStrideY] -= A_ij * tmp[3];
10008     }
10009   }
10010   else { // alpha != 1 && alpha != -1
10011     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10012     // Initializing tmp here isn't necessary for correctness, but it
10013     // makes compilers stop complaining about uninitialized variables.
10014     DomainScalar tmp[4];
10015     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
10016     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
10017     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
10018     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
10019 
10020     Ordinal j = 0;
10021     for (size_t k = 0; k < nnz; ++k) {
10022       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10023       const Ordinal i = ind[k];
10024       while (k >= ptr[j+1]) {
10025         ++j;
10026         const DomainScalar* const X_j = &X[j];
10027         tmp[0] = X_j[0];
10028         tmp[1] = X_j[colStrideX];
10029         tmp[2] = X_j[2*colStrideX];
10030         tmp[3] = X_j[3*colStrideX];
10031       }
10032       RangeScalar* const Y_i = &Y[i];
10033       Y_i[0] += alpha * A_ij * tmp[0];
10034       Y_i[colStrideY] += alpha * A_ij * tmp[1];
10035       Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
10036       Y_i[3*colStrideY] += alpha * A_ij * tmp[3];
10037     }
10038   }
10039 }
10040 
10041 template<class Ordinal,
10042          class MatrixScalar,
10043          class DomainScalar,
10044          class RangeScalar>
10045 void
10046 matVecCscColMajorForifConj1Vec (
10047   const Ordinal numRows,
10048   const Ordinal numCols,
10049   const Ordinal numVecs,
10050   const RangeScalar& beta,
10051   RangeScalar Y[],
10052   const Ordinal colStrideY,
10053   const RangeScalar& alpha,
10054   const size_t  ptr[],
10055   const Ordinal ind[],
10056   const MatrixScalar val[],
10057   const DomainScalar X[],
10058   const Ordinal colStrideX)
10059 {
10060   typedef Teuchos::ScalarTraits<RangeScalar> STS;
10061 
10062   // Prescale: Y := beta * Y.
10063   if (beta == STS::zero()) {
10064     for (Ordinal j = 0; j < numVecs; ++j) {
10065       RangeScalar* const Y_j = &Y[j*colStrideY];
10066       for (Ordinal i = 0; i < numRows; ++i) {
10067         // Follow the Sparse BLAS convention for beta == 0. 
10068         Y_j[i] = STS::zero();
10069       }
10070     }
10071   }
10072   else if (beta != STS::one()) {
10073     for (Ordinal j = 0; j < numVecs; ++j) {
10074       RangeScalar* const Y_j = &Y[j*colStrideY];
10075       for (Ordinal i = 0; i < numRows; ++i) {
10076         Y_j[i] = beta * Y_j[i];
10077       }
10078     }
10079   }
10080   // Outer for loop preface:
10081   if (alpha == STS::zero()) {
10082     return; // Our work is done!
10083   }
10084   const size_t nnz = ptr[numCols];
10085   if (alpha == STS::one()) {
10086     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10087     // Initializing tmp here isn't necessary for correctness, but it
10088     // makes compilers stop complaining about uninitialized variables.
10089     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
10090     Ordinal j = 0;
10091     for (size_t k = 0; k < nnz; ++k) {
10092       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10093       const Ordinal i = ind[k];
10094       // NOTE: "if" instead of "while" here is only valid
10095       // if the matrix contains no empty rows.
10096       if (k >= ptr[j+1]) {
10097         ++j;
10098         tmp = X[j + 0*colStrideX];
10099       }
10100       Y[i] += A_ij * tmp;
10101     }
10102   }
10103   else if (alpha == -STS::one()) {
10104     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10105     // Initializing tmp here isn't necessary for correctness, but it
10106     // makes compilers stop complaining about uninitialized variables.
10107     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
10108     Ordinal j = 0;
10109     for (size_t k = 0; k < nnz; ++k) {
10110       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10111       const Ordinal i = ind[k];
10112       // NOTE: "if" instead of "while" here is only valid
10113       // if the matrix contains no empty rows.
10114       if (k >= ptr[j+1]) {
10115         ++j;
10116         tmp = X[j + 0*colStrideX];
10117       }
10118       Y[i] -= A_ij * tmp;
10119     }
10120   }
10121   else { // alpha != 1 && alpha != -1
10122     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10123     // Initializing tmp here isn't necessary for correctness, but it
10124     // makes compilers stop complaining about uninitialized variables.
10125     DomainScalar tmp = Teuchos::ScalarTraits<DomainScalar>::zero();
10126     Ordinal j = 0;
10127     for (size_t k = 0; k < nnz; ++k) {
10128       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10129       const Ordinal i = ind[k];
10130       // NOTE: "if" instead of "while" here is only valid
10131       // if the matrix contains no empty rows.
10132       if (k >= ptr[j+1]) {
10133         ++j;
10134         tmp = X[j + 0*colStrideX];
10135       }
10136       Y[i] += alpha * A_ij * tmp;
10137     }
10138   }
10139 }
10140 
10141 template<class Ordinal,
10142          class MatrixScalar,
10143          class DomainScalar,
10144          class RangeScalar>
10145 void
10146 matVecCscColMajorForifConj2Vec (
10147   const Ordinal numRows,
10148   const Ordinal numCols,
10149   const Ordinal numVecs,
10150   const RangeScalar& beta,
10151   RangeScalar Y[],
10152   const Ordinal colStrideY,
10153   const RangeScalar& alpha,
10154   const size_t  ptr[],
10155   const Ordinal ind[],
10156   const MatrixScalar val[],
10157   const DomainScalar X[],
10158   const Ordinal colStrideX)
10159 {
10160   typedef Teuchos::ScalarTraits<RangeScalar> STS;
10161 
10162   // Prescale: Y := beta * Y.
10163   if (beta == STS::zero()) {
10164     for (Ordinal j = 0; j < numVecs; ++j) {
10165       RangeScalar* const Y_j = &Y[j*colStrideY];
10166       for (Ordinal i = 0; i < numRows; ++i) {
10167         // Follow the Sparse BLAS convention for beta == 0. 
10168         Y_j[i] = STS::zero();
10169       }
10170     }
10171   }
10172   else if (beta != STS::one()) {
10173     for (Ordinal j = 0; j < numVecs; ++j) {
10174       RangeScalar* const Y_j = &Y[j*colStrideY];
10175       for (Ordinal i = 0; i < numRows; ++i) {
10176         Y_j[i] = beta * Y_j[i];
10177       }
10178     }
10179   }
10180   // Outer for loop preface:
10181   if (alpha == STS::zero()) {
10182     return; // Our work is done!
10183   }
10184   const size_t nnz = ptr[numCols];
10185   if (alpha == STS::one()) {
10186     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10187     // Initializing tmp here isn't necessary for correctness, but it
10188     // makes compilers stop complaining about uninitialized variables.
10189     DomainScalar tmp[2];
10190     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
10191     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
10192 
10193     Ordinal j = 0;
10194     for (size_t k = 0; k < nnz; ++k) {
10195       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10196       const Ordinal i = ind[k];
10197       // NOTE: "if" instead of "while" here is only valid
10198       // if the matrix contains no empty rows.
10199       if (k >= ptr[j+1]) {
10200         ++j;
10201         const DomainScalar* const X_j = &X[j];
10202         tmp[0] = X_j[0];
10203         tmp[1] = X_j[colStrideX];
10204       }
10205       RangeScalar* const Y_i = &Y[i];
10206       Y_i[0] += A_ij * tmp[0];
10207       Y_i[colStrideY] += A_ij * tmp[1];
10208     }
10209   }
10210   else if (alpha == -STS::one()) {
10211     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10212     // Initializing tmp here isn't necessary for correctness, but it
10213     // makes compilers stop complaining about uninitialized variables.
10214     DomainScalar tmp[2];
10215     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
10216     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
10217 
10218     Ordinal j = 0;
10219     for (size_t k = 0; k < nnz; ++k) {
10220       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10221       const Ordinal i = ind[k];
10222       // NOTE: "if" instead of "while" here is only valid
10223       // if the matrix contains no empty rows.
10224       if (k >= ptr[j+1]) {
10225         ++j;
10226         const DomainScalar* const X_j = &X[j];
10227         tmp[0] = X_j[0];
10228         tmp[1] = X_j[colStrideX];
10229       }
10230       RangeScalar* const Y_i = &Y[i];
10231       Y_i[0] -= A_ij * tmp[0];
10232       Y_i[colStrideY] -= A_ij * tmp[1];
10233     }
10234   }
10235   else { // alpha != 1 && alpha != -1
10236     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10237     // Initializing tmp here isn't necessary for correctness, but it
10238     // makes compilers stop complaining about uninitialized variables.
10239     DomainScalar tmp[2];
10240     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
10241     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
10242 
10243     Ordinal j = 0;
10244     for (size_t k = 0; k < nnz; ++k) {
10245       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10246       const Ordinal i = ind[k];
10247       // NOTE: "if" instead of "while" here is only valid
10248       // if the matrix contains no empty rows.
10249       if (k >= ptr[j+1]) {
10250         ++j;
10251         const DomainScalar* const X_j = &X[j];
10252         tmp[0] = X_j[0];
10253         tmp[1] = X_j[colStrideX];
10254       }
10255       RangeScalar* const Y_i = &Y[i];
10256       Y_i[0] += alpha * A_ij * tmp[0];
10257       Y_i[colStrideY] += alpha * A_ij * tmp[1];
10258     }
10259   }
10260 }
10261 
10262 template<class Ordinal,
10263          class MatrixScalar,
10264          class DomainScalar,
10265          class RangeScalar>
10266 void
10267 matVecCscColMajorForifConj3Vec (
10268   const Ordinal numRows,
10269   const Ordinal numCols,
10270   const Ordinal numVecs,
10271   const RangeScalar& beta,
10272   RangeScalar Y[],
10273   const Ordinal colStrideY,
10274   const RangeScalar& alpha,
10275   const size_t  ptr[],
10276   const Ordinal ind[],
10277   const MatrixScalar val[],
10278   const DomainScalar X[],
10279   const Ordinal colStrideX)
10280 {
10281   typedef Teuchos::ScalarTraits<RangeScalar> STS;
10282 
10283   // Prescale: Y := beta * Y.
10284   if (beta == STS::zero()) {
10285     for (Ordinal j = 0; j < numVecs; ++j) {
10286       RangeScalar* const Y_j = &Y[j*colStrideY];
10287       for (Ordinal i = 0; i < numRows; ++i) {
10288         // Follow the Sparse BLAS convention for beta == 0. 
10289         Y_j[i] = STS::zero();
10290       }
10291     }
10292   }
10293   else if (beta != STS::one()) {
10294     for (Ordinal j = 0; j < numVecs; ++j) {
10295       RangeScalar* const Y_j = &Y[j*colStrideY];
10296       for (Ordinal i = 0; i < numRows; ++i) {
10297         Y_j[i] = beta * Y_j[i];
10298       }
10299     }
10300   }
10301   // Outer for loop preface:
10302   if (alpha == STS::zero()) {
10303     return; // Our work is done!
10304   }
10305   const size_t nnz = ptr[numCols];
10306   if (alpha == STS::one()) {
10307     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10308     // Initializing tmp here isn't necessary for correctness, but it
10309     // makes compilers stop complaining about uninitialized variables.
10310     DomainScalar tmp[3];
10311     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
10312     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
10313     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
10314 
10315     Ordinal j = 0;
10316     for (size_t k = 0; k < nnz; ++k) {
10317       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10318       const Ordinal i = ind[k];
10319       // NOTE: "if" instead of "while" here is only valid
10320       // if the matrix contains no empty rows.
10321       if (k >= ptr[j+1]) {
10322         ++j;
10323         const DomainScalar* const X_j = &X[j];
10324         tmp[0] = X_j[0];
10325         tmp[1] = X_j[colStrideX];
10326         tmp[2] = X_j[2*colStrideX];
10327       }
10328       RangeScalar* const Y_i = &Y[i];
10329       Y_i[0] += A_ij * tmp[0];
10330       Y_i[colStrideY] += A_ij * tmp[1];
10331       Y_i[2*colStrideY] += A_ij * tmp[2];
10332     }
10333   }
10334   else if (alpha == -STS::one()) {
10335     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10336     // Initializing tmp here isn't necessary for correctness, but it
10337     // makes compilers stop complaining about uninitialized variables.
10338     DomainScalar tmp[3];
10339     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
10340     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
10341     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
10342 
10343     Ordinal j = 0;
10344     for (size_t k = 0; k < nnz; ++k) {
10345       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10346       const Ordinal i = ind[k];
10347       // NOTE: "if" instead of "while" here is only valid
10348       // if the matrix contains no empty rows.
10349       if (k >= ptr[j+1]) {
10350         ++j;
10351         const DomainScalar* const X_j = &X[j];
10352         tmp[0] = X_j[0];
10353         tmp[1] = X_j[colStrideX];
10354         tmp[2] = X_j[2*colStrideX];
10355       }
10356       RangeScalar* const Y_i = &Y[i];
10357       Y_i[0] -= A_ij * tmp[0];
10358       Y_i[colStrideY] -= A_ij * tmp[1];
10359       Y_i[2*colStrideY] -= A_ij * tmp[2];
10360     }
10361   }
10362   else { // alpha != 1 && alpha != -1
10363     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10364     // Initializing tmp here isn't necessary for correctness, but it
10365     // makes compilers stop complaining about uninitialized variables.
10366     DomainScalar tmp[3];
10367     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
10368     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
10369     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
10370 
10371     Ordinal j = 0;
10372     for (size_t k = 0; k < nnz; ++k) {
10373       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10374       const Ordinal i = ind[k];
10375       // NOTE: "if" instead of "while" here is only valid
10376       // if the matrix contains no empty rows.
10377       if (k >= ptr[j+1]) {
10378         ++j;
10379         const DomainScalar* const X_j = &X[j];
10380         tmp[0] = X_j[0];
10381         tmp[1] = X_j[colStrideX];
10382         tmp[2] = X_j[2*colStrideX];
10383       }
10384       RangeScalar* const Y_i = &Y[i];
10385       Y_i[0] += alpha * A_ij * tmp[0];
10386       Y_i[colStrideY] += alpha * A_ij * tmp[1];
10387       Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
10388     }
10389   }
10390 }
10391 
10392 template<class Ordinal,
10393          class MatrixScalar,
10394          class DomainScalar,
10395          class RangeScalar>
10396 void
10397 matVecCscColMajorForifConj4Vec (
10398   const Ordinal numRows,
10399   const Ordinal numCols,
10400   const Ordinal numVecs,
10401   const RangeScalar& beta,
10402   RangeScalar Y[],
10403   const Ordinal colStrideY,
10404   const RangeScalar& alpha,
10405   const size_t  ptr[],
10406   const Ordinal ind[],
10407   const MatrixScalar val[],
10408   const DomainScalar X[],
10409   const Ordinal colStrideX)
10410 {
10411   typedef Teuchos::ScalarTraits<RangeScalar> STS;
10412 
10413   // Prescale: Y := beta * Y.
10414   if (beta == STS::zero()) {
10415     for (Ordinal j = 0; j < numVecs; ++j) {
10416       RangeScalar* const Y_j = &Y[j*colStrideY];
10417       for (Ordinal i = 0; i < numRows; ++i) {
10418         // Follow the Sparse BLAS convention for beta == 0. 
10419         Y_j[i] = STS::zero();
10420       }
10421     }
10422   }
10423   else if (beta != STS::one()) {
10424     for (Ordinal j = 0; j < numVecs; ++j) {
10425       RangeScalar* const Y_j = &Y[j*colStrideY];
10426       for (Ordinal i = 0; i < numRows; ++i) {
10427         Y_j[i] = beta * Y_j[i];
10428       }
10429     }
10430   }
10431   // Outer for loop preface:
10432   if (alpha == STS::zero()) {
10433     return; // Our work is done!
10434   }
10435   const size_t nnz = ptr[numCols];
10436   if (alpha == STS::one()) {
10437     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10438     // Initializing tmp here isn't necessary for correctness, but it
10439     // makes compilers stop complaining about uninitialized variables.
10440     DomainScalar tmp[4];
10441     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
10442     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
10443     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
10444     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
10445 
10446     Ordinal j = 0;
10447     for (size_t k = 0; k < nnz; ++k) {
10448       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10449       const Ordinal i = ind[k];
10450       // NOTE: "if" instead of "while" here is only valid
10451       // if the matrix contains no empty rows.
10452       if (k >= ptr[j+1]) {
10453         ++j;
10454         const DomainScalar* const X_j = &X[j];
10455         tmp[0] = X_j[0];
10456         tmp[1] = X_j[colStrideX];
10457         tmp[2] = X_j[2*colStrideX];
10458         tmp[3] = X_j[3*colStrideX];
10459       }
10460       RangeScalar* const Y_i = &Y[i];
10461       Y_i[0] += A_ij * tmp[0];
10462       Y_i[colStrideY] += A_ij * tmp[1];
10463       Y_i[2*colStrideY] += A_ij * tmp[2];
10464       Y_i[3*colStrideY] += A_ij * tmp[3];
10465     }
10466   }
10467   else if (alpha == -STS::one()) {
10468     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10469     // Initializing tmp here isn't necessary for correctness, but it
10470     // makes compilers stop complaining about uninitialized variables.
10471     DomainScalar tmp[4];
10472     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
10473     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
10474     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
10475     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
10476 
10477     Ordinal j = 0;
10478     for (size_t k = 0; k < nnz; ++k) {
10479       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10480       const Ordinal i = ind[k];
10481       // NOTE: "if" instead of "while" here is only valid
10482       // if the matrix contains no empty rows.
10483       if (k >= ptr[j+1]) {
10484         ++j;
10485         const DomainScalar* const X_j = &X[j];
10486         tmp[0] = X_j[0];
10487         tmp[1] = X_j[colStrideX];
10488         tmp[2] = X_j[2*colStrideX];
10489         tmp[3] = X_j[3*colStrideX];
10490       }
10491       RangeScalar* const Y_i = &Y[i];
10492       Y_i[0] -= A_ij * tmp[0];
10493       Y_i[colStrideY] -= A_ij * tmp[1];
10494       Y_i[2*colStrideY] -= A_ij * tmp[2];
10495       Y_i[3*colStrideY] -= A_ij * tmp[3];
10496     }
10497   }
10498   else { // alpha != 1 && alpha != -1
10499     // Invariant: Right before updating Y(i,:), tmp = X(j,:).
10500     // Initializing tmp here isn't necessary for correctness, but it
10501     // makes compilers stop complaining about uninitialized variables.
10502     DomainScalar tmp[4];
10503     tmp[0] = Teuchos::ScalarTraits<DomainScalar>::zero();
10504     tmp[1] = Teuchos::ScalarTraits<DomainScalar>::zero();
10505     tmp[2] = Teuchos::ScalarTraits<DomainScalar>::zero();
10506     tmp[3] = Teuchos::ScalarTraits<DomainScalar>::zero();
10507 
10508     Ordinal j = 0;
10509     for (size_t k = 0; k < nnz; ++k) {
10510       const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10511       const Ordinal i = ind[k];
10512       // NOTE: "if" instead of "while" here is only valid
10513       // if the matrix contains no empty rows.
10514       if (k >= ptr[j+1]) {
10515         ++j;
10516         const DomainScalar* const X_j = &X[j];
10517         tmp[0] = X_j[0];
10518         tmp[1] = X_j[colStrideX];
10519         tmp[2] = X_j[2*colStrideX];
10520         tmp[3] = X_j[3*colStrideX];
10521       }
10522       RangeScalar* const Y_i = &Y[i];
10523       Y_i[0] += alpha * A_ij * tmp[0];
10524       Y_i[colStrideY] += alpha * A_ij * tmp[1];
10525       Y_i[2*colStrideY] += alpha * A_ij * tmp[2];
10526       Y_i[3*colStrideY] += alpha * A_ij * tmp[3];
10527     }
10528   }
10529 }
10530 
10531 template<class Ordinal,
10532          class MatrixScalar,
10533          class DomainScalar,
10534          class RangeScalar>
10535 void
10536 matVecCsrColMajorForforConj (
10537   const Ordinal numRows,
10538   const Ordinal numCols,
10539   const Ordinal numVecs,
10540   const RangeScalar& beta,
10541   RangeScalar Y[],
10542   const Ordinal colStrideY,
10543   const RangeScalar& alpha,
10544   const size_t  ptr[],
10545   const Ordinal ind[],
10546   const MatrixScalar val[],
10547   const DomainScalar X[],
10548   const Ordinal colStrideX)
10549 {
10550   typedef Teuchos::ScalarTraits<RangeScalar> STS;
10551 
10552   // With CSR for alpha == 0, scale Y by beta and return.
10553   if (alpha == STS::zero()) {
10554     // Prescale: Y := beta * Y.
10555     if (beta == STS::zero()) {
10556       for (Ordinal j = 0; j < numVecs; ++j) {
10557         RangeScalar* const Y_j = &Y[j*colStrideY];
10558         for (Ordinal i = 0; i < numRows; ++i) {
10559           // Follow the Sparse BLAS convention for beta == 0. 
10560           Y_j[i] = STS::zero();
10561         }
10562       }
10563     }
10564     else if (beta != STS::one()) {
10565       for (Ordinal j = 0; j < numVecs; ++j) {
10566         RangeScalar* const Y_j = &Y[j*colStrideY];
10567         for (Ordinal i = 0; i < numRows; ++i) {
10568           Y_j[i] = beta * Y_j[i];
10569         }
10570       }
10571     }
10572     return; // Our work is done!
10573   }
10574   if (alpha == STS::one()) {
10575     if (beta == -STS::one()) {
10576       for (Ordinal i = 0; i < numRows; ++i) {
10577         // Initialize temporary values to -Y(i,:).
10578         for (Ordinal c = 0; c < numVecs; ++c) {
10579           RangeScalar tmp = -Y[i + c*colStrideY];
10580 
10581           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10582             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10583             const Ordinal j = ind[k];
10584             tmp += A_ij * X[j + c*colStrideX];
10585           }
10586           // Copy temporary values into output vector.
10587           Y[i + c*colStrideY] = tmp;
10588         }
10589       }
10590     }
10591     else if (beta == STS::zero()) {
10592       for (Ordinal i = 0; i < numRows; ++i) {
10593         // Initialize temporary values to 0.
10594         for (Ordinal c = 0; c < numVecs; ++c) {
10595           RangeScalar tmp = STS::zero();
10596 
10597           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10598             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10599             const Ordinal j = ind[k];
10600             tmp += A_ij * X[j + c*colStrideX];
10601           }
10602           // Copy temporary values into output vector.
10603           Y[i + c*colStrideY] = tmp;
10604         }
10605       }
10606     }
10607     else if (beta == STS::one()) {
10608       for (Ordinal i = 0; i < numRows; ++i) {
10609         // Initialize temporary values to Y(i,:).
10610         for (Ordinal c = 0; c < numVecs; ++c) {
10611           RangeScalar tmp = Y[i + c*colStrideY];
10612 
10613           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10614             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10615             const Ordinal j = ind[k];
10616             tmp += A_ij * X[j + c*colStrideX];
10617           }
10618           // Copy temporary values into output vector.
10619           Y[i + c*colStrideY] = tmp;
10620         }
10621       }
10622     }
10623     else { // beta != -1 && beta != 0 && beta != 1
10624       for (Ordinal i = 0; i < numRows; ++i) {
10625         // Initialize temporary values to Y(i,:) * beta.
10626         for (Ordinal c = 0; c < numVecs; ++c) {
10627           RangeScalar tmp = beta * Y[i + c*colStrideY];
10628 
10629           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10630             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10631             const Ordinal j = ind[k];
10632             tmp += A_ij * X[j + c*colStrideX];
10633           }
10634           // Copy temporary values into output vector.
10635           Y[i + c*colStrideY] = tmp;
10636         }
10637       }
10638     }
10639   }
10640   else if (alpha == -STS::one()) {
10641     if (beta == -STS::one()) {
10642       for (Ordinal i = 0; i < numRows; ++i) {
10643         // Initialize temporary values to -Y(i,:).
10644         for (Ordinal c = 0; c < numVecs; ++c) {
10645           RangeScalar tmp = -Y[i + c*colStrideY];
10646 
10647           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10648             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10649             const Ordinal j = ind[k];
10650             tmp -= A_ij * X[j + c*colStrideX];
10651           }
10652           // Copy temporary values into output vector.
10653           Y[i + c*colStrideY] = tmp;
10654         }
10655       }
10656     }
10657     else if (beta == STS::zero()) {
10658       for (Ordinal i = 0; i < numRows; ++i) {
10659         // Initialize temporary values to 0.
10660         for (Ordinal c = 0; c < numVecs; ++c) {
10661           RangeScalar tmp = STS::zero();
10662 
10663           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10664             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10665             const Ordinal j = ind[k];
10666             tmp -= A_ij * X[j + c*colStrideX];
10667           }
10668           // Copy temporary values into output vector.
10669           Y[i + c*colStrideY] = tmp;
10670         }
10671       }
10672     }
10673     else if (beta == STS::one()) {
10674       for (Ordinal i = 0; i < numRows; ++i) {
10675         // Initialize temporary values to Y(i,:).
10676         for (Ordinal c = 0; c < numVecs; ++c) {
10677           RangeScalar tmp = Y[i + c*colStrideY];
10678 
10679           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10680             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10681             const Ordinal j = ind[k];
10682             tmp -= A_ij * X[j + c*colStrideX];
10683           }
10684           // Copy temporary values into output vector.
10685           Y[i + c*colStrideY] = tmp;
10686         }
10687       }
10688     }
10689     else { // beta != -1 && beta != 0 && beta != 1
10690       for (Ordinal i = 0; i < numRows; ++i) {
10691         // Initialize temporary values to Y(i,:) * beta.
10692         for (Ordinal c = 0; c < numVecs; ++c) {
10693           RangeScalar tmp = beta * Y[i + c*colStrideY];
10694 
10695           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10696             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10697             const Ordinal j = ind[k];
10698             tmp -= A_ij * X[j + c*colStrideX];
10699           }
10700           // Copy temporary values into output vector.
10701           Y[i + c*colStrideY] = tmp;
10702         }
10703       }
10704     }
10705   }
10706   else { // alpha != 1 && alpha != -1
10707     if (beta == -STS::one()) {
10708       for (Ordinal i = 0; i < numRows; ++i) {
10709         // Initialize temporary values to -Y(i,:).
10710         for (Ordinal c = 0; c < numVecs; ++c) {
10711           RangeScalar tmp = -Y[i + c*colStrideY];
10712 
10713           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10714             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10715             const Ordinal j = ind[k];
10716             tmp += alpha * A_ij * X[j + c*colStrideX];
10717           }
10718           // Copy temporary values into output vector.
10719           Y[i + c*colStrideY] = tmp;
10720         }
10721       }
10722     }
10723     else if (beta == STS::zero()) {
10724       for (Ordinal i = 0; i < numRows; ++i) {
10725         // Initialize temporary values to 0.
10726         for (Ordinal c = 0; c < numVecs; ++c) {
10727           RangeScalar tmp = STS::zero();
10728 
10729           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10730             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10731             const Ordinal j = ind[k];
10732             tmp += alpha * A_ij * X[j + c*colStrideX];
10733           }
10734           // Copy temporary values into output vector.
10735           Y[i + c*colStrideY] = tmp;
10736         }
10737       }
10738     }
10739     else if (beta == STS::one()) {
10740       for (Ordinal i = 0; i < numRows; ++i) {
10741         // Initialize temporary values to Y(i,:).
10742         for (Ordinal c = 0; c < numVecs; ++c) {
10743           RangeScalar tmp = Y[i + c*colStrideY];
10744 
10745           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10746             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10747             const Ordinal j = ind[k];
10748             tmp += alpha * A_ij * X[j + c*colStrideX];
10749           }
10750           // Copy temporary values into output vector.
10751           Y[i + c*colStrideY] = tmp;
10752         }
10753       }
10754     }
10755     else { // beta != -1 && beta != 0 && beta != 1
10756       for (Ordinal i = 0; i < numRows; ++i) {
10757         // Initialize temporary values to Y(i,:) * beta.
10758         for (Ordinal c = 0; c < numVecs; ++c) {
10759           RangeScalar tmp = beta * Y[i + c*colStrideY];
10760 
10761           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10762             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10763             const Ordinal j = ind[k];
10764             tmp += alpha * A_ij * X[j + c*colStrideX];
10765           }
10766           // Copy temporary values into output vector.
10767           Y[i + c*colStrideY] = tmp;
10768         }
10769       }
10770     }
10771   }
10772 }
10773 
10774 template<class Ordinal,
10775          class MatrixScalar,
10776          class DomainScalar,
10777          class RangeScalar>
10778 void
10779 matVecCsrColMajorForforConjOmp (
10780   const Ordinal numRows,
10781   const Ordinal numCols,
10782   const Ordinal numVecs,
10783   const RangeScalar& beta,
10784   RangeScalar Y[],
10785   const Ordinal colStrideY,
10786   const RangeScalar& alpha,
10787   const size_t  ptr[],
10788   const Ordinal ind[],
10789   const MatrixScalar val[],
10790   const DomainScalar X[],
10791   const Ordinal colStrideX)
10792 {
10793   typedef Teuchos::ScalarTraits<RangeScalar> STS;
10794 
10795   // With CSR for alpha == 0, scale Y by beta and return.
10796   if (alpha == STS::zero()) {
10797     // Prescale: Y := beta * Y.
10798     if (beta == STS::zero()) {
10799       for (Ordinal j = 0; j < numVecs; ++j) {
10800         RangeScalar* const Y_j = &Y[j*colStrideY];
10801         #pragma omp parallel for
10802         for (Ordinal i = 0; i < numRows; ++i) {
10803           // Follow the Sparse BLAS convention for beta == 0. 
10804           Y_j[i] = STS::zero();
10805         }
10806       }
10807     }
10808     else if (beta != STS::one()) {
10809       for (Ordinal j = 0; j < numVecs; ++j) {
10810         RangeScalar* const Y_j = &Y[j*colStrideY];
10811         #pragma omp parallel for
10812         for (Ordinal i = 0; i < numRows; ++i) {
10813           Y_j[i] = beta * Y_j[i];
10814         }
10815       }
10816     }
10817     return; // Our work is done!
10818   }
10819   if (alpha == STS::one()) {
10820     if (beta == -STS::one()) {
10821       #pragma omp parallel for
10822       for (Ordinal i = 0; i < numRows; ++i) {
10823         // Initialize temporary values to -Y(i,:).
10824         for (Ordinal c = 0; c < numVecs; ++c) {
10825           RangeScalar tmp = -Y[i + c*colStrideY];
10826 
10827           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10828             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10829             const Ordinal j = ind[k];
10830             tmp += A_ij * X[j + c*colStrideX];
10831           }
10832           // Copy temporary values into output vector.
10833           Y[i + c*colStrideY] = tmp;
10834         }
10835       }
10836     }
10837     else if (beta == STS::zero()) {
10838       #pragma omp parallel for
10839       for (Ordinal i = 0; i < numRows; ++i) {
10840         // Initialize temporary values to 0.
10841         for (Ordinal c = 0; c < numVecs; ++c) {
10842           RangeScalar tmp = STS::zero();
10843 
10844           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10845             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10846             const Ordinal j = ind[k];
10847             tmp += A_ij * X[j + c*colStrideX];
10848           }
10849           // Copy temporary values into output vector.
10850           Y[i + c*colStrideY] = tmp;
10851         }
10852       }
10853     }
10854     else if (beta == STS::one()) {
10855       #pragma omp parallel for
10856       for (Ordinal i = 0; i < numRows; ++i) {
10857         // Initialize temporary values to Y(i,:).
10858         for (Ordinal c = 0; c < numVecs; ++c) {
10859           RangeScalar tmp = Y[i + c*colStrideY];
10860 
10861           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10862             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10863             const Ordinal j = ind[k];
10864             tmp += A_ij * X[j + c*colStrideX];
10865           }
10866           // Copy temporary values into output vector.
10867           Y[i + c*colStrideY] = tmp;
10868         }
10869       }
10870     }
10871     else { // beta != -1 && beta != 0 && beta != 1
10872       #pragma omp parallel for
10873       for (Ordinal i = 0; i < numRows; ++i) {
10874         // Initialize temporary values to Y(i,:) * beta.
10875         for (Ordinal c = 0; c < numVecs; ++c) {
10876           RangeScalar tmp = beta * Y[i + c*colStrideY];
10877 
10878           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10879             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10880             const Ordinal j = ind[k];
10881             tmp += A_ij * X[j + c*colStrideX];
10882           }
10883           // Copy temporary values into output vector.
10884           Y[i + c*colStrideY] = tmp;
10885         }
10886       }
10887     }
10888   }
10889   else if (alpha == -STS::one()) {
10890     if (beta == -STS::one()) {
10891       #pragma omp parallel for
10892       for (Ordinal i = 0; i < numRows; ++i) {
10893         // Initialize temporary values to -Y(i,:).
10894         for (Ordinal c = 0; c < numVecs; ++c) {
10895           RangeScalar tmp = -Y[i + c*colStrideY];
10896 
10897           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10898             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10899             const Ordinal j = ind[k];
10900             tmp -= A_ij * X[j + c*colStrideX];
10901           }
10902           // Copy temporary values into output vector.
10903           Y[i + c*colStrideY] = tmp;
10904         }
10905       }
10906     }
10907     else if (beta == STS::zero()) {
10908       #pragma omp parallel for
10909       for (Ordinal i = 0; i < numRows; ++i) {
10910         // Initialize temporary values to 0.
10911         for (Ordinal c = 0; c < numVecs; ++c) {
10912           RangeScalar tmp = STS::zero();
10913 
10914           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10915             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10916             const Ordinal j = ind[k];
10917             tmp -= A_ij * X[j + c*colStrideX];
10918           }
10919           // Copy temporary values into output vector.
10920           Y[i + c*colStrideY] = tmp;
10921         }
10922       }
10923     }
10924     else if (beta == STS::one()) {
10925       #pragma omp parallel for
10926       for (Ordinal i = 0; i < numRows; ++i) {
10927         // Initialize temporary values to Y(i,:).
10928         for (Ordinal c = 0; c < numVecs; ++c) {
10929           RangeScalar tmp = Y[i + c*colStrideY];
10930 
10931           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10932             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10933             const Ordinal j = ind[k];
10934             tmp -= A_ij * X[j + c*colStrideX];
10935           }
10936           // Copy temporary values into output vector.
10937           Y[i + c*colStrideY] = tmp;
10938         }
10939       }
10940     }
10941     else { // beta != -1 && beta != 0 && beta != 1
10942       #pragma omp parallel for
10943       for (Ordinal i = 0; i < numRows; ++i) {
10944         // Initialize temporary values to Y(i,:) * beta.
10945         for (Ordinal c = 0; c < numVecs; ++c) {
10946           RangeScalar tmp = beta * Y[i + c*colStrideY];
10947 
10948           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10949             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10950             const Ordinal j = ind[k];
10951             tmp -= A_ij * X[j + c*colStrideX];
10952           }
10953           // Copy temporary values into output vector.
10954           Y[i + c*colStrideY] = tmp;
10955         }
10956       }
10957     }
10958   }
10959   else { // alpha != 1 && alpha != -1
10960     if (beta == -STS::one()) {
10961       #pragma omp parallel for
10962       for (Ordinal i = 0; i < numRows; ++i) {
10963         // Initialize temporary values to -Y(i,:).
10964         for (Ordinal c = 0; c < numVecs; ++c) {
10965           RangeScalar tmp = -Y[i + c*colStrideY];
10966 
10967           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10968             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10969             const Ordinal j = ind[k];
10970             tmp += alpha * A_ij * X[j + c*colStrideX];
10971           }
10972           // Copy temporary values into output vector.
10973           Y[i + c*colStrideY] = tmp;
10974         }
10975       }
10976     }
10977     else if (beta == STS::zero()) {
10978       #pragma omp parallel for
10979       for (Ordinal i = 0; i < numRows; ++i) {
10980         // Initialize temporary values to 0.
10981         for (Ordinal c = 0; c < numVecs; ++c) {
10982           RangeScalar tmp = STS::zero();
10983 
10984           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
10985             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
10986             const Ordinal j = ind[k];
10987             tmp += alpha * A_ij * X[j + c*colStrideX];
10988           }
10989           // Copy temporary values into output vector.
10990           Y[i + c*colStrideY] = tmp;
10991         }
10992       }
10993     }
10994     else if (beta == STS::one()) {
10995       #pragma omp parallel for
10996       for (Ordinal i = 0; i < numRows; ++i) {
10997         // Initialize temporary values to Y(i,:).
10998         for (Ordinal c = 0; c < numVecs; ++c) {
10999           RangeScalar tmp = Y[i + c*colStrideY];
11000 
11001           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11002             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11003             const Ordinal j = ind[k];
11004             tmp += alpha * A_ij * X[j + c*colStrideX];
11005           }
11006           // Copy temporary values into output vector.
11007           Y[i + c*colStrideY] = tmp;
11008         }
11009       }
11010     }
11011     else { // beta != -1 && beta != 0 && beta != 1
11012       #pragma omp parallel for
11013       for (Ordinal i = 0; i < numRows; ++i) {
11014         // Initialize temporary values to Y(i,:) * beta.
11015         for (Ordinal c = 0; c < numVecs; ++c) {
11016           RangeScalar tmp = beta * Y[i + c*colStrideY];
11017 
11018           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11019             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11020             const Ordinal j = ind[k];
11021             tmp += alpha * A_ij * X[j + c*colStrideX];
11022           }
11023           // Copy temporary values into output vector.
11024           Y[i + c*colStrideY] = tmp;
11025         }
11026       }
11027     }
11028   }
11029 }
11030 
11031 template<class Ordinal,
11032          class MatrixScalar,
11033          class DomainScalar,
11034          class RangeScalar>
11035 void
11036 matVecCsrColMajorForforConj4Unrolled (
11037   const Ordinal numRows,
11038   const Ordinal numCols,
11039   const Ordinal numVecs,
11040   const RangeScalar& beta,
11041   RangeScalar Y[],
11042   const Ordinal colStrideY,
11043   const RangeScalar& alpha,
11044   const size_t  ptr[],
11045   const Ordinal ind[],
11046   const MatrixScalar val[],
11047   const DomainScalar X[],
11048   const Ordinal colStrideX)
11049 {
11050   typedef Teuchos::ScalarTraits<RangeScalar> STS;
11051 
11052   // With CSR for alpha == 0, scale Y by beta and return.
11053   if (alpha == STS::zero()) {
11054     // Prescale: Y := beta * Y.
11055     if (beta == STS::zero()) {
11056       for (Ordinal j = 0; j < numVecs; ++j) {
11057         RangeScalar* const Y_j = &Y[j*colStrideY];
11058         for (Ordinal i = 0; i < numRows; ++i) {
11059           // Follow the Sparse BLAS convention for beta == 0. 
11060           Y_j[i] = STS::zero();
11061         }
11062       }
11063     }
11064     else if (beta != STS::one()) {
11065       for (Ordinal j = 0; j < numVecs; ++j) {
11066         RangeScalar* const Y_j = &Y[j*colStrideY];
11067         for (Ordinal i = 0; i < numRows; ++i) {
11068           Y_j[i] = beta * Y_j[i];
11069         }
11070       }
11071     }
11072     return; // Our work is done!
11073   }
11074   if (alpha == STS::one()) {
11075     if (beta == -STS::one()) {
11076       for (Ordinal i = 0; i < numRows; ++i) {
11077         // Initialize temporary values to -Y(i,:).
11078         // Extra +1 in loop bound ensures first 4 iterations get
11079         // strip-mined, but requires that Ordinal be a signed type.
11080         Ordinal c = 0;
11081         for ( ; c < numVecs - 3; c += 4) {
11082           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11083           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
11084 
11085           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11086             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11087             const Ordinal j = ind[k];
11088             const DomainScalar* const X_j = &X[j + c*colStrideX];
11089             tmp[0] += A_ij * X_j[0];
11090             tmp[1] += A_ij * X_j[colStrideX];
11091             tmp[2] += A_ij * X_j[2*colStrideX];
11092             tmp[3] += A_ij * X_j[3*colStrideX];
11093           }
11094           // Copy temporary values into output vector.
11095           Y_i[0] = tmp[0];
11096           Y_i[colStrideY] = tmp[1];
11097           Y_i[2*colStrideY] = tmp[2];
11098           Y_i[3*colStrideY] = tmp[3];
11099         }
11100         // Mop up left-over iterations over multivector columns.
11101         for ( ; c < numVecs; ++c) {
11102           RangeScalar tmp = -Y[i + c*colStrideY];
11103 
11104           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11105             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11106             const Ordinal j = ind[k];
11107             tmp += A_ij * X[j + c*colStrideX];
11108           }
11109           Y[i + c*colStrideY] = tmp;
11110         }
11111       }
11112     }
11113     else if (beta == STS::zero()) {
11114       for (Ordinal i = 0; i < numRows; ++i) {
11115         // Initialize temporary values to 0.
11116         // Extra +1 in loop bound ensures first 4 iterations get
11117         // strip-mined, but requires that Ordinal be a signed type.
11118         Ordinal c = 0;
11119         for ( ; c < numVecs - 3; c += 4) {
11120           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11121           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
11122 
11123           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11124             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11125             const Ordinal j = ind[k];
11126             const DomainScalar* const X_j = &X[j + c*colStrideX];
11127             tmp[0] += A_ij * X_j[0];
11128             tmp[1] += A_ij * X_j[colStrideX];
11129             tmp[2] += A_ij * X_j[2*colStrideX];
11130             tmp[3] += A_ij * X_j[3*colStrideX];
11131           }
11132           // Copy temporary values into output vector.
11133           Y_i[0] = tmp[0];
11134           Y_i[colStrideY] = tmp[1];
11135           Y_i[2*colStrideY] = tmp[2];
11136           Y_i[3*colStrideY] = tmp[3];
11137         }
11138         // Mop up left-over iterations over multivector columns.
11139         for ( ; c < numVecs; ++c) {
11140           RangeScalar tmp = STS::zero();
11141 
11142           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11143             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11144             const Ordinal j = ind[k];
11145             tmp += A_ij * X[j + c*colStrideX];
11146           }
11147           Y[i + c*colStrideY] = tmp;
11148         }
11149       }
11150     }
11151     else if (beta == STS::one()) {
11152       for (Ordinal i = 0; i < numRows; ++i) {
11153         // Initialize temporary values to Y(i,:).
11154         // Extra +1 in loop bound ensures first 4 iterations get
11155         // strip-mined, but requires that Ordinal be a signed type.
11156         Ordinal c = 0;
11157         for ( ; c < numVecs - 3; c += 4) {
11158           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11159           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
11160 
11161           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11162             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11163             const Ordinal j = ind[k];
11164             const DomainScalar* const X_j = &X[j + c*colStrideX];
11165             tmp[0] += A_ij * X_j[0];
11166             tmp[1] += A_ij * X_j[colStrideX];
11167             tmp[2] += A_ij * X_j[2*colStrideX];
11168             tmp[3] += A_ij * X_j[3*colStrideX];
11169           }
11170           // Copy temporary values into output vector.
11171           Y_i[0] = tmp[0];
11172           Y_i[colStrideY] = tmp[1];
11173           Y_i[2*colStrideY] = tmp[2];
11174           Y_i[3*colStrideY] = tmp[3];
11175         }
11176         // Mop up left-over iterations over multivector columns.
11177         for ( ; c < numVecs; ++c) {
11178           RangeScalar tmp = Y[i + c*colStrideY];
11179 
11180           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11181             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11182             const Ordinal j = ind[k];
11183             tmp += A_ij * X[j + c*colStrideX];
11184           }
11185           Y[i + c*colStrideY] = tmp;
11186         }
11187       }
11188     }
11189     else { // beta != -1 && beta != 0 && beta != 1
11190       for (Ordinal i = 0; i < numRows; ++i) {
11191         // Initialize temporary values to Y(i,:) * beta.
11192         // Extra +1 in loop bound ensures first 4 iterations get
11193         // strip-mined, but requires that Ordinal be a signed type.
11194         Ordinal c = 0;
11195         for ( ; c < numVecs - 3; c += 4) {
11196           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11197           RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
11198 
11199           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11200             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11201             const Ordinal j = ind[k];
11202             const DomainScalar* const X_j = &X[j + c*colStrideX];
11203             tmp[0] += A_ij * X_j[0];
11204             tmp[1] += A_ij * X_j[colStrideX];
11205             tmp[2] += A_ij * X_j[2*colStrideX];
11206             tmp[3] += A_ij * X_j[3*colStrideX];
11207           }
11208           // Copy temporary values into output vector.
11209           Y_i[0] = tmp[0];
11210           Y_i[colStrideY] = tmp[1];
11211           Y_i[2*colStrideY] = tmp[2];
11212           Y_i[3*colStrideY] = tmp[3];
11213         }
11214         // Mop up left-over iterations over multivector columns.
11215         for ( ; c < numVecs; ++c) {
11216           RangeScalar tmp = beta * Y[i + c*colStrideY];
11217 
11218           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11219             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11220             const Ordinal j = ind[k];
11221             tmp += A_ij * X[j + c*colStrideX];
11222           }
11223           Y[i + c*colStrideY] = tmp;
11224         }
11225       }
11226     }
11227   }
11228   else if (alpha == -STS::one()) {
11229     if (beta == -STS::one()) {
11230       for (Ordinal i = 0; i < numRows; ++i) {
11231         // Initialize temporary values to -Y(i,:).
11232         // Extra +1 in loop bound ensures first 4 iterations get
11233         // strip-mined, but requires that Ordinal be a signed type.
11234         Ordinal c = 0;
11235         for ( ; c < numVecs - 3; c += 4) {
11236           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11237           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
11238 
11239           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11240             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11241             const Ordinal j = ind[k];
11242             const DomainScalar* const X_j = &X[j + c*colStrideX];
11243             tmp[0] -= A_ij * X_j[0];
11244             tmp[1] -= A_ij * X_j[colStrideX];
11245             tmp[2] -= A_ij * X_j[2*colStrideX];
11246             tmp[3] -= A_ij * X_j[3*colStrideX];
11247           }
11248           // Copy temporary values into output vector.
11249           Y_i[0] = tmp[0];
11250           Y_i[colStrideY] = tmp[1];
11251           Y_i[2*colStrideY] = tmp[2];
11252           Y_i[3*colStrideY] = tmp[3];
11253         }
11254         // Mop up left-over iterations over multivector columns.
11255         for ( ; c < numVecs; ++c) {
11256           RangeScalar tmp = -Y[i + c*colStrideY];
11257 
11258           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11259             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11260             const Ordinal j = ind[k];
11261             tmp -= A_ij * X[j + c*colStrideX];
11262           }
11263           Y[i + c*colStrideY] = tmp;
11264         }
11265       }
11266     }
11267     else if (beta == STS::zero()) {
11268       for (Ordinal i = 0; i < numRows; ++i) {
11269         // Initialize temporary values to 0.
11270         // Extra +1 in loop bound ensures first 4 iterations get
11271         // strip-mined, but requires that Ordinal be a signed type.
11272         Ordinal c = 0;
11273         for ( ; c < numVecs - 3; c += 4) {
11274           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11275           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
11276 
11277           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11278             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11279             const Ordinal j = ind[k];
11280             const DomainScalar* const X_j = &X[j + c*colStrideX];
11281             tmp[0] -= A_ij * X_j[0];
11282             tmp[1] -= A_ij * X_j[colStrideX];
11283             tmp[2] -= A_ij * X_j[2*colStrideX];
11284             tmp[3] -= A_ij * X_j[3*colStrideX];
11285           }
11286           // Copy temporary values into output vector.
11287           Y_i[0] = tmp[0];
11288           Y_i[colStrideY] = tmp[1];
11289           Y_i[2*colStrideY] = tmp[2];
11290           Y_i[3*colStrideY] = tmp[3];
11291         }
11292         // Mop up left-over iterations over multivector columns.
11293         for ( ; c < numVecs; ++c) {
11294           RangeScalar tmp = STS::zero();
11295 
11296           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11297             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11298             const Ordinal j = ind[k];
11299             tmp -= A_ij * X[j + c*colStrideX];
11300           }
11301           Y[i + c*colStrideY] = tmp;
11302         }
11303       }
11304     }
11305     else if (beta == STS::one()) {
11306       for (Ordinal i = 0; i < numRows; ++i) {
11307         // Initialize temporary values to Y(i,:).
11308         // Extra +1 in loop bound ensures first 4 iterations get
11309         // strip-mined, but requires that Ordinal be a signed type.
11310         Ordinal c = 0;
11311         for ( ; c < numVecs - 3; c += 4) {
11312           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11313           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
11314 
11315           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11316             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11317             const Ordinal j = ind[k];
11318             const DomainScalar* const X_j = &X[j + c*colStrideX];
11319             tmp[0] -= A_ij * X_j[0];
11320             tmp[1] -= A_ij * X_j[colStrideX];
11321             tmp[2] -= A_ij * X_j[2*colStrideX];
11322             tmp[3] -= A_ij * X_j[3*colStrideX];
11323           }
11324           // Copy temporary values into output vector.
11325           Y_i[0] = tmp[0];
11326           Y_i[colStrideY] = tmp[1];
11327           Y_i[2*colStrideY] = tmp[2];
11328           Y_i[3*colStrideY] = tmp[3];
11329         }
11330         // Mop up left-over iterations over multivector columns.
11331         for ( ; c < numVecs; ++c) {
11332           RangeScalar tmp = Y[i + c*colStrideY];
11333 
11334           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11335             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11336             const Ordinal j = ind[k];
11337             tmp -= A_ij * X[j + c*colStrideX];
11338           }
11339           Y[i + c*colStrideY] = tmp;
11340         }
11341       }
11342     }
11343     else { // beta != -1 && beta != 0 && beta != 1
11344       for (Ordinal i = 0; i < numRows; ++i) {
11345         // Initialize temporary values to Y(i,:) * beta.
11346         // Extra +1 in loop bound ensures first 4 iterations get
11347         // strip-mined, but requires that Ordinal be a signed type.
11348         Ordinal c = 0;
11349         for ( ; c < numVecs - 3; c += 4) {
11350           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11351           RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
11352 
11353           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11354             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11355             const Ordinal j = ind[k];
11356             const DomainScalar* const X_j = &X[j + c*colStrideX];
11357             tmp[0] -= A_ij * X_j[0];
11358             tmp[1] -= A_ij * X_j[colStrideX];
11359             tmp[2] -= A_ij * X_j[2*colStrideX];
11360             tmp[3] -= A_ij * X_j[3*colStrideX];
11361           }
11362           // Copy temporary values into output vector.
11363           Y_i[0] = tmp[0];
11364           Y_i[colStrideY] = tmp[1];
11365           Y_i[2*colStrideY] = tmp[2];
11366           Y_i[3*colStrideY] = tmp[3];
11367         }
11368         // Mop up left-over iterations over multivector columns.
11369         for ( ; c < numVecs; ++c) {
11370           RangeScalar tmp = beta * Y[i + c*colStrideY];
11371 
11372           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11373             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11374             const Ordinal j = ind[k];
11375             tmp -= A_ij * X[j + c*colStrideX];
11376           }
11377           Y[i + c*colStrideY] = tmp;
11378         }
11379       }
11380     }
11381   }
11382   else { // alpha != 1 && alpha != -1
11383     if (beta == -STS::one()) {
11384       for (Ordinal i = 0; i < numRows; ++i) {
11385         // Initialize temporary values to -Y(i,:).
11386         // Extra +1 in loop bound ensures first 4 iterations get
11387         // strip-mined, but requires that Ordinal be a signed type.
11388         Ordinal c = 0;
11389         for ( ; c < numVecs - 3; c += 4) {
11390           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11391           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
11392 
11393           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11394             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11395             const Ordinal j = ind[k];
11396             const DomainScalar* const X_j = &X[j + c*colStrideX];
11397             tmp[0] += alpha * A_ij * X_j[0];
11398             tmp[1] += alpha * A_ij * X_j[colStrideX];
11399             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
11400             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
11401           }
11402           // Copy temporary values into output vector.
11403           Y_i[0] = tmp[0];
11404           Y_i[colStrideY] = tmp[1];
11405           Y_i[2*colStrideY] = tmp[2];
11406           Y_i[3*colStrideY] = tmp[3];
11407         }
11408         // Mop up left-over iterations over multivector columns.
11409         for ( ; c < numVecs; ++c) {
11410           RangeScalar tmp = -Y[i + c*colStrideY];
11411 
11412           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11413             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11414             const Ordinal j = ind[k];
11415             tmp += alpha * A_ij * X[j + c*colStrideX];
11416           }
11417           Y[i + c*colStrideY] = tmp;
11418         }
11419       }
11420     }
11421     else if (beta == STS::zero()) {
11422       for (Ordinal i = 0; i < numRows; ++i) {
11423         // Initialize temporary values to 0.
11424         // Extra +1 in loop bound ensures first 4 iterations get
11425         // strip-mined, but requires that Ordinal be a signed type.
11426         Ordinal c = 0;
11427         for ( ; c < numVecs - 3; c += 4) {
11428           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11429           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
11430 
11431           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11432             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11433             const Ordinal j = ind[k];
11434             const DomainScalar* const X_j = &X[j + c*colStrideX];
11435             tmp[0] += alpha * A_ij * X_j[0];
11436             tmp[1] += alpha * A_ij * X_j[colStrideX];
11437             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
11438             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
11439           }
11440           // Copy temporary values into output vector.
11441           Y_i[0] = tmp[0];
11442           Y_i[colStrideY] = tmp[1];
11443           Y_i[2*colStrideY] = tmp[2];
11444           Y_i[3*colStrideY] = tmp[3];
11445         }
11446         // Mop up left-over iterations over multivector columns.
11447         for ( ; c < numVecs; ++c) {
11448           RangeScalar tmp = STS::zero();
11449 
11450           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11451             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11452             const Ordinal j = ind[k];
11453             tmp += alpha * A_ij * X[j + c*colStrideX];
11454           }
11455           Y[i + c*colStrideY] = tmp;
11456         }
11457       }
11458     }
11459     else if (beta == STS::one()) {
11460       for (Ordinal i = 0; i < numRows; ++i) {
11461         // Initialize temporary values to Y(i,:).
11462         // Extra +1 in loop bound ensures first 4 iterations get
11463         // strip-mined, but requires that Ordinal be a signed type.
11464         Ordinal c = 0;
11465         for ( ; c < numVecs - 3; c += 4) {
11466           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11467           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
11468 
11469           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11470             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11471             const Ordinal j = ind[k];
11472             const DomainScalar* const X_j = &X[j + c*colStrideX];
11473             tmp[0] += alpha * A_ij * X_j[0];
11474             tmp[1] += alpha * A_ij * X_j[colStrideX];
11475             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
11476             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
11477           }
11478           // Copy temporary values into output vector.
11479           Y_i[0] = tmp[0];
11480           Y_i[colStrideY] = tmp[1];
11481           Y_i[2*colStrideY] = tmp[2];
11482           Y_i[3*colStrideY] = tmp[3];
11483         }
11484         // Mop up left-over iterations over multivector columns.
11485         for ( ; c < numVecs; ++c) {
11486           RangeScalar tmp = Y[i + c*colStrideY];
11487 
11488           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11489             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11490             const Ordinal j = ind[k];
11491             tmp += alpha * A_ij * X[j + c*colStrideX];
11492           }
11493           Y[i + c*colStrideY] = tmp;
11494         }
11495       }
11496     }
11497     else { // beta != -1 && beta != 0 && beta != 1
11498       for (Ordinal i = 0; i < numRows; ++i) {
11499         // Initialize temporary values to Y(i,:) * beta.
11500         // Extra +1 in loop bound ensures first 4 iterations get
11501         // strip-mined, but requires that Ordinal be a signed type.
11502         Ordinal c = 0;
11503         for ( ; c < numVecs - 3; c += 4) {
11504           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11505           RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
11506 
11507           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11508             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11509             const Ordinal j = ind[k];
11510             const DomainScalar* const X_j = &X[j + c*colStrideX];
11511             tmp[0] += alpha * A_ij * X_j[0];
11512             tmp[1] += alpha * A_ij * X_j[colStrideX];
11513             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
11514             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
11515           }
11516           // Copy temporary values into output vector.
11517           Y_i[0] = tmp[0];
11518           Y_i[colStrideY] = tmp[1];
11519           Y_i[2*colStrideY] = tmp[2];
11520           Y_i[3*colStrideY] = tmp[3];
11521         }
11522         // Mop up left-over iterations over multivector columns.
11523         for ( ; c < numVecs; ++c) {
11524           RangeScalar tmp = beta * Y[i + c*colStrideY];
11525 
11526           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11527             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11528             const Ordinal j = ind[k];
11529             tmp += alpha * A_ij * X[j + c*colStrideX];
11530           }
11531           Y[i + c*colStrideY] = tmp;
11532         }
11533       }
11534     }
11535   }
11536 }
11537 
11538 template<class Ordinal,
11539          class MatrixScalar,
11540          class DomainScalar,
11541          class RangeScalar>
11542 void
11543 matVecCsrColMajorForforConj4UnrolledOmp (
11544   const Ordinal numRows,
11545   const Ordinal numCols,
11546   const Ordinal numVecs,
11547   const RangeScalar& beta,
11548   RangeScalar Y[],
11549   const Ordinal colStrideY,
11550   const RangeScalar& alpha,
11551   const size_t  ptr[],
11552   const Ordinal ind[],
11553   const MatrixScalar val[],
11554   const DomainScalar X[],
11555   const Ordinal colStrideX)
11556 {
11557   typedef Teuchos::ScalarTraits<RangeScalar> STS;
11558 
11559   // With CSR for alpha == 0, scale Y by beta and return.
11560   if (alpha == STS::zero()) {
11561     // Prescale: Y := beta * Y.
11562     if (beta == STS::zero()) {
11563       for (Ordinal j = 0; j < numVecs; ++j) {
11564         RangeScalar* const Y_j = &Y[j*colStrideY];
11565         #pragma omp parallel for
11566         for (Ordinal i = 0; i < numRows; ++i) {
11567           // Follow the Sparse BLAS convention for beta == 0. 
11568           Y_j[i] = STS::zero();
11569         }
11570       }
11571     }
11572     else if (beta != STS::one()) {
11573       for (Ordinal j = 0; j < numVecs; ++j) {
11574         RangeScalar* const Y_j = &Y[j*colStrideY];
11575         #pragma omp parallel for
11576         for (Ordinal i = 0; i < numRows; ++i) {
11577           Y_j[i] = beta * Y_j[i];
11578         }
11579       }
11580     }
11581     return; // Our work is done!
11582   }
11583   if (alpha == STS::one()) {
11584     if (beta == -STS::one()) {
11585       #pragma omp parallel for
11586       for (Ordinal i = 0; i < numRows; ++i) {
11587         // Initialize temporary values to -Y(i,:).
11588         // Extra +1 in loop bound ensures first 4 iterations get
11589         // strip-mined, but requires that Ordinal be a signed type.
11590         Ordinal c = 0;
11591         for ( ; c < numVecs - 3; c += 4) {
11592           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11593           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
11594 
11595           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11596             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11597             const Ordinal j = ind[k];
11598             const DomainScalar* const X_j = &X[j + c*colStrideX];
11599             tmp[0] += A_ij * X_j[0];
11600             tmp[1] += A_ij * X_j[colStrideX];
11601             tmp[2] += A_ij * X_j[2*colStrideX];
11602             tmp[3] += A_ij * X_j[3*colStrideX];
11603           }
11604           // Copy temporary values into output vector.
11605           Y_i[0] = tmp[0];
11606           Y_i[colStrideY] = tmp[1];
11607           Y_i[2*colStrideY] = tmp[2];
11608           Y_i[3*colStrideY] = tmp[3];
11609         }
11610         // Mop up left-over iterations over multivector columns.
11611         for ( ; c < numVecs; ++c) {
11612           RangeScalar tmp = -Y[i + c*colStrideY];
11613 
11614           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11615             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11616             const Ordinal j = ind[k];
11617             tmp += A_ij * X[j + c*colStrideX];
11618           }
11619           Y[i + c*colStrideY] = tmp;
11620         }
11621       }
11622     }
11623     else if (beta == STS::zero()) {
11624       #pragma omp parallel for
11625       for (Ordinal i = 0; i < numRows; ++i) {
11626         // Initialize temporary values to 0.
11627         // Extra +1 in loop bound ensures first 4 iterations get
11628         // strip-mined, but requires that Ordinal be a signed type.
11629         Ordinal c = 0;
11630         for ( ; c < numVecs - 3; c += 4) {
11631           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11632           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
11633 
11634           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11635             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11636             const Ordinal j = ind[k];
11637             const DomainScalar* const X_j = &X[j + c*colStrideX];
11638             tmp[0] += A_ij * X_j[0];
11639             tmp[1] += A_ij * X_j[colStrideX];
11640             tmp[2] += A_ij * X_j[2*colStrideX];
11641             tmp[3] += A_ij * X_j[3*colStrideX];
11642           }
11643           // Copy temporary values into output vector.
11644           Y_i[0] = tmp[0];
11645           Y_i[colStrideY] = tmp[1];
11646           Y_i[2*colStrideY] = tmp[2];
11647           Y_i[3*colStrideY] = tmp[3];
11648         }
11649         // Mop up left-over iterations over multivector columns.
11650         for ( ; c < numVecs; ++c) {
11651           RangeScalar tmp = STS::zero();
11652 
11653           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11654             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11655             const Ordinal j = ind[k];
11656             tmp += A_ij * X[j + c*colStrideX];
11657           }
11658           Y[i + c*colStrideY] = tmp;
11659         }
11660       }
11661     }
11662     else if (beta == STS::one()) {
11663       #pragma omp parallel for
11664       for (Ordinal i = 0; i < numRows; ++i) {
11665         // Initialize temporary values to Y(i,:).
11666         // Extra +1 in loop bound ensures first 4 iterations get
11667         // strip-mined, but requires that Ordinal be a signed type.
11668         Ordinal c = 0;
11669         for ( ; c < numVecs - 3; c += 4) {
11670           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11671           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
11672 
11673           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11674             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11675             const Ordinal j = ind[k];
11676             const DomainScalar* const X_j = &X[j + c*colStrideX];
11677             tmp[0] += A_ij * X_j[0];
11678             tmp[1] += A_ij * X_j[colStrideX];
11679             tmp[2] += A_ij * X_j[2*colStrideX];
11680             tmp[3] += A_ij * X_j[3*colStrideX];
11681           }
11682           // Copy temporary values into output vector.
11683           Y_i[0] = tmp[0];
11684           Y_i[colStrideY] = tmp[1];
11685           Y_i[2*colStrideY] = tmp[2];
11686           Y_i[3*colStrideY] = tmp[3];
11687         }
11688         // Mop up left-over iterations over multivector columns.
11689         for ( ; c < numVecs; ++c) {
11690           RangeScalar tmp = Y[i + c*colStrideY];
11691 
11692           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11693             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11694             const Ordinal j = ind[k];
11695             tmp += A_ij * X[j + c*colStrideX];
11696           }
11697           Y[i + c*colStrideY] = tmp;
11698         }
11699       }
11700     }
11701     else { // beta != -1 && beta != 0 && beta != 1
11702       #pragma omp parallel for
11703       for (Ordinal i = 0; i < numRows; ++i) {
11704         // Initialize temporary values to Y(i,:) * beta.
11705         // Extra +1 in loop bound ensures first 4 iterations get
11706         // strip-mined, but requires that Ordinal be a signed type.
11707         Ordinal c = 0;
11708         for ( ; c < numVecs - 3; c += 4) {
11709           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11710           RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
11711 
11712           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11713             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11714             const Ordinal j = ind[k];
11715             const DomainScalar* const X_j = &X[j + c*colStrideX];
11716             tmp[0] += A_ij * X_j[0];
11717             tmp[1] += A_ij * X_j[colStrideX];
11718             tmp[2] += A_ij * X_j[2*colStrideX];
11719             tmp[3] += A_ij * X_j[3*colStrideX];
11720           }
11721           // Copy temporary values into output vector.
11722           Y_i[0] = tmp[0];
11723           Y_i[colStrideY] = tmp[1];
11724           Y_i[2*colStrideY] = tmp[2];
11725           Y_i[3*colStrideY] = tmp[3];
11726         }
11727         // Mop up left-over iterations over multivector columns.
11728         for ( ; c < numVecs; ++c) {
11729           RangeScalar tmp = beta * Y[i + c*colStrideY];
11730 
11731           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11732             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11733             const Ordinal j = ind[k];
11734             tmp += A_ij * X[j + c*colStrideX];
11735           }
11736           Y[i + c*colStrideY] = tmp;
11737         }
11738       }
11739     }
11740   }
11741   else if (alpha == -STS::one()) {
11742     if (beta == -STS::one()) {
11743       #pragma omp parallel for
11744       for (Ordinal i = 0; i < numRows; ++i) {
11745         // Initialize temporary values to -Y(i,:).
11746         // Extra +1 in loop bound ensures first 4 iterations get
11747         // strip-mined, but requires that Ordinal be a signed type.
11748         Ordinal c = 0;
11749         for ( ; c < numVecs - 3; c += 4) {
11750           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11751           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
11752 
11753           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11754             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11755             const Ordinal j = ind[k];
11756             const DomainScalar* const X_j = &X[j + c*colStrideX];
11757             tmp[0] -= A_ij * X_j[0];
11758             tmp[1] -= A_ij * X_j[colStrideX];
11759             tmp[2] -= A_ij * X_j[2*colStrideX];
11760             tmp[3] -= A_ij * X_j[3*colStrideX];
11761           }
11762           // Copy temporary values into output vector.
11763           Y_i[0] = tmp[0];
11764           Y_i[colStrideY] = tmp[1];
11765           Y_i[2*colStrideY] = tmp[2];
11766           Y_i[3*colStrideY] = tmp[3];
11767         }
11768         // Mop up left-over iterations over multivector columns.
11769         for ( ; c < numVecs; ++c) {
11770           RangeScalar tmp = -Y[i + c*colStrideY];
11771 
11772           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11773             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11774             const Ordinal j = ind[k];
11775             tmp -= A_ij * X[j + c*colStrideX];
11776           }
11777           Y[i + c*colStrideY] = tmp;
11778         }
11779       }
11780     }
11781     else if (beta == STS::zero()) {
11782       #pragma omp parallel for
11783       for (Ordinal i = 0; i < numRows; ++i) {
11784         // Initialize temporary values to 0.
11785         // Extra +1 in loop bound ensures first 4 iterations get
11786         // strip-mined, but requires that Ordinal be a signed type.
11787         Ordinal c = 0;
11788         for ( ; c < numVecs - 3; c += 4) {
11789           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11790           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
11791 
11792           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11793             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11794             const Ordinal j = ind[k];
11795             const DomainScalar* const X_j = &X[j + c*colStrideX];
11796             tmp[0] -= A_ij * X_j[0];
11797             tmp[1] -= A_ij * X_j[colStrideX];
11798             tmp[2] -= A_ij * X_j[2*colStrideX];
11799             tmp[3] -= A_ij * X_j[3*colStrideX];
11800           }
11801           // Copy temporary values into output vector.
11802           Y_i[0] = tmp[0];
11803           Y_i[colStrideY] = tmp[1];
11804           Y_i[2*colStrideY] = tmp[2];
11805           Y_i[3*colStrideY] = tmp[3];
11806         }
11807         // Mop up left-over iterations over multivector columns.
11808         for ( ; c < numVecs; ++c) {
11809           RangeScalar tmp = STS::zero();
11810 
11811           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11812             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11813             const Ordinal j = ind[k];
11814             tmp -= A_ij * X[j + c*colStrideX];
11815           }
11816           Y[i + c*colStrideY] = tmp;
11817         }
11818       }
11819     }
11820     else if (beta == STS::one()) {
11821       #pragma omp parallel for
11822       for (Ordinal i = 0; i < numRows; ++i) {
11823         // Initialize temporary values to Y(i,:).
11824         // Extra +1 in loop bound ensures first 4 iterations get
11825         // strip-mined, but requires that Ordinal be a signed type.
11826         Ordinal c = 0;
11827         for ( ; c < numVecs - 3; c += 4) {
11828           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11829           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
11830 
11831           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11832             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11833             const Ordinal j = ind[k];
11834             const DomainScalar* const X_j = &X[j + c*colStrideX];
11835             tmp[0] -= A_ij * X_j[0];
11836             tmp[1] -= A_ij * X_j[colStrideX];
11837             tmp[2] -= A_ij * X_j[2*colStrideX];
11838             tmp[3] -= A_ij * X_j[3*colStrideX];
11839           }
11840           // Copy temporary values into output vector.
11841           Y_i[0] = tmp[0];
11842           Y_i[colStrideY] = tmp[1];
11843           Y_i[2*colStrideY] = tmp[2];
11844           Y_i[3*colStrideY] = tmp[3];
11845         }
11846         // Mop up left-over iterations over multivector columns.
11847         for ( ; c < numVecs; ++c) {
11848           RangeScalar tmp = Y[i + c*colStrideY];
11849 
11850           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11851             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11852             const Ordinal j = ind[k];
11853             tmp -= A_ij * X[j + c*colStrideX];
11854           }
11855           Y[i + c*colStrideY] = tmp;
11856         }
11857       }
11858     }
11859     else { // beta != -1 && beta != 0 && beta != 1
11860       #pragma omp parallel for
11861       for (Ordinal i = 0; i < numRows; ++i) {
11862         // Initialize temporary values to Y(i,:) * beta.
11863         // Extra +1 in loop bound ensures first 4 iterations get
11864         // strip-mined, but requires that Ordinal be a signed type.
11865         Ordinal c = 0;
11866         for ( ; c < numVecs - 3; c += 4) {
11867           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11868           RangeScalar tmp[4] = {beta * Y_i[0], beta * Y_i[colStrideY], beta * Y_i[2*colStrideY], beta * Y_i[3*colStrideY]};
11869 
11870           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11871             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11872             const Ordinal j = ind[k];
11873             const DomainScalar* const X_j = &X[j + c*colStrideX];
11874             tmp[0] -= A_ij * X_j[0];
11875             tmp[1] -= A_ij * X_j[colStrideX];
11876             tmp[2] -= A_ij * X_j[2*colStrideX];
11877             tmp[3] -= A_ij * X_j[3*colStrideX];
11878           }
11879           // Copy temporary values into output vector.
11880           Y_i[0] = tmp[0];
11881           Y_i[colStrideY] = tmp[1];
11882           Y_i[2*colStrideY] = tmp[2];
11883           Y_i[3*colStrideY] = tmp[3];
11884         }
11885         // Mop up left-over iterations over multivector columns.
11886         for ( ; c < numVecs; ++c) {
11887           RangeScalar tmp = beta * Y[i + c*colStrideY];
11888 
11889           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11890             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11891             const Ordinal j = ind[k];
11892             tmp -= A_ij * X[j + c*colStrideX];
11893           }
11894           Y[i + c*colStrideY] = tmp;
11895         }
11896       }
11897     }
11898   }
11899   else { // alpha != 1 && alpha != -1
11900     if (beta == -STS::one()) {
11901       #pragma omp parallel for
11902       for (Ordinal i = 0; i < numRows; ++i) {
11903         // Initialize temporary values to -Y(i,:).
11904         // Extra +1 in loop bound ensures first 4 iterations get
11905         // strip-mined, but requires that Ordinal be a signed type.
11906         Ordinal c = 0;
11907         for ( ; c < numVecs - 3; c += 4) {
11908           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11909           RangeScalar tmp[4] = {-Y_i[0], -Y_i[colStrideY], -Y_i[2*colStrideY], -Y_i[3*colStrideY]};
11910 
11911           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11912             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11913             const Ordinal j = ind[k];
11914             const DomainScalar* const X_j = &X[j + c*colStrideX];
11915             tmp[0] += alpha * A_ij * X_j[0];
11916             tmp[1] += alpha * A_ij * X_j[colStrideX];
11917             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
11918             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
11919           }
11920           // Copy temporary values into output vector.
11921           Y_i[0] = tmp[0];
11922           Y_i[colStrideY] = tmp[1];
11923           Y_i[2*colStrideY] = tmp[2];
11924           Y_i[3*colStrideY] = tmp[3];
11925         }
11926         // Mop up left-over iterations over multivector columns.
11927         for ( ; c < numVecs; ++c) {
11928           RangeScalar tmp = -Y[i + c*colStrideY];
11929 
11930           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11931             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11932             const Ordinal j = ind[k];
11933             tmp += alpha * A_ij * X[j + c*colStrideX];
11934           }
11935           Y[i + c*colStrideY] = tmp;
11936         }
11937       }
11938     }
11939     else if (beta == STS::zero()) {
11940       #pragma omp parallel for
11941       for (Ordinal i = 0; i < numRows; ++i) {
11942         // Initialize temporary values to 0.
11943         // Extra +1 in loop bound ensures first 4 iterations get
11944         // strip-mined, but requires that Ordinal be a signed type.
11945         Ordinal c = 0;
11946         for ( ; c < numVecs - 3; c += 4) {
11947           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11948           RangeScalar tmp[4] = {STS::zero(), STS::zero(), STS::zero(), STS::zero()};
11949 
11950           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11951             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11952             const Ordinal j = ind[k];
11953             const DomainScalar* const X_j = &X[j + c*colStrideX];
11954             tmp[0] += alpha * A_ij * X_j[0];
11955             tmp[1] += alpha * A_ij * X_j[colStrideX];
11956             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
11957             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
11958           }
11959           // Copy temporary values into output vector.
11960           Y_i[0] = tmp[0];
11961           Y_i[colStrideY] = tmp[1];
11962           Y_i[2*colStrideY] = tmp[2];
11963           Y_i[3*colStrideY] = tmp[3];
11964         }
11965         // Mop up left-over iterations over multivector columns.
11966         for ( ; c < numVecs; ++c) {
11967           RangeScalar tmp = STS::zero();
11968 
11969           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11970             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11971             const Ordinal j = ind[k];
11972             tmp += alpha * A_ij * X[j + c*colStrideX];
11973           }
11974           Y[i + c*colStrideY] = tmp;
11975         }
11976       }
11977     }
11978     else if (beta == STS::one()) {
11979       #pragma omp parallel for
11980       for (Ordinal i = 0; i < numRows; ++i) {
11981         // Initialize temporary values to Y(i,:).
11982         // Extra +1 in loop bound ensures first 4 iterations get
11983         // strip-mined, but requires that Ordinal be a signed type.
11984         Ordinal c = 0;
11985         for ( ; c < numVecs - 3; c += 4) {
11986           RangeScalar* const Y_i = &Y[i + c*colStrideY];
11987           RangeScalar tmp[4] = {Y_i[0], Y_i[colStrideY], Y_i[2*colStrideY], Y_i[3*colStrideY]};
11988 
11989           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
11990             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
11991             const Ordinal j = ind[k];
11992             const DomainScalar* const X_j = &X[j + c*colStrideX];
11993             tmp[0] += alpha * A_ij * X_j[0];
11994             tmp[1] += alpha * A_ij * X_j[colStrideX];
11995             tmp[2] += alpha * A_ij * X_j[2*colStrideX];
11996             tmp[3] += alpha * A_ij * X_j[3*colStrideX];
11997           }
11998           // Copy temporary values into output vector.
11999           Y_i[0] = tmp[0];
12000           Y_i[colStrideY] = tmp[1];
12001           Y_i[2*colStrideY] = tmp[2];
12002           Y_i[3*colStrideY] = tmp[3];
12003         }
12004         // Mop up left-over iterations over multivector columns.
12005         for ( ; c < numVecs; ++c) {
12006           RangeScalar tmp = Y[i + c*colStrideY];
12007 
12008           for (size_t k = ptr[i]; k < ptr[i+1]; ++k) {
12009             const MatrixScalar A_ij = Teuchos::ScalarTraits<MatrixScalar>::conjugate (val[k]);
12010             const Ordinal j = ind[k];
12011             tmp += alpha * A_ij * X[j + c*colStrideX];
12012           }
12013           Y[i + c*colStrideY] = tmp;
12014         }
12015       }
12016     }
12017