Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Kokkos_AltSparseOps.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 //
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 //
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __Kokkos_AltSparseOps_hpp
00043 #define __Kokkos_AltSparseOps_hpp
00044 
00045 #include <Teuchos_CompileTimeAssert.hpp>
00046 #include <Teuchos_DataAccess.hpp>
00047 #include <Teuchos_Describable.hpp>
00048 #include <Teuchos_StandardParameterEntryValidators.hpp>
00049 #include <Teuchos_SerialDenseMatrix.hpp>
00050 #include <iterator>
00051 #include <stdexcept>
00052 
00053 #include "Kokkos_ConfigDefs.hpp"
00054 #include "Kokkos_CrsMatrixBase.hpp"
00055 #include "Kokkos_CrsGraphBase.hpp"
00056 
00057 #include "Kokkos_MultiVector.hpp"
00058 #include "Kokkos_DefaultArithmetic.hpp"
00059 
00060 #include "Kokkos_Raw_SparseMatVec_decl.hpp"
00061 #include "Kokkos_Raw_SparseMatVec_def.hpp"
00062 #include "Kokkos_Raw_SparseTriangularSolve_decl.hpp"
00063 #include "Kokkos_Raw_SparseTriangularSolve_def.hpp"
00064 
00065 #ifdef HAVE_KOKKOS_OPENMP
00066 #  include "Kokkos_OpenMPNode.hpp"
00067 #endif // HAVE_KOKKOS_OPENMP
00068 #include "Kokkos_SerialNode.hpp"
00069 
00070 
00071 namespace Kokkos {
00072 
00073   namespace details {
00074 
00077     template <class Ordinal, class Node>
00078     class AltSparseOpsDefaultAllocator {
00079     public:
00080       static Teuchos::ArrayRCP<size_t>
00081       allocRowPtrs (const Teuchos::RCP<Node>& node,
00082                     const Teuchos::ArrayView<const size_t>& numEntriesPerRow)
00083       {
00084         (void) node;
00085 
00086         const Ordinal numRows = numEntriesPerRow.size ();
00087 
00088         // Let the ArrayRCP constructor do the allocation.
00089         Teuchos::ArrayRCP<size_t> ptr (numRows + 1);
00090         ptr[0] = 0; // We'll fill in the rest of the entries below.
00091 
00092         if (numRows > 0) {
00093           // Fill in ptr sequentially for now.  We might parallelize
00094           // this later, though it's still only O(numRows) work.
00095           // Parallel prefix is only likely to pay off for a large
00096           // number of threads.
00097           std::partial_sum (numEntriesPerRow.getRawPtr (),
00098                             numEntriesPerRow.getRawPtr () + numRows,
00099                             ptr.getRawPtr () + 1);
00100         }
00101         return ptr;
00102       }
00103 
00104       static Teuchos::ArrayRCP<size_t>
00105       copyRowPtrs (const Teuchos::RCP<Node>& node,
00106                    const Teuchos::ArrayView<const size_t>& rowPtrs)
00107       {
00108         (void) node;
00109 
00110         // Let the ArrayRCP constructor do the allocation.
00111         Teuchos::ArrayRCP<size_t> ptr (rowPtrs.size ());
00112 
00113         // Copy rowPtrs sequentially for now.  We might parallelize
00114         // this later, though it's still only O(numRows) work.
00115         std::copy (rowPtrs.getRawPtr (),
00116                    rowPtrs.getRawPtr () + rowPtrs.size (),
00117                    ptr.getRawPtr ());
00118         return ptr;
00119       }
00120 
00121       template<class T>
00122       static Teuchos::ArrayRCP<T>
00123       allocStorage (const Teuchos::RCP<Node>& node,
00124                     const Teuchos::ArrayView<const size_t>& rowPtrs)
00125       {
00126         (void) node;
00127 
00128         using Teuchos::ArrayRCP;
00129         using Teuchos::arcp;
00130 
00131         TEUCHOS_TEST_FOR_EXCEPTION(rowPtrs.size() == 0, std::invalid_argument,
00132           "AltSparseOpsDefaultAllocator::allocStorage: The input rowPtrs array "
00133           "must have length at least one, but rowPtrs.size() = 0.  (Remember that "
00134           "rowPtrs must have exactly one more entry than the number of rows in "
00135           "the matrix.)");
00136 
00137         const Ordinal numRows = rowPtrs.size() - 1;
00138         const size_t totalNumEntries = rowPtrs[numRows];
00139         const T zero = Teuchos::ScalarTraits<T>::zero ();
00140 
00141         // Let the ArrayRCP constructor do the allocation.
00142         ArrayRCP<T> val (totalNumEntries);
00143 
00144         // Initialize the values sequentially.
00145         std::fill (val.getRawPtr (), val.getRawPtr () + totalNumEntries, zero);
00146         return val;
00147       }
00148 
00149       template<class T>
00150       static Teuchos::ArrayRCP<T>
00151       copyStorage (const Teuchos::RCP<Node>& node,
00152                    const Teuchos::ArrayView<const size_t>& rowPtrs,
00153                    const Teuchos::ArrayView<const T>& inputVals)
00154       {
00155         (void) node;
00156 
00157         using Teuchos::ArrayRCP;
00158         using Teuchos::arcp;
00159 
00160         TEUCHOS_TEST_FOR_EXCEPTION(rowPtrs.size() == 0, std::invalid_argument,
00161           "AltSparseOpsDefaultAllocator::copyStorage: The input rowPtrs array "
00162           "must have length at least one, but rowPtrs.size() = 0.  (Remember that "
00163           "rowPtrs must have exactly one more entry than the number of rows in "
00164           "the matrix.)");
00165 
00166         const Ordinal numRows = rowPtrs.size() - 1;
00167         const size_t totalNumEntries = rowPtrs[numRows];
00168 
00169         TEUCHOS_TEST_FOR_EXCEPTION(
00170           inputVals.size() != totalNumEntries, std::invalid_argument,
00171           "AltSparseOpsDefaultAllocator::copyStorage: The inputVals array "
00172           "must have as many entries as the number of entries in the local "
00173           "sparse matrix, namely " << totalNumEntries << ".");
00174 
00175         // Let the ArrayRCP constructor do the allocation.
00176         ArrayRCP<T> val (totalNumEntries);
00177 
00178         // Get the raw pointers so that the compiler doesn't have to
00179         // optimize across the ArrayView inlining.
00180         const T* const rawInputVals = inputVals.getRawPtr ();
00181         T* const rawOutputVals = val.getRawPtr ();
00182 
00183         // Copy the values sequentially.
00184         std::copy (rawInputVals, rawInputVals + totalNumEntries, rawOutputVals);
00185         return val;
00186       }
00187     };
00188 
00189 
00190     // Kokkos Kernels for generic first-touch allocation.
00191     namespace {
00192       template<class Ordinal, class T>
00193       class ZeroInitKernel {
00194       private:
00195         T* const x_;
00196 
00197       public:
00198         ZeroInitKernel (T* const x) : x_ (x) {}
00199 
00200         void execute (const Ordinal i) {
00201           x_[i] = Teuchos::ScalarTraits<T>::zero ();
00202         }
00203       };
00204 
00205       template<class Ordinal, class T>
00206       class CopyKernel {
00207       private:
00208         T* const out_;
00209         const T* const in_;
00210 
00211       public:
00212         CopyKernel (T* const out, const T* const in) :
00213           out_ (out), in_ (in)
00214         {}
00215 
00216         void execute (const int i) {
00217           out_[i] = in_[i];
00218         }
00219       };
00220 
00221       template<class Ordinal, class T>
00222       class CsrInitKernel {
00223       private:
00224         const size_t* const ptr_;
00225         T* const val_;
00226 
00227       public:
00228         CsrInitKernel (const size_t* const ptr,
00229                        T* const val) :
00230           ptr_ (ptr),
00231           val_ (val)
00232         {}
00233 
00234         void execute (const Ordinal r) {
00235           for (size_t k = ptr_[r]; k < ptr_[r+1]; ++k) {
00236             val_[k] = Teuchos::ScalarTraits<T>::zero ();
00237           }
00238         }
00239       };
00240 
00241 
00242       template<class Ordinal, class T>
00243       class CsrCopyKernel {
00244       private:
00245         const size_t* const ptr_;
00246         T* const outVal_;
00247         const T* const inVal_;
00248 
00249       public:
00250         CsrCopyKernel (const size_t* const ptr,
00251                        T* const outVal,
00252                        const T* const inVal) :
00253           ptr_ (ptr),
00254           outVal_ (outVal),
00255           inVal_ (inVal)
00256         {}
00257 
00258         void execute (const Ordinal r) {
00259           for (size_t k = ptr_[r]; k < ptr_[r+1]; ++k) {
00260             outVal_[k] = inVal_[k];
00261           }
00262         }
00263       };
00264     } // namespace (anonymous)
00265 
00266 
00272     template <class Ordinal, class Node>
00273     class AltSparseOpsFirstTouchAllocator {
00274     public:
00275       static Teuchos::ArrayRCP<size_t>
00276       allocRowPtrs (const Teuchos::RCP<Node>& node,
00277                     const Teuchos::ArrayView<const size_t>& numEntriesPerRow)
00278       {
00279         using Teuchos::ArrayRCP;
00280         using Teuchos::arcp;
00281         const Ordinal numRows = numEntriesPerRow.size ();
00282 
00283         // Allocate raw, since arcp() might initialize in debug mode.
00284         size_t* const rawPtr = new size_t [numRows + 1];
00285 
00286         // Initialize the row pointers in parallel, using the Kokkos
00287         // Node.  If the Kokkos Node's parallelization scheme respects
00288         // first-touch initialization, this should set the proper NUMA
00289         // affinity, at least at page boundaries.
00290         typedef ZeroInitKernel<Ordinal, size_t> kernel_type;
00291         node->parallel_for (0, numRows+1, kernel_type (rawPtr));
00292 
00293         // Encapsulate the raw pointer in an (owning) ArrayRCP.
00294         ArrayRCP<size_t> ptr = arcp<size_t> (rawPtr, 0, numRows+1, true);
00295 
00296         if (numRows > 0) {
00297           // Fill in ptr sequentially for now.  We might parallelize
00298           // this later, though it's still only O(numRows) work.
00299           // Parallel prefix is only likely to pay off for a large
00300           // number of threads.
00301           std::partial_sum (numEntriesPerRow.getRawPtr (),
00302                             numEntriesPerRow.getRawPtr () + numRows,
00303                             ptr.getRawPtr () + 1);
00304         }
00305         return ptr;
00306       }
00307 
00308       static Teuchos::ArrayRCP<size_t>
00309       copyRowPtrs (const Teuchos::RCP<Node>& node,
00310                    const Teuchos::ArrayView<const size_t>& rowPtrs)
00311       {
00312         using Teuchos::arcp;
00313         using Teuchos::as;
00314 
00315         TEUCHOS_TEST_FOR_EXCEPTION(rowPtrs.size() == 0, std::invalid_argument,
00316           "AltSparseOpsFirstTouchAllocator::copyRowPtrs: "
00317           "The input row offsets array rowPtrs must always have length at least one, "
00318           "even if the graph or matrix to which it belongs has zero rows.");
00319 
00320         // We require that the number of rows in the matrix fit in
00321         // Ordinal.  In a debug build, as() may check for overflow
00322         // when converting from size_t to Ordinal.
00323         Ordinal numRows = Teuchos::OrdinalTraits<Ordinal>::zero ();
00324         {
00325           const size_t numRowsSizeT = as<size_t> (rowPtrs.size() - 1);
00326           numRows = as<size_t> (numRowsSizeT);
00327         }
00328 
00329         // Extract the raw pointer, so that we don't have to make the
00330         // compiler inline ArrayView::operator[] in the loop below.
00331         const size_t* const rawRowPtrs = rowPtrs.getRawPtr ();
00332 
00333         // Allocate raw, since arcp() might initialize in debug mode.
00334         size_t* const rawPtr = new size_t [numRows + 1];
00335 
00336         // Copy the row pointers in parallel.  If first touch works,
00337         // this should set the proper affinity at page boundaries.
00338         typedef CopyKernel<Ordinal, size_t> kernel_type;
00339         node->parallel_for (0, numRows+1, kernel_type (rawPtr, rawRowPtrs));
00340 
00341         // Encapsulate the raw pointer in an (owning) ArrayRCP.
00342         return arcp<size_t> (rawPtr, 0, numRows+1, true);
00343       }
00344 
00345       template<class T>
00346       static Teuchos::ArrayRCP<T>
00347       allocStorage (const Teuchos::RCP<Node>& node,
00348                     const Teuchos::ArrayView<const size_t>& rowPtrs)
00349       {
00350         using Teuchos::ArrayRCP;
00351         using Teuchos::arcp;
00352 
00353         TEUCHOS_TEST_FOR_EXCEPTION(rowPtrs.size() == 0, std::invalid_argument,
00354           "AltSparseOpsFirstTouchAllocator::allocStorage: The input rowPtrs array "
00355           "must have length at least one, but rowPtrs.size() = 0.  (Remember that "
00356           "rowPtrs must have exactly one more entry than the number of rows in "
00357           "the matrix.)");
00358 
00359         const Ordinal numRows = rowPtrs.size() - 1;
00360         const size_t totalNumEntries = rowPtrs[numRows];
00361 
00362         // Allocate raw, since arcp() might initialize in debug mode.
00363         T* const rawVal = new T [totalNumEntries];
00364 
00365         // Get the row pointer so that the compiler doesn't have to
00366         // optimize the ArrayView.
00367         const size_t* const rawRowPtrs = rowPtrs.getRawPtr ();
00368 
00369         // Initialize the values in parallel.  If first touch works,
00370         // this should set the proper affinity at page boundaries.  We
00371         // iterate over the values using the row pointers, so that if
00372         // the Kokkos Node parallelizes in a reproducible way, it will
00373         // initialize the values using the same affinity with which
00374         // the row pointers were initialized.
00375         CsrInitKernel<Ordinal, T> kernel (rawRowPtrs, rawVal);
00376         node->parallel_for (0, numRows, kernel);
00377 
00378         // Encapsulate the raw pointer in an (owning) ArrayRCP.
00379         return arcp<T> (rawVal, 0, totalNumEntries, true);
00380       }
00381 
00382 
00383       template<class T>
00384       static Teuchos::ArrayRCP<T>
00385       copyStorage (const Teuchos::RCP<Node>& node,
00386                    const Teuchos::ArrayView<const size_t>& rowPtrs,
00387                    const Teuchos::ArrayView<const T>& inputVals)
00388       {
00389         using Teuchos::ArrayRCP;
00390         using Teuchos::arcp;
00391         using Teuchos::as;
00392 
00393         TEUCHOS_TEST_FOR_EXCEPTION(rowPtrs.size() == 0, std::invalid_argument,
00394           "AltSparseOpsFirstTouchAllocator::copyStorage: The input rowPtrs array "
00395           "must have length at least one, but rowPtrs.size() = 0.  (Remember that "
00396           "rowPtrs must have exactly one more entry than the number of rows in "
00397           "the matrix.)");
00398 
00399         const Ordinal numRows = rowPtrs.size() - 1;
00400         const size_t totalNumEntries = rowPtrs[numRows];
00401 
00402         // Casting from signed to unsigned won't cause overflow.
00403         TEUCHOS_TEST_FOR_EXCEPTION(
00404           as<size_t> (inputVals.size()) != totalNumEntries,
00405           std::invalid_argument,
00406           "AltSparseOpsFirstTouchAllocator::copyStorage: The inputVals array "
00407           "must have as many entries as the number of entries in the local "
00408           "sparse matrix, namely " << totalNumEntries << ".");
00409 
00410         // Allocate raw, since arcp() might initialize in debug mode.
00411         T* const rawOutputVals = new T [totalNumEntries];
00412 
00413         // Get the raw pointers so that the compiler doesn't have to
00414         // optimize across the ArrayView inlining.
00415         const size_t* const rawRowPtrs = rowPtrs.getRawPtr ();
00416         const T* const rawInputVals = inputVals.getRawPtr ();
00417 
00418         // Copy the values in parallel.  If first touch works, this
00419         // should set the proper affinity at page boundaries.  We
00420         // iterate over the values using the row pointers, so that if
00421         // the Kokkos Node parallelizes in a reproducible way, it will
00422         // initialize the values using the same affinity with which
00423         // the row pointers were initialized.
00424         typedef CsrCopyKernel<Ordinal, T> kernel_type;
00425         kernel_type kernel (rawRowPtrs, rawOutputVals, rawInputVals);
00426         node->parallel_for (0, numRows, kernel);
00427 
00428         // Encapsulate the raw pointer in an (owning) ArrayRCP.
00429         return arcp<T> (rawOutputVals, 0, totalNumEntries, true);
00430       }
00431     };
00432 
00433 
00434 #ifdef HAVE_KOKKOS_OPENMP
00435     // Partial speicalization of AltSparseOpsFirstTouchAllocator for
00436     // Node=Kokkos::OpenMPNode.  This just uses OpenMP pragmas
00437     // directly, avoiding any possible overhead of going through the
00438     // Kokkos Node instance.
00439     template <class Ordinal>
00440     class AltSparseOpsFirstTouchAllocator<Ordinal, OpenMPNode> {
00441     public:
00442       static Teuchos::ArrayRCP<size_t>
00443       allocRowPtrs (const Teuchos::RCP<OpenMPNode>& node,
00444                     const Teuchos::ArrayView<const size_t>& numEntriesPerRow)
00445       {
00446         (void) node;
00447 
00448         using Teuchos::ArrayRCP;
00449         using Teuchos::arcp;
00450         const Ordinal numRows = numEntriesPerRow.size ();
00451 
00452         // Allocate raw, since arcp() might initialize in debug mode.
00453         size_t* const rawPtr = new size_t [numRows + 1];
00454 
00455         // Initialize the row pointers in parallel.  If the operating
00456         // system respects first-touch initialization, this should set
00457         // the proper NUMA affinity, at least at page boundaries.  We
00458         // don't rely on the Kokkos Node for parallelization; instead,
00459         // we use OpenMP, since that's what our kernels use.  If
00460         // you're building without OpenMP support, the compiler will
00461         // simply ignore this pragma.
00462         #pragma omp parallel for
00463         for (Ordinal i = 0; i < numRows+1; ++i) {
00464           rawPtr[i] = 0;
00465         }
00466         // Encapsulate the raw pointer in an (owning) ArrayRCP.
00467         ArrayRCP<size_t> ptr = arcp<size_t> (rawPtr, 0, numRows+1, true);
00468 
00469         if (numRows > 0) {
00470           // Fill in ptr sequentially for now.  We might parallelize
00471           // this later, though it's still only O(numRows) work.
00472           // Parallel prefix is only likely to pay off for a large
00473           // number of threads.
00474           std::partial_sum (numEntriesPerRow.getRawPtr (),
00475                             numEntriesPerRow.getRawPtr () + numRows,
00476                             ptr.getRawPtr () + 1);
00477         }
00478         return ptr;
00479       }
00480 
00481       static Teuchos::ArrayRCP<size_t>
00482       copyRowPtrs (const Teuchos::RCP<OpenMPNode>& node,
00483                    const Teuchos::ArrayView<const size_t>& rowPtrs)
00484       {
00485         (void) node;
00486 
00487         using Teuchos::arcp;
00488         const Ordinal numRows = rowPtrs.size () - 1;
00489 
00490         // Don't force the compiler to inline ArrayView::operator[] in
00491         // the loop below.
00492         const size_t* const rawRowPtrs = rowPtrs.getRawPtr ();
00493 
00494         // Allocate raw, since arcp() might initialize in debug mode.
00495         size_t* const rawPtr = new size_t [numRows + 1];
00496 
00497         // Copy the row pointers in parallel.  If first touch works,
00498         // this should set the proper affinity at page boundaries.  We
00499         // don't rely on the Kokkos Node for this; instead, we use
00500         // OpenMP, since that's what our kernels use.  If you're
00501         // building without OpenMP support, the compiler will simply
00502         // ignore this pragma.
00503         #pragma omp parallel for
00504         for (Ordinal i = 0; i < numRows+1; ++i) {
00505           rawPtr[i] = rawRowPtrs[i];
00506         }
00507 
00508         // Encapsulate the raw pointer in an (owning) ArrayRCP.
00509         return arcp<size_t> (rawPtr, 0, numRows+1, true);
00510       }
00511 
00512       template<class T>
00513       static Teuchos::ArrayRCP<T>
00514       allocStorage (const Teuchos::RCP<OpenMPNode>& node,
00515                     const Teuchos::ArrayView<const size_t>& rowPtrs)
00516       {
00517         (void) node;
00518 
00519         using Teuchos::ArrayRCP;
00520         using Teuchos::arcp;
00521 
00522         TEUCHOS_TEST_FOR_EXCEPTION(rowPtrs.size() == 0, std::invalid_argument,
00523           "AltSparseOpsFirstTouchAllocator::allocStorage: The input rowPtrs array "
00524           "must have length at least one, but rowPtrs.size() = 0.  (Remember that "
00525           "rowPtrs must have exactly one more entry than the number of rows in "
00526           "the matrix.)");
00527 
00528         const Ordinal numRows = rowPtrs.size() - 1;
00529         const size_t totalNumEntries = rowPtrs[numRows];
00530         const T zero = Teuchos::ScalarTraits<T>::zero ();
00531 
00532         // Allocate raw, since arcp() might initialize in debug mode.
00533         T* const rawVal = new T [totalNumEntries];
00534 
00535         // Get the row pointer so that the compiler doesn't have to
00536         // optimize the ArrayView.
00537         const size_t* const rawRowPtrs = rowPtrs.getRawPtr ();
00538 
00539         // Initialize the values in parallel.  If first touch works,
00540         // this should set the proper affinity at page boundaries.  We
00541         // don't rely on the Kokkos Node for this; instead, we use
00542         // OpenMP, since that's what our kernels use.  We also iterate
00543         // over the values using the row pointers, so that if OpenMP
00544         // parallelizes in a reproducible way, it will initialize the
00545         // values using the same affinity with which the row pointers
00546         // were initialized.
00547         //
00548         // If you're building without OpenMP support, the compiler will
00549         // simply ignore this pragma.
00550 #pragma omp parallel for
00551         for (Ordinal i = 0; i < numRows; ++i) {
00552           for (size_t k = rawRowPtrs[i]; k < rawRowPtrs[i+1]; ++k) {
00553             rawVal[k] = zero;
00554           }
00555         }
00556 
00557         // Encapsulate the raw pointer in an (owning) ArrayRCP.
00558         return arcp<T> (rawVal, 0, totalNumEntries, true);
00559       }
00560 
00561 
00562       template<class T>
00563       static Teuchos::ArrayRCP<T>
00564       copyStorage (const Teuchos::RCP<Kokkos::OpenMPNode>& node,
00565                    const Teuchos::ArrayView<const size_t>& rowPtrs,
00566                    const Teuchos::ArrayView<const T>& inputVals)
00567       {
00568         (void) node;
00569 
00570         using Teuchos::ArrayRCP;
00571         using Teuchos::arcp;
00572 
00573         TEUCHOS_TEST_FOR_EXCEPTION(rowPtrs.size() == 0, std::invalid_argument,
00574           "AltSparseOpsFirstTouchAllocator::copyStorage: The input rowPtrs array "
00575           "must have length at least one, but rowPtrs.size() = 0.  (Remember that "
00576           "rowPtrs must have exactly one more entry than the number of rows in "
00577           "the matrix.)");
00578 
00579         const Ordinal numRows = rowPtrs.size() - 1;
00580         const size_t totalNumEntries = rowPtrs[numRows];
00581 
00582         TEUCHOS_TEST_FOR_EXCEPTION(
00583           inputVals.size() != totalNumEntries, std::invalid_argument,
00584           "AltSparseOpsFirstTouchAllocator::copyStorage: The inputVals array "
00585           "must have as many entries as the number of entries in the local "
00586           "sparse matrix, namely " << totalNumEntries << ".");
00587 
00588         // Allocate raw, since arcp() might initialize in debug mode.
00589         T* const rawOutputVals = new T [totalNumEntries];
00590 
00591         // Get the raw pointers so that the compiler doesn't have to
00592         // optimize across the ArrayView inlining.
00593         const size_t* const rawRowPtrs = rowPtrs.getRawPtr ();
00594         const T* const rawInputVals = inputVals.getRawPtr ();
00595 
00596         // Copy the values in parallel.  If first touch works, this
00597         // should set the proper affinity at page boundaries.  We
00598         // don't rely on the Kokkos Node for this; instead, we use
00599         // OpenMP, since that's what our kernels use.  We also iterate
00600         // over the values using the row pointers, so that if OpenMP
00601         // parallelizes in a reproducible way, it will initialize the
00602         // values using the same affinity with which the row pointers
00603         // were initialized.
00604         //
00605         // If you're building without OpenMP support, the compiler will
00606         // simply ignore this pragma.
00607 #pragma omp parallel for
00608         for (Ordinal i = 0; i < numRows; ++i) {
00609           for (size_t k = rawRowPtrs[i]; k < rawRowPtrs[i+1]; ++k) {
00610             rawOutputVals[k] = rawInputVals[k];
00611           }
00612         }
00613 
00614         // Encapsulate the raw pointer in an (owning) ArrayRCP.
00615         return arcp<T> (rawOutputVals, 0, totalNumEntries, true);
00616       }
00617     };
00618 #endif // HAVE_KOKKOS_OPENMP
00619 
00620   } // namespace details
00621 
00625   template <class Ordinal,class Node>
00626   class AltCrsGraph {
00627   public:
00628     typedef Ordinal ordinal_type;
00629     typedef Node node_type;
00630 
00631     AltCrsGraph (Ordinal numRows, Ordinal numCols,
00632                  const Teuchos::RCP<Node>& node,
00633                  const Teuchos::RCP<Teuchos::ParameterList>& params);
00634 
00635     Teuchos::RCP<Node> getNode() const {
00636       return node_;
00637     }
00638 
00639     Ordinal getNumRows () const {
00640       return numRows_;
00641     }
00642 
00643     Ordinal getNumCols () const {
00644       return numCols_;
00645     }
00646 
00647     Teuchos::ArrayRCP<const size_t> getPointers() const {
00648       return ptr_;
00649     }
00650 
00651     Teuchos::ArrayRCP<const Ordinal> getIndices() const {
00652       return ind_;
00653     }
00654 
00655     bool isInitialized() const {
00656       return isInitialized_;
00657     }
00658 
00663     bool isEmpty() const {
00664       return isEmpty_;
00665     }
00666 
00672     bool hasEmptyRows() const {
00673       return hasEmptyRows_;
00674     }
00675 
00676     void setStructure (const Teuchos::ArrayRCP<const size_t>& ptr,
00677                        const Teuchos::ArrayRCP<const Ordinal>& ind);
00678     void setMatDesc (Teuchos::EUplo uplo, Teuchos::EDiag diag);
00679     void getMatDesc (Teuchos::EUplo &uplo, Teuchos::EDiag &diag) const;
00680 
00681   private:
00682     Teuchos::RCP<Node> node_;
00683     Ordinal numRows_, numCols_;
00684     //Teuchos::RCP<ParameterList> params_;
00685     bool isInitialized_;
00686     bool isEmpty_;
00687     bool hasEmptyRows_;
00688     Teuchos::EUplo tri_uplo_;
00689     Teuchos::EDiag unit_diag_;
00690 
00691     Teuchos::ArrayRCP<const size_t> ptr_;
00692     Teuchos::ArrayRCP<const Ordinal> ind_;
00693   };
00694 
00695 
00702   template <class Scalar,
00703             class Ordinal,
00704             class Node>
00705   class AltCrsMatrix {
00706   public:
00707     typedef Scalar scalar_type;
00708     typedef Ordinal ordinal_type;
00709     typedef Node node_type;
00710     typedef AltCrsGraph<Ordinal,Node> graph_type;
00711 
00712     AltCrsMatrix (const Teuchos::RCP<const AltCrsGraph<Ordinal,Node> > &graph,
00713                   const Teuchos::RCP<Teuchos::ParameterList> &params);
00714 
00715     void setValues (const Teuchos::ArrayRCP<const Scalar>& val);
00716 
00717     Teuchos::ArrayRCP<const Scalar> getValues() const {
00718       return val_;
00719     }
00720 
00721     bool isInitialized() const {
00722       return isInitialized_;
00723     }
00724 
00725   private:
00726     Teuchos::RCP<const graph_type> graph_;
00727     Teuchos::ArrayRCP<const Scalar> val_;
00728     bool isInitialized_;
00729   };
00730 
00731   template <class Ordinal, class Node>
00732   AltCrsGraph<Ordinal,Node>::
00733   AltCrsGraph (Ordinal numRows, Ordinal numCols,
00734                const Teuchos::RCP<Node> &node,
00735                const Teuchos::RCP<Teuchos::ParameterList> &params) :
00736     node_ (node),
00737     numRows_ (numRows),
00738     numCols_ (numCols),
00739     isInitialized_ (false),
00740     isEmpty_ (numRows == 0), // provisional; a matrix with numRows > 0
00741                              // may still have zero entries.
00742     hasEmptyRows_ (true), // provisional
00743     tri_uplo_ (Teuchos::UNDEF_TRI),
00744     unit_diag_ (Teuchos::NON_UNIT_DIAG)
00745   {
00746     // Make sure that users only specialize for Kokkos Node types that
00747     // are host Nodes (vs. device Nodes, such as GPU Nodes).
00748     Teuchos::CompileTimeAssert<Node::isHostNode == false> cta; (void) cta;
00749 
00750     // We don't use params currently.
00751     (void) params;
00752   }
00753 
00754   template <class Ordinal, class Node>
00755   void
00756   AltCrsGraph<Ordinal,Node>::
00757   setStructure (const Teuchos::ArrayRCP<const size_t> &ptr,
00758                 const Teuchos::ArrayRCP<const Ordinal> &ind)
00759   {
00760     std::string tfecfFuncName("setStructure(ptr,ind)");
00761     const Ordinal numRows = this->getNumRows();
00762 
00763     // mfh 19 June 2012: The tests expect std::runtime_error rather
00764     // than the arguably more appropriate std::invalid_argument, so
00765     // I'll throw std::runtime_error here.  Ditto for the other checks
00766     // below.
00767     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00768       ptr.is_null (),
00769       std::runtime_error,
00770       ": The input array 'ptr' must be nonnull, even for a matrix with zero "
00771       "rows.");
00772 
00773     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00774       (size_t) ptr.size() != (size_t) numRows+1,
00775       std::runtime_error,
00776       ": Graph input data are not coherent:\n"
00777       "-- ptr.size() = " << ptr.size() << " != numRows+1 = "
00778       << (numRows+1) << ".");
00779 
00780     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00781       ptr[0] != 0,
00782       std::runtime_error,
00783       ": Graph input data are not coherent:\n"
00784       "-- ptr[0] = " << ptr[0] << " != 0.");
00785 
00786     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00787       (size_t) ind.size() != (size_t) ptr[numRows],
00788       std::runtime_error,
00789       ": Graph input data are not coherent:\n"
00790       "-- ind.size() = " << ind.size() << " != ptr[numRows="
00791       << numRows << "] = " << ptr[numRows] << ".");
00792 
00793     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00794       isInitialized_,
00795       std::runtime_error,
00796       ": Graph has already been initialized."
00797     )
00798 
00799     const size_t numEntries = ptr[numRows];
00800     if (numRows == 0 || numEntries == 0) {
00801       isEmpty_ = true;
00802       hasEmptyRows_ = true; // trivially
00803     }
00804     else {
00805       isEmpty_ = false;
00806       // Check whether the graph has any empty rows.
00807       bool emptyRows = false;
00808       for (Ordinal i = 0; i < numRows; ++i) {
00809         if (ptr[i] == ptr[i+1]) {
00810           emptyRows = true;
00811           break;
00812         }
00813       }
00814       hasEmptyRows_ = emptyRows;
00815     }
00816     ptr_ = ptr;
00817     ind_ = ind;
00818     isInitialized_ = true;
00819   }
00820 
00821   template <class Ordinal, class Node>
00822   void
00823   AltCrsGraph<Ordinal,Node>::
00824   setMatDesc (Teuchos::EUplo uplo, Teuchos::EDiag diag)
00825   {
00826     tri_uplo_ = uplo;
00827     unit_diag_ = diag;
00828   }
00829 
00830   template <class Ordinal, class Node>
00831   void
00832   AltCrsGraph<Ordinal,Node>::
00833   getMatDesc (Teuchos::EUplo &uplo, Teuchos::EDiag &diag) const
00834   {
00835     uplo = tri_uplo_;
00836     diag = unit_diag_;
00837   }
00838 
00839   template <class Scalar, class Ordinal, class Node>
00840   AltCrsMatrix<Scalar,Ordinal,Node>::
00841   AltCrsMatrix (const Teuchos::RCP<const AltCrsGraph<Ordinal,Node> >& graph,
00842                 const Teuchos::RCP<Teuchos::ParameterList>& params) :
00843     graph_ (graph),
00844     isInitialized_ (false)
00845   {
00846     // Make sure that users only specialize for Kokkos Node types that
00847     // are host Nodes (vs. device Nodes, such as GPU Nodes).
00848     Teuchos::CompileTimeAssert<Node::isHostNode == false> cta; (void) cta;
00849 
00850     // We don't use params currently.
00851     (void) params;
00852   }
00853 
00854   template <class Scalar, class Ordinal, class Node>
00855   void
00856   AltCrsMatrix<Scalar,Ordinal,Node>::
00857   setValues (const Teuchos::ArrayRCP<const Scalar> &val)
00858   {
00859     std::string tfecfFuncName("setValues(val)");
00860     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00861       isInitialized_, std::runtime_error, ": The matrix is already initialized."
00862     );
00863     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00864       graph_.is_null() && ! val.is_null(),
00865       std::runtime_error,
00866       ": The matrix has a null graph, but you're trying to give it a nonnull "
00867       "array of values."
00868     );
00869     val_ = val;
00870     if (val_.is_null ()) {
00871       isInitialized_ = false;
00872     }
00873     else {
00874       isInitialized_ = true;
00875     }
00876   }
00877 
00931   template <class Scalar,
00932             class Ordinal,
00933             class Node,
00934             class Allocator = details::AltSparseOpsFirstTouchAllocator<Ordinal, Node> >
00935   class AltSparseOps : public Teuchos::Describable {
00936   public:
00938 
00939 
00941     typedef Scalar scalar_type;
00943     typedef Ordinal ordinal_type;
00945     typedef Node node_type;
00947     typedef Allocator allocator_type;
00949     typedef AltSparseOps<Scalar, Ordinal, Node, Allocator> sparse_ops_type;
00950 
00952     template <class O, class N>
00953     struct graph {
00954       typedef AltCrsGraph<O,N> graph_type;
00955     };
00956 
00958     template <class S, class O, class N>
00959     struct matrix {
00960       typedef AltCrsMatrix<S,O,N> matrix_type;
00961     };
00962 
00981     template <class S2>
00982     struct bind_scalar {
00983       typedef AltSparseOps<S2, Ordinal, Node> other_type;
00984     };
00985 
01001     template <class O2>
01002     struct bind_ordinal {
01003       typedef DefaultHostSparseOps<Scalar,O2,Node,Allocator> other_type;
01004     };
01005 
01007 
01008 
01009 
01019     AltSparseOps (const Teuchos::RCP<Node>& node);
01020 
01035     AltSparseOps (const Teuchos::RCP<Node>& node,
01036                   Teuchos::ParameterList& plist);
01037 
01039     ~AltSparseOps();
01040 
01049     static Teuchos::RCP<const Teuchos::ParameterList>
01050     getDefaultParameters ()
01051     {
01052       using Teuchos::ParameterList;
01053       using Teuchos::parameterList;
01054       using Teuchos::RCP;
01055       using Teuchos::rcp_const_cast;
01056 
01057       RCP<ParameterList> plist = parameterList ("AltSparseOps");
01058       setDefaultParameters (*plist);
01059       return rcp_const_cast<const ParameterList> (plist);
01060     }
01061 
01063 
01064 
01065 
01067     std::string description () const {
01068       using Teuchos::TypeNameTraits;
01069       std::ostringstream os;
01070       os <<  "Kokkos::AltSparseOps<"
01071          << "Scalar=" << TypeNameTraits<Scalar>::name()
01072          << ", Ordinal=" << TypeNameTraits<Ordinal>::name()
01073          << ", Node=" << TypeNameTraits<Node>::name()
01074          << ">";
01075       return os.str();
01076     }
01077 
01079     void
01080     describe (Teuchos::FancyOStream& out,
01081               const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
01082     {
01083       using Teuchos::EVerbosityLevel;
01084       using Teuchos::includesVerbLevel;
01085       using Teuchos::OSTab;
01086       using Teuchos::rcpFromRef;
01087       using Teuchos::VERB_DEFAULT;
01088       using Teuchos::VERB_NONE;
01089       using Teuchos::VERB_LOW;
01090       using Teuchos::VERB_MEDIUM;
01091       using Teuchos::VERB_HIGH;
01092       using Teuchos::VERB_EXTREME;
01093       using std::endl;
01094 
01095       // Interpret the default verbosity level as VERB_MEDIUM.
01096       const EVerbosityLevel vl =
01097         (verbLevel == VERB_DEFAULT) ? VERB_MEDIUM : verbLevel;
01098 
01099       if (vl == VERB_NONE) {
01100         return;
01101       }
01102       else if (includesVerbLevel (vl, VERB_LOW)) { // vl >= VERB_LOW
01103         out << this->description();
01104 
01105         if (includesVerbLevel (vl, VERB_MEDIUM)) { // vl >= VERB_MEDIUM
01106           out << ":" << endl;
01107           OSTab tab1 (rcpFromRef (out));
01108 
01109           out << "matVecVariant_ = " << matVecVariant_ << endl
01110               << "unroll_ = " << unroll_ << endl
01111               << "isInitialized_ = " << isInitialized_ << endl;
01112           if (isInitialized_) {
01113             std::string triUplo ("INVALID");
01114             if (tri_uplo_ == Teuchos::UNDEF_TRI) {
01115               triUplo = "UNDEF_TRI";
01116             }
01117             else if (tri_uplo_ == Teuchos::LOWER_TRI) {
01118               triUplo = "LOWER_TRI";
01119             }
01120             else if (tri_uplo_ == Teuchos::UPPER_TRI) {
01121               triUplo = "UPPER_TRI";
01122             }
01123             std::string unitDiag ("INVALID");
01124             if (unit_diag_ == Teuchos::NON_UNIT_DIAG) {
01125               unitDiag = "NON_UNIT_DIAG";
01126             }
01127             else if (unit_diag_ == Teuchos::UNIT_DIAG) {
01128               unitDiag = "UNIT_DIAG";
01129             }
01130 
01131             out << "numRows_ = " << numRows_ << endl
01132                 << "numCols_ = " << numCols_ << endl
01133                 << "isEmpty_ = " << isEmpty_ << endl
01134                 << "hasEmptyRows_ = " << hasEmptyRows_ << endl
01135                 << "tri_uplo_ = " << triUplo << endl
01136                 << "unit_diag_ = " << unitDiag << endl;
01137             if (ptr_.size() > 0) {
01138               out << "numEntries = " << ptr_[ptr_.size()-1] << endl;
01139             }
01140             else {
01141               out << "numEntries = 0" << endl;
01142             }
01143 
01144             if (includesVerbLevel (vl, VERB_EXTREME)) { // vl >= VERB_EXTREME
01145               // Only print out all the sparse matrix's data in
01146               // extreme verbosity mode.
01147               out << "ptr_ = [";
01148               std::copy (ptr_.begin(), ptr_.end(),
01149                          std::ostream_iterator<Ordinal> (out, " "));
01150               out << "]" << endl << "ind_ = [";
01151               std::copy (ind_.begin(), ind_.end(),
01152                          std::ostream_iterator<Ordinal> (out, " "));
01153               out << "]" << endl << "val_ = [";
01154               std::copy (val_.begin(), val_.end(),
01155                          std::ostream_iterator<Scalar> (out, " "));
01156               out << "]" << endl;
01157             } // vl >= VERB_EXTREME
01158           } // if is initialized
01159         } // vl >= VERB_MEDIUM
01160       } // vl >= VERB_LOW
01161     }
01162 
01173     Teuchos::RCP<Teuchos::SerialDenseMatrix<int, scalar_type> >
01174     asDenseMatrix () const
01175     {
01176       using Teuchos::RCP;
01177       using Teuchos::rcp;
01178       typedef Teuchos::OrdinalTraits<ordinal_type> OTO;
01179       typedef Teuchos::ScalarTraits<scalar_type> STS;
01180       typedef Teuchos::SerialDenseMatrix<int, scalar_type> dense_matrix_type;
01181 
01182       RCP<dense_matrix_type> A_ptr =
01183         rcp (new dense_matrix_type (numRows_, numCols_));
01184       dense_matrix_type& A = *A_ptr; // for notational convenience
01185 
01186       for (ordinal_type i = OTO::zero(); i < numRows_; ++i) {
01187         for (size_t k = ptr_[i]; k < ptr_[i+1]; ++k) {
01188           const ordinal_type j = ind_[k];
01189           const scalar_type A_ij = val_[k];
01190           A(i,j) += A_ij;
01191         }
01192         if (unit_diag_ == Teuchos::UNIT_DIAG) {
01193           // Respect whatever is in the sparse matrix, even if it is wrong.
01194           // This is helpful for debugging.
01195           A(i,i) += STS::one ();
01196         }
01197       }
01198       return A_ptr;
01199     }
01200 
01202 
01203 
01204 
01206     Teuchos::RCP<Node> getNode () const;
01207 
01209 
01210 
01211 
01217     static Teuchos::ArrayRCP<size_t>
01218     allocRowPtrs (const Teuchos::RCP<Node>& node,
01219                   const ArrayView<const size_t>& numEntriesPerRow);
01220 
01229     static Teuchos::ArrayRCP<size_t>
01230     copyRowPtrs (const Teuchos::RCP<Node>& node,
01231                  const Teuchos::ArrayView<const size_t>& rowPtrs);
01232 
01239     template <class T>
01240     static Teuchos::ArrayRCP<T>
01241     allocStorage (const Teuchos::RCP<Node>& node,
01242                   const Teuchos::ArrayView<const size_t>& rowPtrs);
01243 
01256     template <class T>
01257     static Teuchos::ArrayRCP<T>
01258     copyStorage (const Teuchos::RCP<Node>& node,
01259                  const Teuchos::ArrayView<const size_t>& rowPtrs,
01260                  const Teuchos::ArrayView<const T>& inputVals);
01261 
01263     static void
01264     finalizeGraph (Teuchos::EUplo uplo,
01265                    Teuchos::EDiag diag,
01266                    AltCrsGraph<Ordinal, Node>& graph,
01267                    const Teuchos::RCP<Teuchos::ParameterList> &params);
01268 
01270     static void
01271     finalizeMatrix (const AltCrsGraph<Ordinal, Node>& graph,
01272                     AltCrsMatrix<Scalar, Ordinal, Node>& matrix,
01273                     const Teuchos::RCP<Teuchos::ParameterList>& params);
01274 
01283     static void
01284     finalizeGraphAndMatrix (Teuchos::EUplo uplo,
01285                             Teuchos::EDiag diag,
01286                             AltCrsGraph<Ordinal, Node>& graph,
01287                             AltCrsMatrix<Scalar, Ordinal, Node>& matrix,
01288                             const Teuchos::RCP<Teuchos::ParameterList>& params);
01289 
01291     void
01292     setGraphAndMatrix (const Teuchos::RCP<const AltCrsGraph<Ordinal,Node> > &graph,
01293                        const Teuchos::RCP<const AltCrsMatrix<Scalar,Ordinal,Node> > &matrix);
01294 
01296 
01297 
01298 
01324     template <class DomainScalar, class RangeScalar>
01325     void
01326     multiply (Teuchos::ETransp trans,
01327               RangeScalar alpha,
01328               const MultiVector<DomainScalar,Node> &X,
01329               MultiVector<RangeScalar,Node> &Y) const;
01330 
01360     template <class DomainScalar, class RangeScalar>
01361     void
01362     multiply (Teuchos::ETransp trans,
01363               RangeScalar alpha,
01364               const MultiVector<DomainScalar,Node> &X,
01365               RangeScalar beta,
01366               MultiVector<RangeScalar,Node> &Y) const;
01367 
01388     template <class DomainScalar, class RangeScalar>
01389     void
01390     solve (Teuchos::ETransp trans,
01391            const MultiVector<DomainScalar,Node> &Y,
01392            MultiVector<RangeScalar,Node> &X) const;
01393 
01395 
01396   private:
01416     enum EMatVecVariant {
01417       FOR_FOR,
01418       FOR_WHILE,
01419       FOR_IF
01420     };
01421 
01423     static void
01424     setDefaultParameters (Teuchos::ParameterList& plist);
01425 
01430     static void
01431     setDefaultMatVecVariantParameter (Teuchos::ParameterList& plist);
01432 
01434     static void
01435     setDefaultUnrollParameter (Teuchos::ParameterList& plist);
01436 
01438     static void
01439     setDefaultFirstTouchParameter (Teuchos::ParameterList& plist);
01440 
01442     AltSparseOps (const AltSparseOps& source);
01443 
01445     Teuchos::RCP<Node> node_;
01446 
01448 
01449 
01462     ArrayRCP<const size_t>  ptr_;
01464     ArrayRCP<const Ordinal> ind_;
01466     ArrayRCP<const Scalar>  val_;
01468 
01470     Teuchos::EUplo  tri_uplo_;
01472     Teuchos::EDiag unit_diag_;
01473 
01475 
01476 
01477     EMatVecVariant matVecVariant_;
01480     bool unroll_;
01482     bool firstTouchAllocation_;
01484 
01486     Ordinal numRows_;
01488     Ordinal numCols_;
01490     bool isInitialized_;
01492     bool isEmpty_;
01494     bool hasEmptyRows_;
01495   };
01496 
01497   template <class Scalar, class Ordinal, class Node, class Allocator>
01498   void
01499   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01500   finalizeGraph (Teuchos::EUplo uplo,
01501                  Teuchos::EDiag diag,
01502                  AltCrsGraph<Ordinal,Node>& graph,
01503                  const Teuchos::RCP<Teuchos::ParameterList>& params)
01504   {
01505     TEUCHOS_TEST_FOR_EXCEPTION(
01506       ! graph.isInitialized(), std::runtime_error,
01507       "Kokkos::AltSparseOps::finalizeGraph: Graph has not yet been initialized."
01508     );
01509     graph.setMatDesc (uplo, diag);
01510   }
01511 
01512   template <class Scalar, class Ordinal, class Node, class Allocator>
01513   void
01514   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01515   finalizeMatrix (const AltCrsGraph<Ordinal,Node> &graph,
01516                   AltCrsMatrix<Scalar,Ordinal,Node> &matrix,
01517                   const Teuchos::RCP<Teuchos::ParameterList> &params)
01518   {
01519     TEUCHOS_TEST_FOR_EXCEPTION(
01520       ! matrix.isInitialized(), std::runtime_error,
01521       "Kokkos::AltSparseOps::finalizeMatrix(graph,matrix,params): "
01522       "Matrix has not yet been initialized."
01523     );
01524   }
01525 
01526   template <class Scalar, class Ordinal, class Node, class Allocator>
01527   void
01528   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01529   finalizeGraphAndMatrix (Teuchos::EUplo uplo,
01530                           Teuchos::EDiag diag,
01531                           AltCrsGraph<Ordinal,Node>& graph,
01532                           AltCrsMatrix<Scalar,Ordinal,Node>& matrix,
01533                           const Teuchos::RCP<Teuchos::ParameterList>& params)
01534   {
01535     TEUCHOS_TEST_FOR_EXCEPTION(
01536       ! graph.isInitialized(), std::runtime_error,
01537       "Kokkos::AltSparseOps::finalizeGraphAndMatrix(graph,matrix,params): "
01538       "Graph has not yet been initialized."
01539     );
01540     TEUCHOS_TEST_FOR_EXCEPTION(
01541       ! matrix.isInitialized(), std::runtime_error,
01542       "Kokkos::AltSparseOps::finalizeGraphAndMatrix(graph,matrix,params): "
01543       "Matrix has not yet been initialized."
01544     );
01545     graph.setMatDesc (uplo, diag);
01546   }
01547 
01548 
01549   template <class Scalar, class Ordinal, class Node, class Allocator>
01550   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01551   AltSparseOps (const Teuchos::RCP<Node>& node) :
01552     node_ (node),
01553     tri_uplo_ (Teuchos::UNDEF_TRI),      // Provisionally
01554     unit_diag_ (Teuchos::NON_UNIT_DIAG), // Provisionally
01555     matVecVariant_ (AltSparseOps<Scalar, Ordinal, Node, Allocator>::FOR_FOR),
01556     unroll_ (true),
01557     firstTouchAllocation_ (false),
01558     numRows_ (Teuchos::OrdinalTraits<Ordinal>::zero ()),
01559     numCols_ (Teuchos::OrdinalTraits<Ordinal>::zero ()),
01560     isInitialized_ (false),
01561     isEmpty_ (true),                     // Provisionally
01562     hasEmptyRows_ (true)                 // Provisionally
01563   {
01564     // Make sure that users only specialize AltSparseOps for Kokkos
01565     // Node types that are host Nodes (vs. device Nodes, such as GPU
01566     // Nodes).
01567     Teuchos::CompileTimeAssert<Node::isHostNode == false> cta; (void)cta;
01568   }
01569 
01570   template <class Scalar, class Ordinal, class Node, class Allocator>
01571   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01572   AltSparseOps (const Teuchos::RCP<Node>& node,
01573                 Teuchos::ParameterList& params) :
01574     node_ (node),
01575     tri_uplo_ (Teuchos::UNDEF_TRI),      // Provisionally
01576     unit_diag_ (Teuchos::NON_UNIT_DIAG), // Provisionally
01577     matVecVariant_ (AltSparseOps<Scalar, Ordinal, Node, Allocator>::FOR_FOR),
01578     unroll_ (true),
01579     firstTouchAllocation_ (false),
01580     numRows_ (Teuchos::OrdinalTraits<Ordinal>::zero ()),
01581     numCols_ (Teuchos::OrdinalTraits<Ordinal>::zero ()),
01582     isInitialized_ (false),
01583     isEmpty_ (true),                     // Provisionally
01584     hasEmptyRows_ (true)                 // Provisionally
01585   {
01586     using Teuchos::getIntegralValue;
01587     using Teuchos::ParameterList;
01588     using Teuchos::RCP;
01589 
01590     // Make sure that users only specialize AltSparseOps for Kokkos
01591     // Node types that are host Nodes (vs. device Nodes, such as GPU
01592     // Nodes).
01593     Teuchos::CompileTimeAssert<Node::isHostNode == false> cta; (void)cta;
01594 
01595     params.validateParametersAndSetDefaults (*getDefaultParameters ());
01596 
01597     const std::string varParamName ("Sparse matrix-vector multiply variant");
01598     const std::string unrollParamName ("Unroll across multivectors");
01599     const std::string firstTouchParamName ("Force first-touch allocation");
01600 
01601     matVecVariant_ = getIntegralValue<EMatVecVariant> (params, varParamName);
01602     unroll_ = params.get<bool> (unrollParamName);
01603     firstTouchAllocation_ = params.get<bool> (firstTouchParamName);
01604   }
01605 
01606   template <class Scalar, class Ordinal, class Node, class Allocator>
01607   AltSparseOps<Scalar, Ordinal, Node, Allocator>::~AltSparseOps() {
01608   }
01609 
01610   template <class Scalar, class Ordinal, class Node, class Allocator>
01611   void
01612   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01613   setDefaultParameters (Teuchos::ParameterList& plist)
01614   {
01615     setDefaultMatVecVariantParameter (plist);
01616     setDefaultUnrollParameter (plist);
01617     setDefaultFirstTouchParameter (plist);
01618   }
01619 
01620   template <class Scalar, class Ordinal, class Node, class Allocator>
01621   void
01622   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01623   setDefaultUnrollParameter (Teuchos::ParameterList& plist)
01624   {
01625     const bool unroll = true;
01626     plist.set ("Unroll across multivectors", unroll, "Whether to unroll reads "
01627                "and writes of multivectors across columns of the input and "
01628                "ouput multivectors");
01629   }
01630 
01631   template <class Scalar, class Ordinal, class Node, class Allocator>
01632   void
01633   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01634   setDefaultFirstTouchParameter (Teuchos::ParameterList& plist)
01635   {
01636     const bool firstTouchAllocation = false;
01637     plist.set ("Force first-touch allocation", firstTouchAllocation,
01638                "Whether to copy all the input data and initialize them in "
01639                "parallel (using OpenMP if available), to ensure first-touch "
01640                "initialization");
01641   }
01642 
01643   template <class Scalar, class Ordinal, class Node, class Allocator>
01644   void
01645   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01646   setDefaultMatVecVariantParameter (Teuchos::ParameterList& plist)
01647   {
01648     using Teuchos::Array;
01649     using Teuchos::ParameterList;
01650     using Teuchos::stringToIntegralParameterEntryValidator;
01651 
01652     Array<std::string> strs (3);
01653     strs[0] = "for-for";
01654     strs[1] = "for-while";
01655     strs[2] = "for-if";
01656 
01657     Array<std::string> docs (3);
01658     docs[0] = "Two nested for loops (textbook algorithm)";
01659     docs[1] = "Outer for loop, inner while loop";
01660     docs[2] = "Outer for loop, inner if statement";
01661 
01662     Array<EMatVecVariant> vals (3);
01663     vals[0] = FOR_FOR;
01664     vals[1] = FOR_WHILE;
01665     vals[2] = FOR_IF;
01666 
01667     const std::string paramName ("Sparse matrix-vector multiply variant");
01668     const std::string paramDoc ("Which algorithm variant to use for sparse "
01669                                 "matrix-vector multiply");
01670     plist.set (paramName, strs[0], paramDoc,
01671                stringToIntegralParameterEntryValidator<EMatVecVariant> (strs(), docs(), vals(), strs[0]));
01672   }
01673 
01674   template <class Scalar, class Ordinal, class Node, class Allocator>
01675   RCP<Node> AltSparseOps<Scalar, Ordinal, Node, Allocator>::getNode() const {
01676     return node_;
01677   }
01678 
01679   template <class Scalar, class Ordinal, class Node, class Allocator>
01680   void
01681   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01682   setGraphAndMatrix (const Teuchos::RCP<const AltCrsGraph<Ordinal,Node> > &opgraph,
01683                      const Teuchos::RCP<const AltCrsMatrix<Scalar,Ordinal,Node> > &opmatrix)
01684   {
01685     using Teuchos::ArrayRCP;
01686     using Teuchos::arcp;
01687     using Teuchos::arcp_const_cast;
01688     using Teuchos::null;
01689     // using std::cerr;
01690     // using std::endl;
01691 
01692     std::string tfecfFuncName("setGraphAndMatrix(uplo,diag,graph,matrix)");
01693     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01694       isInitialized_, std::runtime_error, " operators already initialized."
01695     );
01696 
01697     ArrayRCP<const  size_t> ptr = opgraph->getPointers ();
01698     ArrayRCP<const Ordinal> ind = opgraph->getIndices ();
01699     ArrayRCP<const Scalar> val = opmatrix->getValues ();
01700     const Ordinal numRows = opgraph->getNumRows ();
01701     const Ordinal numCols = opgraph->getNumCols ();
01702 
01703     // Verify the input data before setting internal state.
01704     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01705       (size_t) ptr.size() != (size_t) numRows + 1,
01706       std::invalid_argument,
01707       ": ptr.size() = " << ptr.size() << " != numRows+1 = " << (numRows + 1) << ".");
01708     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01709       ind.size() != val.size(),
01710       std::invalid_argument,
01711       ": ind.size() = " << ind.size() << " != val.size() = " << val.size()
01712       << ", for ptr = opgraph->getPointers() and ind = opgraph->getIndices().");
01713     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01714       (size_t) ptr[numRows] != (size_t) ind.size(),
01715       std::invalid_argument,
01716       ": ptr[numRows = " << numRows << "] = " << ptr[numRows]
01717       << " != ind.size() = " << ind.size() << ", for ptr = "
01718       "opgraph->getPointers() and ind = opgraph->getIndices().");
01719 
01720     numRows_ = numRows;
01721     numCols_ = numCols;
01722     hasEmptyRows_ = opgraph->hasEmptyRows ();
01723 
01724     if (opgraph->isEmpty () || numRows_ == 0) {
01725       isEmpty_ = true;
01726       // We have to go through a little trouble because ptr_ is an
01727       // array of const size_t, but we need to set its first entry
01728       // here.
01729       ArrayRCP<size_t> myPtr = arcp<size_t> (1);
01730       myPtr[0] = 0;
01731       ptr_ = arcp_const_cast<const size_t> (myPtr);
01732       myPtr = null;
01733 
01734       // The matrix is empty, so ind_ and val_ have zero entries.
01735       ind_ = null;
01736       val_ = null;
01737     }
01738     else {
01739       isEmpty_ = false;
01740 
01741       if (firstTouchAllocation_) {
01742         // Override the input data's allocation.  This assumes that
01743         // the input graph and matrix were not allocated using
01744         // first-touch allocation.  Make copies, initializing them
01745         // using the first-touch procedure.  This also overrides the
01746         // Allocator template parameter.
01747         typedef details::AltSparseOpsFirstTouchAllocator<Ordinal, Node> alloc_type;
01748 
01749         ptr_ = alloc_type::copyRowPtrs (node_, ptr ());
01750         ind_ = alloc_type::template copyStorage<Ordinal> (node_, ptr (), ind ());
01751         val_ = alloc_type::template copyStorage<Scalar> (node_, ptr (), val ());
01752       }
01753       else { // No first touch allocation.
01754         // We can just use these arrays directly.
01755         ptr_ = ptr;
01756         ind_ = ind;
01757         val_ = val;
01758       }
01759     }
01760     opgraph->getMatDesc (tri_uplo_, unit_diag_);
01761     isInitialized_ = true;
01762   }
01763 
01764   template <class Scalar, class Ordinal, class Node, class Allocator>
01765   template <class DomainScalar, class RangeScalar>
01766   void
01767   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01768   solve (Teuchos::ETransp trans,
01769          const MultiVector<DomainScalar, Node>& Y,
01770          MultiVector<RangeScalar, Node>& X) const
01771   {
01772     std::string tfecfFuncName("solve(trans,Y,X)");
01773     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01774       ! isInitialized_,
01775       std::runtime_error,
01776       ": The solve was not fully initialized."
01777     );
01778     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01779       X.getNumCols() != Y.getNumCols(),
01780       std::runtime_error,
01781       ": Input and output multivectors have different numbers of vectors (columns)."
01782     );
01783     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01784       static_cast<size_t> (X.getNumRows()) != static_cast<size_t> (numRows_),
01785       std::runtime_error,
01786       ": Dimensions of the matrix and the output multivector X do not match.  "
01787       "X.getNumRows() == " << X.getNumRows() << ", but the matrix has "
01788       << numRows_ << " rows."
01789     );
01790     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01791       static_cast<size_t> (Y.getNumRows()) != static_cast<size_t> (numCols_),
01792       std::runtime_error,
01793       ": Dimensions of the matrix and the input multivector Y do not match.  "
01794       "Y.getNumRows() == " << Y.getNumRows() << ", but the matrix has "
01795       << numCols_ << " columns."
01796     );
01797 
01798     if (numRows_ == Teuchos::OrdinalTraits<Ordinal>::zero () ||
01799         numCols_ == Teuchos::OrdinalTraits<Ordinal>::zero ()) {
01800       return; // Nothing to do
01801     }
01802     else if (isEmpty_) {
01803       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01804         unit_diag_ != Teuchos::UNIT_DIAG,
01805         std::runtime_error,
01806         ": Solve with empty matrix is only valid for an implicit unit diagonal.");
01807       // solve I * X = Y for X = Y
01808       DefaultArithmetic<MultiVector<RangeScalar,Node> >::Assign (X, Y);
01809     }
01810     else {
01811       typedef ordinal_type OT;
01812       typedef scalar_type MST; // matrix scalar type
01813       typedef DomainScalar DST;
01814       typedef RangeScalar RST;
01815 
01816       RST* const X_raw = X.getValuesNonConst ().getRawPtr ();
01817       const Ordinal X_stride = (Ordinal) X.getStride ();
01818       const DST* const Y_raw = Y.getValues ().getRawPtr ();
01819       const Ordinal Y_stride = (Ordinal) Y.getStride ();
01820 
01821       const  size_t* const ptr = ptr_.getRawPtr ();
01822       const Ordinal* const ind = ind_.getRawPtr ();
01823       const Scalar*  const val = val_.getRawPtr ();
01824       const Ordinal numRows = X.getNumRows ();
01825       const Ordinal numCols = Y.getNumRows ();
01826       const Ordinal numVecs = X.getNumCols ();
01827 
01828       if (trans == Teuchos::NO_TRANS) {
01829         if (tri_uplo_ == Teuchos::LOWER_TRI) {
01830           if (unit_diag_ == Teuchos::UNIT_DIAG) {
01831             using Kokkos::Raw::lowerTriSolveCsrColMajorUnitDiag;
01832             lowerTriSolveCsrColMajorUnitDiag<OT, MST, DST, RST> (numRows, numCols,
01833                                                                  numVecs,
01834                                                                  X_raw, X_stride,
01835                                                                  ptr, ind, val,
01836                                                                  Y_raw, Y_stride);
01837           }
01838           else { // non unit diagonal
01839             using Kokkos::Raw::lowerTriSolveCsrColMajor;
01840             lowerTriSolveCsrColMajor<OT, MST, DST, RST> (numRows, numCols,
01841                                                          numVecs,
01842                                                          X_raw, X_stride,
01843                                                          ptr, ind, val,
01844                                                          Y_raw, Y_stride);
01845           }
01846         }
01847         else { // upper triangular
01848           if (unit_diag_ == Teuchos::UNIT_DIAG) {
01849             using Kokkos::Raw::upperTriSolveCsrColMajorUnitDiag;
01850             upperTriSolveCsrColMajorUnitDiag<OT, MST, DST, RST> (numRows, numCols,
01851                                                                  numVecs,
01852                                                                  X_raw, X_stride,
01853                                                                  ptr, ind, val,
01854                                                                  Y_raw, Y_stride);
01855           }
01856           else { // non unit diagonal
01857             using Kokkos::Raw::upperTriSolveCsrColMajor;
01858             upperTriSolveCsrColMajor<OT, MST, DST, RST> (numRows, numCols,
01859                                                          numVecs,
01860                                                          X_raw, X_stride,
01861                                                          ptr, ind, val,
01862                                                          Y_raw, Y_stride);
01863           }
01864         }
01865       }
01866       else if (trans == Teuchos::TRANS) {
01867         if (tri_uplo_ == Teuchos::LOWER_TRI) {
01868           if (unit_diag_ == Teuchos::UNIT_DIAG) {
01869             using Kokkos::Raw::lowerTriSolveCscColMajorUnitDiag;
01870             // numRows resp. numCols come from the number of rows in Y
01871             // resp. X, so they still appear in the same order as
01872             // in the not transposed cases above.
01873             lowerTriSolveCscColMajorUnitDiag<OT, MST, DST, RST> (numRows, numCols,
01874                                                                  numVecs,
01875                                                                  X_raw, X_stride,
01876                                                                  ptr, ind, val,
01877                                                                  Y_raw, Y_stride);
01878           }
01879           else {
01880             using Kokkos::Raw::lowerTriSolveCscColMajor;
01881             lowerTriSolveCscColMajor<OT, MST, DST, RST> (numRows, numCols,
01882                                                          numVecs,
01883                                                          X_raw, X_stride,
01884                                                          ptr, ind, val,
01885                                                          Y_raw, Y_stride);
01886           }
01887         }
01888         else { // upper triangular
01889           if (unit_diag_ == Teuchos::UNIT_DIAG) {
01890             using Kokkos::Raw::upperTriSolveCscColMajorUnitDiag;
01891             upperTriSolveCscColMajorUnitDiag<OT, MST, DST, RST> (numRows, numCols,
01892                                                                  numVecs,
01893                                                                  X_raw, X_stride,
01894                                                                  ptr, ind, val,
01895                                                                  Y_raw, Y_stride);
01896           }
01897           else {
01898             using Kokkos::Raw::upperTriSolveCscColMajor;
01899             upperTriSolveCscColMajor<OT, MST, DST, RST> (numRows, numCols,
01900                                                          numVecs,
01901                                                          X_raw, X_stride,
01902                                                          ptr, ind, val,
01903                                                          Y_raw, Y_stride);
01904           }
01905         }
01906       }
01907       else if (trans == Teuchos::CONJ_TRANS) {
01908         if (tri_uplo_ == Teuchos::LOWER_TRI) {
01909           if (unit_diag_ == Teuchos::UNIT_DIAG) {
01910             using Kokkos::Raw::lowerTriSolveCscColMajorUnitDiagConj;
01911             lowerTriSolveCscColMajorUnitDiagConj<OT, MST, DST, RST> (numRows, numCols,
01912                                                                      numVecs,
01913                                                                      X_raw, X_stride,
01914                                                                      ptr, ind, val,
01915                                                                      Y_raw, Y_stride);
01916           }
01917           else {
01918             using Kokkos::Raw::lowerTriSolveCscColMajorConj;
01919             lowerTriSolveCscColMajorConj<OT, MST, DST, RST> (numRows, numCols,
01920                                                              numVecs,
01921                                                              X_raw, X_stride,
01922                                                              ptr, ind, val,
01923                                                              Y_raw, Y_stride);
01924           }
01925         }
01926         else { // upper triangular
01927           if (unit_diag_ == Teuchos::UNIT_DIAG) {
01928             using Kokkos::Raw::upperTriSolveCscColMajorUnitDiagConj;
01929             upperTriSolveCscColMajorUnitDiagConj<OT, MST, DST, RST> (numRows, numCols,
01930                                                                      numVecs,
01931                                                                      X_raw, X_stride,
01932                                                                      ptr, ind, val,
01933                                                                      Y_raw, Y_stride);
01934           }
01935           else {
01936             using Kokkos::Raw::upperTriSolveCscColMajorConj;
01937             upperTriSolveCscColMajorConj<OT, MST, DST, RST> (numRows, numCols,
01938                                                              numVecs,
01939                                                              X_raw, X_stride,
01940                                                              ptr, ind, val,
01941                                                              Y_raw, Y_stride);
01942           }
01943         }
01944       }
01945     }
01946   }
01947 
01948   template <class Scalar, class Ordinal, class Node, class Allocator>
01949   template <class DomainScalar, class RangeScalar>
01950   void
01951   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01952   multiply (Teuchos::ETransp trans,
01953             RangeScalar alpha,
01954             const MultiVector<DomainScalar, Node>& X,
01955             MultiVector<RangeScalar, Node>& Y) const
01956   {
01957     typedef DomainScalar DST;
01958     typedef RangeScalar RST;
01959     const RST beta = Teuchos::ScalarTraits<RST>::zero ();
01960     this->template multiply<DST, RST> (trans, alpha, X, beta, Y);
01961   }
01962 
01963   template <class Scalar, class Ordinal, class Node, class Allocator>
01964   template <class DomainScalar, class RangeScalar>
01965   void
01966   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
01967   multiply (Teuchos::ETransp trans,
01968             RangeScalar alpha,
01969             const MultiVector<DomainScalar, Node> &X,
01970             RangeScalar beta,
01971             MultiVector<RangeScalar, Node> &Y) const
01972   {
01973     using std::cerr;
01974     using std::endl;
01975 
01976     std::string tfecfFuncName("multiply(trans,alpha,X,beta,Y)");
01977     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01978       ! isInitialized_,
01979       std::runtime_error,
01980       ": Sparse ops not initialized.");
01981     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01982       X.getNumCols() != Y.getNumCols(),
01983       std::runtime_error,
01984       ": X and Y do not have the same number of columns.");
01985     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01986       X.getNumRows() != Teuchos::as<size_t> (numCols_),
01987       std::runtime_error,
01988       ": Dimensions of the matrix and the input multivector X do not match.  "
01989       "X has " << X.getNumRows () << " rows, but the matrix has " << numCols_
01990       << " columns.");
01991     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01992       Y.getNumRows() != Teuchos::as<size_t> (numRows_),
01993       std::runtime_error,
01994       ": Dimensions of the matrix and the output multivector Y do not match.  "
01995       "Y has " << Y.getNumRows () << " rows, but the matrix has " << numRows_
01996       << " rows.");
01997 
01998     typedef ordinal_type OT;
01999     typedef scalar_type MST; // matrix scalar type
02000     typedef DomainScalar DST;
02001     typedef RangeScalar RST;
02002 
02003     // These dimensions come from the input and output multivectors,
02004     // so they apply for the transpose case as well.
02005     const Ordinal numRows = Y.getNumRows ();
02006     const Ordinal numCols = X.getNumRows ();
02007     const Ordinal numVecs = X.getNumCols ();
02008     RST* const Y_raw = Y.getValuesNonConst ().getRawPtr ();
02009     const Ordinal Y_stride = Y.getStride ();
02010     const DST* const X_raw = X.getValues ().getRawPtr ();
02011     const Ordinal X_stride = X.getStride ();
02012 
02013     const  size_t* const ptr = ptr_.getRawPtr ();
02014     const Ordinal* const ind = ind_.getRawPtr ();
02015     const Scalar*  const val = val_.getRawPtr ();
02016 
02017     // Pointer to the sparse matrix-vector multiply routine to use.
02018     void (*matVec) (const OT, const OT, const OT,
02019                     const RST&, RST[], const OT,
02020                     const RST&, const size_t[], const OT[], const MST[],
02021                     const DST[], const OT);
02022 
02023     // The following very long switch statement selects one of the
02024     // hard-coded-numVecs routines for certain values of numVecs.
02025     // (Hard-coding the number of columns in the multivectors avoids
02026     // two branches and an integer addition.)  Otherwise, it picks a
02027     // general routine.  Here is also where we use the parameters
02028     // given to the constructor to pick the algorithm variant.
02029     //
02030     // Note that we're taking numRows and numCols from Y resp. X.
02031     // Assuming that the dimensions of X and Y are correct, then
02032     // whether or not we're applying the transpose, the (transposed,
02033     // if applicable) matrix has dimensions numRows by numCols.
02034     // That's why we don't switch the order of numRows, numCols in the
02035     // invocations below.
02036     switch (numVecs) {
02037     case 1:
02038       if (trans == Teuchos::NO_TRANS) {
02039         if (matVecVariant_ == FOR_FOR) {
02040           matVec = &Kokkos::Raw::matVecCsrColMajorForfor1Vec<OT, MST, DST, RST>;
02041         }
02042         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02043           matVec = &Kokkos::Raw::matVecCsrColMajorForif1Vec<OT, MST, DST, RST>;
02044         }
02045         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02046           matVec = &Kokkos::Raw::matVecCsrColMajorForwhile1Vec<OT, MST, DST, RST>;
02047         }
02048       }
02049       else if (trans == Teuchos::TRANS) {
02050         if (matVecVariant_ == FOR_FOR) {
02051           matVec = &Kokkos::Raw::matVecCscColMajorForfor1Vec<OT, MST, DST, RST>;
02052         }
02053         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02054           matVec = &Kokkos::Raw::matVecCscColMajorForif1Vec<OT, MST, DST, RST>;
02055         }
02056         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02057           matVec = &Kokkos::Raw::matVecCscColMajorForwhile1Vec<OT, MST, DST, RST>;
02058         }
02059       }
02060       else { // if (trans == Teuchos::CONJ_TRANS) {
02061         if (matVecVariant_ == FOR_FOR) {
02062           matVec = &Kokkos::Raw::matVecCscColMajorForforConj1Vec<OT, MST, DST, RST>;
02063         }
02064         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02065           matVec = &Kokkos::Raw::matVecCscColMajorForifConj1Vec<OT, MST, DST, RST>;
02066         }
02067         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02068           matVec = &Kokkos::Raw::matVecCscColMajorForwhileConj1Vec<OT, MST, DST, RST>;
02069         }
02070       }
02071       break;
02072     case 2:
02073       if (trans == Teuchos::NO_TRANS) {
02074         if (matVecVariant_ == FOR_FOR) {
02075           matVec = &Kokkos::Raw::matVecCsrColMajorForfor2Vec<OT, MST, DST, RST>;
02076         }
02077         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02078           matVec = &Kokkos::Raw::matVecCsrColMajorForif2Vec<OT, MST, DST, RST>;
02079         }
02080         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02081           matVec = &Kokkos::Raw::matVecCsrColMajorForwhile2Vec<OT, MST, DST, RST>;
02082         }
02083       }
02084       else if (trans == Teuchos::TRANS) {
02085         if (matVecVariant_ == FOR_FOR) {
02086           matVec = &Kokkos::Raw::matVecCscColMajorForfor2Vec<OT, MST, DST, RST>;
02087         }
02088         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02089           matVec = &Kokkos::Raw::matVecCscColMajorForif2Vec<OT, MST, DST, RST>;
02090         }
02091         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02092           matVec = &Kokkos::Raw::matVecCscColMajorForwhile2Vec<OT, MST, DST, RST>;
02093         }
02094       }
02095       else { // if (trans == Teuchos::CONJ_TRANS) {
02096         if (matVecVariant_ == FOR_FOR) {
02097           matVec = &Kokkos::Raw::matVecCscColMajorForforConj2Vec<OT, MST, DST, RST>;
02098         }
02099         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02100           matVec = &Kokkos::Raw::matVecCscColMajorForifConj2Vec<OT, MST, DST, RST>;
02101         }
02102         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02103           matVec = &Kokkos::Raw::matVecCscColMajorForwhileConj2Vec<OT, MST, DST, RST>;
02104         }
02105       }
02106       break;
02107     case 3:
02108       if (trans == Teuchos::NO_TRANS) {
02109         if (matVecVariant_ == FOR_FOR) {
02110           matVec = &Kokkos::Raw::matVecCsrColMajorForfor3Vec<OT, MST, DST, RST>;
02111         }
02112         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02113           matVec = &Kokkos::Raw::matVecCsrColMajorForif3Vec<OT, MST, DST, RST>;
02114         }
02115         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02116           matVec = &Kokkos::Raw::matVecCsrColMajorForwhile3Vec<OT, MST, DST, RST>;
02117         }
02118       }
02119       else if (trans == Teuchos::TRANS) {
02120         if (matVecVariant_ == FOR_FOR) {
02121           matVec = &Kokkos::Raw::matVecCscColMajorForfor3Vec<OT, MST, DST, RST>;
02122         }
02123         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02124           matVec = &Kokkos::Raw::matVecCscColMajorForif3Vec<OT, MST, DST, RST>;
02125         }
02126         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02127           matVec = &Kokkos::Raw::matVecCscColMajorForwhile3Vec<OT, MST, DST, RST>;
02128         }
02129       }
02130       else { // if (trans == Teuchos::CONJ_TRANS) {
02131         if (matVecVariant_ == FOR_FOR) {
02132           matVec = &Kokkos::Raw::matVecCscColMajorForforConj3Vec<OT, MST, DST, RST>;
02133         }
02134         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02135           matVec = &Kokkos::Raw::matVecCscColMajorForifConj3Vec<OT, MST, DST, RST>;
02136         }
02137         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02138           matVec = &Kokkos::Raw::matVecCscColMajorForwhileConj3Vec<OT, MST, DST, RST>;
02139         }
02140       }
02141       break;
02142     case 4:
02143       if (trans == Teuchos::NO_TRANS) {
02144         if (matVecVariant_ == FOR_FOR) {
02145           matVec = &Kokkos::Raw::matVecCsrColMajorForfor4Vec<OT, MST, DST, RST>;
02146         }
02147         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02148           matVec = &Kokkos::Raw::matVecCsrColMajorForif4Vec<OT, MST, DST, RST>;
02149         }
02150         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02151           matVec = &Kokkos::Raw::matVecCsrColMajorForwhile4Vec<OT, MST, DST, RST>;
02152         }
02153       }
02154       else if (trans == Teuchos::TRANS) {
02155         if (matVecVariant_ == FOR_FOR) {
02156           matVec = &Kokkos::Raw::matVecCscColMajorForfor4Vec<OT, MST, DST, RST>;
02157         }
02158         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02159           matVec = &Kokkos::Raw::matVecCscColMajorForif4Vec<OT, MST, DST, RST>;
02160         }
02161         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02162           matVec = &Kokkos::Raw::matVecCscColMajorForwhile4Vec<OT, MST, DST, RST>;
02163         }
02164       }
02165       else { // if (trans == Teuchos::CONJ_TRANS) {
02166         if (matVecVariant_ == FOR_FOR) {
02167           matVec = &Kokkos::Raw::matVecCscColMajorForforConj4Vec<OT, MST, DST, RST>;
02168         }
02169         else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02170           matVec = &Kokkos::Raw::matVecCscColMajorForifConj4Vec<OT, MST, DST, RST>;
02171         }
02172         else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02173           matVec = &Kokkos::Raw::matVecCscColMajorForwhileConj4Vec<OT, MST, DST, RST>;
02174         }
02175       }
02176       break;
02177     default: // The "general case"
02178       if (unroll_) {
02179         if (trans == Teuchos::NO_TRANS) {
02180           if (matVecVariant_ == FOR_FOR) {
02181             matVec = &Kokkos::Raw::matVecCsrColMajorForfor4Unrolled<OT, MST, DST, RST>;
02182           }
02183           else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02184             // FIXME (mfh 26 Jul 2012) Currently, the code generator
02185             // is broken for the 'for-if' and 'for-while' variants,
02186             // when the number of multivector columns is not fixed (it
02187             // wasn't broken before -- it had to do with the
02188             // introduction of temporaries).  We could either revert
02189             // to the for-for variant, or throw an exception.  We do
02190             // the latter, since for-if is not the default, and we
02191             // want to make sure that people don't get confusing
02192             // performance results.
02193 
02194             //matVec = &Kokkos::Raw::matVecCsrColMajorForif4Unrolled<OT, MST, DST, RST>;
02195             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-if' "
02196               "variant of sparse matrix-vector multiply is not currently "
02197               "implemented for a non-fixed number of columns in the multi"
02198               "vectors.  Please use the 'for-for' variant for now.");
02199           }
02200           else { // matVecVariant_ == FOR_WHILE ||
02201                  // (matVecVariant_ == FOR_IF && hasEmptyRows_)
02202             //matVec = &Kokkos::Raw::matVecCsrColMajorForwhile4Unrolled<OT, MST, DST, RST>;
02203 
02204             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-while' "
02205               "variant of sparse matrix-vector multiply is not currently "
02206               "implemented for a non-fixed number of columns in the multi"
02207               "vectors.  Please use the 'for-for' variant for now.");
02208           }
02209         }
02210         else if (trans == Teuchos::TRANS) {
02211           if (matVecVariant_ == FOR_FOR) {
02212             matVec = &Kokkos::Raw::matVecCscColMajorForfor4Unrolled<OT, MST, DST, RST>;
02213           }
02214           else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02215             //matVec = &Kokkos::Raw::matVecCscColMajorForif4Unrolled<OT, MST, DST, RST>;
02216 
02217             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-if' "
02218               "variant of sparse matrix-vector multiply is not currently "
02219               "implemented for a non-fixed number of columns in the multi"
02220               "vectors.  Please use the 'for-for' variant for now.");
02221           }
02222           else { // matVecVariant_ == FOR_WHILE || (matVecVariant_ == FOR_IF && hasEmptyRows_)
02223             //matVec = &Kokkos::Raw::matVecCscColMajorForwhile4Unrolled<OT, MST, DST, RST>;
02224 
02225             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-while' "
02226               "variant of sparse matrix-vector multiply is not currently "
02227               "implemented for a non-fixed number of columns in the multi"
02228               "vectors.  Please use the 'for-for' variant for now.");
02229           }
02230         }
02231         else { // if (trans == Teuchos::CONJ_TRANS) {
02232           if (matVecVariant_ == FOR_FOR) {
02233             matVec = &Kokkos::Raw::matVecCscColMajorForforConj4Unrolled<OT, MST, DST, RST>;
02234           }
02235           else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02236             //matVec = &Kokkos::Raw::matVecCscColMajorForifConj4Unrolled<OT, MST, DST, RST>;
02237 
02238             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-if' "
02239               "variant of sparse matrix-vector multiply is not currently "
02240               "implemented for a non-fixed number of columns in the multi"
02241               "vectors.  Please use the 'for-for' variant for now.");
02242           }
02243           else { // matVecVariant_ == FOR_WHILE ||
02244                  // (matVecVariant_ == FOR_IF && hasEmptyRows_)
02245             //matVec = &Kokkos::Raw::matVecCscColMajorForwhileConj4Unrolled<OT, MST, DST, RST>;
02246 
02247             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-while' "
02248               "variant of sparse matrix-vector multiply is not currently "
02249               "implemented for a non-fixed number of columns in the multi"
02250               "vectors.  Please use the 'for-for' variant for now.");
02251           }
02252         }
02253       }
02254       else { // Don't unroll across multivector columns
02255         if (trans == Teuchos::NO_TRANS) {
02256           if (matVecVariant_ == FOR_FOR) {
02257             matVec = &Kokkos::Raw::matVecCsrColMajorForfor<OT, MST, DST, RST>;
02258           }
02259           else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02260             //matVec = &Kokkos::Raw::matVecCsrColMajorForif<OT, MST, DST, RST>;
02261 
02262             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-if' "
02263               "variant of sparse matrix-vector multiply is not currently "
02264               "implemented for a non-fixed number of columns in the multi"
02265               "vectors.  Please use the 'for-for' variant for now.");
02266           }
02267           else { // matVecVariant_ == FOR_WHILE ||
02268                  // (matVecVariant_ == FOR_IF && hasEmptyRows_)
02269             //matVec = &Kokkos::Raw::matVecCsrColMajorForwhile<OT, MST, DST, RST>;
02270 
02271             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-if' "
02272               "variant of sparse matrix-vector multiply is not currently "
02273               "implemented for a non-fixed number of columns in the multi"
02274               "vectors.  Please use the 'for-for' variant for now.");
02275           }
02276         }
02277         else if (trans == Teuchos::TRANS) {
02278           if (matVecVariant_ == FOR_FOR) {
02279             matVec = &Kokkos::Raw::matVecCscColMajorForfor<OT, MST, DST, RST>;
02280           }
02281           else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02282             //matVec = &Kokkos::Raw::matVecCscColMajorForif<OT, MST, DST, RST>;
02283 
02284             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-if' "
02285               "variant of sparse matrix-vector multiply is not currently "
02286               "implemented for a non-fixed number of columns in the multi"
02287               "vectors.  Please use the 'for-for' variant for now.");
02288           }
02289           else { // matVecVariant_ == FOR_WHILE ||
02290                  // (matVecVariant_ == FOR_IF && hasEmptyRows_)
02291             //matVec = &Kokkos::Raw::matVecCscColMajorForwhile<OT, MST, DST, RST>;
02292 
02293             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-while' "
02294               "variant of sparse matrix-vector multiply is not currently "
02295               "implemented for a non-fixed number of columns in the multi"
02296               "vectors.  Please use the 'for-for' variant for now.");
02297           }
02298         }
02299         else { // if (trans == Teuchos::CONJ_TRANS) {
02300           if (matVecVariant_ == FOR_FOR) {
02301             matVec = &Kokkos::Raw::matVecCscColMajorForforConj<OT, MST, DST, RST>;
02302           }
02303           else if (matVecVariant_ == FOR_IF && ! hasEmptyRows_) {
02304             //matVec = &Kokkos::Raw::matVecCscColMajorForifConj<OT, MST, DST, RST>;
02305 
02306             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-if' "
02307               "variant of sparse matrix-vector multiply is not currently "
02308               "implemented for a non-fixed number of columns in the multi"
02309               "vectors.  Please use the 'for-for' variant for now.");
02310           }
02311           else { // matVecVariant_ == FOR_WHILE ||
02312                  // (matVecVariant_ == FOR_IF && hasEmptyRows_)
02313             //matVec = &Kokkos::Raw::matVecCscColMajorForwhileConj<OT, MST, DST, RST>;
02314 
02315             TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "The 'for-while' "
02316               "variant of sparse matrix-vector multiply is not currently "
02317               "implemented for a non-fixed number of columns in the multi"
02318               "vectors.  Please use the 'for-for' variant for now.");
02319           }
02320         }
02321       }
02322     }
02323 
02324     // Now we know what mat-vec routine to call, so call it.
02325     matVec (numRows, numCols, numVecs, beta, Y_raw, Y_stride,
02326             alpha, ptr, ind, val, X_raw, X_stride);
02327   }
02328 
02329   template <class Scalar, class Ordinal, class Node, class Allocator>
02330   Teuchos::ArrayRCP<size_t>
02331   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
02332   allocRowPtrs (const Teuchos::RCP<Node>& node,
02333                 const Teuchos::ArrayView<const size_t>& numEntriesPerRow)
02334   {
02335     return allocator_type::allocRowPtrs (node, numEntriesPerRow);
02336   }
02337 
02338   template <class Scalar, class Ordinal, class Node, class Allocator>
02339   Teuchos::ArrayRCP<size_t>
02340   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
02341   copyRowPtrs (const Teuchos::RCP<Node>& node,
02342                const Teuchos::ArrayView<const size_t>& rowPtrs)
02343   {
02344     return allocator_type::copyRowPtrs (node, rowPtrs);
02345   }
02346 
02347   template <class Scalar, class Ordinal, class Node, class Allocator>
02348   template <class T>
02349   Teuchos::ArrayRCP<T>
02350   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
02351   allocStorage (const Teuchos::RCP<Node>& node,
02352                 const Teuchos::ArrayView<const size_t>& rowPtrs)
02353   {
02354     return allocator_type::template allocStorage<T> (node, rowPtrs);
02355   }
02356 
02357   template <class Scalar, class Ordinal, class Node, class Allocator>
02358   template <class T>
02359   Teuchos::ArrayRCP<T>
02360   AltSparseOps<Scalar, Ordinal, Node, Allocator>::
02361   copyStorage (const Teuchos::RCP<Node>& node,
02362                const Teuchos::ArrayView<const size_t>& rowPtrs,
02363                const Teuchos::ArrayView<const T>& inputVals)
02364   {
02365     return allocator_type::template allocStorage<T> (node, rowPtrs, inputVals);
02366   }
02367 
02368 } // namespace Kokkos
02369 
02370 #endif // #ifndef __Kokkos_AltSparseOps_hpp
02371 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends