Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_CacheBlocker.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __TSQR_CacheBlocker_hpp
00043 #define __TSQR_CacheBlocker_hpp
00044 
00045 #include <Tsqr_CacheBlockingStrategy.hpp>
00046 #include <Tsqr_MatView.hpp>
00047 #include <Tsqr_Util.hpp>
00048 
00049 #include <iterator>
00050 #include <sstream>
00051 #include <stdexcept>
00052 
00053 namespace TSQR {
00054 
00071   template<class Ordinal, class Scalar>
00072   class CacheBlocker {
00073   private:
00074     typedef MatView<Ordinal, Scalar> mat_view;
00075     typedef ConstMatView<Ordinal, Scalar> const_mat_view;
00076 
00077     void
00078     validate () 
00079     {
00080       if (nrows_cache_block_ < ncols_)
00081   {
00082     std::ostringstream os;
00083     os << "The typical cache block size is too small.  Only " 
00084        << nrows_cache_block_ << " rows fit, but every cache block needs "
00085       "at least as many rows as the number of columns " << ncols_ 
00086        << " in the matrix.";
00087     throw std::logic_error (os.str());
00088   }
00089     }
00090 
00091   public:
00104     CacheBlocker (const Ordinal num_rows,
00105       const Ordinal num_cols,
00106       const CacheBlockingStrategy<Ordinal, Scalar>& strategy) :
00107       nrows_ (num_rows), 
00108       ncols_ (num_cols), 
00109       strategy_ (strategy),
00110       nrows_cache_block_ (strategy_.cache_block_num_rows (ncols()))
00111     {
00112       validate ();
00113     }
00114 
00116     CacheBlocker () : 
00117       nrows_ (0), 
00118       ncols_ (0), 
00119       nrows_cache_block_ (strategy_.cache_block_num_rows (ncols()))
00120     {}
00121 
00123     CacheBlocker (const CacheBlocker& rhs) :
00124       nrows_ (rhs.nrows()), 
00125       ncols_ (rhs.ncols()), 
00126       strategy_ (rhs.strategy_),
00127       nrows_cache_block_ (rhs.nrows_cache_block_)
00128     {}
00129 
00131     CacheBlocker& operator= (const CacheBlocker& rhs) {
00132       nrows_ = rhs.nrows();
00133       ncols_ = rhs.ncols();
00134       strategy_ = rhs.strategy_;
00135       nrows_cache_block_ = rhs.nrows_cache_block_;
00136       return *this;
00137     }
00138 
00143     size_t TEUCHOS_DEPRECATED cache_block_size () const { 
00144       return strategy_.cache_size_hint(); 
00145     }
00146 
00148     size_t cache_size_hint () const { return strategy_.cache_size_hint(); }
00149 
00151     Ordinal nrows () const { return nrows_; }
00152 
00154     Ordinal ncols () const { return ncols_; }
00155 
00176     template< class MatrixViewType >
00177     MatrixViewType
00178     split_top_block (MatrixViewType& A, const bool contiguous_cache_blocks) const
00179     {
00180       typedef typename MatrixViewType::ordinal_type ordinal_type;
00181       const ordinal_type nrows_top = 
00182   strategy_.top_block_split_nrows (A.nrows(), ncols(), 
00183            nrows_cache_block());
00184       // split_top() sets A to A_rest, and returns A_top.
00185       return A.split_top (nrows_top, contiguous_cache_blocks);
00186     }
00187 
00197     template< class MatrixViewType >
00198     MatrixViewType
00199     top_block (const MatrixViewType& A, const bool contiguous_cache_blocks) const
00200     {
00201       typedef typename MatrixViewType::ordinal_type ordinal_type;
00202       // Ignore the number of columns in A, since we want to block all
00203       // matrices using the same cache blocking strategy.
00204       const ordinal_type nrows_top = 
00205   strategy_.top_block_split_nrows (A.nrows(), ncols(), 
00206            nrows_cache_block());
00207       MatrixViewType A_copy (A);
00208       return A_copy.split_top (nrows_top, contiguous_cache_blocks);
00209     }
00210 
00225     template< class MatrixViewType >
00226     MatrixViewType
00227     split_bottom_block (MatrixViewType& A, const bool contiguous_cache_blocks) const
00228     {
00229       typedef typename MatrixViewType::ordinal_type ordinal_type;
00230       // Ignore the number of columns in A, since we want to block all
00231       // matrices using the same cache blocking strategy.
00232       const ordinal_type nrows_bottom = 
00233   strategy_.bottom_block_split_nrows (A.nrows(), ncols(), 
00234               nrows_cache_block());
00235       // split_bottom() sets A to A_rest, and returns A_bot.
00236       return A.split_bottom (nrows_bottom, contiguous_cache_blocks);
00237     }
00238 
00252     template<class MatrixViewType>
00253     void
00254     fill_with_zeros (MatrixViewType A,
00255          const bool contiguous_cache_blocks) const
00256     {
00257       // Note: if the cache blocks are stored contiguously, A.lda()
00258       // won't be the correct leading dimension of A, but it won't
00259       // matter: we only ever operate on A_cur here, and A_cur's
00260       // leading dimension is set correctly by split_top_block().
00261       while (! A.empty())
00262   {
00263     // This call modifies the matrix view A, but that's OK since
00264     // we passed the input view by copy, not by reference.
00265     MatrixViewType A_cur = split_top_block (A, contiguous_cache_blocks);
00266     A_cur.fill (Scalar(0));
00267   }
00268     }
00269 
00286     void
00287     fill_with_zeros (const Ordinal num_rows,
00288          const Ordinal num_cols,
00289          Scalar A[],
00290          const Ordinal lda, 
00291          const bool contiguous_cache_blocks) const
00292     {
00293       // We say "A_rest" because it points to the remaining part of
00294       // the matrix left to process; at the beginning, the "remaining"
00295       // part is the whole matrix, but that will change as the
00296       // algorithm progresses.
00297       //
00298       // Note: if the cache blocks are stored contiguously, lda won't
00299       // be the correct leading dimension of A, but it won't matter:
00300       // we only ever operate on A_cur here, and A_cur's leading
00301       // dimension is set correctly by A_rest.split_top().
00302       mat_view A_rest (num_rows, num_cols, A, lda);
00303 
00304       while (! A_rest.empty())
00305   {
00306     // This call modifies A_rest.
00307     mat_view A_cur = split_top_block (A_rest, contiguous_cache_blocks);
00308     A_cur.fill (Scalar(0));
00309   }
00310     }
00311 
00329     void
00330     cache_block (const Ordinal num_rows,
00331      const Ordinal num_cols,
00332      Scalar A_out[],
00333      const Scalar A_in[],
00334      const Ordinal lda_in) const
00335     {
00336       // We say "*_rest" because it points to the remaining part of
00337       // the matrix left to cache block; at the beginning, the
00338       // "remaining" part is the whole matrix, but that will change as
00339       // the algorithm progresses.
00340       const_mat_view A_in_rest (num_rows, num_cols, A_in, lda_in);
00341       // Leading dimension doesn't matter since A_out will be cache blocked.
00342       mat_view A_out_rest (num_rows, num_cols, A_out, lda_in);
00343 
00344       while (! A_in_rest.empty())
00345   {
00346     if (A_out_rest.empty())
00347       throw std::logic_error("A_out_rest is empty, but A_in_rest is not");
00348 
00349     // This call modifies A_in_rest.
00350     const_mat_view A_in_cur = split_top_block (A_in_rest, false);
00351 
00352     // This call modifies A_out_rest.
00353     mat_view A_out_cur = split_top_block (A_out_rest, true);
00354 
00355     copy_matrix (A_in_cur.nrows(), num_cols, A_out_cur.get(), 
00356            A_out_cur.lda(), A_in_cur.get(), A_in_cur.lda());
00357   }
00358     }
00359 
00361     void
00362     un_cache_block (const Ordinal num_rows,
00363         const Ordinal num_cols,
00364         Scalar A_out[],
00365         const Ordinal lda_out,        
00366         const Scalar A_in[]) const
00367     {
00368       // We say "*_rest" because it points to the remaining part of
00369       // the matrix left to cache block; at the beginning, the
00370       // "remaining" part is the whole matrix, but that will change as
00371       // the algorithm progresses.
00372       //
00373       // Leading dimension doesn't matter since A_in is cache blocked.
00374       const_mat_view A_in_rest (num_rows, num_cols, A_in, lda_out);
00375       mat_view A_out_rest (num_rows, num_cols, A_out, lda_out);
00376 
00377       while (! A_in_rest.empty())
00378   {
00379     if (A_out_rest.empty())
00380       throw std::logic_error("A_out_rest is empty, but A_in_rest is not");
00381 
00382     // This call modifies A_in_rest.
00383     const_mat_view A_in_cur = split_top_block (A_in_rest, true);
00384 
00385     // This call modifies A_out_rest.
00386     mat_view A_out_cur = split_top_block (A_out_rest, false);
00387 
00388     copy_matrix (A_in_cur.nrows(), num_cols, A_out_cur.get(), 
00389            A_out_cur.lda(), A_in_cur.get(), A_in_cur.lda());
00390   }
00391     }
00392 
00407     template<class MatrixViewType>
00408     MatrixViewType 
00409     get_cache_block (MatrixViewType A,
00410          const typename MatrixViewType::ordinal_type cache_block_index,
00411          const bool contiguous_cache_blocks) const
00412     {
00413       typedef typename MatrixViewType::ordinal_type ordinal_type;
00414       typedef typename MatrixViewType::scalar_type scalar_type;
00415 
00416       // Total number of cache blocks.
00417       const ordinal_type num_cache_blocks = 
00418   strategy_.num_cache_blocks (A.nrows(), A.ncols(), nrows_cache_block());
00419 
00420       if (cache_block_index >= num_cache_blocks)
00421   return MatrixViewType (0, 0, NULL, 0); // empty
00422 
00423       // result[0] = starting row index of the cache block
00424       // result[1] = number of rows in the cache block
00425       // result[2] = pointer offset (A.get() + result[2])
00426       // result[3] = leading dimension (a.k.a. stride) of the cache block
00427       std::vector<Ordinal> result = 
00428   strategy_.cache_block_details (cache_block_index, A.nrows(), A.ncols(),
00429                A.lda(), nrows_cache_block(), 
00430                contiguous_cache_blocks);
00431       if (result[1] == 0)
00432   // For some reason, the cache block is empty.   
00433   return MatrixViewType (0, 0, NULL, 0);
00434 
00435       // We expect that ordinal_type is signed, so adding signed
00436       // (ordinal_type) to unsigned (pointer) may raise compiler
00437       // warnings.
00438       return MatrixViewType (result[1], A.ncols(), 
00439            A.get() + static_cast<size_t>(result[2]), 
00440            result[3]);
00441     }
00442 
00448     bool
00449     operator== (const CacheBlockingStrategy<Ordinal, Scalar>& rhs) const
00450     {
00451       return nrows() == rhs.nrows() && 
00452   ncols() == rhs.ncols() &&
00453   strategy_ == rhs.strategy_;
00454     }
00455 
00456   private:
00458     Ordinal nrows_;
00459 
00461     Ordinal ncols_;
00462     
00464     CacheBlockingStrategy<Ordinal, Scalar> strategy_;
00465 
00472     Ordinal nrows_cache_block_;
00473 
00481     size_t nrows_cache_block () const { return nrows_cache_block_; }
00482   };
00483 
00484 
00492   template<class MatrixViewType>
00493   class CacheBlockRangeIterator : 
00494     public std::iterator<std::forward_iterator_tag, MatrixViewType> 
00495   {
00496   public:
00497     typedef MatrixViewType view_type;
00498     typedef typename MatrixViewType::ordinal_type ordinal_type;
00499     typedef typename MatrixViewType::scalar_type scalar_type;
00500 
00506     CacheBlockRangeIterator () :
00507       A_ (0, 0, NULL, 0),
00508       curInd_ (0),
00509       reverse_ (false),
00510       contiguousCacheBlocks_ (false)
00511     {}
00512 
00524     CacheBlockRangeIterator (const MatrixViewType& A,
00525            const CacheBlockingStrategy<ordinal_type, scalar_type>& strategy,
00526            const ordinal_type currentIndex,
00527            const bool reverse,
00528            const bool contiguousCacheBlocks) :
00529       A_ (A), 
00530       blocker_ (A_.nrows(), A_.ncols(), strategy), 
00531       curInd_ (currentIndex),
00532       reverse_ (reverse),
00533       contiguousCacheBlocks_ (contiguousCacheBlocks)
00534     {}
00535 
00537     CacheBlockRangeIterator (const CacheBlockRangeIterator& rhs) : 
00538       A_ (rhs.A_), 
00539       blocker_ (rhs.blocker_), 
00540       curInd_ (rhs.curInd_), 
00541       reverse_ (rhs.reverse_),
00542       contiguousCacheBlocks_ (rhs.contiguousCacheBlocks_)
00543     {}
00544 
00546     CacheBlockRangeIterator& operator= (const CacheBlockRangeIterator& rhs)
00547     {
00548       A_ = rhs.A_;
00549       blocker_ = rhs.blocker_;
00550       curInd_ = rhs.curInd_;
00551       reverse_ = rhs.reverse_;
00552       contiguousCacheBlocks_ = rhs.contiguousCacheBlocks_;
00553     }
00554 
00556     CacheBlockRangeIterator& operator++() {
00557       if (reverse_)
00558   --curInd_;
00559       else
00560   ++curInd_;
00561       return *this;
00562     }
00563 
00569     CacheBlockRangeIterator operator++(int) {
00570       CacheBlockRangeIterator retval (*this);
00571       operator++();
00572       return retval;
00573     }
00574 
00581     bool operator== (const CacheBlockRangeIterator& rhs) {
00582       // Not correct, but fast.  Should return false for different A_
00583       // or different blocker_.
00584       return curInd_ == rhs.curInd_;
00585     }
00586 
00588     bool operator!= (const CacheBlockRangeIterator& rhs) {
00589       // Not correct, but fast.  Should return false for different A_
00590       // or different blocker_.
00591       return curInd_ != rhs.curInd_;
00592     }
00593 
00599     MatrixViewType operator*() const {
00600       return blocker_.get_cache_block (A_, curInd_, contiguousCacheBlocks_);
00601     }
00602 
00603   private:
00604     MatrixViewType A_;
00605     CacheBlocker<ordinal_type, scalar_type> blocker_;
00606     ordinal_type curInd_;
00607     bool reverse_;
00608     bool contiguousCacheBlocks_;
00609   };
00610 
00629   template<class MatrixViewType>
00630   class CacheBlockRange {
00631   public:
00632     typedef MatrixViewType view_type;
00633     typedef typename MatrixViewType::ordinal_type ordinal_type;
00634     typedef typename MatrixViewType::scalar_type scalar_type;
00635 
00638     typedef CacheBlockRangeIterator<MatrixViewType> iterator;
00639 
00652     CacheBlockRange (MatrixViewType A,
00653          const CacheBlockingStrategy<ordinal_type, scalar_type>& strategy,
00654          const ordinal_type startIndex,
00655          const ordinal_type endIndex,
00656          const bool contiguousCacheBlocks) :
00657       A_ (A), 
00658       startIndex_ (startIndex),
00659       endIndex_ (endIndex),
00660       strategy_ (strategy),
00661       contiguousCacheBlocks_ (contiguousCacheBlocks)
00662     {}
00663 
00664     bool empty() const { 
00665       return startIndex_ >= endIndex_;
00666     }
00667 
00668     iterator begin() const {
00669       return iterator (A_, strategy_, startIndex_, false, contiguousCacheBlocks_);
00670     }
00671 
00672     iterator end() const {
00673       return iterator (A_, strategy_, endIndex_, false, contiguousCacheBlocks_);
00674     }
00675 
00676     iterator rbegin() const {
00677       return iterator (A_, strategy_, endIndex_-1, true, contiguousCacheBlocks_);
00678     }
00679 
00680     iterator rend() const {
00681       // Think about it: rbegin() == rend() means that rbegin() is invalid
00682       // and shouldn't be dereferenced.  rend() should never be dereferenced.
00683       return iterator (A_, strategy_, startIndex_-1, true, contiguousCacheBlocks_);
00684     }
00685 
00686     private:
00688       MatrixViewType A_;
00689 
00695       ordinal_type startIndex_;
00696 
00700       ordinal_type endIndex_;
00701 
00703       CacheBlockingStrategy<ordinal_type, scalar_type> strategy_;
00704       
00706       bool contiguousCacheBlocks_;
00707     };
00708 
00709 
00710 } // namespace TSQR
00711 
00712 
00713 #endif // __TSQR_CacheBlocker_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends