Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_CacheBlocker.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __TSQR_CacheBlocker_hpp
00030 #define __TSQR_CacheBlocker_hpp
00031 
00032 #include <Tsqr_CacheBlockingStrategy.hpp>
00033 #include <Tsqr_MatView.hpp>
00034 #include <Tsqr_Util.hpp>
00035 
00036 #include <iterator>
00037 #include <sstream>
00038 #include <stdexcept>
00039 
00040 namespace TSQR {
00041 
00058   template<class Ordinal, class Scalar>
00059   class CacheBlocker {
00060   private:
00061     typedef MatView<Ordinal, Scalar> mat_view;
00062     typedef ConstMatView<Ordinal, Scalar> const_mat_view;
00063 
00064     void
00065     validate () 
00066     {
00067       if (nrows_cache_block_ < ncols_)
00068   {
00069     std::ostringstream os;
00070     os << "The typical cache block size is too small.  Only " 
00071        << nrows_cache_block_ << " rows fit, but every cache block needs "
00072       "at least as many rows as the number of columns " << ncols_ 
00073        << " in the matrix.";
00074     throw std::logic_error (os.str());
00075   }
00076     }
00077 
00078   public:
00091     CacheBlocker (const Ordinal num_rows,
00092       const Ordinal num_cols,
00093       const CacheBlockingStrategy<Ordinal, Scalar>& strategy) :
00094       nrows_ (num_rows), 
00095       ncols_ (num_cols), 
00096       strategy_ (strategy),
00097       nrows_cache_block_ (strategy_.cache_block_num_rows (ncols()))
00098     {
00099       validate ();
00100     }
00101 
00103     CacheBlocker () : 
00104       nrows_ (0), 
00105       ncols_ (0), 
00106       nrows_cache_block_ (strategy_.cache_block_num_rows (ncols()))
00107     {}
00108 
00110     CacheBlocker (const CacheBlocker& rhs) :
00111       nrows_ (rhs.nrows()), 
00112       ncols_ (rhs.ncols()), 
00113       strategy_ (rhs.strategy_),
00114       nrows_cache_block_ (rhs.nrows_cache_block_)
00115     {}
00116 
00118     CacheBlocker& operator= (const CacheBlocker& rhs) {
00119       nrows_ = rhs.nrows();
00120       ncols_ = rhs.ncols();
00121       strategy_ = rhs.strategy_;
00122       nrows_cache_block_ = rhs.nrows_cache_block_;
00123       return *this;
00124     }
00125 
00130     size_t TEUCHOS_DEPRECATED cache_block_size () const { 
00131       return strategy_.cache_size_hint(); 
00132     }
00133 
00135     size_t cache_size_hint () const { return strategy_.cache_size_hint(); }
00136 
00138     Ordinal nrows () const { return nrows_; }
00139 
00141     Ordinal ncols () const { return ncols_; }
00142 
00163     template< class MatrixViewType >
00164     MatrixViewType
00165     split_top_block (MatrixViewType& A, const bool contiguous_cache_blocks) const
00166     {
00167       typedef typename MatrixViewType::ordinal_type ordinal_type;
00168       const ordinal_type nrows_top = 
00169   strategy_.top_block_split_nrows (A.nrows(), ncols(), 
00170            nrows_cache_block());
00171       // split_top() sets A to A_rest, and returns A_top.
00172       return A.split_top (nrows_top, contiguous_cache_blocks);
00173     }
00174 
00184     template< class MatrixViewType >
00185     MatrixViewType
00186     top_block (const MatrixViewType& A, const bool contiguous_cache_blocks) const
00187     {
00188       typedef typename MatrixViewType::ordinal_type ordinal_type;
00189       // Ignore the number of columns in A, since we want to block all
00190       // matrices using the same cache blocking strategy.
00191       const ordinal_type nrows_top = 
00192   strategy_.top_block_split_nrows (A.nrows(), ncols(), 
00193            nrows_cache_block());
00194       MatrixViewType A_copy (A);
00195       return A_copy.split_top (nrows_top, contiguous_cache_blocks);
00196     }
00197 
00212     template< class MatrixViewType >
00213     MatrixViewType
00214     split_bottom_block (MatrixViewType& A, const bool contiguous_cache_blocks) const
00215     {
00216       typedef typename MatrixViewType::ordinal_type ordinal_type;
00217       // Ignore the number of columns in A, since we want to block all
00218       // matrices using the same cache blocking strategy.
00219       const ordinal_type nrows_bottom = 
00220   strategy_.bottom_block_split_nrows (A.nrows(), ncols(), 
00221               nrows_cache_block());
00222       // split_bottom() sets A to A_rest, and returns A_bot.
00223       return A.split_bottom (nrows_bottom, contiguous_cache_blocks);
00224     }
00225 
00239     template<class MatrixViewType>
00240     void
00241     fill_with_zeros (MatrixViewType A,
00242          const bool contiguous_cache_blocks) const
00243     {
00244       // Note: if the cache blocks are stored contiguously, A.lda()
00245       // won't be the correct leading dimension of A, but it won't
00246       // matter: we only ever operate on A_cur here, and A_cur's
00247       // leading dimension is set correctly by split_top_block().
00248       while (! A.empty())
00249   {
00250     // This call modifies the matrix view A, but that's OK since
00251     // we passed the input view by copy, not by reference.
00252     MatrixViewType A_cur = split_top_block (A, contiguous_cache_blocks);
00253     A_cur.fill (Scalar(0));
00254   }
00255     }
00256 
00273     void
00274     fill_with_zeros (const Ordinal num_rows,
00275          const Ordinal num_cols,
00276          Scalar A[],
00277          const Ordinal lda, 
00278          const bool contiguous_cache_blocks) const
00279     {
00280       // We say "A_rest" because it points to the remaining part of
00281       // the matrix left to process; at the beginning, the "remaining"
00282       // part is the whole matrix, but that will change as the
00283       // algorithm progresses.
00284       //
00285       // Note: if the cache blocks are stored contiguously, lda won't
00286       // be the correct leading dimension of A, but it won't matter:
00287       // we only ever operate on A_cur here, and A_cur's leading
00288       // dimension is set correctly by A_rest.split_top().
00289       mat_view A_rest (num_rows, num_cols, A, lda);
00290 
00291       while (! A_rest.empty())
00292   {
00293     // This call modifies A_rest.
00294     mat_view A_cur = split_top_block (A_rest, contiguous_cache_blocks);
00295     A_cur.fill (Scalar(0));
00296   }
00297     }
00298 
00316     void
00317     cache_block (const Ordinal num_rows,
00318      const Ordinal num_cols,
00319      Scalar A_out[],
00320      const Scalar A_in[],
00321      const Ordinal lda_in) const
00322     {
00323       // We say "*_rest" because it points to the remaining part of
00324       // the matrix left to cache block; at the beginning, the
00325       // "remaining" part is the whole matrix, but that will change as
00326       // the algorithm progresses.
00327       const_mat_view A_in_rest (num_rows, num_cols, A_in, lda_in);
00328       // Leading dimension doesn't matter since A_out will be cache blocked.
00329       mat_view A_out_rest (num_rows, num_cols, A_out, lda_in);
00330 
00331       while (! A_in_rest.empty())
00332   {
00333     if (A_out_rest.empty())
00334       throw std::logic_error("A_out_rest is empty, but A_in_rest is not");
00335 
00336     // This call modifies A_in_rest.
00337     const_mat_view A_in_cur = split_top_block (A_in_rest, false);
00338 
00339     // This call modifies A_out_rest.
00340     mat_view A_out_cur = split_top_block (A_out_rest, true);
00341 
00342     copy_matrix (A_in_cur.nrows(), num_cols, A_out_cur.get(), 
00343            A_out_cur.lda(), A_in_cur.get(), A_in_cur.lda());
00344   }
00345     }
00346 
00348     void
00349     un_cache_block (const Ordinal num_rows,
00350         const Ordinal num_cols,
00351         Scalar A_out[],
00352         const Ordinal lda_out,        
00353         const Scalar A_in[]) const
00354     {
00355       // We say "*_rest" because it points to the remaining part of
00356       // the matrix left to cache block; at the beginning, the
00357       // "remaining" part is the whole matrix, but that will change as
00358       // the algorithm progresses.
00359       //
00360       // Leading dimension doesn't matter since A_in is cache blocked.
00361       const_mat_view A_in_rest (num_rows, num_cols, A_in, lda_out);
00362       mat_view A_out_rest (num_rows, num_cols, A_out, lda_out);
00363 
00364       while (! A_in_rest.empty())
00365   {
00366     if (A_out_rest.empty())
00367       throw std::logic_error("A_out_rest is empty, but A_in_rest is not");
00368 
00369     // This call modifies A_in_rest.
00370     const_mat_view A_in_cur = split_top_block (A_in_rest, true);
00371 
00372     // This call modifies A_out_rest.
00373     mat_view A_out_cur = split_top_block (A_out_rest, false);
00374 
00375     copy_matrix (A_in_cur.nrows(), num_cols, A_out_cur.get(), 
00376            A_out_cur.lda(), A_in_cur.get(), A_in_cur.lda());
00377   }
00378     }
00379 
00394     template<class MatrixViewType>
00395     MatrixViewType 
00396     get_cache_block (MatrixViewType A,
00397          const typename MatrixViewType::ordinal_type cache_block_index,
00398          const bool contiguous_cache_blocks) const
00399     {
00400       typedef typename MatrixViewType::ordinal_type ordinal_type;
00401       typedef typename MatrixViewType::scalar_type scalar_type;
00402 
00403       // Total number of cache blocks.
00404       const ordinal_type num_cache_blocks = 
00405   strategy_.num_cache_blocks (A.nrows(), A.ncols(), nrows_cache_block());
00406 
00407       if (cache_block_index >= num_cache_blocks)
00408   return MatrixViewType (0, 0, NULL, 0); // empty
00409 
00410       // result[0] = starting row index of the cache block
00411       // result[1] = number of rows in the cache block
00412       // result[2] = pointer offset (A.get() + result[2])
00413       // result[3] = leading dimension (a.k.a. stride) of the cache block
00414       std::vector<Ordinal> result = 
00415   strategy_.cache_block_details (cache_block_index, A.nrows(), A.ncols(),
00416                A.lda(), nrows_cache_block(), 
00417                contiguous_cache_blocks);
00418       if (result[1] == 0)
00419   // For some reason, the cache block is empty.   
00420   return MatrixViewType (0, 0, NULL, 0);
00421 
00422       // We expect that ordinal_type is signed, so adding signed
00423       // (ordinal_type) to unsigned (pointer) may raise compiler
00424       // warnings.
00425       return MatrixViewType (result[1], A.ncols(), 
00426            A.get() + static_cast<size_t>(result[2]), 
00427            result[3]);
00428     }
00429 
00435     bool
00436     operator== (const CacheBlockingStrategy<Ordinal, Scalar>& rhs) const
00437     {
00438       return nrows() == rhs.nrows() && 
00439   ncols() == rhs.ncols() &&
00440   strategy_ == rhs.strategy_;
00441     }
00442 
00443   private:
00445     Ordinal nrows_;
00446 
00448     Ordinal ncols_;
00449     
00451     CacheBlockingStrategy<Ordinal, Scalar> strategy_;
00452 
00459     Ordinal nrows_cache_block_;
00460 
00468     size_t nrows_cache_block () const { return nrows_cache_block_; }
00469   };
00470 
00471 
00479   template<class MatrixViewType>
00480   class CacheBlockRangeIterator : 
00481     public std::iterator<std::forward_iterator_tag, MatrixViewType> 
00482   {
00483   public:
00484     typedef MatrixViewType view_type;
00485     typedef typename MatrixViewType::ordinal_type ordinal_type;
00486     typedef typename MatrixViewType::scalar_type scalar_type;
00487 
00493     CacheBlockRangeIterator () :
00494       A_ (0, 0, NULL, 0),
00495       curInd_ (0),
00496       reverse_ (false),
00497       contiguousCacheBlocks_ (false)
00498     {}
00499 
00511     CacheBlockRangeIterator (const MatrixViewType& A,
00512            const CacheBlockingStrategy<ordinal_type, scalar_type>& strategy,
00513            const ordinal_type currentIndex,
00514            const bool reverse,
00515            const bool contiguousCacheBlocks) :
00516       A_ (A), 
00517       blocker_ (A_.nrows(), A_.ncols(), strategy), 
00518       curInd_ (currentIndex),
00519       reverse_ (reverse),
00520       contiguousCacheBlocks_ (contiguousCacheBlocks)
00521     {}
00522 
00524     CacheBlockRangeIterator (const CacheBlockRangeIterator& rhs) : 
00525       A_ (rhs.A_), 
00526       blocker_ (rhs.blocker_), 
00527       curInd_ (rhs.curInd_), 
00528       reverse_ (rhs.reverse_),
00529       contiguousCacheBlocks_ (rhs.contiguousCacheBlocks_)
00530     {}
00531 
00533     CacheBlockRangeIterator& operator= (const CacheBlockRangeIterator& rhs)
00534     {
00535       A_ = rhs.A_;
00536       blocker_ = rhs.blocker_;
00537       curInd_ = rhs.curInd_;
00538       reverse_ = rhs.reverse_;
00539       contiguousCacheBlocks_ = rhs.contiguousCacheBlocks_;
00540     }
00541 
00543     CacheBlockRangeIterator& operator++() {
00544       if (reverse_)
00545   --curInd_;
00546       else
00547   ++curInd_;
00548       return *this;
00549     }
00550 
00556     CacheBlockRangeIterator operator++(int) {
00557       CacheBlockRangeIterator retval (*this);
00558       operator++();
00559       return retval;
00560     }
00561 
00568     bool operator== (const CacheBlockRangeIterator& rhs) {
00569       // Not correct, but fast.  Should return false for different A_
00570       // or different blocker_.
00571       return curInd_ == rhs.curInd_;
00572     }
00573 
00575     bool operator!= (const CacheBlockRangeIterator& rhs) {
00576       // Not correct, but fast.  Should return false for different A_
00577       // or different blocker_.
00578       return curInd_ != rhs.curInd_;
00579     }
00580 
00586     MatrixViewType operator*() const {
00587       return blocker_.get_cache_block (A_, curInd_, contiguousCacheBlocks_);
00588     }
00589 
00590   private:
00591     MatrixViewType A_;
00592     CacheBlocker<ordinal_type, scalar_type> blocker_;
00593     ordinal_type curInd_;
00594     bool reverse_;
00595     bool contiguousCacheBlocks_;
00596   };
00597 
00616   template<class MatrixViewType>
00617   class CacheBlockRange {
00618   public:
00619     typedef MatrixViewType view_type;
00620     typedef typename MatrixViewType::ordinal_type ordinal_type;
00621     typedef typename MatrixViewType::scalar_type scalar_type;
00622 
00625     typedef CacheBlockRangeIterator<MatrixViewType> iterator;
00626 
00639     CacheBlockRange (MatrixViewType A,
00640          const CacheBlockingStrategy<ordinal_type, scalar_type>& strategy,
00641          const ordinal_type startIndex,
00642          const ordinal_type endIndex,
00643          const bool contiguousCacheBlocks) :
00644       A_ (A), 
00645       startIndex_ (startIndex),
00646       endIndex_ (endIndex),
00647       strategy_ (strategy),
00648       contiguousCacheBlocks_ (contiguousCacheBlocks)
00649     {}
00650 
00651     bool empty() const { 
00652       return startIndex_ >= endIndex_;
00653     }
00654 
00655     iterator begin() const {
00656       return iterator (A_, strategy_, startIndex_, false, contiguousCacheBlocks_);
00657     }
00658 
00659     iterator end() const {
00660       return iterator (A_, strategy_, endIndex_, false, contiguousCacheBlocks_);
00661     }
00662 
00663     iterator rbegin() const {
00664       return iterator (A_, strategy_, endIndex_-1, true, contiguousCacheBlocks_);
00665     }
00666 
00667     iterator rend() const {
00668       // Think about it: rbegin() == rend() means that rbegin() is invalid
00669       // and shouldn't be dereferenced.  rend() should never be dereferenced.
00670       return iterator (A_, strategy_, startIndex_-1, true, contiguousCacheBlocks_);
00671     }
00672 
00673     private:
00675       MatrixViewType A_;
00676 
00682       ordinal_type startIndex_;
00683 
00687       ordinal_type endIndex_;
00688 
00690       CacheBlockingStrategy<ordinal_type, scalar_type> strategy_;
00691       
00693       bool contiguousCacheBlocks_;
00694     };
00695 
00696 
00697 } // namespace TSQR
00698 
00699 
00700 #endif // __TSQR_CacheBlocker_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends