Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_CacheBlockingStrategy.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __TSQR_CacheBlockingStrategy_hpp
00030 #define __TSQR_CacheBlockingStrategy_hpp
00031 
00032 #include <algorithm>
00033 #include <limits>
00034 #include <sstream>
00035 #include <stdexcept>
00036 #include <utility> // std::pair
00037 
00040 
00041 namespace TSQR {
00042 
00061   template<class LocalOrdinal, class Scalar>
00062   class CacheBlockingStrategy {
00063   public:
00107     CacheBlockingStrategy (const size_t cacheSizeHint = 0,
00108          const size_t sizeOfScalar = sizeof(Scalar)) :
00109       size_of_scalar_ (sizeOfScalar),
00110       cache_size_hint_ (default_cache_size_hint (cacheSizeHint, sizeOfScalar))
00111     {}
00112 
00114     CacheBlockingStrategy (const CacheBlockingStrategy& rhs) :
00115       size_of_scalar_ (rhs.size_of_scalar_),
00116       cache_size_hint_ (rhs.cache_size_hint())
00117     {}
00118 
00120     CacheBlockingStrategy& operator= (const CacheBlockingStrategy& rhs) {
00121       size_of_scalar_ = rhs.size_of_scalar_;
00122       cache_size_hint_ = rhs.cache_size_hint();
00123       return *this;
00124     }
00125 
00130     size_t TEUCHOS_DEPRECATED cache_block_size () const { 
00131       return cache_size_hint_; 
00132     }
00133 
00150     size_t cache_size_hint () const { return cache_size_hint_; }
00151 
00157     size_t size_of_scalar () const { return size_of_scalar_; }
00158 
00160     bool operator== (const CacheBlockingStrategy& rhs) const {
00161       return cache_size_hint() == rhs.cache_size_hint() && 
00162   size_of_scalar() == rhs.size_of_scalar();
00163     }
00164 
00166     bool operator!= (const CacheBlockingStrategy& rhs) const {
00167       return cache_size_hint() != rhs.cache_size_hint() ||
00168   size_of_scalar() != rhs.size_of_scalar();
00169     }
00170 
00184     LocalOrdinal
00185     cache_block_offset (const LocalOrdinal index,
00186       const LocalOrdinal nrows,
00187       const LocalOrdinal ncols,
00188       const LocalOrdinal nrows_cache_block,
00189       const bool contiguous_cache_blocks) const
00190     {
00191       // Suppress compiler warning for the unused argument.
00192       (void) nrows;
00193 
00194       const LocalOrdinal my_row_start = index * nrows_cache_block;
00195       if (contiguous_cache_blocks) 
00196   return my_row_start * ncols;
00197       else // the common case
00198   return my_row_start;
00199     }
00200 
00216     LocalOrdinal
00217     cache_block_stride (const LocalOrdinal index,
00218       const LocalOrdinal nrows,
00219       const LocalOrdinal ncols,
00220       const LocalOrdinal lda,
00221       const LocalOrdinal nrows_cache_block,
00222       const bool contiguous_cache_blocks) const
00223     {
00224       if (contiguous_cache_blocks)
00225   {
00226     std::pair<LocalOrdinal, LocalOrdinal> result = 
00227       cache_block (index, nrows, ncols, nrows_cache_block);
00228     return result.second; // Number of rows in the cache block
00229   }
00230       else
00231   return lda;
00232     }
00233 
00246     std::pair<LocalOrdinal, LocalOrdinal>
00247     cache_block (const LocalOrdinal index,
00248      const LocalOrdinal nrows,
00249      const LocalOrdinal ncols,
00250      const LocalOrdinal nrows_cache_block) const
00251     {
00252       // See the comments in num_cache_blocks() for an explanation how
00253       // the number of cache blocks is computed, so that no cache
00254       // block has fewer than ncols rows.
00255       const LocalOrdinal quotient = nrows / nrows_cache_block;
00256       const LocalOrdinal remainder = nrows - nrows_cache_block * quotient;
00257       LocalOrdinal my_row_start, my_nrows;
00258 
00259       my_row_start = index * nrows_cache_block;
00260       if (quotient == 0)
00261   { // There is only one cache block.
00262     if (index == 0)
00263       my_nrows = remainder;
00264     else
00265       my_nrows = 0; // Out-of-range block, therefore empty
00266   }
00267       else if (remainder < ncols)
00268   { // There are quotient cache blocks.
00269     if (index < 0)
00270       my_nrows = 0; // Out-of-range block, therefore empty
00271     else if (index < quotient - 1)
00272       my_nrows = nrows_cache_block;
00273     else if (index == quotient - 1)
00274       // The last cache block gets the leftover rows, so that no
00275       // cache block has fewer than ncols rows.
00276       my_nrows = nrows_cache_block + remainder;
00277     else
00278       my_nrows = 0; // Out-of-range block, therefore empty
00279   }
00280       else 
00281   { // There are quotient+1 cache blocks.
00282     if (index < 0)
00283       my_nrows = 0; // Out-of-range block, therefore empty
00284     else if (index < quotient)
00285       my_nrows = nrows_cache_block;
00286     else if (index == quotient)
00287       // The last cache block has the leftover rows, which are
00288       // >= ncols and < nrows_cache_block.
00289       my_nrows = remainder;
00290     else
00291       my_nrows = 0; // Out-of-range block, therefore empty
00292   }
00293       return std::make_pair (my_row_start, my_nrows);
00294     }
00295 
00321     std::vector<LocalOrdinal>
00322     cache_block_details (const LocalOrdinal index,
00323        const LocalOrdinal nrows,
00324        const LocalOrdinal ncols,
00325        const LocalOrdinal lda,
00326        const LocalOrdinal nrows_cache_block,
00327        const bool contiguous_cache_blocks) const
00328     {
00329       const std::pair<LocalOrdinal, LocalOrdinal> result = 
00330   cache_block (index, nrows, ncols, nrows_cache_block);
00331       const LocalOrdinal my_row_start = result.first;
00332       const LocalOrdinal my_nrows = result.second;
00333 
00334       const LocalOrdinal offset = 
00335   contiguous_cache_blocks ? my_row_start * ncols : my_row_start;
00336       const LocalOrdinal stride = 
00337   contiguous_cache_blocks ? my_nrows : lda;
00338 
00339       std::vector<LocalOrdinal> retval (4);
00340       retval[0] = my_row_start;
00341       retval[1] = my_nrows;
00342       retval[2] = offset;
00343       retval[3] = stride;
00344       return retval;
00345     }
00346 
00361     LocalOrdinal 
00362     num_cache_blocks (const LocalOrdinal nrows,
00363           const LocalOrdinal ncols,
00364           const LocalOrdinal nrows_cache_block) const
00365     {
00366       const LocalOrdinal quotient = nrows / nrows_cache_block; 
00367       const LocalOrdinal remainder = nrows - nrows_cache_block * quotient;
00368 
00369       if (quotient == 0)
00370   // If nrows < nrows_cache_block, then there is only one cache
00371   // block, which gets all the rows.
00372   return static_cast<LocalOrdinal>(1);
00373       else if (remainder < ncols)
00374   // Don't let the last cache block have fewer than ncols rows.
00375   // If it would, merge it with the cache block above it.
00376   return quotient;
00377       else
00378   // The last cache block has the leftover rows, which are >=
00379   // ncols and < nrows_cache_block.
00380   return quotient + 1;
00381     }
00382 
00401     LocalOrdinal
00402     top_block_split_nrows (const LocalOrdinal nrows,
00403          const LocalOrdinal ncols,
00404          const LocalOrdinal nrows_cache_block) const
00405     {
00406       // We want to partition the nrows by ncols matrix A into [A_top;
00407       // A_bot], where A_top has nrows_cache_block rows.  However, we
00408       // don't want A_bot to have less than ncols rows.  If it would,
00409       // then we partition A so that A_top has nrows rows and A_bot is
00410       // empty.
00411       if (nrows < nrows_cache_block + ncols)
00412   return nrows;
00413       else
00414   // Don't ask for a bigger cache block than there are rows in
00415   // the matrix left to process.
00416   return std::min (nrows_cache_block, nrows);
00417     }
00418 
00435     LocalOrdinal
00436     bottom_block_split_nrows (const LocalOrdinal nrows,
00437             const LocalOrdinal ncols,
00438             const LocalOrdinal nrows_cache_block) const
00439     {
00440       // We split off the bottom block using the same splitting as if
00441       // we had split off as many top blocks of nrows_cache_block rows
00442       // as permissible.  The last block may have fewer than
00443       // nrows_cache_block rows, but it may not have fewer than ncols
00444       // rows (since we don't want any cache block to have fewer rows
00445       // than columns).
00446       const LocalOrdinal quotient = nrows / nrows_cache_block;
00447       const LocalOrdinal remainder = nrows - quotient * nrows_cache_block;
00448 
00449       LocalOrdinal nrows_bottom;
00450       if (quotient == 0)
00451   nrows_bottom = remainder;
00452       else if (remainder < ncols)
00453   nrows_bottom = nrows_cache_block + remainder;
00454       else if (remainder >= ncols)
00455   nrows_bottom = remainder;
00456       else
00457   throw std::logic_error("Should never get here!");
00458       return nrows_bottom;
00459     }
00460 
00476     size_t 
00477     default_cache_size_hint (const size_t suggested_cache_size,
00478            const size_t sizeOfScalar) const
00479     {
00480       // This is a somewhat arbitrary minimum.  However, our TSQR
00481       // implementation was optimized for matrices with 20 or fewer
00482       // columns, and we expect matrices with 10 columns.  Thus, it's
00483       // reasonable to base the minimum on requiring that the cache
00484       // blocks for a matrix with 10 columns have no fewer rows than
00485       // columns.  We base the minimum on explicit_Q() with such a
00486       // matrix: a cache block of Q (10 x 10), a cache block of C (10
00487       // x 10), a TAU array (length 10), and the top block of C
00488       // (C_top) (10 x 10 in this case, and n x n in general for a
00489       // matrix with n columns).  In this bound, the cache blocks are
00490       // only square because of the requirement that they have no
00491       // fewer rows than columns; normally, cache blocks have many
00492       // more rows than columns.
00493       const size_t min_cache_size = sizeOfScalar * (3*10*10 + 10);
00494 
00495       // 64 KB is a reasonable guess for the L2 cache size.  If Scalar
00496       // is huge, min_cache_size above might be bigger, so we account
00497       // for that with a max.  The type cast is necessary so that the
00498       // compiler can decide which specialization of std::max to use
00499       // (the one for size_t, in this case, rather than the one for
00500       // int, which is the implicit type of the integer constant).
00501       const size_t default_cache_size = 
00502   std::max (min_cache_size, static_cast<size_t> (65536));
00503 
00504       // If the suggested cache size is less than the minimum, ignore
00505       // the suggestion and pick the minimum.
00506       return (suggested_cache_size == 0) ? 
00507   default_cache_size : std::max (suggested_cache_size, min_cache_size);
00508     }
00509 
00528     LocalOrdinal 
00529     cache_block_num_rows (const LocalOrdinal ncols) const
00530     {
00531       // Suppose the cache can hold W words (of size size_of_scalar_
00532       // bytes each).  We have to use the same number of rows per
00533       // cache block for both the factorization and applying the Q
00534       // factor.
00535       //
00536       // The factorization requires a working set of
00537       // ncols*(nrows_cache_block + ncols) + 2*ncols words:
00538       //
00539       // 1. One ncols by ncols R factor (not packed)
00540       // 2. One nrows_cache_block by ncols cache block
00541       // 3. tau array of length ncols
00542       // 4. workspace array of length ncols
00543       //
00544       // That means nrows_cache_block should be <= W/ncols - ncols - 2.
00545       //
00546       // Applying the Q factor to a matrix C with the same number of
00547       // columns as Q requires a working set of
00548       // 2*nrows_cache_block*ncols + ncols*ncols + 2*ncols
00549       //
00550       // 1. Cache block of Q: nrows_cache_block by ncols
00551       // 2. C_top block: ncols by ncols
00552       // 3. C_cur block: nrows_cache_block by ncols
00553       // 4. tau array of length ncols
00554       // 5. workspace array of length ncols
00555       // 
00556       // That means nrows_cache_block should be <= (W/(2*N) - N/2 -
00557       // 1).  Obviously this is smaller than for the factorization, so
00558       // we use this formula to pick nrows_cache_block.  It should also 
00559       // be at least ncols.
00560 
00561       const size_t W = cache_size_hint() / size_of_scalar_;
00562 
00563       // Compute everything in size_t first, and cast to LocalOrdinal
00564       // at the end.  This may avoid overflow if the cache is very
00565       // large and/or LocalOrdinal is very small (say a short int).
00566       //
00567       // Also, since size_t is unsigned, make sure that the
00568       // subtractions don't make it negative.  If it does, then either
00569       // ncols is too big or the cache is too small.
00570 
00571       const size_t term1 = W / (2*ncols);
00572       const size_t term2 = ncols / 2 + 1;
00573 
00574       if (term1 <= term2)
00575   {
00576     // The cache must be very small.  Just make the cache blocks
00577     // square.  That will be inefficient, but wil result in
00578     // correct behavior.
00579     return ncols;
00580   }
00581       else 
00582   {
00583     // The compiler can easily prove that term1 - term2 > 0,
00584     // since we've gotten to this point.  Of course that's
00585     // assuming that C++ compilers are smart...
00586     const size_t nrows_cache_block = 
00587       std::max (term1 - term2, static_cast<size_t>(ncols));
00588 
00589     // Make sure that nrows_cache_block fits in a LocalOrdinal
00590     // type.  We do so by casting the size_t to a LocalOrdinal
00591     // and then back into a size_t.  This should work in the
00592     // typical case of LocalOrdinal=int, and also whenever
00593     // LocalOrdinal's binary representation has no more bits
00594     // than that of size_t.
00595     const LocalOrdinal nrows_cache_block_as_lo = 
00596       static_cast<LocalOrdinal> (nrows_cache_block);
00597     if (static_cast<size_t>(nrows_cache_block_as_lo) != nrows_cache_block)
00598       {
00599         std::ostringstream os;
00600         os << "Error:  While deciding on the number of rows in a cache "
00601     "block for sequential TSQR, the decided-upon number of rows "
00602      << nrows_cache_block << " does not fit in a LocalOrdinal "
00603     "type, whose max value is " 
00604      << std::numeric_limits<LocalOrdinal>::max() << ".";
00605         throw std::range_error (os.str());
00606       }
00607     else
00608       return static_cast<LocalOrdinal> (nrows_cache_block);
00609   }
00610     }
00611 
00612   private:
00618     size_t size_of_scalar_;
00619 
00626     size_t cache_size_hint_;
00627 
00628   };
00629 } // namespace TSQR
00630 
00631 #endif // __TSQR_CacheBlockingStrategy_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends