Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_CacheBlockingStrategy.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __TSQR_CacheBlockingStrategy_hpp
00043 #define __TSQR_CacheBlockingStrategy_hpp
00044 
00045 #include <algorithm>
00046 #include <limits>
00047 #include <sstream>
00048 #include <stdexcept>
00049 #include <utility> // std::pair
00050 
00051 
00052 namespace TSQR {
00053 
00072   template<class LocalOrdinal, class Scalar>
00073   class CacheBlockingStrategy {
00074   public:
00118     CacheBlockingStrategy (const size_t cacheSizeHint = 0,
00119          const size_t sizeOfScalar = sizeof(Scalar)) :
00120       size_of_scalar_ (sizeOfScalar),
00121       cache_size_hint_ (default_cache_size_hint (cacheSizeHint, sizeOfScalar))
00122     {}
00123 
00125     CacheBlockingStrategy (const CacheBlockingStrategy& rhs) :
00126       size_of_scalar_ (rhs.size_of_scalar_),
00127       cache_size_hint_ (rhs.cache_size_hint())
00128     {}
00129 
00131     CacheBlockingStrategy& operator= (const CacheBlockingStrategy& rhs) {
00132       size_of_scalar_ = rhs.size_of_scalar_;
00133       cache_size_hint_ = rhs.cache_size_hint();
00134       return *this;
00135     }
00136 
00141     size_t TEUCHOS_DEPRECATED cache_block_size () const { 
00142       return cache_size_hint_; 
00143     }
00144 
00161     size_t cache_size_hint () const { return cache_size_hint_; }
00162 
00168     size_t size_of_scalar () const { return size_of_scalar_; }
00169 
00171     bool operator== (const CacheBlockingStrategy& rhs) const {
00172       return cache_size_hint() == rhs.cache_size_hint() && 
00173   size_of_scalar() == rhs.size_of_scalar();
00174     }
00175 
00177     bool operator!= (const CacheBlockingStrategy& rhs) const {
00178       return cache_size_hint() != rhs.cache_size_hint() ||
00179   size_of_scalar() != rhs.size_of_scalar();
00180     }
00181 
00195     LocalOrdinal
00196     cache_block_offset (const LocalOrdinal index,
00197       const LocalOrdinal nrows,
00198       const LocalOrdinal ncols,
00199       const LocalOrdinal nrows_cache_block,
00200       const bool contiguous_cache_blocks) const
00201     {
00202       // Suppress compiler warning for the unused argument.
00203       (void) nrows;
00204 
00205       const LocalOrdinal my_row_start = index * nrows_cache_block;
00206       if (contiguous_cache_blocks) 
00207   return my_row_start * ncols;
00208       else // the common case
00209   return my_row_start;
00210     }
00211 
00227     LocalOrdinal
00228     cache_block_stride (const LocalOrdinal index,
00229       const LocalOrdinal nrows,
00230       const LocalOrdinal ncols,
00231       const LocalOrdinal lda,
00232       const LocalOrdinal nrows_cache_block,
00233       const bool contiguous_cache_blocks) const
00234     {
00235       if (contiguous_cache_blocks)
00236   {
00237     std::pair<LocalOrdinal, LocalOrdinal> result = 
00238       cache_block (index, nrows, ncols, nrows_cache_block);
00239     return result.second; // Number of rows in the cache block
00240   }
00241       else
00242   return lda;
00243     }
00244 
00257     std::pair<LocalOrdinal, LocalOrdinal>
00258     cache_block (const LocalOrdinal index,
00259      const LocalOrdinal nrows,
00260      const LocalOrdinal ncols,
00261      const LocalOrdinal nrows_cache_block) const
00262     {
00263       // See the comments in num_cache_blocks() for an explanation how
00264       // the number of cache blocks is computed, so that no cache
00265       // block has fewer than ncols rows.
00266       const LocalOrdinal quotient = nrows / nrows_cache_block;
00267       const LocalOrdinal remainder = nrows - nrows_cache_block * quotient;
00268       LocalOrdinal my_row_start, my_nrows;
00269 
00270       my_row_start = index * nrows_cache_block;
00271       if (quotient == 0)
00272   { // There is only one cache block.
00273     if (index == 0)
00274       my_nrows = remainder;
00275     else
00276       my_nrows = 0; // Out-of-range block, therefore empty
00277   }
00278       else if (remainder < ncols)
00279   { // There are quotient cache blocks.
00280     if (index < 0)
00281       my_nrows = 0; // Out-of-range block, therefore empty
00282     else if (index < quotient - 1)
00283       my_nrows = nrows_cache_block;
00284     else if (index == quotient - 1)
00285       // The last cache block gets the leftover rows, so that no
00286       // cache block has fewer than ncols rows.
00287       my_nrows = nrows_cache_block + remainder;
00288     else
00289       my_nrows = 0; // Out-of-range block, therefore empty
00290   }
00291       else 
00292   { // There are quotient+1 cache blocks.
00293     if (index < 0)
00294       my_nrows = 0; // Out-of-range block, therefore empty
00295     else if (index < quotient)
00296       my_nrows = nrows_cache_block;
00297     else if (index == quotient)
00298       // The last cache block has the leftover rows, which are
00299       // >= ncols and < nrows_cache_block.
00300       my_nrows = remainder;
00301     else
00302       my_nrows = 0; // Out-of-range block, therefore empty
00303   }
00304       return std::make_pair (my_row_start, my_nrows);
00305     }
00306 
00332     std::vector<LocalOrdinal>
00333     cache_block_details (const LocalOrdinal index,
00334        const LocalOrdinal nrows,
00335        const LocalOrdinal ncols,
00336        const LocalOrdinal lda,
00337        const LocalOrdinal nrows_cache_block,
00338        const bool contiguous_cache_blocks) const
00339     {
00340       const std::pair<LocalOrdinal, LocalOrdinal> result = 
00341   cache_block (index, nrows, ncols, nrows_cache_block);
00342       const LocalOrdinal my_row_start = result.first;
00343       const LocalOrdinal my_nrows = result.second;
00344 
00345       const LocalOrdinal offset = 
00346   contiguous_cache_blocks ? my_row_start * ncols : my_row_start;
00347       const LocalOrdinal stride = 
00348   contiguous_cache_blocks ? my_nrows : lda;
00349 
00350       std::vector<LocalOrdinal> retval (4);
00351       retval[0] = my_row_start;
00352       retval[1] = my_nrows;
00353       retval[2] = offset;
00354       retval[3] = stride;
00355       return retval;
00356     }
00357 
00372     LocalOrdinal 
00373     num_cache_blocks (const LocalOrdinal nrows,
00374           const LocalOrdinal ncols,
00375           const LocalOrdinal nrows_cache_block) const
00376     {
00377       const LocalOrdinal quotient = nrows / nrows_cache_block; 
00378       const LocalOrdinal remainder = nrows - nrows_cache_block * quotient;
00379 
00380       if (quotient == 0)
00381   // If nrows < nrows_cache_block, then there is only one cache
00382   // block, which gets all the rows.
00383   return static_cast<LocalOrdinal>(1);
00384       else if (remainder < ncols)
00385   // Don't let the last cache block have fewer than ncols rows.
00386   // If it would, merge it with the cache block above it.
00387   return quotient;
00388       else
00389   // The last cache block has the leftover rows, which are >=
00390   // ncols and < nrows_cache_block.
00391   return quotient + 1;
00392     }
00393 
00412     LocalOrdinal
00413     top_block_split_nrows (const LocalOrdinal nrows,
00414          const LocalOrdinal ncols,
00415          const LocalOrdinal nrows_cache_block) const
00416     {
00417       // We want to partition the nrows by ncols matrix A into [A_top;
00418       // A_bot], where A_top has nrows_cache_block rows.  However, we
00419       // don't want A_bot to have less than ncols rows.  If it would,
00420       // then we partition A so that A_top has nrows rows and A_bot is
00421       // empty.
00422       if (nrows < nrows_cache_block + ncols)
00423   return nrows;
00424       else
00425   // Don't ask for a bigger cache block than there are rows in
00426   // the matrix left to process.
00427   return std::min (nrows_cache_block, nrows);
00428     }
00429 
00446     LocalOrdinal
00447     bottom_block_split_nrows (const LocalOrdinal nrows,
00448             const LocalOrdinal ncols,
00449             const LocalOrdinal nrows_cache_block) const
00450     {
00451       // We split off the bottom block using the same splitting as if
00452       // we had split off as many top blocks of nrows_cache_block rows
00453       // as permissible.  The last block may have fewer than
00454       // nrows_cache_block rows, but it may not have fewer than ncols
00455       // rows (since we don't want any cache block to have fewer rows
00456       // than columns).
00457       const LocalOrdinal quotient = nrows / nrows_cache_block;
00458       const LocalOrdinal remainder = nrows - quotient * nrows_cache_block;
00459 
00460       LocalOrdinal nrows_bottom;
00461       if (quotient == 0)
00462   nrows_bottom = remainder;
00463       else if (remainder < ncols)
00464   nrows_bottom = nrows_cache_block + remainder;
00465       else if (remainder >= ncols)
00466   nrows_bottom = remainder;
00467       else
00468   throw std::logic_error("Should never get here!");
00469       return nrows_bottom;
00470     }
00471 
00487     size_t 
00488     default_cache_size_hint (const size_t suggested_cache_size,
00489            const size_t sizeOfScalar) const
00490     {
00491       // This is a somewhat arbitrary minimum.  However, our TSQR
00492       // implementation was optimized for matrices with 20 or fewer
00493       // columns, and we expect matrices with 10 columns.  Thus, it's
00494       // reasonable to base the minimum on requiring that the cache
00495       // blocks for a matrix with 10 columns have no fewer rows than
00496       // columns.  We base the minimum on explicit_Q() with such a
00497       // matrix: a cache block of Q (10 x 10), a cache block of C (10
00498       // x 10), a TAU array (length 10), and the top block of C
00499       // (C_top) (10 x 10 in this case, and n x n in general for a
00500       // matrix with n columns).  In this bound, the cache blocks are
00501       // only square because of the requirement that they have no
00502       // fewer rows than columns; normally, cache blocks have many
00503       // more rows than columns.
00504       const size_t min_cache_size = sizeOfScalar * (3*10*10 + 10);
00505 
00506       // 64 KB is a reasonable guess for the L2 cache size.  If Scalar
00507       // is huge, min_cache_size above might be bigger, so we account
00508       // for that with a max.  The type cast is necessary so that the
00509       // compiler can decide which specialization of std::max to use
00510       // (the one for size_t, in this case, rather than the one for
00511       // int, which is the implicit type of the integer constant).
00512       const size_t default_cache_size = 
00513   std::max (min_cache_size, static_cast<size_t> (65536));
00514 
00515       // If the suggested cache size is less than the minimum, ignore
00516       // the suggestion and pick the minimum.
00517       return (suggested_cache_size == 0) ? 
00518   default_cache_size : std::max (suggested_cache_size, min_cache_size);
00519     }
00520 
00539     LocalOrdinal 
00540     cache_block_num_rows (const LocalOrdinal ncols) const
00541     {
00542       // Suppose the cache can hold W words (of size size_of_scalar_
00543       // bytes each).  We have to use the same number of rows per
00544       // cache block for both the factorization and applying the Q
00545       // factor.
00546       //
00547       // The factorization requires a working set of
00548       // ncols*(nrows_cache_block + ncols) + 2*ncols words:
00549       //
00550       // 1. One ncols by ncols R factor (not packed)
00551       // 2. One nrows_cache_block by ncols cache block
00552       // 3. tau array of length ncols
00553       // 4. workspace array of length ncols
00554       //
00555       // That means nrows_cache_block should be <= W/ncols - ncols - 2.
00556       //
00557       // Applying the Q factor to a matrix C with the same number of
00558       // columns as Q requires a working set of
00559       // 2*nrows_cache_block*ncols + ncols*ncols + 2*ncols
00560       //
00561       // 1. Cache block of Q: nrows_cache_block by ncols
00562       // 2. C_top block: ncols by ncols
00563       // 3. C_cur block: nrows_cache_block by ncols
00564       // 4. tau array of length ncols
00565       // 5. workspace array of length ncols
00566       // 
00567       // That means nrows_cache_block should be <= (W/(2*N) - N/2 -
00568       // 1).  Obviously this is smaller than for the factorization, so
00569       // we use this formula to pick nrows_cache_block.  It should also 
00570       // be at least ncols.
00571 
00572       const size_t W = cache_size_hint() / size_of_scalar_;
00573 
00574       // Compute everything in size_t first, and cast to LocalOrdinal
00575       // at the end.  This may avoid overflow if the cache is very
00576       // large and/or LocalOrdinal is very small (say a short int).
00577       //
00578       // Also, since size_t is unsigned, make sure that the
00579       // subtractions don't make it negative.  If it does, then either
00580       // ncols is too big or the cache is too small.
00581 
00582       const size_t term1 = W / (2*ncols);
00583       const size_t term2 = ncols / 2 + 1;
00584 
00585       if (term1 <= term2)
00586   {
00587     // The cache must be very small.  Just make the cache blocks
00588     // square.  That will be inefficient, but wil result in
00589     // correct behavior.
00590     return ncols;
00591   }
00592       else 
00593   {
00594     // The compiler can easily prove that term1 - term2 > 0,
00595     // since we've gotten to this point.  Of course that's
00596     // assuming that C++ compilers are smart...
00597     const size_t nrows_cache_block = 
00598       std::max (term1 - term2, static_cast<size_t>(ncols));
00599 
00600     // Make sure that nrows_cache_block fits in a LocalOrdinal
00601     // type.  We do so by casting the size_t to a LocalOrdinal
00602     // and then back into a size_t.  This should work in the
00603     // typical case of LocalOrdinal=int, and also whenever
00604     // LocalOrdinal's binary representation has no more bits
00605     // than that of size_t.
00606     const LocalOrdinal nrows_cache_block_as_lo = 
00607       static_cast<LocalOrdinal> (nrows_cache_block);
00608     if (static_cast<size_t>(nrows_cache_block_as_lo) != nrows_cache_block)
00609       {
00610         std::ostringstream os;
00611         os << "Error:  While deciding on the number of rows in a cache "
00612     "block for sequential TSQR, the decided-upon number of rows "
00613      << nrows_cache_block << " does not fit in a LocalOrdinal "
00614     "type, whose max value is " 
00615      << std::numeric_limits<LocalOrdinal>::max() << ".";
00616         throw std::range_error (os.str());
00617       }
00618     else
00619       return static_cast<LocalOrdinal> (nrows_cache_block);
00620   }
00621     }
00622 
00623   private:
00629     size_t size_of_scalar_;
00630 
00637     size_t cache_size_hint_;
00638 
00639   };
00640 } // namespace TSQR
00641 
00642 #endif // __TSQR_CacheBlockingStrategy_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends