Anasazi Version of the Day
Tsqr_CacheBlockingStrategy.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                 Anasazi: Block Eigensolvers Package
00005 //                 Copyright (2010) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00025 //
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 #ifndef __TSQR_CacheBlockingStrategy_hpp
00030 #define __TSQR_CacheBlockingStrategy_hpp
00031 
00032 #include <algorithm>
00033 #include <limits>
00034 #include <sstream>
00035 #include <stdexcept>
00036 #include <utility> // std::pair
00037 
00040 
00041 namespace TSQR {
00042 
00047   template< class LocalOrdinal, class Scalar >
00048   class CacheBlockingStrategy {
00049   public:
00054     CacheBlockingStrategy (const size_t cacheBlockSize = 0) :
00055       cache_block_size_ (default_cache_block_size (cacheBlockSize))
00056     {}
00057 
00061     CacheBlockingStrategy (const CacheBlockingStrategy& rhs) :
00062       cache_block_size_ (rhs.cache_block_size()) 
00063     {}
00064 
00068     CacheBlockingStrategy& operator= (const CacheBlockingStrategy& rhs) {
00069       cache_block_size_ = rhs.cache_block_size();
00070       return *this;
00071     }
00072 
00076     size_t cache_block_size () const { return cache_block_size_; }
00077 
00082     bool operator== (const CacheBlockingStrategy& rhs) const {
00083       return (cache_block_size() == rhs.cache_block_size());
00084     }
00085 
00090     bool operator!= (const CacheBlockingStrategy& rhs) const {
00091       return (cache_block_size() != rhs.cache_block_size());
00092     }
00093 
00098     std::pair< LocalOrdinal, LocalOrdinal >
00099     cache_block (const LocalOrdinal index,
00100      const LocalOrdinal nrows,
00101      const LocalOrdinal ncols,
00102      const LocalOrdinal nrows_cache_block) const
00103     {
00104       const LocalOrdinal nblocks = nrows / nrows_cache_block;
00105       const LocalOrdinal remainder = nrows - nblocks * nrows_cache_block;
00106       LocalOrdinal my_row_start, my_nrows;
00107 
00108       my_row_start = index * nrows_cache_block;
00109       if (index < nblocks - 1)
00110   my_nrows = nrows_cache_block;
00111       else if (index == nblocks - 1)
00112   {
00113     if (remainder > 0 && remainder < ncols)
00114       my_nrows = nrows_cache_block + remainder;
00115     else
00116       my_nrows = nrows_cache_block;
00117   }
00118       else if (index == nblocks)
00119   {
00120     if (remainder >= ncols)
00121       my_nrows = remainder;
00122     else 
00123       my_nrows = 0;
00124   }
00125       else 
00126   my_nrows = 0;
00127       
00128       return std::make_pair (my_row_start, my_nrows);
00129     }
00130 
00135     LocalOrdinal 
00136     num_cache_blocks (const LocalOrdinal nrows,
00137           const LocalOrdinal ncols,
00138           const LocalOrdinal nrows_cache_block) const
00139     {
00140       const LocalOrdinal quotient = nrows / nrows_cache_block; 
00141       const LocalOrdinal remainder = nrows - nrows_cache_block * quotient;
00142       const LocalOrdinal nblocks = (0 < remainder && remainder < ncols) ? (quotient+1) : quotient;
00143 
00144       return nblocks;
00145     }
00146 
00153     LocalOrdinal
00154     top_block_split_nrows (const LocalOrdinal nrows,
00155          const LocalOrdinal ncols,
00156          const LocalOrdinal nrows_cache_block) const
00157     {
00158       // We want to partition the nrows by ncols matrix A into [A_top;
00159       // A_bot], where A_top has nrows_cache_block rows.  However, we
00160       // don't want A_bot to have less than ncols rows.  If it would,
00161       // then we partition A so that A_top has nrows rows and A_bot is
00162       // empty.
00163       if (nrows < nrows_cache_block + ncols)
00164   return nrows;
00165       else
00166   // Don't ask for a bigger cache block than there are rows in
00167   // the matrix left to process.
00168   return std::min (nrows_cache_block, nrows);
00169     }
00170 
00171 
00178     LocalOrdinal
00179     bottom_block_split_nrows (const LocalOrdinal nrows,
00180             const LocalOrdinal ncols,
00181             const LocalOrdinal nrows_cache_block) const
00182     {
00183       // We want to split off the bottom block using the same
00184       // splitting as if we had split off as many top blocks of
00185       // nrows_cache_block rows as permissible.  The last block may
00186       // have fewer than nrows_cache_block rows, but it may not have
00187       // fewer than ncols rows (since we don't want any cache block to
00188       // have fewer rows than columns).
00189 
00190       if (nrows < nrows_cache_block + ncols)
00191   {
00192     // The nrows by ncols matrix A is just big enough for one
00193     // cache block.  Partition A == [A_top; A_bot] with A_top
00194     // empty and A_bot == A.
00195     return nrows;
00196   }
00197       else
00198   {
00199     const LocalOrdinal quotient = nrows / nrows_cache_block;
00200     const LocalOrdinal remainder = nrows - quotient * nrows_cache_block;
00201 
00202     LocalOrdinal nrows_bottom;
00203     if (remainder == 0 || remainder < ncols)
00204       nrows_bottom = nrows_cache_block + remainder;
00205     else if (remainder >= ncols)
00206       nrows_bottom = remainder;
00207     else
00208       throw std::logic_error("Should never get here!");
00209     return nrows_bottom;
00210   }
00211     }
00212 
00213 
00221     size_t 
00222     default_cache_block_size (const size_t suggested_cache_size) const
00223     {
00224       if (suggested_cache_size == 0)
00225   {
00226     // 64 KB: reasonable default upper bound for L2 cache
00227     // capacity.  
00228     const size_t default_cache_size = 65536;
00229     const size_t default_nwords = default_cache_size / sizeof(Scalar);
00230 
00231     // We want to be able to work on two blocks of size at least
00232     // 4x4 in cache.  Otherwise TSQR is more or less pointless.
00233     // If we can't do that, bail out; it's likely that Scalar is
00234     // the wrong data type for TSQR.
00235     if (default_nwords < 32)
00236       {
00237         std::ostringstream os;
00238         os << "Error: sizeof(Scalar) == " << sizeof(Scalar) 
00239      << ", which is too large for us to pick a reasonable "
00240      << "default cache size.";
00241         throw std::range_error (os.str());
00242       }
00243     else
00244       return default_cache_size;
00245   }
00246       else
00247   {
00248     const size_t nwords = suggested_cache_size / sizeof(Scalar);
00249 
00250     // We want to be able to work on two blocks of size at least
00251     // 4x4 in cache.  Otherwise TSQR is more or less pointless.
00252     // If we can't do that, bail out; it's likely that Scalar is
00253     // the wrong data type for TSQR.
00254     if (nwords < 32)
00255       {
00256         std::ostringstream os;
00257         os << "Error: suggested cache size " << suggested_cache_size
00258      << " bytes is too small for sizeof(Scalar) == " 
00259      << sizeof(Scalar) << " bytes";
00260         throw std::invalid_argument (os.str());
00261       }
00262     else
00263       return suggested_cache_size;
00264   }
00265     }
00266 
00267 
00278     LocalOrdinal 
00279     cache_block_num_rows (const LocalOrdinal ncols) const
00280     {
00281       // Suppose the cache can hold W words (of size sizeof(Scalar)
00282       // bytes each).  We have to use the same number of rows per
00283       // cache block for both the factorization and applying the Q
00284       // factor.
00285       //
00286       // The factorization requires a working set of
00287       // ncols*(nrows_cache_block + ncols) + 2*ncols words:
00288       //
00289       // 1. One ncols by ncols R factor (not packed)
00290       // 2. One nrows_cache_block by ncols cache block
00291       // 3. tau array of length ncols
00292       // 4. workspace array of length ncols
00293       //
00294       // That means nrows_cache_block should be <= W/ncols - ncols - 2.
00295       //
00296       // Applying the Q factor to a matrix C with the same number of
00297       // columns as Q requires a working set of
00298       // 2*nrows_cache_block*ncols + ncols*ncols + 2*ncols
00299       //
00300       // 1. Cache block of Q: nrows_cache_block by ncols
00301       // 2. C_top block: ncols by ncols
00302       // 3. C_cur block: nrows_cache_block by ncols
00303       // 4. tau array of length ncols
00304       // 5. workspace array of length ncols
00305       // 
00306       // That means nrows_cache_block should be <= (W/(2*N) - N/2 -
00307       // 1).  Obviously this is smaller than for the factorization, so
00308       // we use this formula to pick nrows_cache_block.
00309 
00310       const size_t cache_block_nwords = cache_block_size() / sizeof(Scalar);
00311 
00312       // Compute everything in size_t first, and cast to LocalOrdinal
00313       // at the end.  This may avoid overflow if the cache is very
00314       // large and/or LocalOrdinal is very small (say a short int).
00315       //
00316       // Also, since size_t is unsigned, make sure that the
00317       // subtractions don't make it negative.  If it does, then either
00318       // ncols is too big or the cache is too small.
00319 
00320       const size_t term1 = cache_block_nwords / (2*ncols);
00321       const size_t term2 = ncols / 2 + 1;
00322       if (term1 < term2)
00323   {
00324     std::ostringstream os;
00325     os << "Error:  While deciding on the number of rows in a cache "
00326       "block for sequential TSQR, the specified number of columns " 
00327        << ncols << " in the matrix to factor is too big for the "
00328       "specified cache size, which can hold " << cache_block_nwords 
00329        << " words of size sizeof(Scalar) == " << sizeof(Scalar) 
00330        << " each.";
00331     throw std::invalid_argument (os.str());
00332   }
00333       else 
00334   {
00335     // The compiler can easily prove that term1 - term2 >= 0,
00336     // since we've gotten to this point.  Of course that's
00337     // assuming that C++ compilers are smart...
00338     const size_t nrows_cache_block = term1 - term2;
00339 
00340     // Make sure that nrows_cache_block fits in a LocalOrdinal type.
00341     if (static_cast<size_t>(static_cast< LocalOrdinal >(nrows_cache_block)) != nrows_cache_block)
00342       {
00343         std::ostringstream os;
00344         os << "Error:  While deciding on the number of rows in a cache "
00345     "block for sequential TSQR, the decided-upon number of rows "
00346      << nrows_cache_block << " does not fit in a LocalOrdinal "
00347     "type, whose max value is " 
00348      << std::numeric_limits<LocalOrdinal>::max() << ".";
00349         throw std::range_error (os.str());
00350       }
00351     else
00352       return static_cast< LocalOrdinal > (nrows_cache_block);
00353   }
00354     }
00355 
00356   private:
00358     size_t cache_block_size_;
00359   };
00360 } // namespace TSQR
00361 
00362 #endif // __TSQR_CacheBlockingStrategy_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends