Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_CombineBenchmark.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __TSQR_Test_CombineBenchmark_hpp
00030 #define __TSQR_Test_CombineBenchmark_hpp
00031 
00032 #include <Tsqr_ConfigDefs.hpp>
00033 #include <Tsqr_CombineBenchmarker.hpp>
00034 #include <Tsqr_CombineDefault.hpp>
00035 #include <Tsqr_CombineNative.hpp>
00036 #ifdef HAVE_TSQR_FORTRAN
00037 #  include <Tsqr_CombineFortran.hpp>
00038 #endif // HAVE_TSQR_FORTRAN
00039 
00040 #include <algorithm>
00041 #include <iostream>
00042 #include <limits>
00043 #include <sstream>
00044 #include <stdexcept>
00045 #include <utility>
00046 #include <vector>
00047 
00048 
00049 namespace TSQR {
00050   namespace Test {
00051 
00105     struct CombineBenchmarkParameters {
00106       int numRows;
00107       int numCols;
00108       bool testReal;
00109       bool testComplex;
00110       int numTrials;
00111       bool calibrate;
00112       bool averageTimings;
00113       bool strictPerfTests;
00114       double allowance;
00115       std::vector<int> seed;
00116       bool useSeedValues;
00117       std::string additionalFieldNames;
00118       std::string additionalData;
00119       bool printFieldNames;
00120       bool debug;
00121     };
00122 
00123     template<class CombineType, class TimerType>
00124     static std::vector<double>
00125     benchmarkCombineType (std::ostream& out,
00126         std::vector<int>& iseed,
00127         const std::string& dataTypeName,
00128         const std::string& combineTypeName,
00129         const typename CombineType::ordinal_type numRows,
00130         const typename CombineType::ordinal_type numCols,
00131         const int cacheBlockNumTrials,
00132         const int pairNumTrials,
00133         const bool averageTimings,
00134         const std::string& additionalData)
00135     {
00136       using std::endl;
00137 
00138       typedef typename CombineType::ordinal_type ordinal_type;
00139       typedef typename CombineType::scalar_type scalar_type;
00140       typedef typename CombineType::magnitude_type magnitude_type;
00141       typedef CombineBenchmarker<ordinal_type, scalar_type, CombineType, TimerType> 
00142   benchmarker_type;
00143 
00144       TEST_FOR_EXCEPTION(cacheBlockNumTrials < 1, std::invalid_argument,
00145        "The number of trials for the cache block benchmark "
00146        "must be positive, but you specified cacheBlockNum"
00147        "Trials = " << cacheBlockNumTrials << ".");
00148       TEST_FOR_EXCEPTION(pairNumTrials < 1, std::invalid_argument,
00149        "The number of trials for the pair benchmark must be "
00150        "positive, but you specified pairNumTrials = "
00151        << pairNumTrials << ".");
00152 
00153       benchmarker_type b (iseed);
00154       std::pair<double, double> results;
00155       results.first = 
00156   b.benchmarkPair (numCols, pairNumTrials);
00157       results.second = 
00158   b.benchmarkCacheBlock (numRows, numCols, cacheBlockNumTrials);
00159 
00160       // Whether or not we should print the "additional data"
00161       // (originally supplied at command-line invocation of this
00162       // benchmark) after the benchmark results.  The additional data
00163       // option makes it easier to write parsers for benchmark
00164       // results, since we can include data that are known outside the
00165       // benchmark (when invoking the benchmark as an executable), but
00166       // not known (easily or at all) inside the benchmark.  A good
00167       // example would be environment variables, like OMP_NUM_THREADS,
00168       // or (for a benchmark that uses MPI, which this is not) the
00169       // number of MPI processes per node ("ppn").
00170       const bool printAdditionalData = (! additionalData.empty());
00171 
00172       const double pairTime = averageTimings ? 
00173   results.first / static_cast<double>(pairNumTrials) : 
00174   results.first;
00175       const double cacheBlockTime = averageTimings ? 
00176   results.second / static_cast<double>(cacheBlockNumTrials) : 
00177   results.second;
00178 
00179       out << combineTypeName 
00180     << "," << "R1R2"
00181     << "," << dataTypeName
00182     << "," << (2*numCols)
00183     << "," << numCols
00184     << "," << pairNumTrials
00185     << "," << pairTime;
00186       if (printAdditionalData)
00187   out << "," << additionalData;
00188       out << endl;
00189       out << combineTypeName 
00190     << "," << "RA"
00191     << "," << dataTypeName
00192     << "," << numRows
00193     << "," << numCols
00194     << "," << cacheBlockNumTrials
00195     << "," << cacheBlockTime;
00196       if (printAdditionalData)
00197   out << "," << additionalData;
00198       out << endl;
00199 
00200       std::vector<double> timings (2);
00201       timings[0] = pairTime;
00202       timings[1] = cacheBlockTime;
00203       return timings;
00204     }
00205 
00206     template<class Scalar, class TimerType>
00207     static void
00208     benchmarkAllCombineTypes (std::ostream& out,
00209             const std::string& dataTypeName,
00210             CombineBenchmarkParameters& params,
00211             const double timerResolution)
00212     {
00213       using std::cerr;
00214       using std::endl;
00215       const bool debug = params.debug;
00216       const int numRows = params.numRows;
00217       const int numCols = params.numCols;
00218 
00219       TEST_FOR_EXCEPTION(timerResolution <= static_cast<double>(0), 
00220        std::invalid_argument,
00221        "The timer resolution must be a positive number, "
00222        "but you specified timerResolution = " 
00223        << timerResolution << ".");
00224 
00225       // If no calibration is performed, then the number of trials is
00226       // the same for both the cache block [R; A] benchmark and the
00227       // pair [R1; R2] benchmark.  Otherwise, we calibrate the number
00228       // of trials for each separately.  This is because we expect the
00229       // [R1; R2] benchmark to take much less time than the [R; A]
00230       // benchmark, so [R1; R2] should have more trials, in order to
00231       // get comparable timing accuracy without requiring too many [R;
00232       // A] trials.
00233       int pairNumTrials = params.numTrials;
00234       int cacheBlockNumTrials = params.numTrials;
00235       if (params.calibrate)
00236   { // We calibrate the number of trials using the default
00237     // Combine implementation.  We don't expect CombineNative or
00238     // CombineFortran to be much faster than that.  
00239     if (debug)
00240       cerr << "Calibrating..." << endl;
00241 
00242     // Calibrater gets the timer resolution.
00243     typedef CombineDefault<int, Scalar> combine_type;
00244     typedef CombineBenchmarker<int, Scalar, combine_type, TimerType> 
00245       benchmarker_type;
00246     benchmarker_type c (timerResolution, params.seed);
00247 
00248     // Accuracy factor of 1000 gives us 3 digits of timer accuracy.
00249     const double accuracyFactor = static_cast<double> (1000);
00250 
00251     // Number of trials for factor_pair() and apply_pair().
00252     std::pair<int, double> result;
00253     result = c.calibratePair (numCols, accuracyFactor);
00254     if (debug)
00255       {
00256         cerr << "- Pair number of trials: " << result.first << endl;
00257         cerr << "- Pair calibration time: " << result.second << endl;
00258       }
00259     pairNumTrials = result.first;
00260 
00261     // Number of trials for factor_inner() and apply_inner().
00262     result = c.calibrateCacheBlock (numRows, numCols, accuracyFactor);
00263     if (debug)
00264       {
00265         cerr << "- Cache block number of trials: " << result.first << endl;
00266         cerr << "- Cache block calibration time: " << result.second << endl;
00267       }
00268     cacheBlockNumTrials = result.first;
00269 
00270     // Store the updated PRNG seed in the benchmark parameters.
00271     c.getSeed (params.seed);
00272   }
00273 
00274       // Always benchmark CombineDefault.  We use its timings as the
00275       // standard by which the other Combine implementations' timings
00276       // are compared.  The returned vector contains two timings: for
00277       // [R1; R2], and for [R; A], in that order.
00278       std::vector<double> defaultTimings;
00279       {
00280   typedef CombineDefault< int, Scalar > combine_type;
00281   std::string combineTypeName ("Default");
00282   defaultTimings = 
00283     benchmarkCombineType<combine_type, TimerType> (out, params.seed,
00284                dataTypeName, 
00285                combineTypeName, 
00286                numRows, 
00287                numCols, 
00288                cacheBlockNumTrials,
00289                pairNumTrials,
00290                params.averageTimings,
00291                params.additionalData);
00292       }
00293 
00294       // If we're doing strict performance tests, then CombineNative
00295       // (and CombineFortran, if applicable) may be no slower than the
00296       // given allowance factor times CombineDefault's time.  For now,
00297       // we only look at cache block performance, since that is where
00298       // most of the time should be going.
00299       std::vector<double> nativeTimings;
00300       {
00301   typedef CombineNative<int, Scalar> combine_type;
00302   std::string combineTypeName ("Native");
00303   nativeTimings = 
00304     benchmarkCombineType<combine_type, TimerType> (out, params.seed, 
00305                dataTypeName, 
00306                combineTypeName, 
00307                numRows, 
00308                numCols, 
00309                cacheBlockNumTrials,
00310                pairNumTrials,
00311                params.averageTimings,
00312                params.additionalData);
00313   const double slowdown = nativeTimings[1] / defaultTimings[1];
00314   const bool tooSlow = slowdown > params.allowance;
00315   // FIXME (mfh 24 May 2011) Replace std::runtime_error with a
00316   // more appropriately named exception.
00317   TEST_FOR_EXCEPTION(params.strictPerfTests && tooSlow, 
00318          std::runtime_error,
00319          "CombineNative is too slow!  For cache block "
00320          "benchmark with numRows=" << numRows << " and numCols="
00321          << numCols << ", CombineNative time (= " 
00322          << nativeTimings[1] << ") / CombineDefault time (= "
00323          << defaultTimings[1] << ") = " << slowdown 
00324          << " > the allowed fraction " << params.allowance 
00325          << ".");
00326       }
00327 
00328 #ifdef HAVE_TSQR_FORTRAN
00329       std::vector<double> fortranTimings;
00330       {
00331   typedef CombineFortran<Scalar> combine_type;
00332   std::string combineTypeName ("Fortran");
00333   fortranTimings = 
00334     benchmarkCombineType<combine_type, TimerType> (out, params.seed, 
00335                dataTypeName, 
00336                combineTypeName, 
00337                numRows, 
00338                numCols, 
00339                cacheBlockNumTrials,
00340                pairNumTrials,
00341                params.averageTimings,
00342                params.additionalData);
00343   const double slowdown = fortranTimings[1] / defaultTimings[1];
00344   const bool tooSlow = slowdown > params.allowance;
00345   // FIXME (mfh 24 May 2011) Replace std::runtime_error with a
00346   // more appropriately named exception.
00347   TEST_FOR_EXCEPTION(params.strictPerfTests && tooSlow, 
00348          std::runtime_error,
00349          "CombineFortran is too slow!  For cache block "
00350          "benchmark with numRows=" << numRows << " and numCols="
00351          << numCols << ", CombineFortran time (= " 
00352          << fortranTimings[1] << ") / CombineDefault time (= "
00353          << defaultTimings[1] << ") = " << slowdown 
00354          << " > the allowed fraction " << params.allowance 
00355          << ".");
00356       }
00357 #endif // HAVE_TSQR_FORTRAN
00358     }
00359 
00360 
00361     template<class TimerType>
00362     static void
00363     benchmarkAllCombineTypesAndScalars (std::ostream& out,
00364           CombineBenchmarkParameters& params)
00365     {
00366       using std::cerr;
00367       using std::endl;
00368       using std::string;
00369       const bool debug = params.debug;
00370 
00371       // Compute timer resolution.
00372       const double timerResolution = computeTimerResolution<TimerType> ();
00373       if (debug)
00374   cerr << "Timer resolution: " << timerResolution << " seconds" << endl;
00375 
00376       string dataTypeName;
00377       if (params.testReal)
00378   {
00379     dataTypeName = "float";
00380     benchmarkAllCombineTypes<float, TimerType> (out, dataTypeName, 
00381                   params, timerResolution);
00382     dataTypeName = "double";
00383     benchmarkAllCombineTypes<double, TimerType> (out, dataTypeName,
00384                    params, timerResolution);
00385   }
00386       if (params.testComplex)
00387   {
00388 #ifdef HAVE_TSQR_COMPLEX
00389     using std::complex;
00390 
00391     dataTypeName = "complex<float>";
00392     benchmarkAllCombineTypes<complex<float>, TimerType> (out, dataTypeName,
00393                      params, timerResolution);
00394     dataTypeName = "complex<double>";
00395     benchmarkAllCombineTypes<complex<double>, TimerType> (out, dataTypeName,
00396                 params, timerResolution);
00397 
00398 #else // Don't HAVE_TSQR_COMPLEX
00399     throw std::logic_error("TSQR not built with complex arithmetic support");
00400 #endif // HAVE_TSQR_COMPLEX
00401   }
00402     }         
00403 
00414     template<class TimerType>
00415     void
00416     benchmarkCombine (std::ostream& out,
00417           CombineBenchmarkParameters& params)
00418     {
00419       TEST_FOR_EXCEPTION(params.numRows < 1 || params.numCols < 1, 
00420        std::invalid_argument,
00421        "The test matrix must have a positive number of rows "
00422        "and columns, but you specified numRows = " 
00423        << params.numRows << " and numCols = "
00424        << params.numCols << ".");
00425       TEST_FOR_EXCEPTION(! params.calibrate && params.numTrials < 1, 
00426        std::invalid_argument,
00427        "Since you specified no calibration is to be performed, "
00428        "the number of trials must be positive, but you specified "
00429        "numTrials = " << params.numTrials << ".");
00430 
00431       if (! params.useSeedValues)
00432   { // Fill in default seed values.
00433     if (params.seed.size() < 4)
00434       params.seed.resize (4);
00435     params.seed[0] = 0;
00436     params.seed[1] = 0;
00437     params.seed[2] = 0;
00438     params.seed[3] = 1;
00439   }
00440 
00441       if (params.printFieldNames)
00442   {
00443     // The row of field names begins with a '%' character, in
00444     // order to help out the benchmark results parser.
00445     out << "%" << "method"
00446         << "," << "kernel"
00447         << "," << "scalarType"
00448         << "," << "numRows"
00449         << "," << "numCols"
00450         << "," << "numTrials"
00451         << "," << "timing";
00452     if (params.printFieldNames && ! params.additionalFieldNames.empty())
00453       // The additionalFieldNames string should be a
00454       // comma-delimited list of additional field name(s).
00455       out << "," << params.additionalFieldNames;
00456     out << std::endl;
00457   }
00458       benchmarkAllCombineTypesAndScalars<TimerType> (out, params);
00459     }
00460 
00461   } // namespace Test
00462 } // namespace TSQR
00463 
00464 #endif // __TSQR_Test_CombineBenchmark_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends