Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_CombineBenchmark.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __TSQR_Test_CombineBenchmark_hpp
00043 #define __TSQR_Test_CombineBenchmark_hpp
00044 
00045 #include <Tsqr_ConfigDefs.hpp>
00046 #include <Tsqr_CombineBenchmarker.hpp>
00047 #include <Tsqr_CombineDefault.hpp>
00048 #include <Tsqr_CombineNative.hpp>
00049 #ifdef HAVE_TSQR_FORTRAN
00050 #  include <Tsqr_CombineFortran.hpp>
00051 #endif // HAVE_TSQR_FORTRAN
00052 
00053 #include <algorithm>
00054 #include <iostream>
00055 #include <limits>
00056 #include <sstream>
00057 #include <stdexcept>
00058 #include <utility>
00059 #include <vector>
00060 
00061 
00062 namespace TSQR {
00063   namespace Test {
00064 
00118     struct CombineBenchmarkParameters {
00119       int numRows;
00120       int numCols;
00121       bool testReal;
00122       bool testComplex;
00123       int numTrials;
00124       bool calibrate;
00125       bool averageTimings;
00126       bool strictPerfTests;
00127       double allowance;
00128       std::vector<int> seed;
00129       bool useSeedValues;
00130       std::string additionalFieldNames;
00131       std::string additionalData;
00132       bool printFieldNames;
00133       bool debug;
00134     };
00135 
00136     template<class CombineType, class TimerType>
00137     static std::vector<double>
00138     benchmarkCombineType (std::ostream& out,
00139         std::vector<int>& iseed,
00140         const std::string& dataTypeName,
00141         const std::string& combineTypeName,
00142         const typename CombineType::ordinal_type numRows,
00143         const typename CombineType::ordinal_type numCols,
00144         const int cacheBlockNumTrials,
00145         const int pairNumTrials,
00146         const bool averageTimings,
00147         const std::string& additionalData)
00148     {
00149       using std::endl;
00150 
00151       typedef typename CombineType::ordinal_type ordinal_type;
00152       typedef typename CombineType::scalar_type scalar_type;
00153       typedef typename CombineType::magnitude_type magnitude_type;
00154       typedef CombineBenchmarker<ordinal_type, scalar_type, CombineType, TimerType> 
00155   benchmarker_type;
00156 
00157       TEUCHOS_TEST_FOR_EXCEPTION(cacheBlockNumTrials < 1, std::invalid_argument,
00158        "The number of trials for the cache block benchmark "
00159        "must be positive, but you specified cacheBlockNum"
00160        "Trials = " << cacheBlockNumTrials << ".");
00161       TEUCHOS_TEST_FOR_EXCEPTION(pairNumTrials < 1, std::invalid_argument,
00162        "The number of trials for the pair benchmark must be "
00163        "positive, but you specified pairNumTrials = "
00164        << pairNumTrials << ".");
00165 
00166       benchmarker_type b (iseed);
00167       std::pair<double, double> results;
00168       results.first = 
00169   b.benchmarkPair (numCols, pairNumTrials);
00170       results.second = 
00171   b.benchmarkCacheBlock (numRows, numCols, cacheBlockNumTrials);
00172 
00173       // Whether or not we should print the "additional data"
00174       // (originally supplied at command-line invocation of this
00175       // benchmark) after the benchmark results.  The additional data
00176       // option makes it easier to write parsers for benchmark
00177       // results, since we can include data that are known outside the
00178       // benchmark (when invoking the benchmark as an executable), but
00179       // not known (easily or at all) inside the benchmark.  A good
00180       // example would be environment variables, like OMP_NUM_THREADS,
00181       // or (for a benchmark that uses MPI, which this is not) the
00182       // number of MPI processes per node ("ppn").
00183       const bool printAdditionalData = (! additionalData.empty());
00184 
00185       const double pairTime = averageTimings ? 
00186   results.first / static_cast<double>(pairNumTrials) : 
00187   results.first;
00188       const double cacheBlockTime = averageTimings ? 
00189   results.second / static_cast<double>(cacheBlockNumTrials) : 
00190   results.second;
00191 
00192       out << combineTypeName 
00193     << "," << "R1R2"
00194     << "," << dataTypeName
00195     << "," << (2*numCols)
00196     << "," << numCols
00197     << "," << pairNumTrials
00198     << "," << pairTime;
00199       if (printAdditionalData)
00200   out << "," << additionalData;
00201       out << endl;
00202       out << combineTypeName 
00203     << "," << "RA"
00204     << "," << dataTypeName
00205     << "," << numRows
00206     << "," << numCols
00207     << "," << cacheBlockNumTrials
00208     << "," << cacheBlockTime;
00209       if (printAdditionalData)
00210   out << "," << additionalData;
00211       out << endl;
00212 
00213       std::vector<double> timings (2);
00214       timings[0] = pairTime;
00215       timings[1] = cacheBlockTime;
00216       return timings;
00217     }
00218 
00219     template<class Scalar, class TimerType>
00220     static void
00221     benchmarkAllCombineTypes (std::ostream& out,
00222             const std::string& dataTypeName,
00223             CombineBenchmarkParameters& params,
00224             const double timerResolution)
00225     {
00226       using std::cerr;
00227       using std::endl;
00228       const bool debug = params.debug;
00229       const int numRows = params.numRows;
00230       const int numCols = params.numCols;
00231 
00232       TEUCHOS_TEST_FOR_EXCEPTION(timerResolution <= static_cast<double>(0), 
00233        std::invalid_argument,
00234        "The timer resolution must be a positive number, "
00235        "but you specified timerResolution = " 
00236        << timerResolution << ".");
00237 
00238       // If no calibration is performed, then the number of trials is
00239       // the same for both the cache block [R; A] benchmark and the
00240       // pair [R1; R2] benchmark.  Otherwise, we calibrate the number
00241       // of trials for each separately.  This is because we expect the
00242       // [R1; R2] benchmark to take much less time than the [R; A]
00243       // benchmark, so [R1; R2] should have more trials, in order to
00244       // get comparable timing accuracy without requiring too many [R;
00245       // A] trials.
00246       int pairNumTrials = params.numTrials;
00247       int cacheBlockNumTrials = params.numTrials;
00248       if (params.calibrate)
00249   { // We calibrate the number of trials using the default
00250     // Combine implementation.  We don't expect CombineNative or
00251     // CombineFortran to be much faster than that.  
00252     if (debug)
00253       cerr << "Calibrating..." << endl;
00254 
00255     // Calibrater gets the timer resolution.
00256     typedef CombineDefault<int, Scalar> combine_type;
00257     typedef CombineBenchmarker<int, Scalar, combine_type, TimerType> 
00258       benchmarker_type;
00259     benchmarker_type c (timerResolution, params.seed);
00260 
00261     // Accuracy factor of 1000 gives us 3 digits of timer accuracy.
00262     const double accuracyFactor = static_cast<double> (1000);
00263 
00264     // Number of trials for factor_pair() and apply_pair().
00265     std::pair<int, double> result;
00266     result = c.calibratePair (numCols, accuracyFactor);
00267     if (debug)
00268       {
00269         cerr << "- Pair number of trials: " << result.first << endl;
00270         cerr << "- Pair calibration time: " << result.second << endl;
00271       }
00272     pairNumTrials = result.first;
00273 
00274     // Number of trials for factor_inner() and apply_inner().
00275     result = c.calibrateCacheBlock (numRows, numCols, accuracyFactor);
00276     if (debug)
00277       {
00278         cerr << "- Cache block number of trials: " << result.first << endl;
00279         cerr << "- Cache block calibration time: " << result.second << endl;
00280       }
00281     cacheBlockNumTrials = result.first;
00282 
00283     // Store the updated PRNG seed in the benchmark parameters.
00284     c.getSeed (params.seed);
00285   }
00286 
00287       // Always benchmark CombineDefault.  We use its timings as the
00288       // standard by which the other Combine implementations' timings
00289       // are compared.  The returned vector contains two timings: for
00290       // [R1; R2], and for [R; A], in that order.
00291       std::vector<double> defaultTimings;
00292       {
00293   typedef CombineDefault< int, Scalar > combine_type;
00294   std::string combineTypeName ("Default");
00295   defaultTimings = 
00296     benchmarkCombineType<combine_type, TimerType> (out, params.seed,
00297                dataTypeName, 
00298                combineTypeName, 
00299                numRows, 
00300                numCols, 
00301                cacheBlockNumTrials,
00302                pairNumTrials,
00303                params.averageTimings,
00304                params.additionalData);
00305       }
00306 
00307       // If we're doing strict performance tests, then CombineNative
00308       // (and CombineFortran, if applicable) may be no slower than the
00309       // given allowance factor times CombineDefault's time.  For now,
00310       // we only look at cache block performance, since that is where
00311       // most of the time should be going.
00312       std::vector<double> nativeTimings;
00313       {
00314   typedef CombineNative<int, Scalar> combine_type;
00315   std::string combineTypeName ("Native");
00316   nativeTimings = 
00317     benchmarkCombineType<combine_type, TimerType> (out, params.seed, 
00318                dataTypeName, 
00319                combineTypeName, 
00320                numRows, 
00321                numCols, 
00322                cacheBlockNumTrials,
00323                pairNumTrials,
00324                params.averageTimings,
00325                params.additionalData);
00326   const double slowdown = nativeTimings[1] / defaultTimings[1];
00327   const bool tooSlow = slowdown > params.allowance;
00328   // FIXME (mfh 24 May 2011) Replace std::runtime_error with a
00329   // more appropriately named exception.
00330   TEUCHOS_TEST_FOR_EXCEPTION(params.strictPerfTests && tooSlow, 
00331          std::runtime_error,
00332          "CombineNative is too slow!  For cache block "
00333          "benchmark with numRows=" << numRows << " and numCols="
00334          << numCols << ", CombineNative time (= " 
00335          << nativeTimings[1] << ") / CombineDefault time (= "
00336          << defaultTimings[1] << ") = " << slowdown 
00337          << " > the allowed fraction " << params.allowance 
00338          << ".");
00339       }
00340 
00341 #ifdef HAVE_TSQR_FORTRAN
00342       std::vector<double> fortranTimings;
00343       {
00344   typedef CombineFortran<Scalar> combine_type;
00345   std::string combineTypeName ("Fortran");
00346   fortranTimings = 
00347     benchmarkCombineType<combine_type, TimerType> (out, params.seed, 
00348                dataTypeName, 
00349                combineTypeName, 
00350                numRows, 
00351                numCols, 
00352                cacheBlockNumTrials,
00353                pairNumTrials,
00354                params.averageTimings,
00355                params.additionalData);
00356   const double slowdown = fortranTimings[1] / defaultTimings[1];
00357   const bool tooSlow = slowdown > params.allowance;
00358   // FIXME (mfh 24 May 2011) Replace std::runtime_error with a
00359   // more appropriately named exception.
00360   TEUCHOS_TEST_FOR_EXCEPTION(params.strictPerfTests && tooSlow, 
00361          std::runtime_error,
00362          "CombineFortran is too slow!  For cache block "
00363          "benchmark with numRows=" << numRows << " and numCols="
00364          << numCols << ", CombineFortran time (= " 
00365          << fortranTimings[1] << ") / CombineDefault time (= "
00366          << defaultTimings[1] << ") = " << slowdown 
00367          << " > the allowed fraction " << params.allowance 
00368          << ".");
00369       }
00370 #endif // HAVE_TSQR_FORTRAN
00371     }
00372 
00373 
00374     template<class TimerType>
00375     static void
00376     benchmarkAllCombineTypesAndScalars (std::ostream& out,
00377           CombineBenchmarkParameters& params)
00378     {
00379       using std::cerr;
00380       using std::endl;
00381       using std::string;
00382       const bool debug = params.debug;
00383 
00384       // Compute timer resolution.
00385       const double timerResolution = computeTimerResolution<TimerType> ();
00386       if (debug)
00387   cerr << "Timer resolution: " << timerResolution << " seconds" << endl;
00388 
00389       string dataTypeName;
00390       if (params.testReal)
00391   {
00392     dataTypeName = "float";
00393     benchmarkAllCombineTypes<float, TimerType> (out, dataTypeName, 
00394                   params, timerResolution);
00395     dataTypeName = "double";
00396     benchmarkAllCombineTypes<double, TimerType> (out, dataTypeName,
00397                    params, timerResolution);
00398   }
00399       if (params.testComplex)
00400   {
00401 #ifdef HAVE_TSQR_COMPLEX
00402     using std::complex;
00403 
00404     dataTypeName = "complex<float>";
00405     benchmarkAllCombineTypes<complex<float>, TimerType> (out, dataTypeName,
00406                      params, timerResolution);
00407     dataTypeName = "complex<double>";
00408     benchmarkAllCombineTypes<complex<double>, TimerType> (out, dataTypeName,
00409                 params, timerResolution);
00410 
00411 #else // Don't HAVE_TSQR_COMPLEX
00412     throw std::logic_error("TSQR not built with complex arithmetic support");
00413 #endif // HAVE_TSQR_COMPLEX
00414   }
00415     }         
00416 
00427     template<class TimerType>
00428     void
00429     benchmarkCombine (std::ostream& out,
00430           CombineBenchmarkParameters& params)
00431     {
00432       TEUCHOS_TEST_FOR_EXCEPTION(params.numRows < 1 || params.numCols < 1, 
00433        std::invalid_argument,
00434        "The test matrix must have a positive number of rows "
00435        "and columns, but you specified numRows = " 
00436        << params.numRows << " and numCols = "
00437        << params.numCols << ".");
00438       TEUCHOS_TEST_FOR_EXCEPTION(! params.calibrate && params.numTrials < 1, 
00439        std::invalid_argument,
00440        "Since you specified no calibration is to be performed, "
00441        "the number of trials must be positive, but you specified "
00442        "numTrials = " << params.numTrials << ".");
00443 
00444       if (! params.useSeedValues)
00445   { // Fill in default seed values.
00446     if (params.seed.size() < 4)
00447       params.seed.resize (4);
00448     params.seed[0] = 0;
00449     params.seed[1] = 0;
00450     params.seed[2] = 0;
00451     params.seed[3] = 1;
00452   }
00453 
00454       if (params.printFieldNames)
00455   {
00456     // The row of field names begins with a '%' character, in
00457     // order to help out the benchmark results parser.
00458     out << "%" << "method"
00459         << "," << "kernel"
00460         << "," << "scalarType"
00461         << "," << "numRows"
00462         << "," << "numCols"
00463         << "," << "numTrials"
00464         << "," << "timing";
00465     if (params.printFieldNames && ! params.additionalFieldNames.empty())
00466       // The additionalFieldNames string should be a
00467       // comma-delimited list of additional field name(s).
00468       out << "," << params.additionalFieldNames;
00469     out << std::endl;
00470   }
00471       benchmarkAllCombineTypesAndScalars<TimerType> (out, params);
00472     }
00473 
00474   } // namespace Test
00475 } // namespace TSQR
00476 
00477 #endif // __TSQR_Test_CombineBenchmark_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends