Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_CombineBenchmarker.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __Tsqr_CombineBenchmarker_hpp
00030 #define __Tsqr_CombineBenchmarker_hpp
00031 
00032 #include <Tsqr_ConfigDefs.hpp>
00033 #include <Tsqr_Random_NormalGenerator.hpp>
00034 #include <Tsqr_Random_MatrixGenerator.hpp>
00035 #include <Tsqr_verifyTimerConcept.hpp>
00036 
00037 #include <Tsqr_ApplyType.hpp>
00038 #include <Tsqr_Matrix.hpp>
00039 #include <Tsqr_ScalarTraits.hpp>
00040 #include <Tsqr_Util.hpp>
00041 
00042 #include <algorithm>
00043 #include <iostream>
00044 #include <limits>
00045 #include <sstream>
00046 #include <stdexcept>
00047 #include <vector>
00048 
00049 
00050 namespace TSQR {
00051   namespace Test {
00052 
00063     template<class TimerType>
00064     double
00065     computeTimerResolution () 
00066     {
00067       typedef TimerType timer_type;
00068       timer_type timer ("Timer resolution");
00069 
00070       // Warmup run for the timer.
00071       for (int warmup = 0; warmup < 5; ++warmup)
00072   {
00073     timer.start();
00074     (void) timer.stop();
00075   }
00076 
00077       // Keep a count of the total number of times timer.stop() is
00078       // called (once per outer loop iteration).  If bigger than
00079       // maxCount, assume that the timer is broken and bail out.
00080       // This prevents an infinite loop.  Since numTrials is
00081       // multiplied by 10 each time around, maxCount should be more
00082       // than the base-10 log of the maximum acceptable timer
00083       // resolution, relative to the time taken by a single
00084       // floating-point increment operation.
00085       //
00086       // For example, suppose that the time per flop is 10^{-9} s,
00087       // and the timer resolution is 1 s (which is particularly
00088       // coarse).  We start with numTrials=10 and multiply by 10
00089       // each time.  Then, it may take 8 or 9 iterations (may have
00090       // to exceed the timer resolution, or just meet it) so the
00091       // timing loop exceeds the timer resolution.  Thus, maxCount=9
00092       // = log10(timer resolution / time per flop) is just enough.
00093       // Adding a bit more for wiggle room is a good idea.
00094       //
00095       // If your machine is exceptionally fast or your compiler
00096       // autoparallelizes the timing loop, you may have to increase
00097       // maxCount.  We set it to 20 as a generous upper bound.
00098       const size_t maxCount = 20;
00099       size_t count = 0;
00100 
00101       // Don't let numTrials loop around.  (We're multiplying it by
00102       // 10 each time, so it won't take long for this to happen,
00103       // especially if the timer granularity is too coarse.)
00104       const size_t maxNumTrials = std::numeric_limits<size_t>::max() / 10;
00105 
00106       // Do some fake work.  Multiply the number of trials by 10
00107       // each time, so that resolution is expressed as an order of
00108       // decimal magnitude.
00109       size_t numTrials = 1;
00110       double theTime;
00111       do {
00112   const double eps = Teuchos::ScalarTraits<double>::eps();
00113   double fake = Teuchos::ScalarTraits<double>::one();
00114   numTrials *= 10;
00115   ++count;
00116 
00117   // The timing loop.
00118   timer.start();
00119   for (size_t trial = 0; trial < numTrials; ++trial)
00120     fake += eps;
00121   theTime = timer.stop();
00122 
00123       } while (theTime == 0 && count < maxCount && numTrials < maxNumTrials);
00124 
00125       if (theTime == 0)
00126   {
00127     std::ostringstream os;
00128     os << "Maximum number of loops " << maxCount << " exceeded when "
00129       "computing timer resolution.  Largest timing loop length tried: " 
00130        << numTrials << ".";
00131     throw std::logic_error (os.str());
00132   }
00133       else if (numTrials >= maxNumTrials)
00134   {
00135     std::ostringstream os;
00136     os << "Maximum number of timing loop iterations " << maxNumTrials 
00137        << " exceeded when computing timer resolution.  Largest timing "
00138       "loop length tried: " << numTrials << ".";
00139     throw std::logic_error (os.str());
00140   }
00141       else
00142   return theTime;
00143     }
00144 
00168     template<class Ordinal, class Scalar, class CombineType, class TimerType>
00169     class CombineBenchmarker {
00170     public:
00171       typedef Ordinal ordinal_type;
00172       typedef Scalar scalar_type;
00173       typedef CombineType combine_type;
00174       typedef TimerType timer_type;
00175 
00176     private:
00177       typedef Teuchos::ScalarTraits<scalar_type> STS;
00178       typedef typename STS::magnitudeType magnitude_type;
00179       typedef Teuchos::ScalarTraits<magnitude_type> STM;
00180       typedef TSQR::Random::NormalGenerator<ordinal_type, scalar_type> normgen_type;
00181       typedef TSQR::Random::MatrixGenerator<ordinal_type, scalar_type, normgen_type> matgen_type;
00182       typedef Matrix<ordinal_type, scalar_type> matrix_type;
00183 
00184     public:
00185 
00195       CombineBenchmarker (const double timerRes,
00196         const std::vector<int>& iseed) : 
00197   normGenS_ (iseed), 
00198   normGenM_ (iseed),
00199   timerResolution_ (timerRes)
00200       {
00201   TSQR::Test::verifyTimerConcept<timer_type> ();
00202       }
00203 
00210       CombineBenchmarker (const std::vector<int>& iseed) : 
00211   normGenS_ (iseed), 
00212   normGenM_ (iseed),
00213   timerResolution_ (computeTimerResolution<timer_type> ())
00214       {}
00215 
00220       CombineBenchmarker (const double timerRes) :
00221   timerResolution_ (timerRes)
00222       {
00223   TSQR::Test::verifyTimerConcept<timer_type> ();
00224       }
00225 
00226 
00228       CombineBenchmarker () :
00229   timerResolution_ (computeTimerResolution<timer_type> ())
00230       {}
00231 
00233       void 
00234       getSeed (std::vector<int>& iseed) const
00235       {
00236   normGenS_.getSeed (iseed);
00237       }
00238 
00240       double 
00241       timerResolution() const {
00242   return timerResolution_;
00243       }
00244 
00268       std::pair<int, double>
00269       calibrateFirst (const Ordinal numRows,
00270           const Ordinal numCols,
00271           const double accuracyFactor)
00272       {
00273   if (numRows == 0 || numCols == 0)
00274     throw std::invalid_argument("Calibrating timings is impossible for "
00275               "a matrix with either zero rows or zero "
00276               "columns.");
00277   else if (accuracyFactor < 0)
00278     throw std::invalid_argument("Accuracy factor for Combine numTrials "
00279               "calibration must be nonnegative.");
00280   // Random matrix generator.
00281   matgen_type matGen (normGenS_);
00282 
00283   // Generate a random cache block A.
00284   matrix_type A (numRows, numCols);
00285   std::vector<magnitude_type> sigmas (numCols);
00286   randomSingularValues (sigmas, numCols);
00287   matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]);
00288 
00289   // A place to put the Q factor.
00290   matrix_type Q (numRows, numCols);
00291   Q.fill (STS::zero());
00292   for (Ordinal j = 0; j < numCols; ++j)
00293     Q(j,j) = STS::one();
00294 
00295   // TAU array (Householder reflector scaling factors).
00296   std::vector<Scalar> tau (numCols);
00297   // Work space array for factorization and applying the Q factor.
00298   std::vector<Scalar> work (numCols);
00299 
00300   // The Combine instance to benchmark.
00301   combine_type combiner; 
00302 
00303   // A few warmup runs just to avoid timing anomalies.
00304   const int numWarmupRuns = 3;
00305   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00306     {
00307       combiner.factor_first (numRows, numCols, A.get(), A.lda(),
00308            &tau[0], &work[0]);
00309       combiner.apply_first (ApplyType("N"), numRows, numCols, numCols,
00310           A.get(), A.lda(), &tau[0],
00311           Q.get(), Q.lda(), &work[0]);
00312     }
00313 
00314   // How much time numTrials runs must take in order for
00315   // numTrials to be considered sufficiently large.
00316   const double minAcceptableTime = accuracyFactor * timerResolution();
00317 
00318   timer_type timer ("Combine first");
00319 
00320   // The actual timing runs.  Repeat, doubling numTrials each
00321   // time, until the resulting timing is at least timerRes *
00322   // accuracyFactor.  Also, we mandate somewhat arbitrarily that
00323   // numTrials >= 4; this gives us some buffer against timer
00324   // variability.  Finally, don't let numTrials loop around.
00325   // (We're doubling it each time, so it won't take long for
00326   // this to happen, especially if something is wrong with the
00327   // benchmark and it's taking zero time, or if accuracyFactor
00328   // is too large, or if the timer resolution is too large.)
00329   const int maxNumTrials = std::numeric_limits<int>::max() / 2;
00330   double theTime;
00331   int numTrials = 2;
00332   do {
00333     numTrials *= 2; // First value of numTrials is 4.
00334     timer.start();
00335     for (int trial = 0; trial < numTrials; ++trial)
00336       {
00337         combiner.factor_first (numRows, numCols, A.get(), A.lda(),
00338              &tau[0], &work[0]);
00339         combiner.apply_first (ApplyType("N"), numRows, numCols, numCols,
00340             A.get(), A.lda(), &tau[0],
00341             Q.get(), Q.lda(), &work[0]);
00342       }
00343     theTime = timer.stop();
00344   } while (theTime < minAcceptableTime && numTrials < maxNumTrials);
00345 
00346   return std::make_pair (numTrials, theTime);
00347       }
00348 
00365       double
00366       benchmarkFirst (const Ordinal numRows,
00367           const Ordinal numCols,
00368           const int numTrials)
00369       {
00370   if (numRows == 0 || numCols == 0)
00371     throw std::invalid_argument("Benchmarking does not make sense for "
00372               "a matrix with either zero rows or zero "
00373               "columns.");
00374   TEST_FOR_EXCEPTION(numTrials < 1, std::invalid_argument,
00375          "The number of trials must be positive, but "
00376          "numTrials = " << numTrials << ".");
00377 
00378   // Random matrix generator.
00379   matgen_type matGen (normGenS_);
00380 
00381   // Generate a random cache block A.
00382   matrix_type A (numRows, numCols);
00383   std::vector<magnitude_type> sigmas (numCols);
00384   randomSingularValues (sigmas, numCols);
00385   matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]);
00386 
00387   // A place to put the Q factor.
00388   matrix_type Q (numRows, numCols);
00389   Q.fill (STS::zero());
00390   for (Ordinal j = 0; j < numCols; ++j)
00391     Q(j,j) = STS::one();
00392 
00393   // TAU array (Householder reflector scaling factors).
00394   std::vector<Scalar> tau (numCols);
00395   // Work space array for factorization and applying the Q factor.
00396   std::vector<Scalar> work (numCols);
00397 
00398   // The Combine instance to benchmark.
00399   combine_type combiner; 
00400 
00401   // A few warmup runs just to avoid timing anomalies.
00402   const int numWarmupRuns = 3;
00403   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00404     {
00405       combiner.factor_first (numRows, numCols, A.get(), A.lda(),
00406            &tau[0], &work[0]);
00407       combiner.apply_first (ApplyType("N"), numRows, numCols, numCols,
00408           A.get(), A.lda(), &tau[0],
00409           Q.get(), Q.lda(), &work[0]);
00410     }
00411   //
00412   // The actual timing runs.
00413   //
00414   timer_type timer ("Combine first");
00415   timer.start();
00416   for (int trial = 0; trial < numTrials; ++trial)
00417     {
00418       combiner.factor_first (numRows, numCols, A.get(), A.lda(),
00419            &tau[0], &work[0]);
00420       combiner.apply_first (ApplyType("N"), numRows, numCols, numCols,
00421           A.get(), A.lda(), &tau[0],
00422           Q.get(), Q.lda(), &work[0]);
00423     }
00424   return timer.stop();
00425       }
00426 
00452       std::pair<int, double>
00453       calibrateCacheBlock (const Ordinal numRows,
00454          const Ordinal numCols,
00455          const double accuracyFactor)
00456       {
00457   if (numRows == 0 || numCols == 0)
00458     throw std::invalid_argument("Calibrating timings is impossible for "
00459               "a matrix with either zero rows or zero "
00460               "columns.");
00461   else if (accuracyFactor < 0)
00462     throw std::invalid_argument("Accuracy factor for Combine numTrials "
00463               "calibration must be nonnegative.");
00464   // Random matrix generator.
00465   matgen_type matGen (normGenS_);
00466 
00467   // Generate a random R factor first.
00468   matrix_type R (numCols, numCols);
00469   std::vector<magnitude_type> sigmas (numCols);
00470   randomSingularValues (sigmas, numCols);
00471   matGen.fill_random_R (numCols, R.get(), R.lda(), &sigmas[0]);
00472 
00473   // Now generate a random cache block.
00474   matrix_type A (numRows, numCols);
00475   randomSingularValues (sigmas, numCols);
00476   matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]);
00477 
00478   // A place to put the Q factor.
00479   matrix_type Q (numRows + numCols, numCols);
00480   Q.fill (STS::zero());
00481   for (Ordinal j = 0; j < numCols; ++j)
00482     Q(j,j) = STS::one();
00483 
00484   // TAU array (Householder reflector scaling factors).
00485   std::vector<Scalar> tau (numCols);
00486   // Work space array for factorization and applying the Q factor.
00487   std::vector<Scalar> work (numCols);
00488 
00489   // The Combine instance to benchmark.
00490   combine_type combiner; 
00491 
00492   // A few warmup runs just to avoid timing anomalies.
00493   const int numWarmupRuns = 3;
00494   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00495     {
00496       combiner.factor_inner (numRows, numCols, R.get(), R.lda(),
00497            A.get(), A.lda(), &tau[0], &work[0]);
00498       combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols,
00499           A.get(), A.lda(), &tau[0], 
00500           &Q(0, 0), Q.lda(),
00501           &Q(numCols, 0), Q.lda(), 
00502           &work[0]);
00503     }
00504 
00505   // How much time numTrials runs must take in order for
00506   // numTrials to be considered sufficiently large.
00507   const double minAcceptableTime = accuracyFactor * timerResolution();
00508 
00509   timer_type timer ("Combine cache block");
00510 
00511   // The actual timing runs.  Repeat, doubling numTrials each
00512   // time, until the resulting timing is at least timerRes *
00513   // accuracyFactor.  Also, we mandate somewhat arbitrarily that
00514   // numTrials >= 4; this gives us some buffer against timer
00515   // variability.  Finally, don't let numTrials loop around.
00516   // (We're doubling it each time, so it won't take long for
00517   // this to happen, especially if something is wrong with the
00518   // benchmark and it's taking zero time, or if accuracyFactor
00519   // is too large, or if the timer resolution is too large.)
00520   const int maxNumTrials = std::numeric_limits<int>::max() / 2;
00521   double theTime;
00522   int numTrials = 2;
00523   do {
00524     numTrials *= 2; // First value of numTrials is 4.
00525     timer.start();
00526     for (int trial = 0; trial < numTrials; ++trial)
00527       {
00528         combiner.factor_inner (numRows, numCols, R.get(), R.lda(),
00529              A.get(), A.lda(), &tau[0], &work[0]);
00530         combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols,
00531             A.get(), A.lda(), &tau[0], 
00532             &Q(0, 0), Q.lda(),
00533             &Q(numCols, 0), Q.lda(), 
00534             &work[0]);
00535       }
00536     theTime = timer.stop();
00537   } while (theTime < minAcceptableTime && numTrials < maxNumTrials);
00538 
00539   return std::make_pair (numTrials, theTime);
00540       }
00541 
00542 
00561       double
00562       benchmarkCacheBlock (const Ordinal numRows,
00563          const Ordinal numCols,
00564          const int numTrials)
00565       {
00566   if (numRows == 0 || numCols == 0)
00567     throw std::invalid_argument("Benchmarking does not make sense for "
00568               "a matrix with either zero rows or zero "
00569               "columns.");
00570   TEST_FOR_EXCEPTION(numTrials < 1, std::invalid_argument,
00571          "The number of trials must be positive, but "
00572          "numTrials = " << numTrials << ".");
00573 
00574   // Random matrix generator.
00575   matgen_type matGen (normGenS_);
00576 
00577   // Generate a random R factor first.
00578   matrix_type R (numCols, numCols);
00579   std::vector<magnitude_type> sigmas (numCols);
00580   randomSingularValues (sigmas, numCols);
00581   matGen.fill_random_R (numCols, R.get(), R.lda(), &sigmas[0]);
00582 
00583   // Now generate a random cache block.
00584   matrix_type A (numRows, numCols);
00585   randomSingularValues (sigmas, numCols);
00586   matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]);
00587 
00588   // A place to put the Q factor.
00589   matrix_type Q (numRows + numCols, numCols);
00590   Q.fill (STS::zero());
00591   for (Ordinal j = 0; j < numCols; ++j)
00592     Q(j,j) = STS::one();
00593 
00594   // TAU array (Householder reflector scaling factors).
00595   std::vector<Scalar> tau (numCols);
00596   // Work space array for factorization and applying the Q factor.
00597   std::vector<Scalar> work (numCols);
00598 
00599   // The Combine instance to benchmark.
00600   combine_type combiner; 
00601 
00602   // A few warmup runs just to avoid timing anomalies.
00603   const int numWarmupRuns = 3;
00604   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00605     {
00606       combiner.factor_inner (numRows, numCols, R.get(), R.lda(),
00607            A.get(), A.lda(), &tau[0], &work[0]);
00608       combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols,
00609           A.get(), A.lda(), &tau[0], 
00610           &Q(0, 0), Q.lda(),
00611           &Q(numCols, 0), Q.lda(), 
00612           &work[0]);
00613     }
00614   //
00615   // The actual timing runs.
00616   //
00617   timer_type timer ("Combine cache block");
00618   timer.start();
00619   for (int trial = 0; trial < numTrials; ++trial)
00620     {
00621       combiner.factor_inner (numRows, numCols, R.get(), R.lda(),
00622            A.get(), A.lda(), &tau[0], &work[0]);
00623       combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols,
00624           A.get(), A.lda(), &tau[0], 
00625           &Q(0, 0), Q.lda(),
00626           &Q(numCols, 0), Q.lda(), 
00627           &work[0]);
00628     }
00629   return timer.stop();
00630       }
00631 
00655       std::pair<int, double>
00656       calibratePair (const Ordinal numCols,
00657          const double accuracyFactor)
00658       {
00659   if (numCols == 0)
00660     throw std::invalid_argument("Calibrating timings is impossible for "
00661               "a matrix with zero columns.");
00662   else if (accuracyFactor < 0)
00663     throw std::invalid_argument("Accuracy factor for Combine numTrials "
00664               "calibration must be nonnegative.");
00665   // Random matrix generator.
00666   matgen_type matGen (normGenS_);
00667 
00668   // Generate R1 first.
00669   matrix_type R1 (numCols, numCols);
00670   std::vector<magnitude_type> sigmas (numCols);
00671   randomSingularValues (sigmas, numCols);
00672   matGen.fill_random_R (numCols, R1.get(), R1.lda(), &sigmas[0]);
00673 
00674   // Now generate R2.
00675   matrix_type R2 (numCols, numCols);
00676   randomSingularValues (sigmas, numCols);
00677   matGen.fill_random_R (numCols, R2.get(), R2.lda(), &sigmas[0]);
00678 
00679   // A place to put the Q factor of [R1; R2].
00680   matrix_type Q (2*numCols, numCols);
00681   Q.fill (STS::zero());
00682   for (Ordinal j = 0; j < numCols; ++j)
00683     Q(j,j) = STS::one();
00684 
00685   // TAU array (Householder reflector scaling factors).
00686   std::vector<Scalar> tau (numCols);
00687   // Work space array for factorization and applying the Q factor.
00688   std::vector<Scalar> work (numCols);
00689 
00690   // The Combine instance to benchmark.
00691   combine_type combiner; 
00692 
00693   // A few warmup runs just to avoid timing anomalies.
00694   const int numWarmupRuns = 3;
00695   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00696     {
00697       combiner.factor_pair (numCols, R1.get(), R1.lda(), 
00698           R2.get(), R2.lda(),
00699           &tau[0], &work[0]);
00700       combiner.apply_pair (ApplyType("N"), numCols, numCols, 
00701          R2.get(), R2.lda(), &tau[0], 
00702          &Q(0, 0), Q.lda(),
00703          &Q(numCols, 0), Q.lda(),
00704          &work[0]);
00705     }
00706 
00707   // How much time numTrials runs must take in order for
00708   // numTrials to be considered sufficiently large.
00709   const double minAcceptableTime = accuracyFactor * timerResolution();
00710 
00711   timer_type timer ("Combine pair");
00712 
00713   // The actual timing runs.  Repeat, doubling numTrials each
00714   // time, until the resulting timing is at least timerRes *
00715   // accuracyFactor.  Also, we mandate somewhat arbitrarily that
00716   // numTrials >= 4; this gives us some buffer against timer
00717   // variability.  Finally, don't let numTrials loop around.
00718   // (We're doubling it each time, so it won't take long for
00719   // this to happen, especially if something is wrong with the
00720   // benchmark and it's taking zero time, or if accuracyFactor
00721   // is too large, or if the timer resolution is too large.)
00722   const int maxNumTrials = std::numeric_limits<int>::max() / 2;
00723   double theTime;
00724   int numTrials = 2;
00725   do {
00726     numTrials *= 2; // First value of numTrials is 4.
00727     timer.start();
00728     for (int trial = 0; trial < numTrials; ++trial)
00729       {
00730         combiner.factor_pair (numCols, R1.get(), R1.lda(), 
00731             R2.get(), R2.lda(),
00732             &tau[0], &work[0]);
00733         combiner.apply_pair (ApplyType("N"), numCols, numCols, 
00734            R2.get(), R2.lda(), &tau[0], 
00735            &Q(0, 0), Q.lda(),
00736            &Q(numCols, 0), Q.lda(),
00737            &work[0]);
00738       }
00739     theTime = timer.stop();
00740   } while (theTime < minAcceptableTime && numTrials < maxNumTrials);
00741 
00742   return std::make_pair (numTrials, theTime);
00743       }
00744 
00745 
00762       double
00763       benchmarkPair (const Ordinal numCols,
00764          const int numTrials)
00765       {
00766   if (numCols == 0)
00767     throw std::invalid_argument("Benchmarking does not make sense for "
00768               "a matrix with zero columns.");
00769   TEST_FOR_EXCEPTION(numTrials < 1, std::invalid_argument,
00770          "The number of trials must be positive, but "
00771          "numTrials = " << numTrials << ".");
00772 
00773   // Random matrix generator.
00774   matgen_type matGen (normGenS_);
00775 
00776   // Generate R1 first.
00777   matrix_type R1 (numCols, numCols);
00778   std::vector<magnitude_type> sigmas (numCols);
00779   randomSingularValues (sigmas, numCols);
00780   matGen.fill_random_R (numCols, R1.get(), R1.lda(), &sigmas[0]);
00781 
00782   // Now generate R2.
00783   matrix_type R2 (numCols, numCols);
00784   randomSingularValues (sigmas, numCols);
00785   matGen.fill_random_R (numCols, R2.get(), R2.lda(), &sigmas[0]);
00786 
00787   // A place to put the Q factor of [R1; R2].
00788   matrix_type Q (2*numCols, numCols);
00789   Q.fill (STS::zero());
00790   for (Ordinal j = 0; j < numCols; ++j)
00791     Q(j,j) = STS::one();
00792 
00793   // TAU array (Householder reflector scaling factors).
00794   std::vector<Scalar> tau (numCols);
00795   // Work space array for factorization and applying the Q factor.
00796   std::vector<Scalar> work (numCols);
00797 
00798   // The Combine instance to benchmark.
00799   combine_type combiner; 
00800 
00801   // A few warmup runs just to avoid timing anomalies.
00802   const int numWarmupRuns = 3;
00803   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00804     {
00805       combiner.factor_pair (numCols, R1.get(), R1.lda(), 
00806           R2.get(), R2.lda(),
00807           &tau[0], &work[0]);
00808       combiner.apply_pair (ApplyType("N"), numCols, numCols, 
00809          R2.get(), R2.lda(), &tau[0], 
00810          &Q(0, 0), Q.lda(),
00811          &Q(numCols, 0), Q.lda(),
00812          &work[0]);
00813     }
00814   //
00815   // The actual timing runs.
00816   //
00817   timer_type timer ("Combine pair");
00818   timer.start();
00819   for (int trial = 0; trial < numTrials; ++trial)
00820     {
00821       combiner.factor_pair (numCols, R1.get(), R1.lda(), 
00822           R2.get(), R2.lda(),
00823           &tau[0], &work[0]);
00824       combiner.apply_pair (ApplyType("N"), numCols, numCols, 
00825          R2.get(), R2.lda(), &tau[0], 
00826          &Q(0, 0), Q.lda(),
00827          &Q(numCols, 0), Q.lda(),
00828          &work[0]);
00829     }
00830   return timer.stop();
00831       }
00832 
00833     private:
00834 
00836       TSQR::Random::NormalGenerator<ordinal_type, scalar_type> normGenS_;
00837 
00839       TSQR::Random::NormalGenerator<ordinal_type, magnitude_type> normGenM_;
00840 
00842       double timerResolution_;
00843 
00850       void
00851       randomSingularValues (std::vector<magnitude_type>& sigmas,
00852           const Ordinal numValues)
00853       {
00854   // Cast to avoid compiler warnings for signed / unsigned
00855   // comparisons.
00856   typedef typename std::vector<magnitude_type>::size_type size_type;
00857   if (sigmas.size() < static_cast<size_type> (numValues))
00858     sigmas.resize (numValues);
00859       
00860   // Relative amount by which to perturb each singular value.  The
00861   // perturbation will be multiplied by a normal(0,1) pseudorandom
00862   // number drawn from magGen.
00863   const magnitude_type perturbationFactor = magnitude_type(10) * STM::eps();
00864   const magnitude_type one = STM::one();
00865   for (Ordinal k = 0; k < numValues; ++k)
00866     {
00867       magnitude_type perturbation = perturbationFactor * normGenM_();
00868       // If (1 - perturbation) is a small or nonpositive number,
00869       // subtract instead.
00870       if (one - perturbation <= perturbationFactor)
00871         perturbation = -perturbation;
00872       sigmas[k] = one - perturbation;
00873     }
00874       }
00875     };
00876 
00877   } // namespace Test
00878 } // namespace TSQR
00879 
00880 #endif // __Tsqr_CombineBenchmarker_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends