Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_CombineBenchmarker.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __Tsqr_CombineBenchmarker_hpp
00043 #define __Tsqr_CombineBenchmarker_hpp
00044 
00045 #include <Tsqr_ConfigDefs.hpp>
00046 #include <Tsqr_Random_NormalGenerator.hpp>
00047 #include <Tsqr_Random_MatrixGenerator.hpp>
00048 #include <Tsqr_verifyTimerConcept.hpp>
00049 
00050 #include <Tsqr_ApplyType.hpp>
00051 #include <Tsqr_Matrix.hpp>
00052 #include <Tsqr_ScalarTraits.hpp>
00053 #include <Tsqr_Util.hpp>
00054 
00055 #include <algorithm>
00056 #include <iostream>
00057 #include <limits>
00058 #include <sstream>
00059 #include <stdexcept>
00060 #include <vector>
00061 
00062 
00063 namespace TSQR {
00064   namespace Test {
00065 
00076     template<class TimerType>
00077     double
00078     computeTimerResolution () 
00079     {
00080       typedef TimerType timer_type;
00081       timer_type timer ("Timer resolution");
00082 
00083       // Warmup run for the timer.
00084       for (int warmup = 0; warmup < 5; ++warmup)
00085   {
00086     timer.start();
00087     (void) timer.stop();
00088   }
00089 
00090       // Keep a count of the total number of times timer.stop() is
00091       // called (once per outer loop iteration).  If bigger than
00092       // maxCount, assume that the timer is broken and bail out.
00093       // This prevents an infinite loop.  Since numTrials is
00094       // multiplied by 10 each time around, maxCount should be more
00095       // than the base-10 log of the maximum acceptable timer
00096       // resolution, relative to the time taken by a single
00097       // floating-point increment operation.
00098       //
00099       // For example, suppose that the time per flop is 10^{-9} s,
00100       // and the timer resolution is 1 s (which is particularly
00101       // coarse).  We start with numTrials=10 and multiply by 10
00102       // each time.  Then, it may take 8 or 9 iterations (may have
00103       // to exceed the timer resolution, or just meet it) so the
00104       // timing loop exceeds the timer resolution.  Thus, maxCount=9
00105       // = log10(timer resolution / time per flop) is just enough.
00106       // Adding a bit more for wiggle room is a good idea.
00107       //
00108       // If your machine is exceptionally fast or your compiler
00109       // autoparallelizes the timing loop, you may have to increase
00110       // maxCount.  We set it to 20 as a generous upper bound.
00111       const size_t maxCount = 20;
00112       size_t count = 0;
00113 
00114       // Don't let numTrials loop around.  (We're multiplying it by
00115       // 10 each time, so it won't take long for this to happen,
00116       // especially if the timer granularity is too coarse.)
00117       const size_t maxNumTrials = std::numeric_limits<size_t>::max() / 10;
00118 
00119       // Do some fake work.  Multiply the number of trials by 10
00120       // each time, so that resolution is expressed as an order of
00121       // decimal magnitude.
00122       size_t numTrials = 1;
00123       double theTime;
00124       do {
00125   const double eps = Teuchos::ScalarTraits<double>::eps();
00126   double fake = Teuchos::ScalarTraits<double>::one();
00127   numTrials *= 10;
00128   ++count;
00129 
00130   // The timing loop.
00131   timer.start();
00132   for (size_t trial = 0; trial < numTrials; ++trial)
00133     fake += eps;
00134   theTime = timer.stop();
00135 
00136       } while (theTime == 0 && count < maxCount && numTrials < maxNumTrials);
00137 
00138       if (theTime == 0)
00139   {
00140     std::ostringstream os;
00141     os << "Maximum number of loops " << maxCount << " exceeded when "
00142       "computing timer resolution.  Largest timing loop length tried: " 
00143        << numTrials << ".";
00144     throw std::logic_error (os.str());
00145   }
00146       else if (numTrials >= maxNumTrials)
00147   {
00148     std::ostringstream os;
00149     os << "Maximum number of timing loop iterations " << maxNumTrials 
00150        << " exceeded when computing timer resolution.  Largest timing "
00151       "loop length tried: " << numTrials << ".";
00152     throw std::logic_error (os.str());
00153   }
00154       else
00155   return theTime;
00156     }
00157 
00181     template<class Ordinal, class Scalar, class CombineType, class TimerType>
00182     class CombineBenchmarker {
00183     public:
00184       typedef Ordinal ordinal_type;
00185       typedef Scalar scalar_type;
00186       typedef CombineType combine_type;
00187       typedef TimerType timer_type;
00188 
00189     private:
00190       typedef Teuchos::ScalarTraits<scalar_type> STS;
00191       typedef typename STS::magnitudeType magnitude_type;
00192       typedef Teuchos::ScalarTraits<magnitude_type> STM;
00193       typedef TSQR::Random::NormalGenerator<ordinal_type, scalar_type> normgen_type;
00194       typedef TSQR::Random::MatrixGenerator<ordinal_type, scalar_type, normgen_type> matgen_type;
00195       typedef Matrix<ordinal_type, scalar_type> matrix_type;
00196 
00197     public:
00198 
00208       CombineBenchmarker (const double timerRes,
00209         const std::vector<int>& iseed) : 
00210   normGenS_ (iseed), 
00211   normGenM_ (iseed),
00212   timerResolution_ (timerRes)
00213       {
00214   TSQR::Test::verifyTimerConcept<timer_type> ();
00215       }
00216 
00223       CombineBenchmarker (const std::vector<int>& iseed) : 
00224   normGenS_ (iseed), 
00225   normGenM_ (iseed),
00226   timerResolution_ (computeTimerResolution<timer_type> ())
00227       {}
00228 
00233       CombineBenchmarker (const double timerRes) :
00234   timerResolution_ (timerRes)
00235       {
00236   TSQR::Test::verifyTimerConcept<timer_type> ();
00237       }
00238 
00239 
00241       CombineBenchmarker () :
00242   timerResolution_ (computeTimerResolution<timer_type> ())
00243       {}
00244 
00246       void 
00247       getSeed (std::vector<int>& iseed) const
00248       {
00249   normGenS_.getSeed (iseed);
00250       }
00251 
00253       double 
00254       timerResolution() const {
00255   return timerResolution_;
00256       }
00257 
00281       std::pair<int, double>
00282       calibrateFirst (const Ordinal numRows,
00283           const Ordinal numCols,
00284           const double accuracyFactor)
00285       {
00286   if (numRows == 0 || numCols == 0)
00287     throw std::invalid_argument("Calibrating timings is impossible for "
00288               "a matrix with either zero rows or zero "
00289               "columns.");
00290   else if (accuracyFactor < 0)
00291     throw std::invalid_argument("Accuracy factor for Combine numTrials "
00292               "calibration must be nonnegative.");
00293   // Random matrix generator.
00294   matgen_type matGen (normGenS_);
00295 
00296   // Generate a random cache block A.
00297   matrix_type A (numRows, numCols);
00298   std::vector<magnitude_type> sigmas (numCols);
00299   randomSingularValues (sigmas, numCols);
00300   matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]);
00301 
00302   // A place to put the Q factor.
00303   matrix_type Q (numRows, numCols);
00304   Q.fill (STS::zero());
00305   for (Ordinal j = 0; j < numCols; ++j)
00306     Q(j,j) = STS::one();
00307 
00308   // TAU array (Householder reflector scaling factors).
00309   std::vector<Scalar> tau (numCols);
00310   // Work space array for factorization and applying the Q factor.
00311   std::vector<Scalar> work (numCols);
00312 
00313   // The Combine instance to benchmark.
00314   combine_type combiner; 
00315 
00316   // A few warmup runs just to avoid timing anomalies.
00317   const int numWarmupRuns = 3;
00318   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00319     {
00320       combiner.factor_first (numRows, numCols, A.get(), A.lda(),
00321            &tau[0], &work[0]);
00322       combiner.apply_first (ApplyType("N"), numRows, numCols, numCols,
00323           A.get(), A.lda(), &tau[0],
00324           Q.get(), Q.lda(), &work[0]);
00325     }
00326 
00327   // How much time numTrials runs must take in order for
00328   // numTrials to be considered sufficiently large.
00329   const double minAcceptableTime = accuracyFactor * timerResolution();
00330 
00331   timer_type timer ("Combine first");
00332 
00333   // The actual timing runs.  Repeat, doubling numTrials each
00334   // time, until the resulting timing is at least timerRes *
00335   // accuracyFactor.  Also, we mandate somewhat arbitrarily that
00336   // numTrials >= 4; this gives us some buffer against timer
00337   // variability.  Finally, don't let numTrials loop around.
00338   // (We're doubling it each time, so it won't take long for
00339   // this to happen, especially if something is wrong with the
00340   // benchmark and it's taking zero time, or if accuracyFactor
00341   // is too large, or if the timer resolution is too large.)
00342   const int maxNumTrials = std::numeric_limits<int>::max() / 2;
00343   double theTime;
00344   int numTrials = 2;
00345   do {
00346     numTrials *= 2; // First value of numTrials is 4.
00347     timer.start();
00348     for (int trial = 0; trial < numTrials; ++trial)
00349       {
00350         combiner.factor_first (numRows, numCols, A.get(), A.lda(),
00351              &tau[0], &work[0]);
00352         combiner.apply_first (ApplyType("N"), numRows, numCols, numCols,
00353             A.get(), A.lda(), &tau[0],
00354             Q.get(), Q.lda(), &work[0]);
00355       }
00356     theTime = timer.stop();
00357   } while (theTime < minAcceptableTime && numTrials < maxNumTrials);
00358 
00359   return std::make_pair (numTrials, theTime);
00360       }
00361 
00378       double
00379       benchmarkFirst (const Ordinal numRows,
00380           const Ordinal numCols,
00381           const int numTrials)
00382       {
00383   if (numRows == 0 || numCols == 0)
00384     throw std::invalid_argument("Benchmarking does not make sense for "
00385               "a matrix with either zero rows or zero "
00386               "columns.");
00387   TEUCHOS_TEST_FOR_EXCEPTION(numTrials < 1, std::invalid_argument,
00388          "The number of trials must be positive, but "
00389          "numTrials = " << numTrials << ".");
00390 
00391   // Random matrix generator.
00392   matgen_type matGen (normGenS_);
00393 
00394   // Generate a random cache block A.
00395   matrix_type A (numRows, numCols);
00396   std::vector<magnitude_type> sigmas (numCols);
00397   randomSingularValues (sigmas, numCols);
00398   matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]);
00399 
00400   // A place to put the Q factor.
00401   matrix_type Q (numRows, numCols);
00402   Q.fill (STS::zero());
00403   for (Ordinal j = 0; j < numCols; ++j)
00404     Q(j,j) = STS::one();
00405 
00406   // TAU array (Householder reflector scaling factors).
00407   std::vector<Scalar> tau (numCols);
00408   // Work space array for factorization and applying the Q factor.
00409   std::vector<Scalar> work (numCols);
00410 
00411   // The Combine instance to benchmark.
00412   combine_type combiner; 
00413 
00414   // A few warmup runs just to avoid timing anomalies.
00415   const int numWarmupRuns = 3;
00416   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00417     {
00418       combiner.factor_first (numRows, numCols, A.get(), A.lda(),
00419            &tau[0], &work[0]);
00420       combiner.apply_first (ApplyType("N"), numRows, numCols, numCols,
00421           A.get(), A.lda(), &tau[0],
00422           Q.get(), Q.lda(), &work[0]);
00423     }
00424   //
00425   // The actual timing runs.
00426   //
00427   timer_type timer ("Combine first");
00428   timer.start();
00429   for (int trial = 0; trial < numTrials; ++trial)
00430     {
00431       combiner.factor_first (numRows, numCols, A.get(), A.lda(),
00432            &tau[0], &work[0]);
00433       combiner.apply_first (ApplyType("N"), numRows, numCols, numCols,
00434           A.get(), A.lda(), &tau[0],
00435           Q.get(), Q.lda(), &work[0]);
00436     }
00437   return timer.stop();
00438       }
00439 
00465       std::pair<int, double>
00466       calibrateCacheBlock (const Ordinal numRows,
00467          const Ordinal numCols,
00468          const double accuracyFactor)
00469       {
00470   if (numRows == 0 || numCols == 0)
00471     throw std::invalid_argument("Calibrating timings is impossible for "
00472               "a matrix with either zero rows or zero "
00473               "columns.");
00474   else if (accuracyFactor < 0)
00475     throw std::invalid_argument("Accuracy factor for Combine numTrials "
00476               "calibration must be nonnegative.");
00477   // Random matrix generator.
00478   matgen_type matGen (normGenS_);
00479 
00480   // Generate a random R factor first.
00481   matrix_type R (numCols, numCols);
00482   std::vector<magnitude_type> sigmas (numCols);
00483   randomSingularValues (sigmas, numCols);
00484   matGen.fill_random_R (numCols, R.get(), R.lda(), &sigmas[0]);
00485 
00486   // Now generate a random cache block.
00487   matrix_type A (numRows, numCols);
00488   randomSingularValues (sigmas, numCols);
00489   matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]);
00490 
00491   // A place to put the Q factor.
00492   matrix_type Q (numRows + numCols, numCols);
00493   Q.fill (STS::zero());
00494   for (Ordinal j = 0; j < numCols; ++j)
00495     Q(j,j) = STS::one();
00496 
00497   // TAU array (Householder reflector scaling factors).
00498   std::vector<Scalar> tau (numCols);
00499   // Work space array for factorization and applying the Q factor.
00500   std::vector<Scalar> work (numCols);
00501 
00502   // The Combine instance to benchmark.
00503   combine_type combiner; 
00504 
00505   // A few warmup runs just to avoid timing anomalies.
00506   const int numWarmupRuns = 3;
00507   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00508     {
00509       combiner.factor_inner (numRows, numCols, R.get(), R.lda(),
00510            A.get(), A.lda(), &tau[0], &work[0]);
00511       combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols,
00512           A.get(), A.lda(), &tau[0], 
00513           &Q(0, 0), Q.lda(),
00514           &Q(numCols, 0), Q.lda(), 
00515           &work[0]);
00516     }
00517 
00518   // How much time numTrials runs must take in order for
00519   // numTrials to be considered sufficiently large.
00520   const double minAcceptableTime = accuracyFactor * timerResolution();
00521 
00522   timer_type timer ("Combine cache block");
00523 
00524   // The actual timing runs.  Repeat, doubling numTrials each
00525   // time, until the resulting timing is at least timerRes *
00526   // accuracyFactor.  Also, we mandate somewhat arbitrarily that
00527   // numTrials >= 4; this gives us some buffer against timer
00528   // variability.  Finally, don't let numTrials loop around.
00529   // (We're doubling it each time, so it won't take long for
00530   // this to happen, especially if something is wrong with the
00531   // benchmark and it's taking zero time, or if accuracyFactor
00532   // is too large, or if the timer resolution is too large.)
00533   const int maxNumTrials = std::numeric_limits<int>::max() / 2;
00534   double theTime;
00535   int numTrials = 2;
00536   do {
00537     numTrials *= 2; // First value of numTrials is 4.
00538     timer.start();
00539     for (int trial = 0; trial < numTrials; ++trial)
00540       {
00541         combiner.factor_inner (numRows, numCols, R.get(), R.lda(),
00542              A.get(), A.lda(), &tau[0], &work[0]);
00543         combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols,
00544             A.get(), A.lda(), &tau[0], 
00545             &Q(0, 0), Q.lda(),
00546             &Q(numCols, 0), Q.lda(), 
00547             &work[0]);
00548       }
00549     theTime = timer.stop();
00550   } while (theTime < minAcceptableTime && numTrials < maxNumTrials);
00551 
00552   return std::make_pair (numTrials, theTime);
00553       }
00554 
00555 
00574       double
00575       benchmarkCacheBlock (const Ordinal numRows,
00576          const Ordinal numCols,
00577          const int numTrials)
00578       {
00579   if (numRows == 0 || numCols == 0)
00580     throw std::invalid_argument("Benchmarking does not make sense for "
00581               "a matrix with either zero rows or zero "
00582               "columns.");
00583   TEUCHOS_TEST_FOR_EXCEPTION(numTrials < 1, std::invalid_argument,
00584          "The number of trials must be positive, but "
00585          "numTrials = " << numTrials << ".");
00586 
00587   // Random matrix generator.
00588   matgen_type matGen (normGenS_);
00589 
00590   // Generate a random R factor first.
00591   matrix_type R (numCols, numCols);
00592   std::vector<magnitude_type> sigmas (numCols);
00593   randomSingularValues (sigmas, numCols);
00594   matGen.fill_random_R (numCols, R.get(), R.lda(), &sigmas[0]);
00595 
00596   // Now generate a random cache block.
00597   matrix_type A (numRows, numCols);
00598   randomSingularValues (sigmas, numCols);
00599   matGen.fill_random_svd (numRows, numCols, A.get(), A.lda(), &sigmas[0]);
00600 
00601   // A place to put the Q factor.
00602   matrix_type Q (numRows + numCols, numCols);
00603   Q.fill (STS::zero());
00604   for (Ordinal j = 0; j < numCols; ++j)
00605     Q(j,j) = STS::one();
00606 
00607   // TAU array (Householder reflector scaling factors).
00608   std::vector<Scalar> tau (numCols);
00609   // Work space array for factorization and applying the Q factor.
00610   std::vector<Scalar> work (numCols);
00611 
00612   // The Combine instance to benchmark.
00613   combine_type combiner; 
00614 
00615   // A few warmup runs just to avoid timing anomalies.
00616   const int numWarmupRuns = 3;
00617   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00618     {
00619       combiner.factor_inner (numRows, numCols, R.get(), R.lda(),
00620            A.get(), A.lda(), &tau[0], &work[0]);
00621       combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols,
00622           A.get(), A.lda(), &tau[0], 
00623           &Q(0, 0), Q.lda(),
00624           &Q(numCols, 0), Q.lda(), 
00625           &work[0]);
00626     }
00627   //
00628   // The actual timing runs.
00629   //
00630   timer_type timer ("Combine cache block");
00631   timer.start();
00632   for (int trial = 0; trial < numTrials; ++trial)
00633     {
00634       combiner.factor_inner (numRows, numCols, R.get(), R.lda(),
00635            A.get(), A.lda(), &tau[0], &work[0]);
00636       combiner.apply_inner (ApplyType("N"), numRows, numCols, numCols,
00637           A.get(), A.lda(), &tau[0], 
00638           &Q(0, 0), Q.lda(),
00639           &Q(numCols, 0), Q.lda(), 
00640           &work[0]);
00641     }
00642   return timer.stop();
00643       }
00644 
00668       std::pair<int, double>
00669       calibratePair (const Ordinal numCols,
00670          const double accuracyFactor)
00671       {
00672   if (numCols == 0)
00673     throw std::invalid_argument("Calibrating timings is impossible for "
00674               "a matrix with zero columns.");
00675   else if (accuracyFactor < 0)
00676     throw std::invalid_argument("Accuracy factor for Combine numTrials "
00677               "calibration must be nonnegative.");
00678   // Random matrix generator.
00679   matgen_type matGen (normGenS_);
00680 
00681   // Generate R1 first.
00682   matrix_type R1 (numCols, numCols);
00683   std::vector<magnitude_type> sigmas (numCols);
00684   randomSingularValues (sigmas, numCols);
00685   matGen.fill_random_R (numCols, R1.get(), R1.lda(), &sigmas[0]);
00686 
00687   // Now generate R2.
00688   matrix_type R2 (numCols, numCols);
00689   randomSingularValues (sigmas, numCols);
00690   matGen.fill_random_R (numCols, R2.get(), R2.lda(), &sigmas[0]);
00691 
00692   // A place to put the Q factor of [R1; R2].
00693   matrix_type Q (2*numCols, numCols);
00694   Q.fill (STS::zero());
00695   for (Ordinal j = 0; j < numCols; ++j)
00696     Q(j,j) = STS::one();
00697 
00698   // TAU array (Householder reflector scaling factors).
00699   std::vector<Scalar> tau (numCols);
00700   // Work space array for factorization and applying the Q factor.
00701   std::vector<Scalar> work (numCols);
00702 
00703   // The Combine instance to benchmark.
00704   combine_type combiner; 
00705 
00706   // A few warmup runs just to avoid timing anomalies.
00707   const int numWarmupRuns = 3;
00708   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00709     {
00710       combiner.factor_pair (numCols, R1.get(), R1.lda(), 
00711           R2.get(), R2.lda(),
00712           &tau[0], &work[0]);
00713       combiner.apply_pair (ApplyType("N"), numCols, numCols, 
00714          R2.get(), R2.lda(), &tau[0], 
00715          &Q(0, 0), Q.lda(),
00716          &Q(numCols, 0), Q.lda(),
00717          &work[0]);
00718     }
00719 
00720   // How much time numTrials runs must take in order for
00721   // numTrials to be considered sufficiently large.
00722   const double minAcceptableTime = accuracyFactor * timerResolution();
00723 
00724   timer_type timer ("Combine pair");
00725 
00726   // The actual timing runs.  Repeat, doubling numTrials each
00727   // time, until the resulting timing is at least timerRes *
00728   // accuracyFactor.  Also, we mandate somewhat arbitrarily that
00729   // numTrials >= 4; this gives us some buffer against timer
00730   // variability.  Finally, don't let numTrials loop around.
00731   // (We're doubling it each time, so it won't take long for
00732   // this to happen, especially if something is wrong with the
00733   // benchmark and it's taking zero time, or if accuracyFactor
00734   // is too large, or if the timer resolution is too large.)
00735   const int maxNumTrials = std::numeric_limits<int>::max() / 2;
00736   double theTime;
00737   int numTrials = 2;
00738   do {
00739     numTrials *= 2; // First value of numTrials is 4.
00740     timer.start();
00741     for (int trial = 0; trial < numTrials; ++trial)
00742       {
00743         combiner.factor_pair (numCols, R1.get(), R1.lda(), 
00744             R2.get(), R2.lda(),
00745             &tau[0], &work[0]);
00746         combiner.apply_pair (ApplyType("N"), numCols, numCols, 
00747            R2.get(), R2.lda(), &tau[0], 
00748            &Q(0, 0), Q.lda(),
00749            &Q(numCols, 0), Q.lda(),
00750            &work[0]);
00751       }
00752     theTime = timer.stop();
00753   } while (theTime < minAcceptableTime && numTrials < maxNumTrials);
00754 
00755   return std::make_pair (numTrials, theTime);
00756       }
00757 
00758 
00775       double
00776       benchmarkPair (const Ordinal numCols,
00777          const int numTrials)
00778       {
00779   if (numCols == 0)
00780     throw std::invalid_argument("Benchmarking does not make sense for "
00781               "a matrix with zero columns.");
00782   TEUCHOS_TEST_FOR_EXCEPTION(numTrials < 1, std::invalid_argument,
00783          "The number of trials must be positive, but "
00784          "numTrials = " << numTrials << ".");
00785 
00786   // Random matrix generator.
00787   matgen_type matGen (normGenS_);
00788 
00789   // Generate R1 first.
00790   matrix_type R1 (numCols, numCols);
00791   std::vector<magnitude_type> sigmas (numCols);
00792   randomSingularValues (sigmas, numCols);
00793   matGen.fill_random_R (numCols, R1.get(), R1.lda(), &sigmas[0]);
00794 
00795   // Now generate R2.
00796   matrix_type R2 (numCols, numCols);
00797   randomSingularValues (sigmas, numCols);
00798   matGen.fill_random_R (numCols, R2.get(), R2.lda(), &sigmas[0]);
00799 
00800   // A place to put the Q factor of [R1; R2].
00801   matrix_type Q (2*numCols, numCols);
00802   Q.fill (STS::zero());
00803   for (Ordinal j = 0; j < numCols; ++j)
00804     Q(j,j) = STS::one();
00805 
00806   // TAU array (Householder reflector scaling factors).
00807   std::vector<Scalar> tau (numCols);
00808   // Work space array for factorization and applying the Q factor.
00809   std::vector<Scalar> work (numCols);
00810 
00811   // The Combine instance to benchmark.
00812   combine_type combiner; 
00813 
00814   // A few warmup runs just to avoid timing anomalies.
00815   const int numWarmupRuns = 3;
00816   for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00817     {
00818       combiner.factor_pair (numCols, R1.get(), R1.lda(), 
00819           R2.get(), R2.lda(),
00820           &tau[0], &work[0]);
00821       combiner.apply_pair (ApplyType("N"), numCols, numCols, 
00822          R2.get(), R2.lda(), &tau[0], 
00823          &Q(0, 0), Q.lda(),
00824          &Q(numCols, 0), Q.lda(),
00825          &work[0]);
00826     }
00827   //
00828   // The actual timing runs.
00829   //
00830   timer_type timer ("Combine pair");
00831   timer.start();
00832   for (int trial = 0; trial < numTrials; ++trial)
00833     {
00834       combiner.factor_pair (numCols, R1.get(), R1.lda(), 
00835           R2.get(), R2.lda(),
00836           &tau[0], &work[0]);
00837       combiner.apply_pair (ApplyType("N"), numCols, numCols, 
00838          R2.get(), R2.lda(), &tau[0], 
00839          &Q(0, 0), Q.lda(),
00840          &Q(numCols, 0), Q.lda(),
00841          &work[0]);
00842     }
00843   return timer.stop();
00844       }
00845 
00846     private:
00847 
00849       TSQR::Random::NormalGenerator<ordinal_type, scalar_type> normGenS_;
00850 
00852       TSQR::Random::NormalGenerator<ordinal_type, magnitude_type> normGenM_;
00853 
00855       double timerResolution_;
00856 
00863       void
00864       randomSingularValues (std::vector<magnitude_type>& sigmas,
00865           const Ordinal numValues)
00866       {
00867   // Cast to avoid compiler warnings for signed / unsigned
00868   // comparisons.
00869   typedef typename std::vector<magnitude_type>::size_type size_type;
00870   if (sigmas.size() < static_cast<size_type> (numValues))
00871     sigmas.resize (numValues);
00872       
00873   // Relative amount by which to perturb each singular value.  The
00874   // perturbation will be multiplied by a normal(0,1) pseudorandom
00875   // number drawn from magGen.
00876   const magnitude_type perturbationFactor = magnitude_type(10) * STM::eps();
00877   const magnitude_type one = STM::one();
00878   for (Ordinal k = 0; k < numValues; ++k)
00879     {
00880       magnitude_type perturbation = perturbationFactor * normGenM_();
00881       // If (1 - perturbation) is a small or nonpositive number,
00882       // subtract instead.
00883       if (one - perturbation <= perturbationFactor)
00884         perturbation = -perturbation;
00885       sigmas[k] = one - perturbation;
00886     }
00887       }
00888     };
00889 
00890   } // namespace Test
00891 } // namespace TSQR
00892 
00893 #endif // __Tsqr_CombineBenchmarker_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends