Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_ParTest.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __TSQR_Test_DistTest_hpp
00030 #define __TSQR_Test_DistTest_hpp
00031 
00032 #include <Tsqr_ConfigDefs.hpp>
00033 #include <Tsqr_Random_NormalGenerator.hpp>
00034 #include <Tsqr_verifyTimerConcept.hpp>
00035 
00036 #include <Tsqr_generateStack.hpp>
00037 #include <Tsqr_DistTsqr.hpp>
00038 #include <Tsqr_GlobalTimeStats.hpp>
00039 #include <Tsqr_GlobalVerify.hpp>
00040 #include <Tsqr_printGlobalMatrix.hpp>
00041 
00042 #include <algorithm>
00043 #include <iomanip>
00044 #include <iostream>
00045 #include <vector>
00046 
00047 
00048 namespace TSQR {
00049   namespace Test {
00050 
00054     template<class Ordinal, class Scalar>
00055     class DistTsqrVerifier {
00056       TSQR::Random::NormalGenerator<Ordinal, Scalar> gen_;
00057       Teuchos::RCP<MessengerBase<Ordinal> > const ordinalComm_;
00058       Teuchos::RCP<MessengerBase<Scalar> > const scalarComm_;
00059       std::string scalarTypeName_;
00060       std::ostream& out_;
00061       std::ostream& err_;
00062       const bool testFactorExplicit_, testFactorImplicit_;
00063       const bool humanReadable_, printMatrices_, debug_;
00064 
00065     public:
00066       typedef Ordinal ordinal_type;
00067       typedef Scalar scalar_type;
00068       typedef typename Teuchos::ScalarTraits<scalar_type>::magnitudeType magnitude_type;
00069       typedef typename std::vector<magnitude_type> result_type;
00070       typedef Matrix<ordinal_type, scalar_type> matrix_type;
00071       
00092       DistTsqrVerifier (const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00093       const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00094       const std::vector<int>& seed,
00095       const std::string& scalarTypeName,
00096       std::ostream& out,
00097       std::ostream& err,
00098       const bool testFactorExplicit,
00099       const bool testFactorImplicit,
00100       const bool humanReadable,
00101       const bool printMatrices,
00102       const bool debug) :
00103   gen_ (seed), 
00104   ordinalComm_ (ordinalComm),
00105   scalarComm_ (scalarComm),
00106   scalarTypeName_ (scalarTypeName), 
00107   out_ (out), 
00108   err_ (err),
00109   testFactorExplicit_ (testFactorExplicit),
00110   testFactorImplicit_ (testFactorImplicit),
00111   humanReadable_ (humanReadable), 
00112   printMatrices_ (printMatrices),
00113   debug_ (debug)
00114       {}
00115 
00137       DistTsqrVerifier (const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00138       const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00139       const std::string& scalarTypeName,
00140       std::ostream& out,
00141       std::ostream& err,
00142       const bool testFactorExplicit,
00143       const bool testFactorImplicit,
00144       const bool humanReadable,
00145       const bool printMatrices,
00146       const bool debug) :
00147   ordinalComm_ (ordinalComm),
00148   scalarComm_ (scalarComm),
00149   scalarTypeName_ (scalarTypeName), 
00150   out_ (out), 
00151   err_ (err),
00152   testFactorExplicit_ (testFactorExplicit),
00153   testFactorImplicit_ (testFactorImplicit),
00154   humanReadable_ (humanReadable), 
00155   printMatrices_ (printMatrices),
00156   debug_ (debug)
00157       {}
00158 
00165       void 
00166       getSeed (std::vector<int>& seed) const
00167       {
00168   gen_.getSeed (seed);
00169       }
00170 
00175       void 
00176       verify (const Ordinal numCols, 
00177         const std::string& additionalFieldNames,
00178         const std::string& additionalData,
00179         const bool printFieldNames)
00180       {
00181   using std::endl;
00182 
00183   const int myRank = scalarComm_->rank();
00184   if (debug_)
00185     {
00186       scalarComm_->barrier();
00187       if (myRank == 0)
00188         err_ << "Verifying DistTsqr:" << endl;
00189       scalarComm_->barrier();
00190     }
00191 
00192   // Generate test problem.
00193   Matrix< Ordinal, Scalar > A_local, Q_local, R;
00194   testProblem (A_local, Q_local, R, numCols);
00195   if (debug_)
00196     {
00197       scalarComm_->barrier();
00198       if (myRank == 0)
00199         err_ << "-- Generated test problem." << endl;
00200       scalarComm_->barrier();
00201     }
00202 
00203   // Set up TSQR implementation.
00204   DistTsqr< Ordinal, Scalar > par (scalarComm_);
00205   if (debug_)
00206     {
00207       scalarComm_->barrier();
00208       if (myRank == 0)
00209         err_ << "-- DistTsqr object initialized" << endl << endl;
00210     }
00211 
00212   // Whether we've printed field names (i.e., column headers)
00213   // yet.  Only matters for non-humanReadable output.
00214   bool printedFieldNames = false;
00215 
00216   // Test DistTsqr::factor() and DistTsqr::explicit_Q().
00217   if (testFactorImplicit_)
00218     {
00219       // Factor the matrix A (copied into R, which will be
00220       // overwritten on output)
00221       typedef typename DistTsqr< Ordinal, Scalar >::FactorOutput 
00222         factor_output_type;
00223       factor_output_type factorOutput = par.factor (R.view());
00224       if (debug_)
00225         {
00226     scalarComm_->barrier();
00227     if (myRank == 0)
00228       err_ << "-- Finished DistTsqr::factor" << endl;
00229         }
00230       // Compute the explicit Q factor
00231       par.explicit_Q (numCols, Q_local.get(), Q_local.lda(), factorOutput);
00232       if (debug_)
00233         {
00234     scalarComm_->barrier();
00235     if (myRank == 0)
00236       err_ << "-- Finished DistTsqr::explicit_Q" << endl;
00237         }
00238       // Verify the factorization
00239       result_type result = 
00240         global_verify (numCols, numCols, A_local.get(), A_local.lda(),
00241            Q_local.get(), Q_local.lda(), R.get(), R.lda(), 
00242            scalarComm_.get());
00243       if (debug_)
00244         {
00245     scalarComm_->barrier();
00246     if (myRank == 0)
00247       err_ << "-- Finished global_verify" << endl;
00248         }
00249       reportResults ("DistTsqr", numCols, result, 
00250          additionalFieldNames, additionalData,
00251          printFieldNames && (! printedFieldNames));
00252       if (printFieldNames && (! printedFieldNames))
00253         printedFieldNames = true;
00254     }
00255 
00256   // Test DistTsqr::factorExplicit()
00257   if (testFactorExplicit_)
00258     {
00259       // Factor the matrix and compute the explicit Q factor, both
00260       // in a single operation.
00261       par.factorExplicit (R.view(), Q_local.view());
00262       if (debug_)
00263         {
00264     scalarComm_->barrier();
00265     if (myRank == 0)
00266       err_ << "-- Finished DistTsqr::factorExplicit" << endl;
00267         }
00268 
00269       if (printMatrices_)
00270         {
00271     if (myRank == 0)
00272       err_ << std::endl << "Computed Q factor:" << std::endl;
00273     printGlobalMatrix (err_, Q_local, scalarComm_.get(), ordinalComm_.get());
00274     if (myRank == 0)
00275       {
00276         err_ << std::endl << "Computed R factor:" << std::endl;
00277         print_local_matrix (err_, R.nrows(), R.ncols(), R.get(), R.lda());
00278         err_ << std::endl;
00279       }
00280         }
00281 
00282       // Verify the factorization
00283       result_type result = 
00284         global_verify (numCols, numCols, A_local.get(), A_local.lda(),
00285            Q_local.get(), Q_local.lda(), R.get(), R.lda(), 
00286            scalarComm_.get());
00287       if (debug_)
00288         {
00289     scalarComm_->barrier();
00290     if (myRank == 0)
00291       err_ << "-- Finished global_verify" << endl;
00292         }
00293       reportResults ("DistTsqrRB", numCols, result, 
00294          additionalFieldNames, additionalData,
00295          printFieldNames && (! printedFieldNames));
00296       if (printFieldNames && (! printedFieldNames))
00297         printedFieldNames = true;
00298     }
00299       }
00300 
00301     private:
00308       void 
00309       reportResults (const std::string& method, 
00310          const Ordinal numCols, 
00311          const result_type& result,
00312          const std::string& additionalFieldNames,
00313          const std::string& additionalData,
00314          const bool printFieldNames)
00315       {
00316   using std::endl;
00317 
00318   const int numProcs = scalarComm_->size();
00319   const int myRank = scalarComm_->rank();
00320 
00321   if (myRank == 0)
00322     {
00323       if (humanReadable_)
00324         {
00325     out_ << method << " accuracy results:" << endl
00326          << "Scalar type = " << scalarTypeName_ << endl
00327          << "Number of columns = " << numCols << endl
00328          << "Number of (MPI) processes = " << numProcs << endl
00329          << "Absolute residual $\\| A - Q R \\|_2: "
00330          << result[0] << endl
00331          << "Absolute orthogonality $\\| I - Q^* Q \\|_2$: " 
00332          << result[1] << endl
00333          << "Test matrix norm $\\| A \\|_F$: "
00334          << result[2] << endl;
00335         }
00336       else
00337         {
00338     // Use scientific notation for floating-point numbers
00339     out_ << std::scientific;
00340 
00341     if (printFieldNames)
00342       {
00343         out_ << "%method,scalarType,numCols,numProcs"
00344           ",absFrobResid,absFrobOrthog,frobA";
00345         if (! additionalFieldNames.empty())
00346           out_ << "," << additionalFieldNames;
00347         out_ << endl;
00348       }
00349 
00350     out_ << method
00351          << "," << scalarTypeName_
00352          << "," << numCols
00353          << "," << numProcs
00354          << "," << result[0]
00355          << "," << result[1]
00356          << "," << result[2];
00357     if (! additionalData.empty())
00358       out_ << "," << additionalData;
00359     out_ << endl;
00360         }
00361     }
00362       }
00363 
00364       void 
00365       testProblem (Matrix< Ordinal, Scalar >& A_local,
00366        Matrix< Ordinal, Scalar >& Q_local,
00367        Matrix< Ordinal, Scalar >& R,
00368        const Ordinal numCols)
00369       {
00370   const Ordinal numRowsLocal = numCols;
00371 
00372   // A_local: Space for the matrix A to factor -- local to each
00373   //   processor.
00374   //
00375   // A_global: Global matrix (only nonempty on Proc 0); only
00376   //   used temporarily.
00377   Matrix< Ordinal, Scalar > A_global;
00378 
00379   // This modifies A_local on all procs, and A_global on Proc 0.
00380   par_tsqr_test_problem (gen_, A_local, A_global, numCols, scalarComm_);
00381   
00382   if (printMatrices_)
00383     {
00384       const int myRank = scalarComm_->rank();
00385 
00386       if (myRank == 0)
00387         err_ << "Input matrix A:" << std::endl;
00388       printGlobalMatrix (err_, A_local, scalarComm_.get(), ordinalComm_.get());
00389       if (myRank == 0)
00390         err_ << std::endl;
00391     }
00392 
00393   // Copy the test problem input into R, since the factorization
00394   // will overwrite it in place with the final R factor.
00395   R.reshape (numCols, numCols);
00396   R.fill (Scalar(0));
00397   R.copy (A_local);
00398 
00399   // Prepare space in which to construct the explicit Q factor
00400   // (local component on this processor)
00401   Q_local.reshape (numRowsLocal, numCols);
00402   Q_local.fill (Scalar(0));
00403       }
00404     };
00405 
00406 
00410     template< class Ordinal, class Scalar, class TimerType >
00411     class DistTsqrBenchmarker {
00412       TSQR::Random::NormalGenerator< Ordinal, Scalar > gen_;
00413       Teuchos::RCP< MessengerBase< Scalar > > scalarComm_;
00414       Teuchos::RCP< MessengerBase< double > > doubleComm_; 
00415       std::string scalarTypeName_;
00416 
00417       std::ostream& out_;
00418       std::ostream& err_;
00419       const bool testFactorExplicit_, testFactorImplicit_;
00420       const bool humanReadable_, debug_;
00421 
00422     public:
00423       typedef Ordinal ordinal_type;
00424       typedef Scalar scalar_type;
00425       typedef typename Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type;
00426       typedef TimerType timer_type;
00427 
00451       DistTsqrBenchmarker (const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00452          const Teuchos::RCP< MessengerBase< double > >& doubleComm,
00453          const std::vector<int>& seed,
00454          const std::string& scalarTypeName,
00455          std::ostream& out,
00456          std::ostream& err,
00457          const bool testFactorExplicit,
00458          const bool testFactorImplicit,
00459          const bool humanReadable,
00460          const bool debug) :
00461   gen_ (seed), 
00462   scalarComm_ (scalarComm),
00463   doubleComm_ (doubleComm),
00464   scalarTypeName_ (scalarTypeName), 
00465   out_ (out), 
00466   err_ (err),
00467   testFactorExplicit_ (testFactorExplicit),
00468   testFactorImplicit_ (testFactorImplicit),
00469   humanReadable_ (humanReadable), 
00470   debug_ (debug)
00471       {}
00472 
00497       DistTsqrBenchmarker (const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00498          const Teuchos::RCP< MessengerBase< double > >& doubleComm,
00499          const std::string& scalarTypeName,
00500          std::ostream& out,
00501          std::ostream& err,
00502          const bool testFactorExplicit,
00503          const bool testFactorImplicit,
00504          const bool humanReadable,
00505          const bool debug) :
00506   scalarComm_ (scalarComm),
00507   doubleComm_ (doubleComm),
00508   scalarTypeName_ (scalarTypeName), 
00509   out_ (out), 
00510   err_ (err),
00511   testFactorExplicit_ (testFactorExplicit),
00512   testFactorImplicit_ (testFactorImplicit),
00513   humanReadable_ (humanReadable), 
00514   debug_ (debug)
00515       {}
00516 
00523       void 
00524       getSeed (std::vector<int>& seed) const
00525       {
00526   gen_.getSeed (seed);
00527       }
00528 
00535       void 
00536       benchmark (const int numTrials, 
00537      const Ordinal numCols,
00538      const std::string& additionalFieldNames,
00539      const std::string& additionalData,
00540      const bool printFieldNames)
00541       {
00542   using std::endl;
00543 
00544   // Set up test problem.
00545   Matrix< Ordinal, Scalar > A_local, Q_local, R;
00546   testProblem (A_local, Q_local, R, numCols);
00547 
00548   // Set up TSQR implementation.
00549   DistTsqr< Ordinal, Scalar > par (scalarComm_);
00550 
00551   // Whether we've printed field names (i.e., column headers)
00552   // yet.  Only matters for non-humanReadable output.
00553   bool printedFieldNames = false;
00554 
00555   if (testFactorImplicit_)
00556     {
00557       std::string timerName ("DistTsqr");
00558       typedef typename DistTsqr< Ordinal, Scalar >::FactorOutput 
00559         factor_output_type;
00560 
00561       // Throw away some number of runs, because some MPI libraries
00562       // (recent versions of OpenMPI at least) do autotuning for the
00563       // first few collectives calls.
00564       const int numThrowAwayRuns = 5;
00565       for (int runNum = 0; runNum < numThrowAwayRuns; ++runNum)
00566         {
00567     // Factor the matrix A (copied into R, which will be
00568     // overwritten on output)
00569     factor_output_type factorOutput = par.factor (R.view());
00570     // Compute the explicit Q factor
00571     par.explicit_Q (numCols, Q_local.get(), Q_local.lda(), factorOutput);
00572         }
00573 
00574       // Now do the actual timing runs.  Benchmark DistTsqr
00575       // (factor() and explicit_Q()) for numTrials trials.
00576       timer_type timer (timerName);
00577       timer.start();
00578       for (int trialNum = 0; trialNum < numTrials; ++trialNum)
00579         {
00580     // Factor the matrix A (copied into R, which will be
00581     // overwritten on output)
00582     factor_output_type factorOutput = par.factor (R.view());
00583     // Compute the explicit Q factor
00584     par.explicit_Q (numCols, Q_local.get(), Q_local.lda(), factorOutput);
00585         }
00586       // Cumulative timing on this MPI process.
00587       // "Cumulative" means the elapsed time of numTrials executions.
00588       const double localCumulativeTiming = timer.stop();
00589 
00590       // reportResults() must be called on all processes, since this
00591       // figures out the min and max timings over all processes.
00592       reportResults (timerName, numTrials, numCols, localCumulativeTiming, 
00593          additionalFieldNames, additionalData,
00594          printFieldNames && (! printedFieldNames));
00595       if (printFieldNames && (! printedFieldNames))
00596         printedFieldNames = true;
00597     }
00598 
00599   if (testFactorExplicit_)
00600     {
00601       std::string timerName ("DistTsqrRB");
00602 
00603       // Throw away some number of runs, because some MPI libraries
00604       // (recent versions of OpenMPI at least) do autotuning for the
00605       // first few collectives calls.
00606       const int numThrowAwayRuns = 5;
00607       for (int runNum = 0; runNum < numThrowAwayRuns; ++runNum)
00608         {
00609     par.factorExplicit (R.view(), Q_local.view());
00610         }
00611 
00612       // Benchmark DistTsqr::factorExplicit() for numTrials trials.
00613       timer_type timer (timerName);
00614       timer.start();
00615       for (int trialNum = 0; trialNum < numTrials; ++trialNum)
00616         {
00617     par.factorExplicit (R.view(), Q_local.view());
00618         }
00619       // Cumulative timing on this MPI process.
00620       // "Cumulative" means the elapsed time of numTrials executions.
00621       const double localCumulativeTiming = timer.stop();
00622 
00623       // Report cumulative (not per-invocation) timing results
00624       reportResults (timerName, numTrials, numCols, localCumulativeTiming, 
00625          additionalFieldNames, additionalData,
00626          printFieldNames && (! printedFieldNames));
00627       if (printFieldNames && (! printedFieldNames))
00628         printedFieldNames = true;
00629 
00630       // Per-invocation timings (for factorExplicit() benchmark
00631       // only).  localTimings were computed on this MPI process;
00632       // globalTimings are statistical summaries of those over
00633       // all MPI processes.  We only collect that data for
00634       // factorExplicit().
00635       std::vector< TimeStats > localTimings;
00636       std::vector< TimeStats > globalTimings;
00637       par.getFactorExplicitTimings (localTimings);
00638       for (std::vector< TimeStats >::size_type k = 0; k < localTimings.size(); ++k)
00639         globalTimings.push_back (globalTimeStats (doubleComm_, localTimings[k]));
00640       std::vector< std::string > timingLabels;
00641       par.getFactorExplicitTimingLabels (timingLabels);
00642 
00643       if (humanReadable_)
00644         out_ << timerName << " per-invocation benchmark results:" << endl;
00645 
00646       const std::string labelLabel ("label,scalarType");
00647       for (std::vector< std::string >::size_type k = 0; k < timingLabels.size(); ++k)
00648         {
00649     // Only print column headers (i.e., field names) once, if at all.
00650     const bool printHeaders = (k == 0) && printFieldNames;
00651     globalTimings[k].print (out_, humanReadable_, 
00652           timingLabels[k] + "," + scalarTypeName_, 
00653           labelLabel, printHeaders);
00654         }
00655     }
00656       }
00657 
00658     private:
00671       void 
00672       reportResults (const std::string& method,
00673          const int numTrials,
00674          const ordinal_type numCols,
00675          const double localTiming,
00676          const std::string& additionalFieldNames,
00677          const std::string& additionalData,
00678          const bool printFieldNames)
00679       {
00680   using std::endl;
00681 
00682   // Find min and max timing over all MPI processes
00683   TimeStats localStats;
00684   localStats.update (localTiming);
00685   TimeStats globalStats = globalTimeStats (doubleComm_, localStats);
00686 
00687   // Only Rank 0 prints the final results.
00688   const bool printResults = (doubleComm_->rank() == 0);
00689   if (printResults)
00690     {
00691       const int numProcs = doubleComm_->size();
00692       if (humanReadable_)
00693         {
00694     out_ << method << " cumulative benchmark results (total time over all trials):" << endl
00695          << "Scalar type = " << scalarTypeName_ << endl
00696          << "Number of columns = " << numCols << endl
00697          << "Number of (MPI) processes = " << numProcs << endl
00698          << "Number of trials = " << numTrials << endl
00699          << "Min timing (in seconds) = " << globalStats.min() << endl
00700          << "Mean timing (in seconds) = " << globalStats.mean() << endl
00701          << "Max timing (in seconds) = " << globalStats.max() << endl
00702          << endl;
00703         }
00704       else
00705         {
00706     // Use scientific notation for floating-point numbers
00707     out_ << std::scientific;
00708 
00709     if (printFieldNames)
00710       {
00711         out_ << "%method,scalarType,numCols,numProcs,numTrials"
00712        << ",minTiming,meanTiming,maxTiming";
00713         if (! additionalFieldNames.empty())
00714           out_ << "," << additionalFieldNames;
00715         out_ << endl;
00716       }
00717 
00718     out_ << method
00719          << "," << scalarTypeName_
00720          << "," << numCols
00721          << "," << numProcs
00722          << "," << numTrials
00723          << "," << globalStats.min()
00724          << "," << globalStats.mean()
00725          << "," << globalStats.max();
00726     if (! additionalData.empty())
00727       out_ << "," << additionalData;
00728     out_ << endl;
00729         }
00730     }
00731       }
00732 
00733       void 
00734       testProblem (Matrix< Ordinal, Scalar >& A_local,
00735        Matrix< Ordinal, Scalar >& Q_local,
00736        Matrix< Ordinal, Scalar >& R,
00737        const Ordinal numCols)
00738       {
00739   const Ordinal numRowsLocal = numCols;
00740 
00741   // A_local: Space for the matrix A to factor -- local to each
00742   //   processor.
00743   //
00744   // A_global: Global matrix (only nonempty on Proc 0); only
00745   //   used temporarily.
00746   Matrix< Ordinal, Scalar > A_global;
00747 
00748   // This modifies A_local on all procs, and A_global on Proc 0.
00749   par_tsqr_test_problem (gen_, A_local, A_global, numCols, scalarComm_);
00750 
00751   // Copy the test problem input into R, since the factorization
00752   // will overwrite it in place with the final R factor.
00753   R.reshape (numCols, numCols);
00754   R.copy (A_local);
00755 
00756   // Prepare space in which to construct the explicit Q factor
00757   // (local component on this processor)
00758   Q_local.reshape (numRowsLocal, numCols);
00759   Q_local.fill (Scalar(0));
00760       }
00761 
00764       static void
00765       conceptChecks () 
00766       {
00767   verifyTimerConcept< timer_type >();
00768       }
00769     };
00770 
00771 
00772   } // namespace Test
00773 } // namespace TSQR
00774 
00775 #endif // __TSQR_Test_DistTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends