Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_TsqrTest.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __TSQR_Test_TsqrTest_hpp
00030 #define __TSQR_Test_TsqrTest_hpp
00031 
00032 #include <Tsqr.hpp>
00033 #ifdef HAVE_TSQR_INTEL_TBB
00034 #  include <TbbTsqr.hpp>
00035 #endif // HAVE_TSQR_INTEL_TBB
00036 #include <Tsqr_TestSetup.hpp>
00037 #include <Tsqr_GlobalVerify.hpp>
00038 #include <Tsqr_printGlobalMatrix.hpp>
00039 #include <Tsqr_verifyTimerConcept.hpp>
00040 #include <Teuchos_ScalarTraits.hpp>
00041 
00042 #include <cstring> // size_t
00043 #include <iostream>
00044 #include <stdexcept>
00045 #include <string>
00046 
00047 
00048 namespace TSQR {
00049   namespace Test {
00050 
00051     template<class TsqrType>
00052     class TsqrVerifier {
00053     public:
00054       typedef TsqrType tsqr_type;
00055       typedef typename tsqr_type::scalar_type scalar_type;
00056       typedef typename tsqr_type::ordinal_type ordinal_type;
00057       typedef Matrix<ordinal_type, scalar_type> matrix_type;
00058       typedef typename tsqr_type::FactorOutput factor_output_type;
00059       typedef MessengerBase<scalar_type> messenger_type;
00060       typedef Teuchos::RCP<messenger_type> messenger_ptr;
00061 
00062       static void
00063       verify (tsqr_type& tsqr,
00064         const messenger_ptr& scalarComm,
00065         const matrix_type& A_local,
00066         matrix_type& A_copy,
00067         matrix_type& Q_local,
00068         matrix_type& R,
00069         const bool contiguousCacheBlocks,
00070         const bool b_debug = false)
00071       {
00072   using std::cerr;
00073   using std::endl;
00074 
00075   const ordinal_type nrows_local = A_local.nrows();
00076   const ordinal_type ncols = A_local.ncols();
00077 
00078   // If specified, rearrange cache blocks in the copy.
00079   if (contiguousCacheBlocks)
00080     {
00081       tsqr.cache_block (nrows_local, ncols, A_copy.get(), 
00082             A_local.get(), A_local.lda());
00083       if (b_debug)
00084         {
00085     scalarComm->barrier();
00086     if (scalarComm->rank() == 0)
00087       cerr << "-- Cache-blocked input matrix to factor." << endl;
00088         }
00089     }
00090   else
00091     A_copy.copy (A_local);
00092 
00093   const bool testFactorExplicit = true;
00094   if (testFactorExplicit)
00095     {
00096       tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), 
00097          contiguousCacheBlocks);
00098       if (b_debug)
00099         {
00100     scalarComm->barrier();
00101     if (scalarComm->rank() == 0)
00102       cerr << "-- Finished Tsqr::factorExplicit" << endl;
00103         }
00104     }
00105   else
00106     {
00107       // Factor the (copy of the) matrix.
00108       factor_output_type factorOutput = 
00109         tsqr.factor (nrows_local, ncols, A_copy.get(), A_copy.lda(), 
00110          R.get(), R.lda(), contiguousCacheBlocks);
00111       if (b_debug)
00112         {
00113     scalarComm->barrier();
00114     if (scalarComm->rank() == 0)
00115       cerr << "-- Finished Tsqr::factor" << endl;
00116         }
00117 
00118       // Compute the explicit Q factor in Q_local
00119       tsqr.explicit_Q (nrows_local, 
00120            ncols, A_copy.get(), A_copy.lda(), factorOutput, 
00121            ncols, Q_local.get(), Q_local.lda(), 
00122            contiguousCacheBlocks);
00123       if (b_debug)
00124         {
00125     scalarComm->barrier();
00126     if (scalarComm->rank() == 0)
00127       cerr << "-- Finished Tsqr::explicit_Q" << endl;
00128         }
00129     }
00130 
00131   // "Un"-cache-block the output, if contiguous cache blocks were
00132   // used.  This is only necessary because global_verify() doesn't
00133   // currently support contiguous cache blocks.
00134   if (contiguousCacheBlocks)
00135     {
00136       // We can use A_copy as scratch space for un-cache-blocking
00137       // Q_local, since we're done using A_copy for other things.
00138       tsqr.un_cache_block (nrows_local, ncols, A_copy.get(), 
00139          A_copy.lda(), Q_local.get());
00140       // Overwrite Q_local with the un-cache-blocked Q factor.
00141       Q_local.copy (A_copy);
00142 
00143       if (b_debug)
00144         {
00145     scalarComm->barrier();
00146     if (scalarComm->rank() == 0)
00147       cerr << "-- Un-cache-blocked output Q factor" << endl;
00148         }
00149     }
00150       }
00151     };
00152 
00197     template<class Ordinal, class Scalar, class Generator>
00198     void
00199     verifyTsqr (const std::string& which,
00200     const std::string& scalarTypeName,
00201     Generator& generator,
00202     const Ordinal nrows_global,
00203     const Ordinal ncols,
00204     const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00205     const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00206     const int num_cores = 1,
00207     const size_t cache_size_hint = 0,
00208     const bool contiguousCacheBlocks,
00209     const bool printFieldNames,
00210     const bool human_readable = false,
00211     const bool b_debug = false)
00212     {
00213       typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
00214       using std::cerr;
00215       using std::cout;
00216       using std::endl;
00217 
00218       const bool b_extra_debug = false;
00219       const int nprocs = scalarComm->size();
00220       const int my_rank = scalarComm->rank();
00221       if (b_debug)
00222   {
00223     scalarComm->barrier();
00224     if (my_rank == 0)
00225       cerr << "tsqr_verify:" << endl;
00226     scalarComm->barrier();
00227   }
00228       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00229 
00230       // Set up storage for the test problem.
00231       Matrix< Ordinal, Scalar > A_local (nrows_local, ncols);
00232       Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols);
00233       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00234   {
00235     A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00236     Q_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00237   }
00238       Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0));
00239 
00240       // Generate the test problem.
00241       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00242       if (b_debug)
00243   {
00244     scalarComm->barrier();
00245     if (my_rank == 0)
00246       cerr << "-- Generated test problem." << endl;
00247   }
00248 
00249       // Make sure that the test problem (the matrix to factor) was
00250       // distributed correctly.
00251       if (b_extra_debug && b_debug)
00252   {
00253     if (my_rank == 0)
00254       cerr << "Test matrix A:" << endl;
00255     scalarComm->barrier ();
00256     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00257     scalarComm->barrier ();
00258   }
00259 
00260       // Factoring the matrix stored in A_local overwrites it, so we
00261       // make a copy of A_local.  Initialize with NaNs to make sure
00262       // that cache blocking works correctly (if applicable).
00263       Matrix< Ordinal, Scalar > A_copy (nrows_local, ncols);
00264       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00265   A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN());
00266 
00267       // actual_cache_size_hint: "cache_size_hint" is just a
00268       // suggestion.  TSQR determines the cache size hint itself;
00269       // this remembers it so we can print it out later.
00270       size_t actual_cache_size_hint;
00271 
00272       if (which == "MpiTbbTSQR")
00273   {
00274 #ifdef HAVE_TSQR_INTEL_TBB
00275     using Teuchos::RCP;
00276     typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type;
00277     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00278     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00279 
00280     RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (num_cores, cache_size_hint));
00281     RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm));
00282     tsqr_type tsqr (node_tsqr, dist_tsqr);
00283     
00284     // Compute the factorization and explicit Q factor.
00285     TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy, 
00286                Q_local, R, contiguousCacheBlocks, 
00287                b_debug);
00288     // Save the "actual" cache block size
00289     actual_cache_size_hint = tsqr.cache_size_hint();
00290 #else
00291     throw std::logic_error("TSQR not built with Intel TBB support");
00292 #endif // HAVE_TSQR_INTEL_TBB
00293   }
00294       else if (which == "MpiSeqTSQR")
00295   {
00296     using Teuchos::RCP;
00297     typedef SequentialTsqr< Ordinal, Scalar > node_tsqr_type;
00298     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00299     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00300 
00301     RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (cache_size_hint));
00302     RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm));
00303     tsqr_type tsqr (node_tsqr, dist_tsqr);
00304     
00305     // Compute the factorization and explicit Q factor.
00306     TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy, 
00307                Q_local, R, contiguousCacheBlocks, 
00308                b_debug);
00309     // Save the "actual" cache block size
00310     actual_cache_size_hint = tsqr.cache_size_hint();
00311   }
00312       else 
00313   throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00314 
00315       // Print out the Q and R factors
00316       if (b_extra_debug && b_debug)
00317   {
00318     if (my_rank == 0)
00319       cerr << endl << "Q factor:" << endl;
00320     scalarComm->barrier();
00321     printGlobalMatrix (cerr, Q_local, scalarComm.get(), ordinalComm.get());
00322     scalarComm->barrier ();
00323     if (my_rank == 0)
00324       {
00325         cerr << endl << "R factor:" << endl;
00326         print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00327         cerr << endl;
00328       }
00329     scalarComm->barrier ();
00330   }
00331 
00332       // Test accuracy of the resulting factorization
00333       std::vector< magnitude_type > results = 
00334   global_verify (nrows_local, ncols, A_local.get(), A_local.lda(),
00335            Q_local.get(), Q_local.lda(), R.get(), R.lda(), 
00336            scalarComm.get());
00337       if (b_debug)
00338   {
00339     scalarComm->barrier();
00340     if (my_rank == 0)
00341       cerr << "-- Finished global_verify" << endl;
00342   }
00343 
00344       // Print the results on Proc 0.
00345       if (my_rank == 0)
00346   {
00347     if (human_readable)
00348       {
00349         std::string human_readable_name;
00350 
00351         if (which == "MpiSeqTSQR")
00352     human_readable_name = "MPI parallel / cache-blocked TSQR";
00353         else if (which == "MpiTbbTSQR")
00354     {
00355 #ifdef HAVE_TSQR_INTEL_TBB
00356       human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR";
00357 #else
00358       throw std::logic_error("TSQR not built with Intel TBB support");
00359 #endif // HAVE_TSQR_INTEL_TBB
00360     }
00361         else 
00362     throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00363 
00364         cout << human_readable_name << ":" << endl
00365        << "Scalar type: " << scalarTypeName << endl
00366        << "# rows: " << nrows_global << endl
00367        << "# columns: " << ncols << endl
00368        << "# MPI processes: " << nprocs << endl;
00369 #ifdef HAVE_TSQR_INTEL_TBB
00370         if (which == "MpiTbbTSQR")
00371     cout << "# cores per process = " << num_cores << endl;
00372 #endif // HAVE_TSQR_INTEL_TBB
00373         cout << "Cache size hint in bytes: " << actual_cache_size_hint << endl
00374        << "Contiguous cache blocks? " << contiguousCacheBlocks << endl
00375        << "Absolute residual $\\| A - Q R \\|_2: "
00376        << results[0] << endl
00377        << "Absolute orthogonality $\\| I - Q^* Q \\|_2$: " 
00378        << results[1] << endl
00379        << "Test matrix norm $\\| A \\|_F$: "
00380        << results[2] << endl
00381        << endl;
00382       }
00383     else
00384       {
00385         if (printFieldNames)
00386     {
00387       cout << "%"
00388            << "method"
00389            << ",scalarType"
00390            << ",globalNumRows"
00391            << ",numCols"
00392            << ",numProcs"
00393            << ",numCores"
00394            << ",cacheSizeHint"
00395            << ",contiguousCacheBlocks"
00396            << ",absFrobResid"
00397            << ",absFrobOrthog"
00398            << ",frobA" << endl;
00399     }
00400 
00401         cout << which
00402        << "," << scalarTypeName
00403        << "," << nrows_global
00404        << "," << ncols
00405        << "," << nprocs;
00406 #ifdef HAVE_TSQR_INTEL_TBB
00407         if (which == "MpiTbbTSQR")
00408     cout << "," << num_cores;
00409         else
00410     cout << ",1";
00411 #else
00412         cout << ",1" << endl;
00413 #endif // HAVE_TSQR_INTEL_TBB
00414         cout << "," << actual_cache_size_hint
00415        << "," << contiguousCacheBlocks 
00416        << "," << results[0] 
00417        << "," << results[1]
00418        << "," << results[2]
00419        << endl;
00420       }
00421   }
00422     }
00423 
00424 
00425     template<class TsqrBase, class TimerType>
00426     double
00427     do_tsqr_benchmark (const std::string& which,
00428            TsqrBase& tsqr, 
00429            const Teuchos::RCP< MessengerBase< typename TsqrBase::scalar_type > >& messenger,
00430            const Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& A_local,
00431            Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& A_copy,
00432            Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& Q_local,
00433            Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& R,
00434            const int ntrials,
00435            const bool contiguousCacheBlocks,
00436            const bool human_readable,
00437            const bool b_debug = false)
00438     {
00439       typedef typename TsqrBase::FactorOutput factor_output_type;
00440       typedef typename TsqrBase::ordinal_type ordinal_type;
00441       using std::cerr;
00442       using std::cout;
00443       using std::endl;
00444 
00445       const ordinal_type nrows_local = A_local.nrows();
00446       const ordinal_type ncols = A_local.ncols();
00447 
00448       if (contiguousCacheBlocks)
00449   {
00450     tsqr.cache_block (nrows_local, ncols, A_copy.get(), 
00451           A_local.get(), A_local.lda());
00452     if (b_debug)
00453       {
00454         messenger->barrier();
00455         if (messenger->rank() == 0)
00456     cerr << "-- Cache-blocked input matrix to factor." << endl;
00457       }
00458   }
00459       else
00460   A_copy.copy (A_local);
00461 
00462       if (b_debug)
00463   {
00464     messenger->barrier();
00465     if (messenger->rank() == 0)
00466       cerr << "-- Starting timing loop" << endl;
00467   }
00468 
00469       // Benchmark TSQR for ntrials trials.  The answer (the numerical
00470       // results of the factorization) is only valid if ntrials == 1,
00471       // but this is a benchmark and not a verification routine.  Call
00472       // tsqr_verify() if you want to determine whether TSQR computes
00473       // the right answer.
00474       //
00475       // Name of timer doesn't matter here; we only need the timing.
00476       TSQR::Test::verifyTimerConcept< TimerType >();
00477       TimerType timer (which);
00478 
00479 
00480       const bool testFactorExplicit = true;
00481       double tsqr_timing;
00482       if (testFactorExplicit)
00483   {
00484     timer.start();
00485     for (int trial_num = 0; trial_num < ntrials; ++trial_num)
00486       tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), 
00487          contiguousCacheBlocks);
00488     tsqr_timing = timer.stop();
00489   }
00490       else
00491   {
00492     timer.start();
00493     for (int trial_num = 0; trial_num < ntrials; ++trial_num)
00494       {
00495         // Factor the matrix and compute the explicit Q factor.
00496         // Don't worry about the fact that we're overwriting the
00497         // input; this is a benchmark, not a numerical verification
00498         // test.  (We have the latter implemented as tsqr_verify()
00499         // in this file.)  For the same reason, don't worry about
00500         // un-cache-blocking the output (when cache blocks are
00501         // stored contiguously).
00502         factor_output_type factor_output = 
00503     tsqr.factor (nrows_local, ncols, A_copy.get(), A_copy.lda(), 
00504            R.get(), R.lda(), contiguousCacheBlocks);
00505         tsqr.explicit_Q (nrows_local, 
00506              ncols, A_copy.get(), A_copy.lda(), factor_output, 
00507              ncols, Q_local.get(), Q_local.lda(), 
00508              contiguousCacheBlocks);
00509         // Timings in debug mode likely won't make sense, because
00510         // Proc 0 is outputting the debug messages to cerr.
00511         // Nevertheless, we don't put any "if(b_debug)" calls in the
00512         // timing loop.
00513       }
00514     // Compute the resulting total time (in seconds) to execute
00515     // ntrials runs of Tsqr::factor() and Tsqr::explicit_Q().  The
00516     // time may differ on different MPI processes.
00517     tsqr_timing = timer.stop();
00518   }
00519 
00520       if (b_debug)
00521   {
00522     messenger->barrier();
00523     if (messenger->rank() == 0)
00524       cerr << "-- Finished timing loop" << endl;
00525   }
00526       return tsqr_timing;
00527     }
00528 
00582     template<class Ordinal, class Scalar, class Generator, class TimerType>
00583     void
00584     benchmarkTsqr (const std::string& which,
00585        const std::string& scalarTypeName,
00586        Generator& generator,
00587        const int ntrials,
00588        const Ordinal nrows_global,
00589        const Ordinal ncols,
00590        const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00591        const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00592        const Ordinal num_cores,
00593        const size_t cache_size_hint,
00594        const bool contiguousCacheBlocks,
00595        const bool printFieldNames,
00596        const bool human_readable,
00597        const bool b_debug)
00598     {
00599       using std::cerr;
00600       using std::cout;
00601       using std::endl;
00602 
00603       TSQR::Test::verifyTimerConcept< TimerType >();
00604       const bool b_extra_debug = false;
00605       const int nprocs = scalarComm->size();
00606       const int my_rank = scalarComm->rank();
00607       if (b_debug)
00608   {
00609     scalarComm->barrier();
00610     if (my_rank == 0)
00611       cerr << "tsqr_benchmark:" << endl;
00612     scalarComm->barrier();
00613   }
00614       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00615 
00616       // Set up storage for the test problem.
00617       Matrix< Ordinal, Scalar > A_local (nrows_local, ncols);
00618       Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols);
00619       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00620   {
00621     A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00622     Q_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00623   }
00624       Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0));
00625 
00626       // Generate the test problem.
00627       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00628       if (b_debug)
00629   {
00630     scalarComm->barrier();
00631     if (my_rank == 0)
00632       cerr << "-- Generated test problem." << endl;
00633   }
00634 
00635       // Make sure that the test problem (the matrix to factor) was
00636       // distributed correctly.
00637       if (b_extra_debug && b_debug)
00638   {
00639     if (my_rank == 0)
00640       cerr << "Test matrix A:" << endl;
00641     scalarComm->barrier ();
00642     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00643     scalarComm->barrier ();
00644   }
00645 
00646       // Factoring the matrix stored in A_local overwrites it, so we
00647       // make a copy of A_local.  If specified, rearrange cache blocks
00648       // in the copy.  Initialize with NaNs to make sure that cache
00649       // blocking worked correctly.
00650       Matrix< Ordinal, Scalar > A_copy (nrows_local, ncols);
00651       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00652   A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN());
00653 
00654       // actual_cache_size_hint: "cache_size_hint" is just a
00655       // suggestion.  TSQR determines the cache block size itself;
00656       // this remembers it so we can print it out later.
00657       size_t actual_cache_size_hint;
00658       // Run time (in seconds, as a double-precision floating-point
00659       // value) for TSQR on this MPI node.
00660       double tsqr_timing;
00661 
00662       if (which == "MpiTbbTSQR")
00663   {
00664 #ifdef HAVE_TSQR_INTEL_TBB
00665     using Teuchos::RCP;
00666     typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type;
00667     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00668     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00669 
00670     RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (num_cores, cache_size_hint));
00671     RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm));
00672     tsqr_type tsqr (nodeTsqr, distTsqr);
00673 
00674     // Run the benchmark.
00675     tsqr_timing = 
00676       do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local,
00677                    A_copy, Q_local, R, ntrials, 
00678                    contiguousCacheBlocks, 
00679                    human_readable, b_debug);
00680 
00681     // Save the "actual" cache block size
00682     actual_cache_size_hint = tsqr.cache_size_hint();
00683 #else
00684     throw std::logic_error("TSQR not built with Intel TBB support");
00685 #endif // HAVE_TSQR_INTEL_TBB
00686   }
00687       else if (which == "MpiSeqTSQR")
00688   {
00689     using Teuchos::RCP;
00690     typedef SequentialTsqr< Ordinal, Scalar > node_tsqr_type;
00691     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00692     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00693 
00694     // Set up TSQR.
00695     RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (cache_size_hint));
00696     RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm));
00697     tsqr_type tsqr (nodeTsqr, distTsqr);
00698     
00699     // Run the benchmark.
00700     tsqr_timing = 
00701       do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local,
00702                    A_copy, Q_local, R, ntrials, 
00703                    contiguousCacheBlocks, 
00704                    human_readable, b_debug);
00705     // Save the "actual" cache block size
00706     actual_cache_size_hint = tsqr.cache_size_hint();
00707   }
00708       else
00709   throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00710 
00711       // Find the min and max TSQR timing on all processors.
00712       const double min_tsqr_timing = scalarComm->globalMin (tsqr_timing);
00713       const double max_tsqr_timing = scalarComm->globalMax (tsqr_timing);
00714 
00715       // Print the results on Proc 0.
00716       if (my_rank == 0)
00717   {
00718     if (human_readable)
00719       {
00720         std::string human_readable_name;
00721 
00722         if (which == "MpiSeqTSQR")
00723     human_readable_name = "MPI parallel / cache-blocked TSQR";
00724         else if (which == "MpiTbbTSQR")
00725     {
00726 #ifdef HAVE_TSQR_INTEL_TBB
00727       human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR";
00728 #else
00729       throw std::logic_error("TSQR not built with Intel TBB support");
00730 #endif // HAVE_TSQR_INTEL_TBB
00731     }
00732         else 
00733     throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00734 
00735         cout << human_readable_name << ":" << endl
00736        << "Scalar type: " << scalarTypeName << endl
00737        << "# rows: " << nrows_global << endl
00738        << "# columns: " << ncols << endl
00739        << "# MPI processes: " << nprocs << endl;
00740 
00741 #ifdef HAVE_TSQR_INTEL_TBB
00742         if (which == "MpiTbbTSQR")
00743     cout << "# cores per process: " << num_cores << endl;
00744 #endif // HAVE_TSQR_INTEL_TBB
00745 
00746         cout << "Cache size hint in bytes: " << actual_cache_size_hint << endl
00747        << "contiguous cache blocks? " << contiguousCacheBlocks << endl
00748        << "# trials: " << ntrials << endl
00749        << "Min total time (s) over all MPI processes: " 
00750        << min_tsqr_timing << endl
00751        << "Max total time (s) over all MPI processes: " 
00752        << max_tsqr_timing << endl
00753        << endl;
00754       }
00755     else
00756       {
00757         if (printFieldNames)
00758     {
00759       cout << "%"
00760            << "method"
00761            << ",scalarType"
00762            << ",globalNumRows"
00763            << ",numCols"
00764            << ",numProcs"
00765            << ",numCores"
00766            << ",cacheSizeHint"
00767            << ",contiguousCacheBlocks"
00768            << ",numTrials"
00769            << ",minTiming"
00770            << ",maxTiming" 
00771            << endl;
00772     }
00773         cout << which
00774        << "," << scalarTypeName
00775        << "," << nrows_global
00776        << "," << ncols 
00777        << "," << nprocs;
00778 #ifdef HAVE_TSQR_INTEL_TBB
00779         if (which == "MpiTbbTSQR")
00780     cout << "," << num_cores;
00781         else 
00782     cout << ",1";
00783 #else
00784         cout << ",1";
00785 #endif // HAVE_TSQR_INTEL_TBB
00786         cout << "," << actual_cache_size_hint
00787        << "," << contiguousCacheBlocks
00788        << "," << ntrials 
00789        << "," << min_tsqr_timing 
00790        << "," << max_tsqr_timing 
00791        << endl;
00792       }
00793   }
00794     }
00795 
00796 
00797   } // namespace Test
00798 } // namespace TSQR
00799 
00800 #endif // __TSQR_Test_TsqrTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends