Anasazi Version of the Day
Tsqr_TsqrTest.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                 Anasazi: Block Eigensolvers Package
00005 //                 Copyright (2010) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00025 //
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 #ifndef __TSQR_Test_TsqrTest_hpp
00030 #define __TSQR_Test_TsqrTest_hpp
00031 
00032 #include <Tsqr_Config.hpp>
00033 
00034 #include <Tsqr.hpp>
00035 #ifdef HAVE_TSQR_INTEL_TBB
00036 #  include <TbbTsqr.hpp>
00037 #endif // HAVE_TSQR_INTEL_TBB
00038 #include <Tsqr_TestSetup.hpp>
00039 #include <Tsqr_GlobalVerify.hpp>
00040 #include <Tsqr_printGlobalMatrix.hpp>
00041 #include <Tsqr_verifyTimerConcept.hpp>
00042 
00043 #include <cstring> // size_t
00044 #include <iostream>
00045 #include <stdexcept>
00046 #include <string>
00047 
00050 
00051 namespace TSQR {
00052   namespace Test {
00053 
00054     template< class TsqrType >
00055     class TsqrVerifier {
00056     public:
00057       typedef TsqrType tsqr_type;
00058       typedef typename tsqr_type::scalar_type scalar_type;
00059       typedef typename tsqr_type::ordinal_type ordinal_type;
00060       typedef Matrix< ordinal_type, scalar_type > matrix_type;
00061       typedef typename tsqr_type::FactorOutput factor_output_type;
00062       typedef MessengerBase< scalar_type > messenger_type;
00063       typedef Teuchos::RCP< messenger_type > messenger_ptr;
00064 
00065       static void
00066       verify (tsqr_type& tsqr,
00067         const messenger_ptr& scalarComm,
00068         const matrix_type& A_local,
00069         matrix_type& A_copy,
00070         matrix_type& Q_local,
00071         matrix_type& R,
00072         const bool contiguousCacheBlocks,
00073         const bool b_debug = false)
00074       {
00075   using std::cerr;
00076   using std::endl;
00077 
00078   const ordinal_type nrows_local = A_local.nrows();
00079   const ordinal_type ncols = A_local.ncols();
00080 
00081   // If specified, rearrange cache blocks in the copy.
00082   if (contiguousCacheBlocks)
00083     {
00084       tsqr.cache_block (nrows_local, ncols, A_copy.get(), 
00085             A_local.get(), A_local.lda());
00086       if (b_debug)
00087         {
00088     scalarComm->barrier();
00089     if (scalarComm->rank() == 0)
00090       cerr << "-- Cache-blocked input matrix to factor." << endl;
00091         }
00092     }
00093   else
00094     A_copy.copy (A_local);
00095 
00096   const bool testFactorExplicit = true;
00097   if (testFactorExplicit)
00098     {
00099       tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), 
00100          contiguousCacheBlocks);
00101       if (b_debug)
00102         {
00103     scalarComm->barrier();
00104     if (scalarComm->rank() == 0)
00105       cerr << "-- Finished Tsqr::factorExplicit" << endl;
00106         }
00107     }
00108   else
00109     {
00110       // Factor the (copy of the) matrix.
00111       factor_output_type factorOutput = 
00112         tsqr.factor (nrows_local, ncols, A_copy.get(), A_copy.lda(), 
00113          R.get(), R.lda(), contiguousCacheBlocks);
00114       if (b_debug)
00115         {
00116     scalarComm->barrier();
00117     if (scalarComm->rank() == 0)
00118       cerr << "-- Finished Tsqr::factor" << endl;
00119         }
00120 
00121       // Compute the explicit Q factor in Q_local
00122       tsqr.explicit_Q (nrows_local, 
00123            ncols, A_copy.get(), A_copy.lda(), factorOutput, 
00124            ncols, Q_local.get(), Q_local.lda(), 
00125            contiguousCacheBlocks);
00126       if (b_debug)
00127         {
00128     scalarComm->barrier();
00129     if (scalarComm->rank() == 0)
00130       cerr << "-- Finished Tsqr::explicit_Q" << endl;
00131         }
00132     }
00133 
00134   // "Un"-cache-block the output, if contiguous cache blocks were
00135   // used.  This is only necessary because global_verify() doesn't
00136   // currently support contiguous cache blocks.
00137   if (contiguousCacheBlocks)
00138     {
00139       // We can use A_copy as scratch space for un-cache-blocking
00140       // Q_local, since we're done using A_copy for other things.
00141       tsqr.un_cache_block (nrows_local, ncols, A_copy.get(), 
00142          A_copy.lda(), Q_local.get());
00143       // Overwrite Q_local with the un-cache-blocked Q factor.
00144       Q_local.copy (A_copy);
00145 
00146       if (b_debug)
00147         {
00148     scalarComm->barrier();
00149     if (scalarComm->rank() == 0)
00150       cerr << "-- Un-cache-blocked output Q factor" << endl;
00151         }
00152     }
00153       }
00154     };
00155 
00195     template< class Ordinal, class Scalar, class Generator >
00196     void
00197     verifyTsqr (const std::string& which,
00198     Generator& generator,
00199     const Ordinal nrows_global,
00200     const Ordinal ncols,
00201     const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00202     const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00203     const int num_cores = 1,
00204     const size_t cache_block_size = 0,
00205     const bool contiguousCacheBlocks = false,
00206     const bool human_readable = false,
00207     const bool b_debug = false)
00208     {
00209       typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00210       using std::cerr;
00211       using std::cout;
00212       using std::endl;
00213 
00214       const bool b_extra_debug = false;
00215       const int nprocs = scalarComm->size();
00216       const int my_rank = scalarComm->rank();
00217       if (b_debug)
00218   {
00219     scalarComm->barrier();
00220     if (my_rank == 0)
00221       cerr << "tsqr_verify:" << endl;
00222     scalarComm->barrier();
00223   }
00224       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00225 
00226       // Set up storage for the test problem.
00227       Matrix< Ordinal, Scalar > A_local (nrows_local, ncols);
00228       Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols);
00229       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00230   {
00231     A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00232     Q_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00233   }
00234       Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0));
00235 
00236       // Generate the test problem.
00237       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00238       if (b_debug)
00239   {
00240     scalarComm->barrier();
00241     if (my_rank == 0)
00242       cerr << "-- Generated test problem." << endl;
00243   }
00244 
00245       // Make sure that the test problem (the matrix to factor) was
00246       // distributed correctly.
00247       if (b_extra_debug && b_debug)
00248   {
00249     if (my_rank == 0)
00250       cerr << "Test matrix A:" << endl;
00251     scalarComm->barrier ();
00252     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00253     scalarComm->barrier ();
00254   }
00255 
00256       // Factoring the matrix stored in A_local overwrites it, so we
00257       // make a copy of A_local.  Initialize with NaNs to make sure
00258       // that cache blocking works correctly (if applicable).
00259       Matrix< Ordinal, Scalar > A_copy (nrows_local, ncols);
00260       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00261   A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN());
00262 
00263       // actual_cache_block_size: "cache_block_size" is just a
00264       // suggestion.  TSQR determines the cache block size itself;
00265       // this remembers it so we can print it out later.
00266       size_t actual_cache_block_size;
00267 
00268       if (which == "MpiTbbTSQR")
00269   {
00270 #ifdef HAVE_TSQR_INTEL_TBB
00271     using Teuchos::RCP;
00272     typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type;
00273     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00274     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00275 
00276     RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (num_cores, cache_block_size));
00277     RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm));
00278     tsqr_type tsqr (node_tsqr, dist_tsqr);
00279     
00280     // Compute the factorization and explicit Q factor.
00281     TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy, 
00282                Q_local, R, contiguousCacheBlocks, 
00283                b_debug);
00284     // Save the "actual" cache block size
00285     actual_cache_block_size = tsqr.cache_block_size();
00286 #else
00287     throw std::logic_error("TSQR not built with Intel TBB support");
00288 #endif // HAVE_TSQR_INTEL_TBB
00289   }
00290       else if (which == "MpiSeqTSQR")
00291   {
00292     using Teuchos::RCP;
00293     typedef SequentialTsqr< Ordinal, Scalar > node_tsqr_type;
00294     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00295     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00296 
00297     RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (cache_block_size));
00298     RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm));
00299     tsqr_type tsqr (node_tsqr, dist_tsqr);
00300     
00301     // Compute the factorization and explicit Q factor.
00302     TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy, 
00303                Q_local, R, contiguousCacheBlocks, 
00304                b_debug);
00305     // Save the "actual" cache block size
00306     actual_cache_block_size = tsqr.cache_block_size();
00307   }
00308       else 
00309   throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00310 
00311       // Print out the Q and R factors
00312       if (b_extra_debug && b_debug)
00313   {
00314     if (my_rank == 0)
00315       cerr << endl << "Q factor:" << endl;
00316     scalarComm->barrier();
00317     printGlobalMatrix (cerr, Q_local, scalarComm.get(), ordinalComm.get());
00318     scalarComm->barrier ();
00319     if (my_rank == 0)
00320       {
00321         cerr << endl << "R factor:" << endl;
00322         print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00323         cerr << endl;
00324       }
00325     scalarComm->barrier ();
00326   }
00327 
00328       // Test accuracy of the resulting factorization
00329       std::pair< magnitude_type, magnitude_type > result = 
00330   global_verify (nrows_local, ncols, A_local.get(), A_local.lda(),
00331            Q_local.get(), Q_local.lda(), R.get(), R.lda(), 
00332            scalarComm.get());
00333       if (b_debug)
00334   {
00335     scalarComm->barrier();
00336     if (my_rank == 0)
00337       cerr << "-- Finished global_verify" << endl;
00338   }
00339 
00340       // Print the results on Proc 0.
00341       if (my_rank == 0)
00342   {
00343     if (human_readable)
00344       {
00345         std::string human_readable_name;
00346 
00347         if (which == "MpiSeqTSQR")
00348     human_readable_name = "MPI parallel / cache-blocked TSQR";
00349         else if (which == "MpiTbbTSQR")
00350     {
00351 #ifdef HAVE_TSQR_INTEL_TBB
00352       human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR";
00353 #else
00354       throw std::logic_error("TSQR not built with Intel TBB support");
00355 #endif // HAVE_TSQR_INTEL_TBB
00356     }
00357         else 
00358     throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00359 
00360         cout << human_readable_name << ":" << endl
00361        << "# rows = " << nrows_global << endl
00362        << "# columns = " << ncols << endl
00363        << "# MPI processes = " << nprocs << endl;
00364 #ifdef HAVE_TSQR_INTEL_TBB
00365         if (which == "MpiTbbTSQR")
00366     cout << "# cores per process = " << num_cores << endl;
00367 #endif // HAVE_TSQR_INTEL_TBB
00368         cout << "cache block # bytes = " << actual_cache_block_size << endl
00369        << "contiguous cache blocks? " << contiguousCacheBlocks << endl
00370        << "Relative residual $\\|A - Q*R\\|_2 / \\|A\\|_2$ = " 
00371        << result.first << endl
00372        << "Relative orthogonality $\\|I - Q^T*Q\\|_2$ = " 
00373        << result.second << endl
00374        << endl;
00375       }
00376     else
00377       {
00378         cout << which
00379        << "," << nrows_global
00380        << "," << ncols
00381        << "," << nprocs;
00382 #ifdef HAVE_TSQR_INTEL_TBB
00383         if (which == "MpiTbbTSQR")
00384     cout << "," << num_cores << endl;
00385 #endif // HAVE_TSQR_INTEL_TBB
00386         cout << "," << actual_cache_block_size
00387        << "," << contiguousCacheBlocks 
00388        << "," << result.first 
00389        << "," << result.second
00390        << endl;
00391       }
00392   }
00393     }
00394 
00395 
00396     template< class TsqrBase, class TimerType >
00397     double
00398     do_tsqr_benchmark (const std::string& which,
00399            TsqrBase& tsqr, 
00400            const Teuchos::RCP< MessengerBase< typename TsqrBase::scalar_type > >& messenger,
00401            const Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& A_local,
00402            Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& A_copy,
00403            Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& Q_local,
00404            Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& R,
00405            const int ntrials,
00406            const bool contiguousCacheBlocks,
00407            const bool human_readable,
00408            const bool b_debug = false)
00409     {
00410       typedef typename TsqrBase::FactorOutput factor_output_type;
00411       typedef typename TsqrBase::ordinal_type ordinal_type;
00412       using std::cerr;
00413       using std::cout;
00414       using std::endl;
00415 
00416       const ordinal_type nrows_local = A_local.nrows();
00417       const ordinal_type ncols = A_local.ncols();
00418 
00419       if (contiguousCacheBlocks)
00420   {
00421     tsqr.cache_block (nrows_local, ncols, A_copy.get(), 
00422           A_local.get(), A_local.lda());
00423     if (b_debug)
00424       {
00425         messenger->barrier();
00426         if (messenger->rank() == 0)
00427     cerr << "-- Cache-blocked input matrix to factor." << endl;
00428       }
00429   }
00430       else
00431   A_copy.copy (A_local);
00432 
00433       if (b_debug)
00434   {
00435     messenger->barrier();
00436     if (messenger->rank() == 0)
00437       cerr << "-- Starting timing loop" << endl;
00438   }
00439 
00440       // Benchmark TSQR for ntrials trials.  The answer (the numerical
00441       // results of the factorization) is only valid if ntrials == 1,
00442       // but this is a benchmark and not a verification routine.  Call
00443       // tsqr_verify() if you want to determine whether TSQR computes
00444       // the right answer.
00445       //
00446       // Name of timer doesn't matter here; we only need the timing.
00447       TSQR::Test::verifyTimerConcept< TimerType >();
00448       TimerType timer (which);
00449 
00450 
00451       const bool testFactorExplicit = true;
00452       double tsqr_timing;
00453       if (testFactorExplicit)
00454   {
00455     timer.start();
00456     for (int trial_num = 0; trial_num < ntrials; ++trial_num)
00457       tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), 
00458          contiguousCacheBlocks);
00459     tsqr_timing = timer.stop();
00460   }
00461       else
00462   {
00463     timer.start();
00464     for (int trial_num = 0; trial_num < ntrials; ++trial_num)
00465       {
00466         // Factor the matrix and compute the explicit Q factor.
00467         // Don't worry about the fact that we're overwriting the
00468         // input; this is a benchmark, not a numerical verification
00469         // test.  (We have the latter implemented as tsqr_verify()
00470         // in this file.)  For the same reason, don't worry about
00471         // un-cache-blocking the output (when cache blocks are
00472         // stored contiguously).
00473         factor_output_type factor_output = 
00474     tsqr.factor (nrows_local, ncols, A_copy.get(), A_copy.lda(), 
00475            R.get(), R.lda(), contiguousCacheBlocks);
00476         tsqr.explicit_Q (nrows_local, 
00477              ncols, A_copy.get(), A_copy.lda(), factor_output, 
00478              ncols, Q_local.get(), Q_local.lda(), 
00479              contiguousCacheBlocks);
00480         // Timings in debug mode likely won't make sense, because
00481         // Proc 0 is outputting the debug messages to cerr.
00482         // Nevertheless, we don't put any "if(b_debug)" calls in the
00483         // timing loop.
00484       }
00485     // Compute the resulting total time (in seconds) to execute
00486     // ntrials runs of Tsqr::factor() and Tsqr::explicit_Q().  The
00487     // time may differ on different MPI processes.
00488     tsqr_timing = timer.stop();
00489   }
00490 
00491       if (b_debug)
00492   {
00493     messenger->barrier();
00494     if (messenger->rank() == 0)
00495       cerr << "-- Finished timing loop" << endl;
00496   }
00497       return tsqr_timing;
00498     }
00499 
00548     template< class Ordinal, class Scalar, class Generator, class TimerType >
00549     void
00550     benchmarkTsqr (const std::string& which,
00551        Generator& generator,
00552        const int ntrials,
00553        const Ordinal nrows_global,
00554        const Ordinal ncols,
00555        const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00556        const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00557        const Ordinal num_cores,
00558        const size_t cache_block_size,
00559        const bool contiguousCacheBlocks,
00560        const bool human_readable,
00561        const bool b_debug)
00562     {
00563       using std::cerr;
00564       using std::cout;
00565       using std::endl;
00566 
00567       TSQR::Test::verifyTimerConcept< TimerType >();
00568       const bool b_extra_debug = false;
00569       const int nprocs = scalarComm->size();
00570       const int my_rank = scalarComm->rank();
00571       if (b_debug)
00572   {
00573     scalarComm->barrier();
00574     if (my_rank == 0)
00575       cerr << "tsqr_benchmark:" << endl;
00576     scalarComm->barrier();
00577   }
00578       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00579 
00580       // Set up storage for the test problem.
00581       Matrix< Ordinal, Scalar > A_local (nrows_local, ncols);
00582       Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols);
00583       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00584   {
00585     A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00586     Q_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00587   }
00588       Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0));
00589 
00590       // Generate the test problem.
00591       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00592       if (b_debug)
00593   {
00594     scalarComm->barrier();
00595     if (my_rank == 0)
00596       cerr << "-- Generated test problem." << endl;
00597   }
00598 
00599       // Make sure that the test problem (the matrix to factor) was
00600       // distributed correctly.
00601       if (b_extra_debug && b_debug)
00602   {
00603     if (my_rank == 0)
00604       cerr << "Test matrix A:" << endl;
00605     scalarComm->barrier ();
00606     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00607     scalarComm->barrier ();
00608   }
00609 
00610       // Factoring the matrix stored in A_local overwrites it, so we
00611       // make a copy of A_local.  If specified, rearrange cache blocks
00612       // in the copy.  Initialize with NaNs to make sure that cache
00613       // blocking worked correctly.
00614       Matrix< Ordinal, Scalar > A_copy (nrows_local, ncols);
00615       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00616   A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN());
00617 
00618       // actual_cache_block_size: "cache_block_size" is just a
00619       // suggestion.  TSQR determines the cache block size itself;
00620       // this remembers it so we can print it out later.
00621       size_t actual_cache_block_size;
00622       // Run time (in seconds, as a double-precision floating-point
00623       // value) for TSQR on this MPI node.
00624       double tsqr_timing;
00625 
00626       if (which == "MpiTbbTSQR")
00627   {
00628 #ifdef HAVE_TSQR_INTEL_TBB
00629     using Teuchos::RCP;
00630     typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type;
00631     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00632     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00633 
00634     RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (num_cores, cache_block_size));
00635     RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm));
00636     tsqr_type tsqr (nodeTsqr, distTsqr);
00637 
00638     // Run the benchmark.
00639     tsqr_timing = 
00640       do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local,
00641                    A_copy, Q_local, R, ntrials, 
00642                    contiguousCacheBlocks, 
00643                    human_readable, b_debug);
00644 
00645     // Save the "actual" cache block size
00646     actual_cache_block_size = tsqr.cache_block_size();
00647 #else
00648     throw std::logic_error("TSQR not built with Intel TBB support");
00649 #endif // HAVE_TSQR_INTEL_TBB
00650   }
00651       else if (which == "MpiSeqTSQR")
00652   {
00653     using Teuchos::RCP;
00654     typedef SequentialTsqr< Ordinal, Scalar > node_tsqr_type;
00655     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00656     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00657 
00658     // Set up TSQR.
00659     RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (cache_block_size));
00660     RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm));
00661     tsqr_type tsqr (nodeTsqr, distTsqr);
00662     
00663     // Run the benchmark.
00664     tsqr_timing = 
00665       do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local,
00666                    A_copy, Q_local, R, ntrials, 
00667                    contiguousCacheBlocks, 
00668                    human_readable, b_debug);
00669     // Save the "actual" cache block size
00670     actual_cache_block_size = tsqr.cache_block_size();
00671   }
00672       else
00673   throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00674 
00675       // Find the min and max TSQR timing on all processors.
00676       const double min_tsqr_timing = scalarComm->globalMin (tsqr_timing);
00677       const double max_tsqr_timing = scalarComm->globalMax (tsqr_timing);
00678 
00679       // Print the results on Proc 0.
00680       if (my_rank == 0)
00681   {
00682     if (human_readable)
00683       {
00684         std::string human_readable_name;
00685 
00686         if (which == "MpiSeqTSQR")
00687     human_readable_name = "MPI parallel / cache-blocked TSQR";
00688         else if (which == "MpiTbbTSQR")
00689     {
00690 #ifdef HAVE_TSQR_INTEL_TBB
00691       human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR";
00692 #else
00693       throw std::logic_error("TSQR not built with Intel TBB support");
00694 #endif // HAVE_TSQR_INTEL_TBB
00695     }
00696         else 
00697     throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00698 
00699         cout << human_readable_name << ":" << endl
00700        << "# rows = " << nrows_global << endl
00701        << "# columns = " << ncols << endl
00702        << "# MPI processes = " << nprocs << endl;
00703 
00704 #ifdef HAVE_TSQR_INTEL_TBB
00705         if (which == "MpiTbbTSQR")
00706     cout << "# cores per process = " << num_cores << endl;
00707 #endif // HAVE_TSQR_INTEL_TBB
00708 
00709         cout << "cache block # bytes = " << actual_cache_block_size << endl
00710        << "contiguous cache blocks? " << contiguousCacheBlocks << endl
00711        << "# trials = " << ntrials << endl
00712        << "Min total time (s) over all MPI processes = " 
00713        << min_tsqr_timing << endl
00714        << "Max total time (s) over all MPI processes = " 
00715        << max_tsqr_timing << endl
00716        << endl;
00717       }
00718     else
00719       {
00720         cout << which
00721        << "," << nrows_global
00722        << "," << ncols 
00723        << "," << nprocs ;
00724 #ifdef HAVE_TSQR_INTEL_TBB
00725         if (which == "MpiTbbTSQR")
00726     cout << "," << num_cores << endl;
00727 #endif // HAVE_TSQR_INTEL_TBB
00728         cout << "," << actual_cache_block_size
00729        << "," << contiguousCacheBlocks
00730        << "," << ntrials 
00731        << "," << min_tsqr_timing 
00732        << "," << max_tsqr_timing 
00733        << endl;
00734       }
00735   }
00736     }
00737 
00738 
00739   } // namespace Test
00740 } // namespace TSQR
00741 
00742 #endif // __TSQR_Test_TsqrTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends