Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_TsqrTest.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __TSQR_Test_TsqrTest_hpp
00043 #define __TSQR_Test_TsqrTest_hpp
00044 
00045 #include <Tsqr.hpp>
00046 #ifdef HAVE_TSQR_INTEL_TBB
00047 #  include <TbbTsqr.hpp>
00048 #endif // HAVE_TSQR_INTEL_TBB
00049 #include <Tsqr_TestSetup.hpp>
00050 #include <Tsqr_GlobalVerify.hpp>
00051 #include <Tsqr_printGlobalMatrix.hpp>
00052 #include <Tsqr_verifyTimerConcept.hpp>
00053 #include <Teuchos_ScalarTraits.hpp>
00054 
00055 #include <cstring> // size_t
00056 #include <iostream>
00057 #include <stdexcept>
00058 #include <string>
00059 
00060 
00061 namespace TSQR {
00062   namespace Test {
00063 
00064     template<class TsqrType>
00065     class TsqrVerifier {
00066     public:
00067       typedef TsqrType tsqr_type;
00068       typedef typename tsqr_type::scalar_type scalar_type;
00069       typedef typename tsqr_type::ordinal_type ordinal_type;
00070       typedef Matrix<ordinal_type, scalar_type> matrix_type;
00071       typedef typename tsqr_type::FactorOutput factor_output_type;
00072       typedef MessengerBase<scalar_type> messenger_type;
00073       typedef Teuchos::RCP<messenger_type> messenger_ptr;
00074 
00075       static void
00076       verify (tsqr_type& tsqr,
00077         const messenger_ptr& scalarComm,
00078         const matrix_type& A_local,
00079         matrix_type& A_copy,
00080         matrix_type& Q_local,
00081         matrix_type& R,
00082         const bool contiguousCacheBlocks,
00083         const bool b_debug = false)
00084       {
00085   using std::cerr;
00086   using std::endl;
00087 
00088   const ordinal_type nrows_local = A_local.nrows();
00089   const ordinal_type ncols = A_local.ncols();
00090 
00091   // If specified, rearrange cache blocks in the copy.
00092   if (contiguousCacheBlocks)
00093     {
00094       tsqr.cache_block (nrows_local, ncols, A_copy.get(), 
00095             A_local.get(), A_local.lda());
00096       if (b_debug)
00097         {
00098     scalarComm->barrier();
00099     if (scalarComm->rank() == 0)
00100       cerr << "-- Cache-blocked input matrix to factor." << endl;
00101         }
00102     }
00103   else
00104     A_copy.copy (A_local);
00105 
00106   const bool testFactorExplicit = true;
00107   if (testFactorExplicit)
00108     {
00109       tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), 
00110          contiguousCacheBlocks);
00111       if (b_debug)
00112         {
00113     scalarComm->barrier();
00114     if (scalarComm->rank() == 0)
00115       cerr << "-- Finished Tsqr::factorExplicit" << endl;
00116         }
00117     }
00118   else
00119     {
00120       // Factor the (copy of the) matrix.
00121       factor_output_type factorOutput = 
00122         tsqr.factor (nrows_local, ncols, A_copy.get(), A_copy.lda(), 
00123          R.get(), R.lda(), contiguousCacheBlocks);
00124       if (b_debug)
00125         {
00126     scalarComm->barrier();
00127     if (scalarComm->rank() == 0)
00128       cerr << "-- Finished Tsqr::factor" << endl;
00129         }
00130 
00131       // Compute the explicit Q factor in Q_local
00132       tsqr.explicit_Q (nrows_local, 
00133            ncols, A_copy.get(), A_copy.lda(), factorOutput, 
00134            ncols, Q_local.get(), Q_local.lda(), 
00135            contiguousCacheBlocks);
00136       if (b_debug)
00137         {
00138     scalarComm->barrier();
00139     if (scalarComm->rank() == 0)
00140       cerr << "-- Finished Tsqr::explicit_Q" << endl;
00141         }
00142     }
00143 
00144   // "Un"-cache-block the output, if contiguous cache blocks were
00145   // used.  This is only necessary because global_verify() doesn't
00146   // currently support contiguous cache blocks.
00147   if (contiguousCacheBlocks)
00148     {
00149       // We can use A_copy as scratch space for un-cache-blocking
00150       // Q_local, since we're done using A_copy for other things.
00151       tsqr.un_cache_block (nrows_local, ncols, A_copy.get(), 
00152          A_copy.lda(), Q_local.get());
00153       // Overwrite Q_local with the un-cache-blocked Q factor.
00154       Q_local.copy (A_copy);
00155 
00156       if (b_debug)
00157         {
00158     scalarComm->barrier();
00159     if (scalarComm->rank() == 0)
00160       cerr << "-- Un-cache-blocked output Q factor" << endl;
00161         }
00162     }
00163       }
00164     };
00165 
00210     template<class Ordinal, class Scalar, class Generator>
00211     void
00212     verifyTsqr (const std::string& which,
00213     const std::string& scalarTypeName,
00214     Generator& generator,
00215     const Ordinal nrows_global,
00216     const Ordinal ncols,
00217     const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00218     const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00219     const int num_cores = 1,
00220     const size_t cache_size_hint = 0,
00221     const bool contiguousCacheBlocks,
00222     const bool printFieldNames,
00223     const bool human_readable = false,
00224     const bool b_debug = false)
00225     {
00226       typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
00227       using std::cerr;
00228       using std::cout;
00229       using std::endl;
00230 
00231       const bool b_extra_debug = false;
00232       const int nprocs = scalarComm->size();
00233       const int my_rank = scalarComm->rank();
00234       if (b_debug)
00235   {
00236     scalarComm->barrier();
00237     if (my_rank == 0)
00238       cerr << "tsqr_verify:" << endl;
00239     scalarComm->barrier();
00240   }
00241       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00242 
00243       // Set up storage for the test problem.
00244       Matrix< Ordinal, Scalar > A_local (nrows_local, ncols);
00245       Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols);
00246       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00247   {
00248     A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00249     Q_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00250   }
00251       Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0));
00252 
00253       // Generate the test problem.
00254       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00255       if (b_debug)
00256   {
00257     scalarComm->barrier();
00258     if (my_rank == 0)
00259       cerr << "-- Generated test problem." << endl;
00260   }
00261 
00262       // Make sure that the test problem (the matrix to factor) was
00263       // distributed correctly.
00264       if (b_extra_debug && b_debug)
00265   {
00266     if (my_rank == 0)
00267       cerr << "Test matrix A:" << endl;
00268     scalarComm->barrier ();
00269     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00270     scalarComm->barrier ();
00271   }
00272 
00273       // Factoring the matrix stored in A_local overwrites it, so we
00274       // make a copy of A_local.  Initialize with NaNs to make sure
00275       // that cache blocking works correctly (if applicable).
00276       Matrix< Ordinal, Scalar > A_copy (nrows_local, ncols);
00277       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00278   A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN());
00279 
00280       // actual_cache_size_hint: "cache_size_hint" is just a
00281       // suggestion.  TSQR determines the cache size hint itself;
00282       // this remembers it so we can print it out later.
00283       size_t actual_cache_size_hint;
00284 
00285       if (which == "MpiTbbTSQR")
00286   {
00287 #ifdef HAVE_TSQR_INTEL_TBB
00288     using Teuchos::RCP;
00289     typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type;
00290     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00291     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00292 
00293     RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (num_cores, cache_size_hint));
00294     RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm));
00295     tsqr_type tsqr (node_tsqr, dist_tsqr);
00296     
00297     // Compute the factorization and explicit Q factor.
00298     TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy, 
00299                Q_local, R, contiguousCacheBlocks, 
00300                b_debug);
00301     // Save the "actual" cache block size
00302     actual_cache_size_hint = tsqr.cache_size_hint();
00303 #else
00304     throw std::logic_error("TSQR not built with Intel TBB support");
00305 #endif // HAVE_TSQR_INTEL_TBB
00306   }
00307       else if (which == "MpiSeqTSQR")
00308   {
00309     using Teuchos::RCP;
00310     typedef SequentialTsqr< Ordinal, Scalar > node_tsqr_type;
00311     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00312     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00313 
00314     RCP< node_tsqr_type > node_tsqr (new node_tsqr_type (cache_size_hint));
00315     RCP< dist_tsqr_type > dist_tsqr (new dist_tsqr_type (scalarComm));
00316     tsqr_type tsqr (node_tsqr, dist_tsqr);
00317     
00318     // Compute the factorization and explicit Q factor.
00319     TsqrVerifier< tsqr_type >::verify (tsqr, scalarComm, A_local, A_copy, 
00320                Q_local, R, contiguousCacheBlocks, 
00321                b_debug);
00322     // Save the "actual" cache block size
00323     actual_cache_size_hint = tsqr.cache_size_hint();
00324   }
00325       else 
00326   throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00327 
00328       // Print out the Q and R factors
00329       if (b_extra_debug && b_debug)
00330   {
00331     if (my_rank == 0)
00332       cerr << endl << "Q factor:" << endl;
00333     scalarComm->barrier();
00334     printGlobalMatrix (cerr, Q_local, scalarComm.get(), ordinalComm.get());
00335     scalarComm->barrier ();
00336     if (my_rank == 0)
00337       {
00338         cerr << endl << "R factor:" << endl;
00339         print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00340         cerr << endl;
00341       }
00342     scalarComm->barrier ();
00343   }
00344 
00345       // Test accuracy of the resulting factorization
00346       std::vector< magnitude_type > results = 
00347   global_verify (nrows_local, ncols, A_local.get(), A_local.lda(),
00348            Q_local.get(), Q_local.lda(), R.get(), R.lda(), 
00349            scalarComm.get());
00350       if (b_debug)
00351   {
00352     scalarComm->barrier();
00353     if (my_rank == 0)
00354       cerr << "-- Finished global_verify" << endl;
00355   }
00356 
00357       // Print the results on Proc 0.
00358       if (my_rank == 0)
00359   {
00360     if (human_readable)
00361       {
00362         std::string human_readable_name;
00363 
00364         if (which == "MpiSeqTSQR")
00365     human_readable_name = "MPI parallel / cache-blocked TSQR";
00366         else if (which == "MpiTbbTSQR")
00367     {
00368 #ifdef HAVE_TSQR_INTEL_TBB
00369       human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR";
00370 #else
00371       throw std::logic_error("TSQR not built with Intel TBB support");
00372 #endif // HAVE_TSQR_INTEL_TBB
00373     }
00374         else 
00375     throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00376 
00377         cout << human_readable_name << ":" << endl
00378        << "Scalar type: " << scalarTypeName << endl
00379        << "# rows: " << nrows_global << endl
00380        << "# columns: " << ncols << endl
00381        << "# MPI processes: " << nprocs << endl;
00382 #ifdef HAVE_TSQR_INTEL_TBB
00383         if (which == "MpiTbbTSQR")
00384     cout << "# cores per process = " << num_cores << endl;
00385 #endif // HAVE_TSQR_INTEL_TBB
00386         cout << "Cache size hint in bytes: " << actual_cache_size_hint << endl
00387        << "Contiguous cache blocks? " << contiguousCacheBlocks << endl
00388        << "Absolute residual $\\| A - Q R \\|_2: "
00389        << results[0] << endl
00390        << "Absolute orthogonality $\\| I - Q^* Q \\|_2$: " 
00391        << results[1] << endl
00392        << "Test matrix norm $\\| A \\|_F$: "
00393        << results[2] << endl
00394        << endl;
00395       }
00396     else
00397       {
00398         if (printFieldNames)
00399     {
00400       cout << "%"
00401            << "method"
00402            << ",scalarType"
00403            << ",globalNumRows"
00404            << ",numCols"
00405            << ",numProcs"
00406            << ",numCores"
00407            << ",cacheSizeHint"
00408            << ",contiguousCacheBlocks"
00409            << ",absFrobResid"
00410            << ",absFrobOrthog"
00411            << ",frobA" << endl;
00412     }
00413 
00414         cout << which
00415        << "," << scalarTypeName
00416        << "," << nrows_global
00417        << "," << ncols
00418        << "," << nprocs;
00419 #ifdef HAVE_TSQR_INTEL_TBB
00420         if (which == "MpiTbbTSQR")
00421     cout << "," << num_cores;
00422         else
00423     cout << ",1";
00424 #else
00425         cout << ",1" << endl;
00426 #endif // HAVE_TSQR_INTEL_TBB
00427         cout << "," << actual_cache_size_hint
00428        << "," << contiguousCacheBlocks 
00429        << "," << results[0] 
00430        << "," << results[1]
00431        << "," << results[2]
00432        << endl;
00433       }
00434   }
00435     }
00436 
00437 
00438     template<class TsqrBase, class TimerType>
00439     double
00440     do_tsqr_benchmark (const std::string& which,
00441            TsqrBase& tsqr, 
00442            const Teuchos::RCP< MessengerBase< typename TsqrBase::scalar_type > >& messenger,
00443            const Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& A_local,
00444            Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& A_copy,
00445            Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& Q_local,
00446            Matrix< typename TsqrBase::ordinal_type, typename TsqrBase::scalar_type >& R,
00447            const int ntrials,
00448            const bool contiguousCacheBlocks,
00449            const bool human_readable,
00450            const bool b_debug = false)
00451     {
00452       typedef typename TsqrBase::FactorOutput factor_output_type;
00453       typedef typename TsqrBase::ordinal_type ordinal_type;
00454       using std::cerr;
00455       using std::cout;
00456       using std::endl;
00457 
00458       const ordinal_type nrows_local = A_local.nrows();
00459       const ordinal_type ncols = A_local.ncols();
00460 
00461       if (contiguousCacheBlocks)
00462   {
00463     tsqr.cache_block (nrows_local, ncols, A_copy.get(), 
00464           A_local.get(), A_local.lda());
00465     if (b_debug)
00466       {
00467         messenger->barrier();
00468         if (messenger->rank() == 0)
00469     cerr << "-- Cache-blocked input matrix to factor." << endl;
00470       }
00471   }
00472       else
00473   A_copy.copy (A_local);
00474 
00475       if (b_debug)
00476   {
00477     messenger->barrier();
00478     if (messenger->rank() == 0)
00479       cerr << "-- Starting timing loop" << endl;
00480   }
00481 
00482       // Benchmark TSQR for ntrials trials.  The answer (the numerical
00483       // results of the factorization) is only valid if ntrials == 1,
00484       // but this is a benchmark and not a verification routine.  Call
00485       // tsqr_verify() if you want to determine whether TSQR computes
00486       // the right answer.
00487       //
00488       // Name of timer doesn't matter here; we only need the timing.
00489       TSQR::Test::verifyTimerConcept< TimerType >();
00490       TimerType timer (which);
00491 
00492 
00493       const bool testFactorExplicit = true;
00494       double tsqr_timing;
00495       if (testFactorExplicit)
00496   {
00497     timer.start();
00498     for (int trial_num = 0; trial_num < ntrials; ++trial_num)
00499       tsqr.factorExplicit (A_copy.view(), Q_local.view(), R.view(), 
00500          contiguousCacheBlocks);
00501     tsqr_timing = timer.stop();
00502   }
00503       else
00504   {
00505     timer.start();
00506     for (int trial_num = 0; trial_num < ntrials; ++trial_num)
00507       {
00508         // Factor the matrix and compute the explicit Q factor.
00509         // Don't worry about the fact that we're overwriting the
00510         // input; this is a benchmark, not a numerical verification
00511         // test.  (We have the latter implemented as tsqr_verify()
00512         // in this file.)  For the same reason, don't worry about
00513         // un-cache-blocking the output (when cache blocks are
00514         // stored contiguously).
00515         factor_output_type factor_output = 
00516     tsqr.factor (nrows_local, ncols, A_copy.get(), A_copy.lda(), 
00517            R.get(), R.lda(), contiguousCacheBlocks);
00518         tsqr.explicit_Q (nrows_local, 
00519              ncols, A_copy.get(), A_copy.lda(), factor_output, 
00520              ncols, Q_local.get(), Q_local.lda(), 
00521              contiguousCacheBlocks);
00522         // Timings in debug mode likely won't make sense, because
00523         // Proc 0 is outputting the debug messages to cerr.
00524         // Nevertheless, we don't put any "if(b_debug)" calls in the
00525         // timing loop.
00526       }
00527     // Compute the resulting total time (in seconds) to execute
00528     // ntrials runs of Tsqr::factor() and Tsqr::explicit_Q().  The
00529     // time may differ on different MPI processes.
00530     tsqr_timing = timer.stop();
00531   }
00532 
00533       if (b_debug)
00534   {
00535     messenger->barrier();
00536     if (messenger->rank() == 0)
00537       cerr << "-- Finished timing loop" << endl;
00538   }
00539       return tsqr_timing;
00540     }
00541 
00595     template<class Ordinal, class Scalar, class Generator, class TimerType>
00596     void
00597     benchmarkTsqr (const std::string& which,
00598        const std::string& scalarTypeName,
00599        Generator& generator,
00600        const int ntrials,
00601        const Ordinal nrows_global,
00602        const Ordinal ncols,
00603        const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00604        const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00605        const Ordinal num_cores,
00606        const size_t cache_size_hint,
00607        const bool contiguousCacheBlocks,
00608        const bool printFieldNames,
00609        const bool human_readable,
00610        const bool b_debug)
00611     {
00612       using std::cerr;
00613       using std::cout;
00614       using std::endl;
00615 
00616       TSQR::Test::verifyTimerConcept< TimerType >();
00617       const bool b_extra_debug = false;
00618       const int nprocs = scalarComm->size();
00619       const int my_rank = scalarComm->rank();
00620       if (b_debug)
00621   {
00622     scalarComm->barrier();
00623     if (my_rank == 0)
00624       cerr << "tsqr_benchmark:" << endl;
00625     scalarComm->barrier();
00626   }
00627       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00628 
00629       // Set up storage for the test problem.
00630       Matrix< Ordinal, Scalar > A_local (nrows_local, ncols);
00631       Matrix< Ordinal, Scalar > Q_local (nrows_local, ncols);
00632       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00633   {
00634     A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00635     Q_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00636   }
00637       Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0));
00638 
00639       // Generate the test problem.
00640       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00641       if (b_debug)
00642   {
00643     scalarComm->barrier();
00644     if (my_rank == 0)
00645       cerr << "-- Generated test problem." << endl;
00646   }
00647 
00648       // Make sure that the test problem (the matrix to factor) was
00649       // distributed correctly.
00650       if (b_extra_debug && b_debug)
00651   {
00652     if (my_rank == 0)
00653       cerr << "Test matrix A:" << endl;
00654     scalarComm->barrier ();
00655     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00656     scalarComm->barrier ();
00657   }
00658 
00659       // Factoring the matrix stored in A_local overwrites it, so we
00660       // make a copy of A_local.  If specified, rearrange cache blocks
00661       // in the copy.  Initialize with NaNs to make sure that cache
00662       // blocking worked correctly.
00663       Matrix< Ordinal, Scalar > A_copy (nrows_local, ncols);
00664       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00665   A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN());
00666 
00667       // actual_cache_size_hint: "cache_size_hint" is just a
00668       // suggestion.  TSQR determines the cache block size itself;
00669       // this remembers it so we can print it out later.
00670       size_t actual_cache_size_hint;
00671       // Run time (in seconds, as a double-precision floating-point
00672       // value) for TSQR on this MPI node.
00673       double tsqr_timing;
00674 
00675       if (which == "MpiTbbTSQR")
00676   {
00677 #ifdef HAVE_TSQR_INTEL_TBB
00678     using Teuchos::RCP;
00679     typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar > node_tsqr_type;
00680     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00681     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00682 
00683     RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (num_cores, cache_size_hint));
00684     RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm));
00685     tsqr_type tsqr (nodeTsqr, distTsqr);
00686 
00687     // Run the benchmark.
00688     tsqr_timing = 
00689       do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local,
00690                    A_copy, Q_local, R, ntrials, 
00691                    contiguousCacheBlocks, 
00692                    human_readable, b_debug);
00693 
00694     // Save the "actual" cache block size
00695     actual_cache_size_hint = tsqr.cache_size_hint();
00696 #else
00697     throw std::logic_error("TSQR not built with Intel TBB support");
00698 #endif // HAVE_TSQR_INTEL_TBB
00699   }
00700       else if (which == "MpiSeqTSQR")
00701   {
00702     using Teuchos::RCP;
00703     typedef SequentialTsqr< Ordinal, Scalar > node_tsqr_type;
00704     typedef TSQR::DistTsqr< Ordinal, Scalar > dist_tsqr_type;
00705     typedef Tsqr< Ordinal, Scalar, node_tsqr_type, dist_tsqr_type > tsqr_type;
00706 
00707     // Set up TSQR.
00708     RCP< node_tsqr_type > nodeTsqr (new node_tsqr_type (cache_size_hint));
00709     RCP< dist_tsqr_type > distTsqr (new dist_tsqr_type (scalarComm));
00710     tsqr_type tsqr (nodeTsqr, distTsqr);
00711     
00712     // Run the benchmark.
00713     tsqr_timing = 
00714       do_tsqr_benchmark< tsqr_type, TimerType > (which, tsqr, scalarComm, A_local,
00715                    A_copy, Q_local, R, ntrials, 
00716                    contiguousCacheBlocks, 
00717                    human_readable, b_debug);
00718     // Save the "actual" cache block size
00719     actual_cache_size_hint = tsqr.cache_size_hint();
00720   }
00721       else
00722   throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00723 
00724       // Find the min and max TSQR timing on all processors.
00725       const double min_tsqr_timing = scalarComm->globalMin (tsqr_timing);
00726       const double max_tsqr_timing = scalarComm->globalMax (tsqr_timing);
00727 
00728       // Print the results on Proc 0.
00729       if (my_rank == 0)
00730   {
00731     if (human_readable)
00732       {
00733         std::string human_readable_name;
00734 
00735         if (which == "MpiSeqTSQR")
00736     human_readable_name = "MPI parallel / cache-blocked TSQR";
00737         else if (which == "MpiTbbTSQR")
00738     {
00739 #ifdef HAVE_TSQR_INTEL_TBB
00740       human_readable_name = "MPI parallel / TBB parallel / cache-blocked TSQR";
00741 #else
00742       throw std::logic_error("TSQR not built with Intel TBB support");
00743 #endif // HAVE_TSQR_INTEL_TBB
00744     }
00745         else 
00746     throw std::logic_error("Unknown TSQR implementation type \"" + which + "\"");
00747 
00748         cout << human_readable_name << ":" << endl
00749        << "Scalar type: " << scalarTypeName << endl
00750        << "# rows: " << nrows_global << endl
00751        << "# columns: " << ncols << endl
00752        << "# MPI processes: " << nprocs << endl;
00753 
00754 #ifdef HAVE_TSQR_INTEL_TBB
00755         if (which == "MpiTbbTSQR")
00756     cout << "# cores per process: " << num_cores << endl;
00757 #endif // HAVE_TSQR_INTEL_TBB
00758 
00759         cout << "Cache size hint in bytes: " << actual_cache_size_hint << endl
00760        << "contiguous cache blocks? " << contiguousCacheBlocks << endl
00761        << "# trials: " << ntrials << endl
00762        << "Min total time (s) over all MPI processes: " 
00763        << min_tsqr_timing << endl
00764        << "Max total time (s) over all MPI processes: " 
00765        << max_tsqr_timing << endl
00766        << endl;
00767       }
00768     else
00769       {
00770         if (printFieldNames)
00771     {
00772       cout << "%"
00773            << "method"
00774            << ",scalarType"
00775            << ",globalNumRows"
00776            << ",numCols"
00777            << ",numProcs"
00778            << ",numCores"
00779            << ",cacheSizeHint"
00780            << ",contiguousCacheBlocks"
00781            << ",numTrials"
00782            << ",minTiming"
00783            << ",maxTiming" 
00784            << endl;
00785     }
00786         cout << which
00787        << "," << scalarTypeName
00788        << "," << nrows_global
00789        << "," << ncols 
00790        << "," << nprocs;
00791 #ifdef HAVE_TSQR_INTEL_TBB
00792         if (which == "MpiTbbTSQR")
00793     cout << "," << num_cores;
00794         else 
00795     cout << ",1";
00796 #else
00797         cout << ",1";
00798 #endif // HAVE_TSQR_INTEL_TBB
00799         cout << "," << actual_cache_size_hint
00800        << "," << contiguousCacheBlocks
00801        << "," << ntrials 
00802        << "," << min_tsqr_timing 
00803        << "," << max_tsqr_timing 
00804        << endl;
00805       }
00806   }
00807     }
00808 
00809 
00810   } // namespace Test
00811 } // namespace TSQR
00812 
00813 #endif // __TSQR_Test_TsqrTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends