Anasazi Version of the Day
Tsqr_MgsTest.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                 Anasazi: Block Eigensolvers Package
00005 //                 Copyright (2010) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00025 //
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 #ifndef __TSQR_Test_MgsTest_hpp
00030 #define __TSQR_Test_MgsTest_hpp
00031 
00032 #include <Tsqr_Config.hpp>
00033 
00034 #include <Tsqr_Mgs.hpp>
00035 #ifdef HAVE_TSQR_INTEL_TBB
00036 #  include <TbbTsqr_TbbMgs.hpp>
00037 #endif // HAVE_TSQR_INTEL_TBB
00038 #include <Tsqr_TestSetup.hpp>
00039 #include <Tsqr_GlobalVerify.hpp>
00040 #include <Tsqr_printGlobalMatrix.hpp>
00041 #include <Tsqr_verifyTimerConcept.hpp>
00042 
00043 #include <Teuchos_RCP.hpp>
00044 
00045 #include <algorithm>
00046 #include <sstream>
00047 #include <limits>
00048 #include <iostream>
00049 #include <stdexcept>
00050 #include <utility>
00051 
00054 
00055 namespace TSQR {
00056   namespace Test {
00057 
00058     static std::string
00059     mgs_human_readable_name (const std::string& which)
00060     {
00061       if (which == "MpiSeqMGS")
00062   return std::string ("MPI parallel / sequential MGS");
00063       else if (which == "MpiTbbMGS")
00064   {
00065 #ifdef HAVE_TSQR_INTEL_TBB
00066     return std::string ("MPI parallel / TBB parallel MGS");
00067 #else
00068     throw std::logic_error("MGS not built with Intel TBB support");
00069 #endif // HAVE_TSQR_INTEL_TBB
00070   }
00071       else 
00072   throw std::logic_error("Unknown MGS implementation type \"" + which + "\"");
00073     }
00074 
00075     template< class MgsType >
00076     class MgsVerifier {
00077     public:
00078       typedef MgsType mgs_type;
00079       typedef typename MgsType::ordinal_type ordinal_type;
00080       typedef typename MgsType::scalar_type scalar_type;
00081       typedef Matrix< ordinal_type, scalar_type > matrix_type;
00082       typedef MessengerBase< scalar_type > messenger_type;
00083       typedef Teuchos::RCP< messenger_type > messenger_ptr;
00084 
00085       static void
00086       verify (mgs_type& orthogonalizer,
00087         const messenger_ptr& messenger,
00088         matrix_type& Q_local,
00089         matrix_type& R,
00090         const bool b_debug = false)
00091       {
00092   using std::cerr;
00093   using std::endl;
00094 
00095   // Factor the (copy of the) matrix.  On output, the explicit Q
00096   // factor (of A_local) is in Q_local and the R factor is in R.
00097   orthogonalizer.mgs (Q_local.nrows(), Q_local.ncols(), 
00098           Q_local.get(), Q_local.lda(),
00099           R.get(), R.lda());
00100   if (b_debug)
00101     {
00102       messenger->barrier();
00103       if (messenger->rank() == 0)
00104         cerr << "-- Finished MGS::mgs" << endl;
00105     }
00106       }
00107     };
00108 
00109     template< class Ordinal, class Scalar, class Generator >
00110     void
00111     verifyMgs (const std::string& which,
00112          Generator& generator,
00113          const Ordinal nrows_global,
00114          const Ordinal ncols,
00115          const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00116          const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00117          const int num_cores,
00118          const bool human_readable,
00119          const bool b_debug)
00120     {
00121       typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00122       using std::cerr;
00123       using std::cout;
00124       using std::endl;
00125 
00126       const bool b_extra_debug = false;
00127       const int nprocs = scalarComm->size();
00128       const int my_rank = scalarComm->rank();
00129       if (b_debug)
00130   {
00131     scalarComm->barrier();
00132     if (my_rank == 0)
00133       cerr << "mgs_verify:" << endl;
00134     scalarComm->barrier();
00135   }
00136       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00137 
00138       // Set up storage for the test problem
00139       Matrix< Ordinal, Scalar > A_local (nrows_local, ncols);
00140       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00141   A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00142       Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0));
00143 
00144       // Generate the test problem.
00145       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00146       if (b_debug)
00147   {
00148     scalarComm->barrier();
00149     if (my_rank == 0)
00150       cerr << "-- Generated test problem." << endl;
00151   }
00152 
00153       // Make sure that the test problem (the matrix to factor) was
00154       // distributed correctly.
00155       if (b_extra_debug && b_debug)
00156   {
00157     if (my_rank == 0)
00158       cerr << "Test matrix A:" << endl;
00159     scalarComm->barrier();
00160     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00161     scalarComm->barrier();
00162   }
00163 
00164       // Factoring the matrix stored in A_local overwrites it, so we
00165       // copy A_local into Q_local.  MGS orthogonalization does not
00166       // support contiguously stored cache blocks, unlike TSQR, so we
00167       // don't have to consider whether or not to rearrange cache
00168       // blocks here (unlike with TSQR).
00169       Matrix< Ordinal, Scalar > Q_local (A_local);
00170 
00171       if (b_debug)
00172   {
00173     scalarComm->barrier();
00174     if (my_rank == 0)
00175       cerr << "-- Starting verification" << endl;
00176   }
00177 
00178       if (which == "MpiTbbMGS")
00179   {
00180 #ifdef HAVE_TSQR_INTEL_TBB
00181     typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type;
00182     mgs_type mgser (scalarComm);
00183     MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug);
00184 #else
00185     throw std::logic_error("MGS not built with Intel TBB support");
00186 #endif // HAVE_TSQR_INTEL_TBB
00187   }
00188       else if (which == "MpiSeqMGS")
00189   {
00190     typedef MGS< Ordinal, Scalar > mgs_type;
00191     mgs_type mgser (scalarComm);
00192     MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug);
00193   }
00194       else
00195   throw std::logic_error ("Invalid MGS implementation type \"" + which + "\"");
00196 
00197       // Print out the Q and R factors
00198       if (b_extra_debug && b_debug)
00199   {
00200     if (my_rank == 0)
00201       cerr << endl << "Q factor:" << endl;
00202     scalarComm->barrier ();
00203     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00204     scalarComm->barrier ();
00205     if (my_rank == 0)
00206       {
00207         cerr << endl << "R factor:" << endl;
00208         print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00209         cerr << endl;
00210       }
00211     scalarComm->barrier ();
00212   }
00213 
00214       // Test accuracy of the resulting factorization
00215       std::pair< magnitude_type, magnitude_type > result = 
00216   global_verify (nrows_local, ncols, A_local.get(), A_local.lda(),
00217            Q_local.get(), Q_local.lda(), R.get(), R.lda(), 
00218            scalarComm.get());
00219       if (b_debug)
00220   {
00221     scalarComm->barrier();
00222     if (my_rank == 0)
00223       cerr << "-- Finished global_verify" << endl;
00224     scalarComm->barrier();
00225   }
00226 
00227       // Print the results on Proc 0.
00228       if (my_rank == 0)
00229   {
00230     if (human_readable)
00231       {
00232         cout << mgs_human_readable_name(which) << endl
00233        << "# rows = " << nrows_global << endl
00234        << "# columns = " << ncols << endl
00235        << "# MPI processes = " << nprocs << endl;
00236         if (which == "MpiTbbTSQR")
00237     cout << "# cores per process = " << num_cores << endl;
00238         cout << "Relative residual $\\|A - Q*R\\|_2 / \\|A\\|_2$ = " 
00239        << result.first << endl
00240        << "Relative orthogonality $\\|I - Q^T*Q\\|_2$ = " 
00241        << result.second << endl
00242        << endl;
00243       }
00244     else
00245       {
00246         cout << which
00247        << "," << nrows_global
00248        << "," << ncols
00249        << "," << nprocs;
00250         if (which == "MpiTbbTSQR")
00251     cout << "," << num_cores << endl;
00252         cout << "," << result.first 
00253        << "," << result.second
00254        << endl;
00255       }
00256   }
00257     }
00258 
00259 
00260     template< class MgsBase, class TimerType >
00261     static double // returns timing in s
00262     do_mgs_benchmark (MgsBase& orthogonalizer,
00263           Matrix< typename MgsBase::ordinal_type, typename MgsBase::scalar_type >& Q_local,
00264           Matrix< typename MgsBase::ordinal_type, typename MgsBase::scalar_type >& R,
00265           const int num_trials,
00266           const bool human_readable)
00267     {
00268       typedef typename MgsBase::ordinal_type ordinal_type;
00269       using std::cout;
00270 
00271       TSQR::Test::verifyTimerConcept< TimerType >();
00272 
00273       const ordinal_type nrows_local = Q_local.nrows();
00274       const ordinal_type ncols = Q_local.ncols();
00275 
00276       // Benchmark MGS for ntrials trials.  The answer (the numerical
00277       // results of the factorization) is only valid if ntrials == 1,
00278       // but this is a benchmark and not a verification routine.  Call
00279       // mgs_verify() if you want to determine whether MGS computes
00280       // the right answer.
00281       //
00282       // Name of timer doesn't matter here; we only need the timing.
00283       TimerType timer("MGS"); 
00284       timer.start();
00285       for (int trial_num = 0; trial_num < num_trials; ++trial_num)
00286   {
00287     // Orthogonalize the columns of A using MGS.  Don't worry about
00288     // the fact that we're overwriting the input; this is a
00289     // benchmark, not a numerical verification test.  (We have the
00290     // latter implemented as mgs_verify() in this file.)
00291     orthogonalizer.mgs (nrows_local, ncols, Q_local.get(),
00292             Q_local.lda(), R.get(), R.lda());
00293     // Timings in debug mode likely won't make sense, because
00294     // Proc 0 is outputting the debug messages to cerr.
00295     // Nevertheless, we don't put any "if(b_debug)" calls in the
00296     // timing loop.
00297   }
00298       // Compute the resulting total time (in seconds) to execute
00299       // num_trials runs of :mgs().  The time may differ on different
00300       // MPI processes.
00301       const double mgs_timing = timer.stop();
00302       return mgs_timing;
00303     }
00304 
00305     template< class Ordinal, class Scalar, class Generator, class TimerType >
00306     void
00307     benchmarkMgs (const std::string& which,
00308       Generator& generator,
00309       const int ntrials,
00310       const Ordinal nrows_global,
00311       const Ordinal ncols,
00312       const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00313       const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00314       const int num_cores,
00315       const bool human_readable,
00316       const bool b_debug)
00317     {
00318       typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00319       using std::cerr;
00320       using std::cout;
00321       using std::endl;
00322 
00323       TSQR::Test::verifyTimerConcept< TimerType >();
00324 
00325       const bool b_extra_debug = false;
00326       const int nprocs = scalarComm->size();
00327       const int my_rank = scalarComm->rank();
00328       if (b_debug)
00329   {
00330     scalarComm->barrier();
00331     if (my_rank == 0)
00332       cerr << "mgs_benchmark:" << endl;
00333     scalarComm->barrier();
00334   }
00335       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00336    
00337       // Set up storage for the test problem.
00338       Matrix<Ordinal, Scalar> A_local (nrows_local, ncols);
00339       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00340   A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00341       Matrix<Ordinal, Scalar> R (ncols, ncols, Scalar(0));
00342 
00343       // Generate the test problem.
00344       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00345       if (b_debug)
00346   {
00347     scalarComm->barrier();
00348     if (my_rank == 0)
00349       cerr << "-- Generated test problem." << endl;
00350   }
00351 
00352       // Make sure that the test problem (the matrix to factor) was
00353       // distributed correctly.
00354       if (b_extra_debug && b_debug)
00355   {
00356     if (my_rank == 0)
00357       cerr << "Test matrix A:" << endl;
00358     scalarComm->barrier ();
00359     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00360     scalarComm->barrier ();
00361   }
00362 
00363       // Factoring the matrix stored in A_local overwrites it, so we
00364       // make a copy of A_local.  MGS orthogonalization does not
00365       // support contiguously stored cache blocks, unlike TSQR, so we
00366       // don't have to consider whether or not to rearrange cache
00367       // blocks here (unlike with TSQR).
00368       Matrix< Ordinal, Scalar > Q_local (A_local);
00369 
00370       if (b_debug)
00371   {
00372     scalarComm->barrier();
00373     if (my_rank == 0)
00374       cerr << "-- Starting timing loop" << endl;
00375   }
00376 
00377       // Set up MGS and run the benchmark.
00378       double mgs_timing; // Total run time in seconds of all ntrials trials
00379       if (which == "MpiTbbMGS")
00380   {
00381 #ifdef HAVE_TSQR_INTEL_TBB
00382     typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type;
00383     mgs_type mgser (scalarComm);
00384     mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, 
00385                 ntrials, human_readable);
00386 #else
00387     throw std::logic_error("MGS not built with Intel TBB support");
00388 #endif // HAVE_TSQR_INTEL_TBB
00389   }
00390       else if (which == "MpiSeqMGS")
00391   {
00392     typedef MGS< Ordinal, Scalar > mgs_type;
00393     mgs_type mgser (scalarComm);
00394     mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, 
00395                 ntrials, human_readable);
00396   }
00397       else
00398   throw std::logic_error ("Invalid MGS implementation type \"" + which + "\"");
00399 
00400       if (b_debug)
00401   {
00402     scalarComm->barrier();
00403     if (my_rank == 0)
00404       cerr << "-- Finished timing loop" << endl;
00405   }
00406 
00407       // Find the min and max MGS timing on all processors.
00408       const double min_mgs_timing = scalarComm->globalMin (mgs_timing);
00409       const double max_mgs_timing = scalarComm->globalMax (mgs_timing);
00410 
00411       if (b_debug)
00412   {
00413     scalarComm->barrier();
00414     if (my_rank == 0)
00415       cerr << "-- Computed min and max timings" << endl;
00416   }
00417 
00418       // Print the results on Proc 0.
00419       if (my_rank == 0)
00420   {
00421     if (human_readable)
00422       {
00423         cout << mgs_human_readable_name(which) << ":" << endl
00424        << "# rows = " << nrows_global << endl
00425        << "# columns = " << ncols << endl
00426        << "# MPI processes = " << nprocs << endl;
00427         if (which == "MpiTbbTSQR")
00428     cout << "# cores per process = " << num_cores << endl;
00429         cout << "# trials = " << ntrials << endl
00430        << "Min total time (s) over all MPI processes = " 
00431        << min_mgs_timing << endl
00432        << "Max total time (s) over all MPI processes = " 
00433        << max_mgs_timing << endl
00434        << endl;
00435       }
00436     else
00437       {
00438         cout << which
00439        << "," << nrows_global
00440        << "," << ncols 
00441        << "," << nprocs;
00442         if (which == "MpiTbbTSQR")
00443     cout << "," << num_cores << endl;
00444         cout << "," << ntrials 
00445        << "," << min_mgs_timing 
00446        << "," << max_mgs_timing 
00447        << endl;
00448       }
00449   }
00450     }
00451     
00452 
00453   } // namespace Test
00454 } // namespace TSQR
00455 
00456 #endif // __TSQR_Test_MgsTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends