Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_MgsTest.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __TSQR_Test_MgsTest_hpp
00030 #define __TSQR_Test_MgsTest_hpp
00031 
00032 #include <Tsqr_ConfigDefs.hpp>
00033 
00034 #include <Tsqr_Mgs.hpp>
00035 #ifdef HAVE_TSQR_INTEL_TBB
00036 #  include <TbbTsqr_TbbMgs.hpp>
00037 #endif // HAVE_TSQR_INTEL_TBB
00038 #include <Tsqr_TestSetup.hpp>
00039 #include <Tsqr_GlobalVerify.hpp>
00040 #include <Tsqr_printGlobalMatrix.hpp>
00041 #include <Tsqr_verifyTimerConcept.hpp>
00042 
00043 #include <Teuchos_RCP.hpp>
00044 
00045 #include <algorithm>
00046 #include <sstream>
00047 #include <limits>
00048 #include <iostream>
00049 #include <stdexcept>
00050 #include <utility>
00051 
00052 
00053 namespace TSQR {
00054   namespace Test {
00055 
00056     static std::string
00057     mgs_human_readable_name (const std::string& which)
00058     {
00059       if (which == "MpiSeqMGS")
00060   return std::string ("MPI parallel / sequential MGS");
00061       else if (which == "MpiTbbMGS")
00062   {
00063 #ifdef HAVE_TSQR_INTEL_TBB
00064     return std::string ("MPI parallel / TBB parallel MGS");
00065 #else
00066     throw std::logic_error("MGS not built with Intel TBB support");
00067 #endif // HAVE_TSQR_INTEL_TBB
00068   }
00069       else 
00070   throw std::logic_error("Unknown MGS implementation type \"" + which + "\"");
00071     }
00072 
00073     template< class MgsType >
00074     class MgsVerifier {
00075     public:
00076       typedef MgsType mgs_type;
00077       typedef typename MgsType::ordinal_type ordinal_type;
00078       typedef typename MgsType::scalar_type scalar_type;
00079       typedef Matrix< ordinal_type, scalar_type > matrix_type;
00080       typedef MessengerBase< scalar_type > messenger_type;
00081       typedef Teuchos::RCP< messenger_type > messenger_ptr;
00082 
00083       static void
00084       verify (mgs_type& orthogonalizer,
00085         const messenger_ptr& messenger,
00086         matrix_type& Q_local,
00087         matrix_type& R,
00088         const bool b_debug = false)
00089       {
00090   using std::cerr;
00091   using std::endl;
00092 
00093   // Factor the (copy of the) matrix.  On output, the explicit Q
00094   // factor (of A_local) is in Q_local and the R factor is in R.
00095   orthogonalizer.mgs (Q_local.nrows(), Q_local.ncols(), 
00096           Q_local.get(), Q_local.lda(),
00097           R.get(), R.lda());
00098   if (b_debug)
00099     {
00100       messenger->barrier();
00101       if (messenger->rank() == 0)
00102         cerr << "-- Finished MGS::mgs" << endl;
00103     }
00104       }
00105     };
00106 
00107     template< class Ordinal, class Scalar, class Generator >
00108     void
00109     verifyMgs (const std::string& which,
00110          Generator& generator,
00111          const Ordinal nrows_global,
00112          const Ordinal ncols,
00113          const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00114          const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00115          const int num_cores,
00116          const bool human_readable,
00117          const bool b_debug)
00118     {
00119       typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
00120       using std::cerr;
00121       using std::cout;
00122       using std::endl;
00123 
00124       const bool b_extra_debug = false;
00125       const int nprocs = scalarComm->size();
00126       const int my_rank = scalarComm->rank();
00127       if (b_debug)
00128   {
00129     scalarComm->barrier();
00130     if (my_rank == 0)
00131       cerr << "mgs_verify:" << endl;
00132     scalarComm->barrier();
00133   }
00134       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00135 
00136       // Set up storage for the test problem
00137       Matrix< Ordinal, Scalar > A_local (nrows_local, ncols);
00138       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00139   A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00140       Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0));
00141 
00142       // Generate the test problem.
00143       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00144       if (b_debug)
00145   {
00146     scalarComm->barrier();
00147     if (my_rank == 0)
00148       cerr << "-- Generated test problem." << endl;
00149   }
00150 
00151       // Make sure that the test problem (the matrix to factor) was
00152       // distributed correctly.
00153       if (b_extra_debug && b_debug)
00154   {
00155     if (my_rank == 0)
00156       cerr << "Test matrix A:" << endl;
00157     scalarComm->barrier();
00158     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00159     scalarComm->barrier();
00160   }
00161 
00162       // Factoring the matrix stored in A_local overwrites it, so we
00163       // copy A_local into Q_local.  MGS orthogonalization does not
00164       // support contiguously stored cache blocks, unlike TSQR, so we
00165       // don't have to consider whether or not to rearrange cache
00166       // blocks here (unlike with TSQR).
00167       Matrix< Ordinal, Scalar > Q_local (A_local);
00168 
00169       if (b_debug)
00170   {
00171     scalarComm->barrier();
00172     if (my_rank == 0)
00173       cerr << "-- Starting verification" << endl;
00174   }
00175 
00176       if (which == "MpiTbbMGS")
00177   {
00178 #ifdef HAVE_TSQR_INTEL_TBB
00179     typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type;
00180     mgs_type mgser (scalarComm);
00181     MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug);
00182 #else
00183     throw std::logic_error("MGS not built with Intel TBB support");
00184 #endif // HAVE_TSQR_INTEL_TBB
00185   }
00186       else if (which == "MpiSeqMGS")
00187   {
00188     typedef MGS< Ordinal, Scalar > mgs_type;
00189     mgs_type mgser (scalarComm);
00190     MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug);
00191   }
00192       else
00193   throw std::logic_error ("Invalid MGS implementation type \"" + which + "\"");
00194 
00195       // Print out the Q and R factors
00196       if (b_extra_debug && b_debug)
00197   {
00198     if (my_rank == 0)
00199       cerr << endl << "Q factor:" << endl;
00200     scalarComm->barrier ();
00201     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00202     scalarComm->barrier ();
00203     if (my_rank == 0)
00204       {
00205         cerr << endl << "R factor:" << endl;
00206         print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00207         cerr << endl;
00208       }
00209     scalarComm->barrier ();
00210   }
00211 
00212       // Test accuracy of the resulting factorization
00213       std::vector< magnitude_type > results = 
00214   global_verify (nrows_local, ncols, A_local.get(), A_local.lda(),
00215            Q_local.get(), Q_local.lda(), R.get(), R.lda(), 
00216            scalarComm.get());
00217       if (b_debug)
00218   {
00219     scalarComm->barrier();
00220     if (my_rank == 0)
00221       cerr << "-- Finished global_verify" << endl;
00222     scalarComm->barrier();
00223   }
00224 
00225       // Print the results on Proc 0.
00226       if (my_rank == 0)
00227   {
00228     if (human_readable)
00229       {
00230         cout << mgs_human_readable_name(which) << endl
00231        << "# rows = " << nrows_global << endl
00232        << "# columns = " << ncols << endl
00233        << "# MPI processes = " << nprocs << endl;
00234         if (which == "MpiTbbTSQR")
00235     cout << "# cores per process = " << num_cores << endl;
00236         cout << "Absolute residual $\\|A - Q*R\\|_2: "
00237        << results[0] << endl
00238        << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: " 
00239        << results[1] << endl
00240        << "Test matrix norm $\\| A \\|_F$: "
00241        << results[2] << endl
00242        << endl;
00243       }
00244     else
00245       {
00246         cout << which
00247        << "," << nrows_global
00248        << "," << ncols
00249        << "," << nprocs;
00250         if (which == "MpiTbbTSQR")
00251     cout << "," << num_cores << endl;
00252         cout << "," << results[0] 
00253        << "," << results[1]
00254        << "," << results[2]
00255        << endl;
00256       }
00257   }
00258     }
00259 
00260 
00261     template< class MgsBase, class TimerType >
00262     static double // returns timing in s
00263     do_mgs_benchmark (MgsBase& orthogonalizer,
00264           Matrix< typename MgsBase::ordinal_type, typename MgsBase::scalar_type >& Q_local,
00265           Matrix< typename MgsBase::ordinal_type, typename MgsBase::scalar_type >& R,
00266           const int num_trials,
00267           const bool human_readable)
00268     {
00269       typedef typename MgsBase::ordinal_type ordinal_type;
00270       using std::cout;
00271 
00272       TSQR::Test::verifyTimerConcept< TimerType >();
00273 
00274       const ordinal_type nrows_local = Q_local.nrows();
00275       const ordinal_type ncols = Q_local.ncols();
00276 
00277       // Benchmark MGS for ntrials trials.  The answer (the numerical
00278       // results of the factorization) is only valid if ntrials == 1,
00279       // but this is a benchmark and not a verification routine.  Call
00280       // mgs_verify() if you want to determine whether MGS computes
00281       // the right answer.
00282       //
00283       // Name of timer doesn't matter here; we only need the timing.
00284       TimerType timer("MGS"); 
00285       timer.start();
00286       for (int trial_num = 0; trial_num < num_trials; ++trial_num)
00287   {
00288     // Orthogonalize the columns of A using MGS.  Don't worry about
00289     // the fact that we're overwriting the input; this is a
00290     // benchmark, not a numerical verification test.  (We have the
00291     // latter implemented as mgs_verify() in this file.)
00292     orthogonalizer.mgs (nrows_local, ncols, Q_local.get(),
00293             Q_local.lda(), R.get(), R.lda());
00294     // Timings in debug mode likely won't make sense, because
00295     // Proc 0 is outputting the debug messages to cerr.
00296     // Nevertheless, we don't put any "if(b_debug)" calls in the
00297     // timing loop.
00298   }
00299       // Compute the resulting total time (in seconds) to execute
00300       // num_trials runs of :mgs().  The time may differ on different
00301       // MPI processes.
00302       const double mgs_timing = timer.stop();
00303       return mgs_timing;
00304     }
00305 
00306     template< class Ordinal, class Scalar, class Generator, class TimerType >
00307     void
00308     benchmarkMgs (const std::string& which,
00309       Generator& generator,
00310       const int ntrials,
00311       const Ordinal nrows_global,
00312       const Ordinal ncols,
00313       const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00314       const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00315       const int num_cores,
00316       const bool human_readable,
00317       const bool b_debug)
00318     {
00319       typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
00320       using std::cerr;
00321       using std::cout;
00322       using std::endl;
00323 
00324       TSQR::Test::verifyTimerConcept< TimerType >();
00325 
00326       const bool b_extra_debug = false;
00327       const int nprocs = scalarComm->size();
00328       const int my_rank = scalarComm->rank();
00329       if (b_debug)
00330   {
00331     scalarComm->barrier();
00332     if (my_rank == 0)
00333       cerr << "mgs_benchmark:" << endl;
00334     scalarComm->barrier();
00335   }
00336       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00337    
00338       // Set up storage for the test problem.
00339       Matrix<Ordinal, Scalar> A_local (nrows_local, ncols);
00340       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00341   A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00342       Matrix<Ordinal, Scalar> R (ncols, ncols, Scalar(0));
00343 
00344       // Generate the test problem.
00345       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00346       if (b_debug)
00347   {
00348     scalarComm->barrier();
00349     if (my_rank == 0)
00350       cerr << "-- Generated test problem." << endl;
00351   }
00352 
00353       // Make sure that the test problem (the matrix to factor) was
00354       // distributed correctly.
00355       if (b_extra_debug && b_debug)
00356   {
00357     if (my_rank == 0)
00358       cerr << "Test matrix A:" << endl;
00359     scalarComm->barrier ();
00360     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00361     scalarComm->barrier ();
00362   }
00363 
00364       // Factoring the matrix stored in A_local overwrites it, so we
00365       // make a copy of A_local.  MGS orthogonalization does not
00366       // support contiguously stored cache blocks, unlike TSQR, so we
00367       // don't have to consider whether or not to rearrange cache
00368       // blocks here (unlike with TSQR).
00369       Matrix< Ordinal, Scalar > Q_local (A_local);
00370 
00371       if (b_debug)
00372   {
00373     scalarComm->barrier();
00374     if (my_rank == 0)
00375       cerr << "-- Starting timing loop" << endl;
00376   }
00377 
00378       // Set up MGS and run the benchmark.
00379       double mgs_timing; // Total run time in seconds of all ntrials trials
00380       if (which == "MpiTbbMGS")
00381   {
00382 #ifdef HAVE_TSQR_INTEL_TBB
00383     typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type;
00384     mgs_type mgser (scalarComm);
00385     mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, 
00386                 ntrials, human_readable);
00387 #else
00388     throw std::logic_error("MGS not built with Intel TBB support");
00389 #endif // HAVE_TSQR_INTEL_TBB
00390   }
00391       else if (which == "MpiSeqMGS")
00392   {
00393     typedef MGS< Ordinal, Scalar > mgs_type;
00394     mgs_type mgser (scalarComm);
00395     mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, 
00396                 ntrials, human_readable);
00397   }
00398       else
00399   throw std::logic_error ("Invalid MGS implementation type \"" + which + "\"");
00400 
00401       if (b_debug)
00402   {
00403     scalarComm->barrier();
00404     if (my_rank == 0)
00405       cerr << "-- Finished timing loop" << endl;
00406   }
00407 
00408       // Find the min and max MGS timing on all processors.
00409       const double min_mgs_timing = scalarComm->globalMin (mgs_timing);
00410       const double max_mgs_timing = scalarComm->globalMax (mgs_timing);
00411 
00412       if (b_debug)
00413   {
00414     scalarComm->barrier();
00415     if (my_rank == 0)
00416       cerr << "-- Computed min and max timings" << endl;
00417   }
00418 
00419       // Print the results on Proc 0.
00420       if (my_rank == 0)
00421   {
00422     if (human_readable)
00423       {
00424         cout << mgs_human_readable_name(which) << ":" << endl
00425        << "# rows = " << nrows_global << endl
00426        << "# columns = " << ncols << endl
00427        << "# MPI processes = " << nprocs << endl;
00428         if (which == "MpiTbbTSQR")
00429     cout << "# cores per process = " << num_cores << endl;
00430         cout << "# trials = " << ntrials << endl
00431        << "Min total time (s) over all MPI processes = " 
00432        << min_mgs_timing << endl
00433        << "Max total time (s) over all MPI processes = " 
00434        << max_mgs_timing << endl
00435        << endl;
00436       }
00437     else
00438       {
00439         cout << which
00440        << "," << nrows_global
00441        << "," << ncols 
00442        << "," << nprocs;
00443         if (which == "MpiTbbTSQR")
00444     cout << "," << num_cores << endl;
00445         cout << "," << ntrials 
00446        << "," << min_mgs_timing 
00447        << "," << max_mgs_timing 
00448        << endl;
00449       }
00450   }
00451     }
00452     
00453 
00454   } // namespace Test
00455 } // namespace TSQR
00456 
00457 #endif // __TSQR_Test_MgsTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends