Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_MgsTest.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __TSQR_Test_MgsTest_hpp
00043 #define __TSQR_Test_MgsTest_hpp
00044 
00045 #include <Tsqr_ConfigDefs.hpp>
00046 
00047 #include <Tsqr_Mgs.hpp>
00048 #ifdef HAVE_TSQR_INTEL_TBB
00049 #  include <TbbTsqr_TbbMgs.hpp>
00050 #endif // HAVE_TSQR_INTEL_TBB
00051 #include <Tsqr_TestSetup.hpp>
00052 #include <Tsqr_GlobalVerify.hpp>
00053 #include <Tsqr_printGlobalMatrix.hpp>
00054 #include <Tsqr_verifyTimerConcept.hpp>
00055 
00056 #include <Teuchos_RCP.hpp>
00057 
00058 #include <algorithm>
00059 #include <sstream>
00060 #include <limits>
00061 #include <iostream>
00062 #include <stdexcept>
00063 #include <utility>
00064 
00065 
00066 namespace TSQR {
00067   namespace Test {
00068 
00069     static std::string
00070     mgs_human_readable_name (const std::string& which)
00071     {
00072       if (which == "MpiSeqMGS")
00073   return std::string ("MPI parallel / sequential MGS");
00074       else if (which == "MpiTbbMGS")
00075   {
00076 #ifdef HAVE_TSQR_INTEL_TBB
00077     return std::string ("MPI parallel / TBB parallel MGS");
00078 #else
00079     throw std::logic_error("MGS not built with Intel TBB support");
00080 #endif // HAVE_TSQR_INTEL_TBB
00081   }
00082       else 
00083   throw std::logic_error("Unknown MGS implementation type \"" + which + "\"");
00084     }
00085 
00086     template< class MgsType >
00087     class MgsVerifier {
00088     public:
00089       typedef MgsType mgs_type;
00090       typedef typename MgsType::ordinal_type ordinal_type;
00091       typedef typename MgsType::scalar_type scalar_type;
00092       typedef Matrix< ordinal_type, scalar_type > matrix_type;
00093       typedef MessengerBase< scalar_type > messenger_type;
00094       typedef Teuchos::RCP< messenger_type > messenger_ptr;
00095 
00096       static void
00097       verify (mgs_type& orthogonalizer,
00098         const messenger_ptr& messenger,
00099         matrix_type& Q_local,
00100         matrix_type& R,
00101         const bool b_debug = false)
00102       {
00103   using std::cerr;
00104   using std::endl;
00105 
00106   // Factor the (copy of the) matrix.  On output, the explicit Q
00107   // factor (of A_local) is in Q_local and the R factor is in R.
00108   orthogonalizer.mgs (Q_local.nrows(), Q_local.ncols(), 
00109           Q_local.get(), Q_local.lda(),
00110           R.get(), R.lda());
00111   if (b_debug)
00112     {
00113       messenger->barrier();
00114       if (messenger->rank() == 0)
00115         cerr << "-- Finished MGS::mgs" << endl;
00116     }
00117       }
00118     };
00119 
00120     template< class Ordinal, class Scalar, class Generator >
00121     void
00122     verifyMgs (const std::string& which,
00123          Generator& generator,
00124          const Ordinal nrows_global,
00125          const Ordinal ncols,
00126          const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00127          const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00128          const int num_cores,
00129          const bool human_readable,
00130          const bool b_debug)
00131     {
00132       typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
00133       using std::cerr;
00134       using std::cout;
00135       using std::endl;
00136 
00137       const bool b_extra_debug = false;
00138       const int nprocs = scalarComm->size();
00139       const int my_rank = scalarComm->rank();
00140       if (b_debug)
00141   {
00142     scalarComm->barrier();
00143     if (my_rank == 0)
00144       cerr << "mgs_verify:" << endl;
00145     scalarComm->barrier();
00146   }
00147       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00148 
00149       // Set up storage for the test problem
00150       Matrix< Ordinal, Scalar > A_local (nrows_local, ncols);
00151       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00152   A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00153       Matrix< Ordinal, Scalar > R (ncols, ncols, Scalar(0));
00154 
00155       // Generate the test problem.
00156       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00157       if (b_debug)
00158   {
00159     scalarComm->barrier();
00160     if (my_rank == 0)
00161       cerr << "-- Generated test problem." << endl;
00162   }
00163 
00164       // Make sure that the test problem (the matrix to factor) was
00165       // distributed correctly.
00166       if (b_extra_debug && b_debug)
00167   {
00168     if (my_rank == 0)
00169       cerr << "Test matrix A:" << endl;
00170     scalarComm->barrier();
00171     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00172     scalarComm->barrier();
00173   }
00174 
00175       // Factoring the matrix stored in A_local overwrites it, so we
00176       // copy A_local into Q_local.  MGS orthogonalization does not
00177       // support contiguously stored cache blocks, unlike TSQR, so we
00178       // don't have to consider whether or not to rearrange cache
00179       // blocks here (unlike with TSQR).
00180       Matrix< Ordinal, Scalar > Q_local (A_local);
00181 
00182       if (b_debug)
00183   {
00184     scalarComm->barrier();
00185     if (my_rank == 0)
00186       cerr << "-- Starting verification" << endl;
00187   }
00188 
00189       if (which == "MpiTbbMGS")
00190   {
00191 #ifdef HAVE_TSQR_INTEL_TBB
00192     typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type;
00193     mgs_type mgser (scalarComm);
00194     MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug);
00195 #else
00196     throw std::logic_error("MGS not built with Intel TBB support");
00197 #endif // HAVE_TSQR_INTEL_TBB
00198   }
00199       else if (which == "MpiSeqMGS")
00200   {
00201     typedef MGS< Ordinal, Scalar > mgs_type;
00202     mgs_type mgser (scalarComm);
00203     MgsVerifier< mgs_type >::verify (mgser, scalarComm, Q_local, R, b_debug);
00204   }
00205       else
00206   throw std::logic_error ("Invalid MGS implementation type \"" + which + "\"");
00207 
00208       // Print out the Q and R factors
00209       if (b_extra_debug && b_debug)
00210   {
00211     if (my_rank == 0)
00212       cerr << endl << "Q factor:" << endl;
00213     scalarComm->barrier ();
00214     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00215     scalarComm->barrier ();
00216     if (my_rank == 0)
00217       {
00218         cerr << endl << "R factor:" << endl;
00219         print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00220         cerr << endl;
00221       }
00222     scalarComm->barrier ();
00223   }
00224 
00225       // Test accuracy of the resulting factorization
00226       std::vector< magnitude_type > results = 
00227   global_verify (nrows_local, ncols, A_local.get(), A_local.lda(),
00228            Q_local.get(), Q_local.lda(), R.get(), R.lda(), 
00229            scalarComm.get());
00230       if (b_debug)
00231   {
00232     scalarComm->barrier();
00233     if (my_rank == 0)
00234       cerr << "-- Finished global_verify" << endl;
00235     scalarComm->barrier();
00236   }
00237 
00238       // Print the results on Proc 0.
00239       if (my_rank == 0)
00240   {
00241     if (human_readable)
00242       {
00243         cout << mgs_human_readable_name(which) << endl
00244        << "# rows = " << nrows_global << endl
00245        << "# columns = " << ncols << endl
00246        << "# MPI processes = " << nprocs << endl;
00247         if (which == "MpiTbbTSQR")
00248     cout << "# cores per process = " << num_cores << endl;
00249         cout << "Absolute residual $\\|A - Q*R\\|_2: "
00250        << results[0] << endl
00251        << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: " 
00252        << results[1] << endl
00253        << "Test matrix norm $\\| A \\|_F$: "
00254        << results[2] << endl
00255        << endl;
00256       }
00257     else
00258       {
00259         cout << which
00260        << "," << nrows_global
00261        << "," << ncols
00262        << "," << nprocs;
00263         if (which == "MpiTbbTSQR")
00264     cout << "," << num_cores << endl;
00265         cout << "," << results[0] 
00266        << "," << results[1]
00267        << "," << results[2]
00268        << endl;
00269       }
00270   }
00271     }
00272 
00273 
00274     template< class MgsBase, class TimerType >
00275     static double // returns timing in s
00276     do_mgs_benchmark (MgsBase& orthogonalizer,
00277           Matrix< typename MgsBase::ordinal_type, typename MgsBase::scalar_type >& Q_local,
00278           Matrix< typename MgsBase::ordinal_type, typename MgsBase::scalar_type >& R,
00279           const int num_trials,
00280           const bool human_readable)
00281     {
00282       typedef typename MgsBase::ordinal_type ordinal_type;
00283       using std::cout;
00284 
00285       TSQR::Test::verifyTimerConcept< TimerType >();
00286 
00287       const ordinal_type nrows_local = Q_local.nrows();
00288       const ordinal_type ncols = Q_local.ncols();
00289 
00290       // Benchmark MGS for ntrials trials.  The answer (the numerical
00291       // results of the factorization) is only valid if ntrials == 1,
00292       // but this is a benchmark and not a verification routine.  Call
00293       // mgs_verify() if you want to determine whether MGS computes
00294       // the right answer.
00295       //
00296       // Name of timer doesn't matter here; we only need the timing.
00297       TimerType timer("MGS"); 
00298       timer.start();
00299       for (int trial_num = 0; trial_num < num_trials; ++trial_num)
00300   {
00301     // Orthogonalize the columns of A using MGS.  Don't worry about
00302     // the fact that we're overwriting the input; this is a
00303     // benchmark, not a numerical verification test.  (We have the
00304     // latter implemented as mgs_verify() in this file.)
00305     orthogonalizer.mgs (nrows_local, ncols, Q_local.get(),
00306             Q_local.lda(), R.get(), R.lda());
00307     // Timings in debug mode likely won't make sense, because
00308     // Proc 0 is outputting the debug messages to cerr.
00309     // Nevertheless, we don't put any "if(b_debug)" calls in the
00310     // timing loop.
00311   }
00312       // Compute the resulting total time (in seconds) to execute
00313       // num_trials runs of :mgs().  The time may differ on different
00314       // MPI processes.
00315       const double mgs_timing = timer.stop();
00316       return mgs_timing;
00317     }
00318 
00319     template< class Ordinal, class Scalar, class Generator, class TimerType >
00320     void
00321     benchmarkMgs (const std::string& which,
00322       Generator& generator,
00323       const int ntrials,
00324       const Ordinal nrows_global,
00325       const Ordinal ncols,
00326       const Teuchos::RCP< MessengerBase< Ordinal > >& ordinalComm,
00327       const Teuchos::RCP< MessengerBase< Scalar > >& scalarComm,
00328       const int num_cores,
00329       const bool human_readable,
00330       const bool b_debug)
00331     {
00332       typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
00333       using std::cerr;
00334       using std::cout;
00335       using std::endl;
00336 
00337       TSQR::Test::verifyTimerConcept< TimerType >();
00338 
00339       const bool b_extra_debug = false;
00340       const int nprocs = scalarComm->size();
00341       const int my_rank = scalarComm->rank();
00342       if (b_debug)
00343   {
00344     scalarComm->barrier();
00345     if (my_rank == 0)
00346       cerr << "mgs_benchmark:" << endl;
00347     scalarComm->barrier();
00348   }
00349       const Ordinal nrows_local = numLocalRows (nrows_global, my_rank, nprocs);
00350    
00351       // Set up storage for the test problem.
00352       Matrix<Ordinal, Scalar> A_local (nrows_local, ncols);
00353       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00354   A_local.fill (std::numeric_limits< Scalar >::quiet_NaN());
00355       Matrix<Ordinal, Scalar> R (ncols, ncols, Scalar(0));
00356 
00357       // Generate the test problem.
00358       distributedTestProblem (generator, A_local, ordinalComm.get(), scalarComm.get());
00359       if (b_debug)
00360   {
00361     scalarComm->barrier();
00362     if (my_rank == 0)
00363       cerr << "-- Generated test problem." << endl;
00364   }
00365 
00366       // Make sure that the test problem (the matrix to factor) was
00367       // distributed correctly.
00368       if (b_extra_debug && b_debug)
00369   {
00370     if (my_rank == 0)
00371       cerr << "Test matrix A:" << endl;
00372     scalarComm->barrier ();
00373     printGlobalMatrix (cerr, A_local, scalarComm.get(), ordinalComm.get());
00374     scalarComm->barrier ();
00375   }
00376 
00377       // Factoring the matrix stored in A_local overwrites it, so we
00378       // make a copy of A_local.  MGS orthogonalization does not
00379       // support contiguously stored cache blocks, unlike TSQR, so we
00380       // don't have to consider whether or not to rearrange cache
00381       // blocks here (unlike with TSQR).
00382       Matrix< Ordinal, Scalar > Q_local (A_local);
00383 
00384       if (b_debug)
00385   {
00386     scalarComm->barrier();
00387     if (my_rank == 0)
00388       cerr << "-- Starting timing loop" << endl;
00389   }
00390 
00391       // Set up MGS and run the benchmark.
00392       double mgs_timing; // Total run time in seconds of all ntrials trials
00393       if (which == "MpiTbbMGS")
00394   {
00395 #ifdef HAVE_TSQR_INTEL_TBB
00396     typedef TSQR::TBB::TbbMgs< Ordinal, Scalar > mgs_type;
00397     mgs_type mgser (scalarComm);
00398     mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, 
00399                 ntrials, human_readable);
00400 #else
00401     throw std::logic_error("MGS not built with Intel TBB support");
00402 #endif // HAVE_TSQR_INTEL_TBB
00403   }
00404       else if (which == "MpiSeqMGS")
00405   {
00406     typedef MGS< Ordinal, Scalar > mgs_type;
00407     mgs_type mgser (scalarComm);
00408     mgs_timing = do_mgs_benchmark< mgs_type, TimerType > (mgser, Q_local, R, 
00409                 ntrials, human_readable);
00410   }
00411       else
00412   throw std::logic_error ("Invalid MGS implementation type \"" + which + "\"");
00413 
00414       if (b_debug)
00415   {
00416     scalarComm->barrier();
00417     if (my_rank == 0)
00418       cerr << "-- Finished timing loop" << endl;
00419   }
00420 
00421       // Find the min and max MGS timing on all processors.
00422       const double min_mgs_timing = scalarComm->globalMin (mgs_timing);
00423       const double max_mgs_timing = scalarComm->globalMax (mgs_timing);
00424 
00425       if (b_debug)
00426   {
00427     scalarComm->barrier();
00428     if (my_rank == 0)
00429       cerr << "-- Computed min and max timings" << endl;
00430   }
00431 
00432       // Print the results on Proc 0.
00433       if (my_rank == 0)
00434   {
00435     if (human_readable)
00436       {
00437         cout << mgs_human_readable_name(which) << ":" << endl
00438        << "# rows = " << nrows_global << endl
00439        << "# columns = " << ncols << endl
00440        << "# MPI processes = " << nprocs << endl;
00441         if (which == "MpiTbbTSQR")
00442     cout << "# cores per process = " << num_cores << endl;
00443         cout << "# trials = " << ntrials << endl
00444        << "Min total time (s) over all MPI processes = " 
00445        << min_mgs_timing << endl
00446        << "Max total time (s) over all MPI processes = " 
00447        << max_mgs_timing << endl
00448        << endl;
00449       }
00450     else
00451       {
00452         cout << which
00453        << "," << nrows_global
00454        << "," << ncols 
00455        << "," << nprocs;
00456         if (which == "MpiTbbTSQR")
00457     cout << "," << num_cores << endl;
00458         cout << "," << ntrials 
00459        << "," << min_mgs_timing 
00460        << "," << max_mgs_timing 
00461        << endl;
00462       }
00463   }
00464     }
00465     
00466 
00467   } // namespace Test
00468 } // namespace TSQR
00469 
00470 #endif // __TSQR_Test_MgsTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends