Anasazi Version of the Day
Tsqr_TbbTest.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                 Anasazi: Block Eigensolvers Package
00005 //                 Copyright (2010) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00025 //
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 #ifndef __TSQR_Test_TbbTest_hpp
00030 #define __TSQR_Test_TbbTest_hpp
00031 
00032 #include <Tsqr_nodeTestProblem.hpp>
00033 #include <Tsqr_verifyTimerConcept.hpp>
00034 #include <Tsqr_Random_NormalGenerator.hpp>
00035 
00036 #include <Tsqr_Blas.hpp>
00037 #include <Tsqr_Lapack.hpp>
00038 #include <Tsqr_LocalVerify.hpp>
00039 #include <Tsqr_Matrix.hpp>
00040 #include <Tsqr_Util.hpp>
00041 #include <Tsqr_ScalarTraits.hpp>
00042 #include <TbbTsqr.hpp>
00043 
00044 #include <Teuchos_Time.hpp>
00045 
00046 #include <algorithm>
00047 #include <cstring> // size_t definition
00048 //#include <iomanip>
00049 #include <iostream>
00050 #include <limits>
00051 #include <stdexcept>
00052 #include <vector>
00053 
00054 using std::make_pair;
00055 using std::pair;
00056 using std::vector;
00057 
00058 using std::cerr;
00059 using std::cout;
00060 using std::endl;
00061 
00064 
00065 namespace TSQR {
00066   namespace Test {
00067 
00071     template< class Ordinal, class Scalar >
00072     void
00073     verifyTbbTsqr (TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator,
00074        const Ordinal nrows, 
00075        const Ordinal ncols, 
00076        const int num_cores,
00077        const size_t cache_block_size,
00078        const bool contiguous_cache_blocks,
00079        const bool human_readable,
00080        const bool b_debug = false)
00081     {
00082       typedef Teuchos::Time timer_type;
00083       typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar, timer_type > node_tsqr_type;
00084       typedef typename node_tsqr_type::FactorOutput factor_output_type;
00085       typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00086       using std::cerr;
00087       using std::cout;
00088       using std::endl;
00089 
00090       node_tsqr_type actor (num_cores, cache_block_size);
00091 
00092       if (b_debug)
00093   {
00094     cerr << "Intel TBB TSQR test problem:" << endl
00095          << "* " << nrows << " x " << ncols << endl
00096          << "* # cores: " << num_cores << endl
00097          << "* Cache block of " << actor.cache_block_size() << " bytes" << endl;
00098     if (contiguous_cache_blocks)
00099       cerr << "* Contiguous cache blocks" << endl;
00100   }
00101 
00102       Matrix< Ordinal, Scalar > A (nrows, ncols);
00103       Matrix< Ordinal, Scalar > A_copy (nrows, ncols);
00104       Matrix< Ordinal, Scalar > Q (nrows, ncols);
00105       Matrix< Ordinal, Scalar > R (ncols, ncols);
00106       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00107   {
00108     A.fill (std::numeric_limits< Scalar>::quiet_NaN());
00109     A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN());
00110     Q.fill (std::numeric_limits< Scalar >::quiet_NaN());
00111     R.fill (std::numeric_limits< Scalar >::quiet_NaN());
00112   }
00113       const Ordinal lda = nrows;
00114       const Ordinal ldq = nrows;
00115       const Ordinal ldr = ncols;
00116 
00117       // Create a test problem
00118       nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), true);
00119 
00120       if (b_debug)
00121   cerr << "-- Generated test problem" << endl;
00122 
00123       // Copy A into A_copy, since TSQR overwrites the input.  If
00124       // specified, rearrange the data in A_copy so that the data in
00125       // each cache block is contiguously stored.   
00126       if (! contiguous_cache_blocks)
00127   {
00128     A_copy.copy (A);
00129     if (b_debug)
00130       cerr << "-- Copied test problem from A into A_copy" << endl;
00131   }
00132       else
00133   {
00134     actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda());
00135     if (b_debug)
00136       cerr << "-- Reorganized test matrix to have contiguous "
00137         "cache blocks" << endl;
00138 
00139     // Verify cache blocking, when in debug mode.
00140     if (b_debug)
00141       {
00142         Matrix< Ordinal, Scalar > A2 (nrows, ncols);
00143         if (std::numeric_limits< Scalar >::has_quiet_NaN)
00144     A2.fill (std::numeric_limits< Scalar >::quiet_NaN());
00145 
00146         actor.un_cache_block (nrows, ncols, A2.get(), A2.lda(), A_copy.get());
00147         if (A == A2)
00148     {
00149       if (b_debug)
00150         cerr << "-- Cache blocking test succeeded!" << endl;
00151     }
00152         else
00153     throw std::logic_error ("Cache blocking failed");
00154       }
00155   }
00156 
00157       // Fill R with zeros, since the factorization may not overwrite
00158       // the strict lower triangle of R.
00159       R.fill (Scalar(0));
00160 
00161       // Factor the matrix and compute the explicit Q factor
00162       factor_output_type factor_output = 
00163   actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), R.get(), 
00164           R.lda(), contiguous_cache_blocks);
00165       if (b_debug)
00166   cerr << "-- Finished TbbTsqr::factor" << endl;
00167       actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), factor_output,
00168       ncols, Q.get(), Q.lda(), contiguous_cache_blocks);
00169       if (b_debug)
00170   cerr << "-- Finished TbbTsqr::explicit_Q" << endl;
00171 
00172       // "Un"-cache-block the output Q (the explicit Q factor), if
00173       // contiguous cache blocks were used.  This is only necessary
00174       // because local_verify() doesn't currently support contiguous
00175       // cache blocks.
00176       if (contiguous_cache_blocks)
00177   {
00178     // Use A_copy as temporary storage for un-cache-blocking Q.
00179     actor.un_cache_block (nrows, ncols, A_copy.get(), A_copy.lda(), Q.get());
00180     Q.copy (A_copy);
00181     if (b_debug)
00182       cerr << "-- Un-cache-blocked output Q factor" << endl;
00183   }
00184 
00185       // Print out the R factor
00186       if (b_debug)
00187   {
00188     cerr << endl << "-- R factor:" << endl;
00189     print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00190     cerr << endl;
00191   }
00192 
00193       // Validate the factorization
00194       std::pair< magnitude_type, magnitude_type > results =
00195   local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, R.get(), ldr);
00196       if (b_debug)
00197   cerr << "-- Finished local_verify" << endl;
00198 
00199       // Print the results
00200       if (human_readable)
00201   cout << "Parallel (via Intel\'s Threading Building Blocks) / cache-blocked) TSQR:" << endl
00202        << "# rows = " << nrows << endl
00203        << "# columns = " << ncols << endl
00204        << "# cores: " << num_cores << endl
00205        << "cache block # bytes = " << actor.cache_block_size() << endl
00206        << "contiguous cache blocks? " << contiguous_cache_blocks << endl
00207        << "Relative residual $\\|A - Q*R\\|_2 / \\|A\\|_2$ = " 
00208        << results.first << endl
00209        << "Relative orthogonality $\\|I - Q^T*Q\\|_2$ = " 
00210        << results.second << endl
00211        << endl;
00212       else
00213   cout << "TbbTsqr"
00214        << "," << nrows
00215        << "," << ncols
00216        << "," << num_cores
00217        << "," << actor.cache_block_size()
00218        << "," << contiguous_cache_blocks 
00219        << "," << results.first 
00220        << "," << results.second
00221        << endl;
00222     }
00223 
00231     template< class Ordinal, class Scalar >
00232     void
00233     benchmarkTbbTsqr (const int ntrials,
00234           const Ordinal nrows, 
00235           const Ordinal ncols, 
00236           const int num_cores,
00237           const size_t cache_block_size,
00238           const bool contiguous_cache_blocks,
00239           const bool human_readable)
00240     {
00241       using TSQR::TBB::TbbTsqr;
00242       using std::cerr;
00243       using std::cout;
00244       using std::endl;
00245 
00246       typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00247       typedef Teuchos::Time timer_type;
00248       typedef Ordinal ordinal_type;
00249       typedef Scalar scalar_type;
00250       typedef Matrix< ordinal_type, scalar_type > matrix_type;
00251       typedef TbbTsqr< ordinal_type, scalar_type, timer_type > node_tsqr_type;
00252 
00253       // Pseudorandom normal(0,1) generator.  Default seed is OK,
00254       // because this is a benchmark, not an accuracy test.
00255       TSQR::Random::NormalGenerator< ordinal_type, scalar_type > generator;
00256 
00257       // Set up TSQR implementation.
00258       node_tsqr_type actor (num_cores, cache_block_size);
00259 
00260       matrix_type A (nrows, ncols);
00261       matrix_type A_copy (nrows, ncols);
00262       matrix_type Q (nrows, ncols);
00263       matrix_type R (ncols, ncols, scalar_type(0));
00264 
00265       // Fill R with zeros, since the factorization may not overwrite
00266       // the strict lower triangle of R.
00267       R.fill (scalar_type(0));
00268 
00269       // Create a test problem
00270       nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), false);
00271 
00272       // Copy A into A_copy, since TSQR overwrites the input.  If
00273       // specified, rearrange the data in A_copy so that the data in
00274       // each cache block is contiguously stored.   
00275       if (contiguous_cache_blocks)
00276   actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda());
00277       else
00278   A_copy.copy (A);
00279 
00280       // Benchmark TBB-based TSQR for ntrials trials.
00281       //
00282       // Name of timer doesn't matter here; we only need the timing.
00283       timer_type timer("TbbTsqr");
00284       timer.start();
00285       for (int trial_num = 0; trial_num < ntrials; ++trial_num)
00286   {
00287     // Factor the matrix in-place in A_copy, and extract the
00288     // resulting R factor into R.
00289     typedef typename node_tsqr_type::FactorOutput factor_output_type;
00290     factor_output_type factor_output = 
00291       actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), 
00292         R.get(), R.lda(), contiguous_cache_blocks);
00293     // Compute the explicit Q factor (which was stored
00294     // implicitly in A_copy and factor_output) and store in Q.
00295     // We don't need to un-cache-block the output, because we
00296     // aren't verifying it here.
00297     actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), 
00298           factor_output, ncols, Q.get(), Q.lda(), 
00299           contiguous_cache_blocks);
00300   }
00301       const double tbb_tsqr_timing = timer.stop();
00302 
00303       // Print the results
00304       if (human_readable)
00305   {
00306     cout << "(Intel TBB / cache-blocked) TSQR cumulative timings:" << endl
00307          << "# rows = " << nrows << endl
00308          << "# columns = " << ncols << endl
00309          << "# cores: " << num_cores << endl
00310          << "cache block # bytes = " << actor.cache_block_size() << endl
00311          << "contiguous cache blocks? " << contiguous_cache_blocks << endl
00312          << "# trials = " << ntrials << endl
00313          << "Total time (s) = " << tbb_tsqr_timing << endl
00314          << "Total time (s) in factor() (min over all tasks): " 
00315          << (ntrials * actor.min_seq_factor_timing()) << endl
00316          << "Total time (s) in factor() (max over all tasks): " 
00317          << (ntrials * actor.max_seq_factor_timing()) << endl
00318          << "Total time (s) in apply() (min over all tasks): " 
00319          << (ntrials * actor.min_seq_apply_timing()) << endl
00320          << "Total time (s) in apply() (max over all tasks): " 
00321          << (ntrials * actor.max_seq_apply_timing()) << endl
00322          << endl << endl;
00323     cout << "(Intel TBB / cache-blocked) TSQR per-invocation timings:" << endl;
00324     
00325     std::vector< TimeStats > stats;
00326     actor.getStats (stats);
00327     std::vector< std::string> labels;
00328     actor.getStatsLabels (labels);
00329 
00330     const std::string labelLabel ("label");
00331     for (std::vector< std::string >::size_type k = 0; k < labels.size(); ++k)
00332       {
00333         const bool printHeaders = (k == 0);
00334         if (stats[k].count() > 0)
00335     stats[k].print (cout, human_readable, labels[k], labelLabel, printHeaders);
00336       }
00337   }
00338       else
00339   {
00340     // We don't include {min,max}_seq_apply_timing() here, because
00341     // those times don't benefit from the accuracy of benchmarking
00342     // for ntrials > 1.  Thus, it's misleading to include them
00343     // with tbb_tsqr_timing, the total time over ntrials trials.
00344     cout << "TbbTsqr"
00345          << "," << nrows
00346          << "," << ncols
00347          << "," << num_cores
00348          << "," << actor.cache_block_size()
00349          << "," << contiguous_cache_blocks 
00350          << "," << ntrials
00351          << "," << tbb_tsqr_timing 
00352          << endl;
00353   }
00354     }
00355   } // namespace Test
00356 } // namespace TSQR
00357 
00358 #endif // __TSQR_Test_TbbTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends