Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_TbbTest.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __TSQR_Test_TbbTest_hpp
00030 #define __TSQR_Test_TbbTest_hpp
00031 
00032 #include <Tsqr_nodeTestProblem.hpp>
00033 #include <Tsqr_verifyTimerConcept.hpp>
00034 #include <Tsqr_Random_NormalGenerator.hpp>
00035 
00036 #include <Tsqr_Blas.hpp>
00037 #include <Tsqr_Lapack.hpp>
00038 #include <Tsqr_LocalVerify.hpp>
00039 #include <Tsqr_Matrix.hpp>
00040 #include <Tsqr_Util.hpp>
00041 #include <Tsqr_ScalarTraits.hpp>
00042 #include <TbbTsqr.hpp>
00043 
00044 #include <Teuchos_Time.hpp>
00045 
00046 #include <algorithm>
00047 #include <cstring> // size_t definition
00048 //#include <iomanip>
00049 #include <iostream>
00050 #include <limits>
00051 #include <stdexcept>
00052 #include <vector>
00053 
00054 using std::make_pair;
00055 using std::pair;
00056 using std::vector;
00057 
00058 using std::cerr;
00059 using std::cout;
00060 using std::endl;
00061 
00064 
00065 namespace TSQR {
00066   namespace Test {
00067 
00071     template< class Ordinal, class Scalar >
00072     void
00073     verifyTbbTsqr (const std::string& scalarTypeName,
00074        TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator,
00075        const Ordinal nrows, 
00076        const Ordinal ncols, 
00077        const int num_cores,
00078        const size_t cache_size_hint,
00079        const bool contiguous_cache_blocks,
00080        const bool printFieldNames,
00081        const bool human_readable,
00082        const bool b_debug = false)
00083     {
00084       typedef Teuchos::Time timer_type;
00085       typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar, timer_type > node_tsqr_type;
00086       typedef typename node_tsqr_type::FactorOutput factor_output_type;
00087       typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00088       using std::cerr;
00089       using std::cout;
00090       using std::endl;
00091 
00092       node_tsqr_type actor (num_cores, cache_size_hint);
00093 
00094       if (b_debug)
00095   {
00096     cerr << "Intel TBB TSQR test problem:" << endl
00097          << "* " << nrows << " x " << ncols << endl
00098          << "* # cores: " << num_cores << endl
00099          << "* Cache size hint in bytes: " << actor.cache_size_hint() << endl;
00100     if (contiguous_cache_blocks)
00101       cerr << "* Contiguous cache blocks" << endl;
00102   }
00103 
00104       Matrix< Ordinal, Scalar > A (nrows, ncols);
00105       Matrix< Ordinal, Scalar > A_copy (nrows, ncols);
00106       Matrix< Ordinal, Scalar > Q (nrows, ncols);
00107       Matrix< Ordinal, Scalar > R (ncols, ncols);
00108       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00109   {
00110     A.fill (std::numeric_limits< Scalar>::quiet_NaN());
00111     A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN());
00112     Q.fill (std::numeric_limits< Scalar >::quiet_NaN());
00113     R.fill (std::numeric_limits< Scalar >::quiet_NaN());
00114   }
00115       const Ordinal lda = nrows;
00116       const Ordinal ldq = nrows;
00117       const Ordinal ldr = ncols;
00118 
00119       // Create a test problem
00120       nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), true);
00121 
00122       if (b_debug)
00123   cerr << "-- Generated test problem" << endl;
00124 
00125       // Copy A into A_copy, since TSQR overwrites the input.  If
00126       // specified, rearrange the data in A_copy so that the data in
00127       // each cache block is contiguously stored.   
00128       if (! contiguous_cache_blocks)
00129   {
00130     A_copy.copy (A);
00131     if (b_debug)
00132       cerr << "-- Copied test problem from A into A_copy" << endl;
00133   }
00134       else
00135   {
00136     actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda());
00137     if (b_debug)
00138       cerr << "-- Reorganized test matrix to have contiguous "
00139         "cache blocks" << endl;
00140 
00141     // Verify cache blocking, when in debug mode.
00142     if (b_debug)
00143       {
00144         Matrix< Ordinal, Scalar > A2 (nrows, ncols);
00145         if (std::numeric_limits< Scalar >::has_quiet_NaN)
00146     A2.fill (std::numeric_limits< Scalar >::quiet_NaN());
00147 
00148         actor.un_cache_block (nrows, ncols, A2.get(), A2.lda(), A_copy.get());
00149         if (A == A2)
00150     {
00151       if (b_debug)
00152         cerr << "-- Cache blocking test succeeded!" << endl;
00153     }
00154         else
00155     throw std::logic_error ("Cache blocking failed");
00156       }
00157   }
00158 
00159       // Fill R with zeros, since the factorization may not overwrite
00160       // the strict lower triangle of R.
00161       R.fill (Scalar(0));
00162 
00163       // Factor the matrix and compute the explicit Q factor
00164       factor_output_type factor_output = 
00165   actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), R.get(), 
00166           R.lda(), contiguous_cache_blocks);
00167       if (b_debug)
00168   cerr << "-- Finished TbbTsqr::factor" << endl;
00169       actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), factor_output,
00170       ncols, Q.get(), Q.lda(), contiguous_cache_blocks);
00171       if (b_debug)
00172   cerr << "-- Finished TbbTsqr::explicit_Q" << endl;
00173 
00174       // "Un"-cache-block the output Q (the explicit Q factor), if
00175       // contiguous cache blocks were used.  This is only necessary
00176       // because local_verify() doesn't currently support contiguous
00177       // cache blocks.
00178       if (contiguous_cache_blocks)
00179   {
00180     // Use A_copy as temporary storage for un-cache-blocking Q.
00181     actor.un_cache_block (nrows, ncols, A_copy.get(), A_copy.lda(), Q.get());
00182     Q.copy (A_copy);
00183     if (b_debug)
00184       cerr << "-- Un-cache-blocked output Q factor" << endl;
00185   }
00186 
00187       // Print out the R factor
00188       if (b_debug)
00189   {
00190     cerr << endl << "-- R factor:" << endl;
00191     print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00192     cerr << endl;
00193   }
00194 
00195       // Validate the factorization
00196       std::vector< magnitude_type > results =
00197   local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, R.get(), ldr);
00198       if (b_debug)
00199   cerr << "-- Finished local_verify" << endl;
00200 
00201       // Print the results
00202       if (human_readable)
00203   cout << "Parallel (via Intel\'s Threading Building Blocks) / cache-blocked) TSQR:" << endl
00204        << "Scalar type: " << scalarTypeName << endl
00205        << "# rows: " << nrows << endl
00206        << "# columns: " << ncols << endl
00207        << "# cores: " << num_cores << endl
00208        << "Cache size hint in bytes: " << actor.cache_size_hint() << endl
00209        << "Contiguous cache blocks? " << contiguous_cache_blocks << endl
00210        << "Absolute residual $\\|A - Q*R\\|_2$: "
00211        << results[0] << endl
00212        << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: " 
00213        << results[1] << endl
00214        << "Test matrix norm $\\| A \\|_F$: "
00215        << results[2] << endl
00216        << endl;
00217       else
00218   {
00219     if (printFieldNames)
00220       {
00221         const char prefix[] = "%";
00222         cout << prefix
00223        << "method"
00224        << ",scalarType"
00225        << ",numRows"
00226        << ",numCols"
00227        << ",numThreads"
00228        << ",cacheSizeHint"
00229        << ",contiguousCacheBlocks"
00230        << ",absFrobResid"
00231        << ",absFrobOrthog"
00232        << ",frobA"
00233        << endl;
00234       }
00235     cout << "TbbTsqr"
00236          << "," << scalarTypeName
00237          << "," << nrows
00238          << "," << ncols
00239          << "," << num_cores
00240          << "," << actor.cache_size_hint()
00241          << "," << contiguous_cache_blocks 
00242          << "," << results[0]
00243          << "," << results[1]
00244          << "," << results[2]
00245          << endl;
00246   }
00247     }
00248 
00256     template< class Ordinal, class Scalar >
00257     void
00258     benchmarkTbbTsqr (const std::string& scalarTypeName,
00259           const int ntrials,
00260           const Ordinal nrows, 
00261           const Ordinal ncols, 
00262           const int num_cores,
00263           const size_t cache_size_hint,
00264           const bool contiguous_cache_blocks,
00265           const bool printFieldNames,
00266           const bool human_readable)
00267     {
00268       using TSQR::TBB::TbbTsqr;
00269       using std::cerr;
00270       using std::cout;
00271       using std::endl;
00272 
00273       typedef Teuchos::Time timer_type;
00274       typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00275       typedef Ordinal ordinal_type;
00276       typedef Scalar scalar_type;
00277       typedef Matrix< ordinal_type, scalar_type > matrix_type;
00278       typedef TbbTsqr< ordinal_type, scalar_type, timer_type > node_tsqr_type;
00279 
00280       // Pseudorandom normal(0,1) generator.  Default seed is OK,
00281       // because this is a benchmark, not an accuracy test.
00282       TSQR::Random::NormalGenerator< ordinal_type, scalar_type > generator;
00283 
00284       // Set up TSQR implementation.
00285       node_tsqr_type actor (num_cores, cache_size_hint);
00286 
00287       matrix_type A (nrows, ncols);
00288       matrix_type A_copy (nrows, ncols);
00289       matrix_type Q (nrows, ncols);
00290       matrix_type R (ncols, ncols, scalar_type(0));
00291 
00292       // Fill R with zeros, since the factorization may not overwrite
00293       // the strict lower triangle of R.
00294       R.fill (scalar_type(0));
00295 
00296       // Create a test problem
00297       nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), false);
00298 
00299       // Copy A into A_copy, since TSQR overwrites the input.  If
00300       // specified, rearrange the data in A_copy so that the data in
00301       // each cache block is contiguously stored.   
00302       if (contiguous_cache_blocks)
00303   actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda());
00304       else
00305   A_copy.copy (A);
00306 
00307       // Do a few timing runs and throw away the results, just to warm
00308       // up any libraries that do autotuning.
00309       const int numWarmupRuns = 5;
00310       for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00311   {
00312     // Factor the matrix in-place in A_copy, and extract the
00313     // resulting R factor into R.
00314     typedef typename node_tsqr_type::FactorOutput factor_output_type;
00315     factor_output_type factor_output = 
00316       actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), 
00317         R.get(), R.lda(), contiguous_cache_blocks);
00318     // Compute the explicit Q factor (which was stored
00319     // implicitly in A_copy and factor_output) and store in Q.
00320     // We don't need to un-cache-block the output, because we
00321     // aren't verifying it here.
00322     actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), 
00323           factor_output, ncols, Q.get(), Q.lda(), 
00324           contiguous_cache_blocks);
00325   }
00326 
00327       // Benchmark TBB-based TSQR for ntrials trials.
00328       //
00329       // Name of timer doesn't matter here; we only need the timing.
00330       timer_type timer("TbbTsqr");
00331       timer.start();
00332       for (int trial_num = 0; trial_num < ntrials; ++trial_num)
00333   {
00334     // Factor the matrix in-place in A_copy, and extract the
00335     // resulting R factor into R.
00336     typedef typename node_tsqr_type::FactorOutput factor_output_type;
00337     factor_output_type factor_output = 
00338       actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), 
00339         R.get(), R.lda(), contiguous_cache_blocks);
00340     // Compute the explicit Q factor (which was stored
00341     // implicitly in A_copy and factor_output) and store in Q.
00342     // We don't need to un-cache-block the output, because we
00343     // aren't verifying it here.
00344     actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), 
00345           factor_output, ncols, Q.get(), Q.lda(), 
00346           contiguous_cache_blocks);
00347   }
00348       const double tbb_tsqr_timing = timer.stop();
00349 
00350       // Print the results
00351       if (human_readable)
00352   {
00353     cout << "(Intel TBB / cache-blocked) TSQR cumulative timings:" << endl
00354          << "Scalar type: " << scalarTypeName << endl
00355          << "# rows: " << nrows << endl
00356          << "# columns: " << ncols << endl
00357          << "# cores: " << num_cores << endl
00358          << "Cache size hint in bytes: " << actor.cache_size_hint() << endl
00359          << "Contiguous cache blocks? " << contiguous_cache_blocks << endl
00360          << "# trials: " << ntrials << endl
00361          << "Total time (s) = " << tbb_tsqr_timing << endl
00362          << "Total time (s) in factor() (min over all tasks): " 
00363          << (ntrials * actor.min_seq_factor_timing()) << endl
00364          << "Total time (s) in factor() (max over all tasks): " 
00365          << (ntrials * actor.max_seq_factor_timing()) << endl
00366          << "Total time (s) in apply() (min over all tasks): " 
00367          << (ntrials * actor.min_seq_apply_timing()) << endl
00368          << "Total time (s) in apply() (max over all tasks): " 
00369          << (ntrials * actor.max_seq_apply_timing()) << endl
00370          << endl << endl;
00371     cout << "(Intel TBB / cache-blocked) TSQR per-invocation timings:" << endl;
00372     
00373     std::vector<TimeStats> stats;
00374     actor.getStats (stats);
00375     std::vector<std::string> labels;
00376     actor.getStatsLabels (labels);
00377 
00378     const std::string labelLabel ("label");
00379     for (std::vector<std::string>::size_type k = 0; k < labels.size(); ++k)
00380       {
00381         const bool printHeaders = (k == 0);
00382         if (stats[k].count() > 0)
00383     stats[k].print (cout, human_readable, labels[k], labelLabel, printHeaders);
00384       }
00385   }
00386       else
00387   {
00388     if (printFieldNames)
00389       {
00390         const char prefix[] = "%";
00391         cout << prefix 
00392        << "method"
00393        << ",scalarType"
00394        << ",numRows"
00395        << ",numCols"
00396        << ",numThreads"
00397        << ",cacheSizeHint"
00398        << ",contiguousCacheBlocks"
00399        << ",numTrials"
00400        << ",timing"
00401        << endl;
00402       }
00403 
00404     // We don't include {min,max}_seq_apply_timing() here, because
00405     // those times don't benefit from the accuracy of benchmarking
00406     // for ntrials > 1.  Thus, it's misleading to include them
00407     // with tbb_tsqr_timing, the total time over ntrials trials.
00408     cout << "TbbTsqr"
00409          << "," << scalarTypeName
00410          << "," << nrows
00411          << "," << ncols
00412          << "," << num_cores
00413          << "," << actor.cache_size_hint()
00414          << "," << contiguous_cache_blocks 
00415          << "," << ntrials
00416          << "," << tbb_tsqr_timing 
00417          << endl;
00418   }
00419     }
00420   } // namespace Test
00421 } // namespace TSQR
00422 
00423 #endif // __TSQR_Test_TbbTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends