Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_TbbTest.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __TSQR_Test_TbbTest_hpp
00043 #define __TSQR_Test_TbbTest_hpp
00044 
00045 #include <Tsqr_nodeTestProblem.hpp>
00046 #include <Tsqr_verifyTimerConcept.hpp>
00047 #include <Tsqr_Random_NormalGenerator.hpp>
00048 
00049 #include <Tsqr_Blas.hpp>
00050 #include <Tsqr_Lapack.hpp>
00051 #include <Tsqr_LocalVerify.hpp>
00052 #include <Tsqr_Matrix.hpp>
00053 #include <Tsqr_Util.hpp>
00054 #include <Tsqr_ScalarTraits.hpp>
00055 #include <TbbTsqr.hpp>
00056 
00057 #include <Teuchos_Time.hpp>
00058 
00059 #include <algorithm>
00060 #include <cstring> // size_t definition
00061 //#include <iomanip>
00062 #include <iostream>
00063 #include <limits>
00064 #include <stdexcept>
00065 #include <vector>
00066 
00067 using std::make_pair;
00068 using std::pair;
00069 using std::vector;
00070 
00071 using std::cerr;
00072 using std::cout;
00073 using std::endl;
00074 
00077 
00078 namespace TSQR {
00079   namespace Test {
00080 
00084     template< class Ordinal, class Scalar >
00085     void
00086     verifyTbbTsqr (const std::string& scalarTypeName,
00087        TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator,
00088        const Ordinal nrows, 
00089        const Ordinal ncols, 
00090        const int num_cores,
00091        const size_t cache_size_hint,
00092        const bool contiguous_cache_blocks,
00093        const bool printFieldNames,
00094        const bool human_readable,
00095        const bool b_debug = false)
00096     {
00097       typedef Teuchos::Time timer_type;
00098       typedef TSQR::TBB::TbbTsqr< Ordinal, Scalar, timer_type > node_tsqr_type;
00099       typedef typename node_tsqr_type::FactorOutput factor_output_type;
00100       typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00101       using std::cerr;
00102       using std::cout;
00103       using std::endl;
00104 
00105       node_tsqr_type actor (num_cores, cache_size_hint);
00106 
00107       if (b_debug)
00108   {
00109     cerr << "Intel TBB TSQR test problem:" << endl
00110          << "* " << nrows << " x " << ncols << endl
00111          << "* # cores: " << num_cores << endl
00112          << "* Cache size hint in bytes: " << actor.cache_size_hint() << endl;
00113     if (contiguous_cache_blocks)
00114       cerr << "* Contiguous cache blocks" << endl;
00115   }
00116 
00117       Matrix< Ordinal, Scalar > A (nrows, ncols);
00118       Matrix< Ordinal, Scalar > A_copy (nrows, ncols);
00119       Matrix< Ordinal, Scalar > Q (nrows, ncols);
00120       Matrix< Ordinal, Scalar > R (ncols, ncols);
00121       if (std::numeric_limits< Scalar >::has_quiet_NaN)
00122   {
00123     A.fill (std::numeric_limits< Scalar>::quiet_NaN());
00124     A_copy.fill (std::numeric_limits< Scalar >::quiet_NaN());
00125     Q.fill (std::numeric_limits< Scalar >::quiet_NaN());
00126     R.fill (std::numeric_limits< Scalar >::quiet_NaN());
00127   }
00128       const Ordinal lda = nrows;
00129       const Ordinal ldq = nrows;
00130       const Ordinal ldr = ncols;
00131 
00132       // Create a test problem
00133       nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), true);
00134 
00135       if (b_debug)
00136   cerr << "-- Generated test problem" << endl;
00137 
00138       // Copy A into A_copy, since TSQR overwrites the input.  If
00139       // specified, rearrange the data in A_copy so that the data in
00140       // each cache block is contiguously stored.   
00141       if (! contiguous_cache_blocks)
00142   {
00143     A_copy.copy (A);
00144     if (b_debug)
00145       cerr << "-- Copied test problem from A into A_copy" << endl;
00146   }
00147       else
00148   {
00149     actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda());
00150     if (b_debug)
00151       cerr << "-- Reorganized test matrix to have contiguous "
00152         "cache blocks" << endl;
00153 
00154     // Verify cache blocking, when in debug mode.
00155     if (b_debug)
00156       {
00157         Matrix< Ordinal, Scalar > A2 (nrows, ncols);
00158         if (std::numeric_limits< Scalar >::has_quiet_NaN)
00159     A2.fill (std::numeric_limits< Scalar >::quiet_NaN());
00160 
00161         actor.un_cache_block (nrows, ncols, A2.get(), A2.lda(), A_copy.get());
00162         if (A == A2)
00163     {
00164       if (b_debug)
00165         cerr << "-- Cache blocking test succeeded!" << endl;
00166     }
00167         else
00168     throw std::logic_error ("Cache blocking failed");
00169       }
00170   }
00171 
00172       // Fill R with zeros, since the factorization may not overwrite
00173       // the strict lower triangle of R.
00174       R.fill (Scalar(0));
00175 
00176       // Factor the matrix and compute the explicit Q factor
00177       factor_output_type factor_output = 
00178   actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), R.get(), 
00179           R.lda(), contiguous_cache_blocks);
00180       if (b_debug)
00181   cerr << "-- Finished TbbTsqr::factor" << endl;
00182       actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), factor_output,
00183       ncols, Q.get(), Q.lda(), contiguous_cache_blocks);
00184       if (b_debug)
00185   cerr << "-- Finished TbbTsqr::explicit_Q" << endl;
00186 
00187       // "Un"-cache-block the output Q (the explicit Q factor), if
00188       // contiguous cache blocks were used.  This is only necessary
00189       // because local_verify() doesn't currently support contiguous
00190       // cache blocks.
00191       if (contiguous_cache_blocks)
00192   {
00193     // Use A_copy as temporary storage for un-cache-blocking Q.
00194     actor.un_cache_block (nrows, ncols, A_copy.get(), A_copy.lda(), Q.get());
00195     Q.copy (A_copy);
00196     if (b_debug)
00197       cerr << "-- Un-cache-blocked output Q factor" << endl;
00198   }
00199 
00200       // Print out the R factor
00201       if (b_debug)
00202   {
00203     cerr << endl << "-- R factor:" << endl;
00204     print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00205     cerr << endl;
00206   }
00207 
00208       // Validate the factorization
00209       std::vector< magnitude_type > results =
00210   local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, R.get(), ldr);
00211       if (b_debug)
00212   cerr << "-- Finished local_verify" << endl;
00213 
00214       // Print the results
00215       if (human_readable)
00216   cout << "Parallel (via Intel\'s Threading Building Blocks) / cache-blocked) TSQR:" << endl
00217        << "Scalar type: " << scalarTypeName << endl
00218        << "# rows: " << nrows << endl
00219        << "# columns: " << ncols << endl
00220        << "# cores: " << num_cores << endl
00221        << "Cache size hint in bytes: " << actor.cache_size_hint() << endl
00222        << "Contiguous cache blocks? " << contiguous_cache_blocks << endl
00223        << "Absolute residual $\\|A - Q*R\\|_2$: "
00224        << results[0] << endl
00225        << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: " 
00226        << results[1] << endl
00227        << "Test matrix norm $\\| A \\|_F$: "
00228        << results[2] << endl
00229        << endl;
00230       else
00231   {
00232     if (printFieldNames)
00233       {
00234         const char prefix[] = "%";
00235         cout << prefix
00236        << "method"
00237        << ",scalarType"
00238        << ",numRows"
00239        << ",numCols"
00240        << ",numThreads"
00241        << ",cacheSizeHint"
00242        << ",contiguousCacheBlocks"
00243        << ",absFrobResid"
00244        << ",absFrobOrthog"
00245        << ",frobA"
00246        << endl;
00247       }
00248     cout << "TbbTsqr"
00249          << "," << scalarTypeName
00250          << "," << nrows
00251          << "," << ncols
00252          << "," << num_cores
00253          << "," << actor.cache_size_hint()
00254          << "," << contiguous_cache_blocks 
00255          << "," << results[0]
00256          << "," << results[1]
00257          << "," << results[2]
00258          << endl;
00259   }
00260     }
00261 
00269     template< class Ordinal, class Scalar >
00270     void
00271     benchmarkTbbTsqr (const std::string& scalarTypeName,
00272           const int ntrials,
00273           const Ordinal nrows, 
00274           const Ordinal ncols, 
00275           const int num_cores,
00276           const size_t cache_size_hint,
00277           const bool contiguous_cache_blocks,
00278           const bool printFieldNames,
00279           const bool human_readable)
00280     {
00281       using TSQR::TBB::TbbTsqr;
00282       using std::cerr;
00283       using std::cout;
00284       using std::endl;
00285 
00286       typedef Teuchos::Time timer_type;
00287       typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00288       typedef Ordinal ordinal_type;
00289       typedef Scalar scalar_type;
00290       typedef Matrix< ordinal_type, scalar_type > matrix_type;
00291       typedef TbbTsqr< ordinal_type, scalar_type, timer_type > node_tsqr_type;
00292 
00293       // Pseudorandom normal(0,1) generator.  Default seed is OK,
00294       // because this is a benchmark, not an accuracy test.
00295       TSQR::Random::NormalGenerator< ordinal_type, scalar_type > generator;
00296 
00297       // Set up TSQR implementation.
00298       node_tsqr_type actor (num_cores, cache_size_hint);
00299 
00300       matrix_type A (nrows, ncols);
00301       matrix_type A_copy (nrows, ncols);
00302       matrix_type Q (nrows, ncols);
00303       matrix_type R (ncols, ncols, scalar_type(0));
00304 
00305       // Fill R with zeros, since the factorization may not overwrite
00306       // the strict lower triangle of R.
00307       R.fill (scalar_type(0));
00308 
00309       // Create a test problem
00310       nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), false);
00311 
00312       // Copy A into A_copy, since TSQR overwrites the input.  If
00313       // specified, rearrange the data in A_copy so that the data in
00314       // each cache block is contiguously stored.   
00315       if (contiguous_cache_blocks)
00316   actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda());
00317       else
00318   A_copy.copy (A);
00319 
00320       // Do a few timing runs and throw away the results, just to warm
00321       // up any libraries that do autotuning.
00322       const int numWarmupRuns = 5;
00323       for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00324   {
00325     // Factor the matrix in-place in A_copy, and extract the
00326     // resulting R factor into R.
00327     typedef typename node_tsqr_type::FactorOutput factor_output_type;
00328     factor_output_type factor_output = 
00329       actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), 
00330         R.get(), R.lda(), contiguous_cache_blocks);
00331     // Compute the explicit Q factor (which was stored
00332     // implicitly in A_copy and factor_output) and store in Q.
00333     // We don't need to un-cache-block the output, because we
00334     // aren't verifying it here.
00335     actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), 
00336           factor_output, ncols, Q.get(), Q.lda(), 
00337           contiguous_cache_blocks);
00338   }
00339 
00340       // Benchmark TBB-based TSQR for ntrials trials.
00341       //
00342       // Name of timer doesn't matter here; we only need the timing.
00343       timer_type timer("TbbTsqr");
00344       timer.start();
00345       for (int trial_num = 0; trial_num < ntrials; ++trial_num)
00346   {
00347     // Factor the matrix in-place in A_copy, and extract the
00348     // resulting R factor into R.
00349     typedef typename node_tsqr_type::FactorOutput factor_output_type;
00350     factor_output_type factor_output = 
00351       actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), 
00352         R.get(), R.lda(), contiguous_cache_blocks);
00353     // Compute the explicit Q factor (which was stored
00354     // implicitly in A_copy and factor_output) and store in Q.
00355     // We don't need to un-cache-block the output, because we
00356     // aren't verifying it here.
00357     actor.explicit_Q (nrows, ncols, A_copy.get(), A_copy.lda(), 
00358           factor_output, ncols, Q.get(), Q.lda(), 
00359           contiguous_cache_blocks);
00360   }
00361       const double tbb_tsqr_timing = timer.stop();
00362 
00363       // Print the results
00364       if (human_readable)
00365   {
00366     cout << "(Intel TBB / cache-blocked) TSQR cumulative timings:" << endl
00367          << "Scalar type: " << scalarTypeName << endl
00368          << "# rows: " << nrows << endl
00369          << "# columns: " << ncols << endl
00370          << "# cores: " << num_cores << endl
00371          << "Cache size hint in bytes: " << actor.cache_size_hint() << endl
00372          << "Contiguous cache blocks? " << contiguous_cache_blocks << endl
00373          << "# trials: " << ntrials << endl
00374          << "Total time (s) = " << tbb_tsqr_timing << endl
00375          << "Total time (s) in factor() (min over all tasks): " 
00376          << (ntrials * actor.min_seq_factor_timing()) << endl
00377          << "Total time (s) in factor() (max over all tasks): " 
00378          << (ntrials * actor.max_seq_factor_timing()) << endl
00379          << "Total time (s) in apply() (min over all tasks): " 
00380          << (ntrials * actor.min_seq_apply_timing()) << endl
00381          << "Total time (s) in apply() (max over all tasks): " 
00382          << (ntrials * actor.max_seq_apply_timing()) << endl
00383          << endl << endl;
00384     cout << "(Intel TBB / cache-blocked) TSQR per-invocation timings:" << endl;
00385     
00386     std::vector<TimeStats> stats;
00387     actor.getStats (stats);
00388     std::vector<std::string> labels;
00389     actor.getStatsLabels (labels);
00390 
00391     const std::string labelLabel ("label");
00392     for (std::vector<std::string>::size_type k = 0; k < labels.size(); ++k)
00393       {
00394         const bool printHeaders = (k == 0);
00395         if (stats[k].count() > 0)
00396     stats[k].print (cout, human_readable, labels[k], labelLabel, printHeaders);
00397       }
00398   }
00399       else
00400   {
00401     if (printFieldNames)
00402       {
00403         const char prefix[] = "%";
00404         cout << prefix 
00405        << "method"
00406        << ",scalarType"
00407        << ",numRows"
00408        << ",numCols"
00409        << ",numThreads"
00410        << ",cacheSizeHint"
00411        << ",contiguousCacheBlocks"
00412        << ",numTrials"
00413        << ",timing"
00414        << endl;
00415       }
00416 
00417     // We don't include {min,max}_seq_apply_timing() here, because
00418     // those times don't benefit from the accuracy of benchmarking
00419     // for ntrials > 1.  Thus, it's misleading to include them
00420     // with tbb_tsqr_timing, the total time over ntrials trials.
00421     cout << "TbbTsqr"
00422          << "," << scalarTypeName
00423          << "," << nrows
00424          << "," << ncols
00425          << "," << num_cores
00426          << "," << actor.cache_size_hint()
00427          << "," << contiguous_cache_blocks 
00428          << "," << ntrials
00429          << "," << tbb_tsqr_timing 
00430          << endl;
00431   }
00432     }
00433   } // namespace Test
00434 } // namespace TSQR
00435 
00436 #endif // __TSQR_Test_TbbTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends