Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_KokkosNodeTsqrTest.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __TSQR_Test_KokkosNodeTsqrTest_hpp
00030 #define __TSQR_Test_KokkosNodeTsqrTest_hpp
00031 
00032 #include <Tsqr_nodeTestProblem.hpp>
00033 #include <Tsqr_verifyTimerConcept.hpp>
00034 #include <Tsqr_Random_NormalGenerator.hpp>
00035 #include <Tsqr_LocalVerify.hpp>
00036 #include <Tsqr_Matrix.hpp>
00037 #include <Tsqr_ScalarTraits.hpp>
00038 #include <Tsqr_KokkosNodeTsqr.hpp>
00039 
00040 #include <Teuchos_Time.hpp>
00041 #include <Teuchos_TypeNameTraits.hpp>
00042 
00043 #include <algorithm>
00044 #include <iostream>
00045 #include <limits>
00046 #include <stdexcept>
00047 
00048 namespace TSQR {
00049   namespace Test {
00050 
00078     template<class Ordinal, class Scalar, class NodeType>
00079     void
00080     verifyKokkosNodeTsqr (const Teuchos::RCP<const NodeType>& node,
00081         TSQR::Random::NormalGenerator<Ordinal, Scalar>& gen,
00082         const Ordinal numRows,
00083         const Ordinal numCols,
00084         const int numPartitions,
00085         const size_t cacheSizeHint,
00086         const bool contiguousCacheBlocks,
00087         const bool printFieldNames,
00088         const bool humanReadable,
00089         const bool debug)
00090     {
00091       using Teuchos::ParameterList;
00092       using Teuchos::parameterList;
00093       using Teuchos::RCP;
00094       using Teuchos::TypeNameTraits;
00095       using std::cerr;
00096       using std::cout;
00097       using std::endl;
00098       typedef TSQR::KokkosNodeTsqr<Ordinal, Scalar, NodeType> node_tsqr_type;
00099       typedef typename node_tsqr_type::FactorOutput factor_output_type;
00100       typedef Teuchos::ScalarTraits<Scalar> STS;
00101       typedef typename STS::magnitudeType magnitude_type;
00102       typedef Teuchos::Time timer_type;
00103       typedef Matrix<Ordinal, Scalar> matrix_type;
00104 
00105       const std::string scalarTypeName = TypeNameTraits<Scalar>::name();
00106 
00107       // Set up TSQR implementation.
00108       RCP<ParameterList> params = parameterList ("Intranode TSQR");
00109       params->set ("Cache Size Hint", cacheSizeHint);
00110       params->set ("Num Partitions", numPartitions);
00111       node_tsqr_type actor (node, params);
00112       if (debug)
00113   {
00114     cerr << actor.description() << endl;
00115     if (contiguousCacheBlocks)
00116       cerr << "-- Test with contiguous cache blocks" << endl;
00117   }
00118 
00119       // Allocate space for test problem.
00120       matrix_type A (numRows, numCols);
00121       matrix_type A_copy (numRows, numCols);
00122       matrix_type Q (numRows, numCols);
00123       matrix_type R (numCols, numCols);
00124       if (std::numeric_limits<Scalar>::has_quiet_NaN)
00125   {
00126     A.fill (std::numeric_limits<Scalar>::quiet_NaN());
00127     A_copy.fill (std::numeric_limits<Scalar>::quiet_NaN());
00128     Q.fill (std::numeric_limits<Scalar>::quiet_NaN());
00129     R.fill (std::numeric_limits<Scalar>::quiet_NaN());
00130   }
00131       else
00132   {
00133     A.fill (STS::zero());
00134     A_copy.fill (STS::zero());
00135     Q.fill (STS::zero());
00136     R.fill (STS::zero());
00137   }
00138       const Ordinal lda = numRows;
00139       const Ordinal ldq = numRows;
00140       const Ordinal ldr = numCols;
00141 
00142       // Create a test problem
00143       nodeTestProblem (gen, numRows, numCols, A.get(), A.lda(), true);
00144 
00145       if (debug)
00146   {
00147     cerr << "-- Generated test problem" << endl;
00148     // Don't print the matrix if it's too big.
00149     if (A.nrows() <= 30)
00150       {
00151         cerr << "A = " << endl;
00152         print_local_matrix (cerr, A.nrows(), A.ncols(), 
00153           A.get(), A.lda());
00154         cerr << endl << endl;
00155       }
00156   }
00157 
00158       // Copy A into A_copy, since TSQR overwrites the input.  If
00159       // specified, rearrange the data in A_copy so that the data in
00160       // each cache block is contiguously stored.   
00161       if (! contiguousCacheBlocks)
00162   {
00163     A_copy.copy (A);
00164     if (debug)
00165       {
00166         cerr << "-- Copied test problem from A into A_copy" << endl;
00167         // Don't print the matrix if it's too big.
00168         if (A_copy.nrows() <= 30)
00169     {
00170       cerr << "A_copy = " << endl;
00171       print_local_matrix (cerr, A_copy.nrows(), A_copy.ncols(), 
00172               A_copy.get(), A_copy.lda());
00173       cerr << endl << endl;
00174     }
00175       }
00176   }
00177       else
00178   {
00179     actor.cache_block (numRows, numCols, A_copy.get(), A.get(), A.lda());
00180     if (debug)
00181       {
00182         cerr << "-- Reorganized test matrix to have contiguous "
00183     "cache blocks" << endl;
00184         // Don't print the matrix if it's too big.
00185         if (A_copy.nrows() <= 30)
00186     {
00187       cerr << "A_copy = " << endl;
00188       print_local_matrix (cerr, A_copy.nrows(), A_copy.ncols(), 
00189               A_copy.get(), A_copy.lda());
00190       cerr << endl << endl;
00191     }
00192       }
00193 
00194     // Verify cache blocking, when in debug mode.
00195     if (debug)
00196       {
00197         matrix_type A2 (numRows, numCols);
00198         if (std::numeric_limits<Scalar>::has_quiet_NaN)
00199     A2.fill (std::numeric_limits<Scalar>::quiet_NaN());
00200 
00201         actor.un_cache_block (numRows, numCols, A2.get(), A2.lda(), A_copy.get());
00202         if (A == A2)
00203     {
00204       if (debug)
00205         cerr << "-- Cache blocking test succeeded!" << endl;
00206     }
00207         else
00208     {
00209       if (debug)
00210         {
00211           cerr << "*** Cache blocking test failed! A != A2 ***" 
00212          << endl << endl;
00213           // Don't print the matrices if they are too big.
00214           if (A.nrows() <= 30 && A2.nrows() <= 30) 
00215       {
00216         cerr << "A = " << endl;
00217         print_local_matrix (cerr, A.nrows(), A.ncols(), 
00218                 A.get(), A.lda());
00219         cerr << endl << "A2 = " << endl;
00220         print_local_matrix (cerr, A2.nrows(), A2.ncols(), 
00221                 A2.get(), A2.lda());
00222         cerr << endl;
00223       }
00224         }
00225       throw std::logic_error ("Cache blocking failed");
00226     }
00227       }
00228   }
00229 
00230       // Fill R with zeros, since the factorization may not
00231       // necessarily overwrite the strict lower triangle of R.
00232       if (debug)
00233   cerr << "-- Filling R with zeros" << endl;
00234       R.fill (STS::zero());
00235 
00236       if (debug)
00237   cerr << "-- Calling factor()" << endl;
00238 
00239       // Factor the matrix and compute the explicit Q factor
00240       factor_output_type factor_output = 
00241   actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(), 
00242           R.get(), R.lda(), contiguousCacheBlocks);
00243       if (debug)
00244   {
00245     cerr << "-- Finished factor()" << endl;
00246     cerr << "-- Calling explicit_Q()" << endl;
00247   }
00248 
00249       // KokkosNodeTsqr isn't designed to be used by itself, so we
00250       // have to help it along by filling the top ncols x ncols
00251       // entries with the first ncols columns of the identity matrix.
00252       {
00253   typedef MatView<Ordinal, Scalar> view_type;
00254   view_type Q_top = actor.top_block (Q.view(), contiguousCacheBlocks);
00255   view_type Q_top_square (Q_top.ncols(), Q_top.ncols(), 
00256         Q_top.get(), Q_top.lda());
00257   Q_top_square.fill (STS::zero());
00258   for (Ordinal j = 0; j < Q_top_square.ncols(); ++j)
00259     Q_top_square(j,j) = STS::one();
00260       }
00261       actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(), 
00262       factor_output, numCols, Q.get(), Q.lda(), 
00263       contiguousCacheBlocks);
00264       if (debug)
00265   cerr << "-- Finished explicit_Q()" << endl;
00266 
00267       // "Un"-cache-block the output Q (the explicit Q factor), if
00268       // contiguous cache blocks were used.  This is only necessary
00269       // because local_verify() doesn't currently support contiguous
00270       // cache blocks.
00271       if (contiguousCacheBlocks)
00272   {
00273     // Use A_copy as temporary storage for un-cache-blocking Q.
00274     actor.un_cache_block (numRows, numCols, A_copy.get(), 
00275         A_copy.lda(), Q.get());
00276     Q.copy (A_copy);
00277     if (debug)
00278       cerr << "-- Un-cache-blocked output Q factor" << endl;
00279   }
00280 
00281       // Print out the Q and R factors in debug mode.
00282       if (debug)
00283   {
00284     // Don't print the matrix if it's too big.
00285     if (Q.nrows() <= 30)
00286       {
00287         cerr << endl << "-- Q factor:" << endl;
00288         print_local_matrix (cerr, Q.nrows(), Q.ncols(), 
00289           Q.get(), Q.lda());
00290         cerr << endl << endl;
00291       }
00292     cerr << endl << "-- R factor:" << endl;
00293     print_local_matrix (cerr, numCols, numCols, R.get(), R.lda());
00294     cerr << endl;
00295   }
00296 
00297       // Validate the factorization
00298       std::vector<magnitude_type> results =
00299   local_verify (numRows, numCols, A.get(), lda, 
00300           Q.get(), ldq, R.get(), ldr);
00301       if (debug)
00302   cerr << "-- Finished local_verify" << endl;
00303 
00304       // Print the results
00305       if (humanReadable)
00306   cout << "KokkosNodeTsqr:" << endl
00307        << "Scalar type: " << scalarTypeName << endl
00308        << "# rows: " << numRows << endl
00309        << "# columns: " << numCols << endl
00310        << "# partitions: " << numPartitions << endl
00311        << "cache size hint (revised) in bytes: " << actor.cache_size_hint() << endl
00312        << "contiguous cache blocks? " << contiguousCacheBlocks << endl
00313        << "Absolute residual $\\|A - Q*R\\|_2$: "
00314        << results[0] << endl
00315        << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: " 
00316        << results[1] << endl
00317        << "Test matrix norm $\\| A \\|_F$: "
00318        << results[2] << endl
00319        << endl;
00320       else
00321   {
00322     if (printFieldNames)
00323       {
00324         const char prefix[] = "%";
00325         cout << prefix
00326        << "method"
00327        << ",scalarType"
00328        << ",numRows"
00329        << ",numCols"
00330        << ",numPartitions"
00331        << ",cacheSizeHint"
00332        << ",contiguousCacheBlocks"
00333        << ",absFrobResid"
00334        << ",absFrobOrthog"
00335        << ",frobA"
00336        << endl;
00337       }
00338     cout << "KokkosNodeTsqr"
00339          << "," << scalarTypeName
00340          << "," << numRows
00341          << "," << numCols
00342          << "," << numPartitions
00343          << "," << actor.cache_size_hint()
00344          << "," << contiguousCacheBlocks 
00345          << "," << results[0]
00346          << "," << results[1]
00347          << "," << results[2]
00348          << endl;
00349   }
00350     }
00351 
00352 
00381     template<class Ordinal, class Scalar, class NodeType>
00382     void
00383     benchmarkKokkosNodeTsqr (const Teuchos::RCP<const NodeType>& node,
00384            const int numTrials,
00385            const Ordinal numRows, 
00386            const Ordinal numCols, 
00387            const int numPartitions,
00388            const size_t cacheSizeHint,
00389            const bool contiguousCacheBlocks,
00390            const bool printFieldNames,
00391            const bool humanReadable)
00392     {
00393       using Teuchos::ParameterList;
00394       using Teuchos::parameterList;
00395       using Teuchos::RCP;
00396       using Teuchos::TypeNameTraits;
00397       using std::cerr;
00398       using std::cout;
00399       using std::endl;
00400       typedef TSQR::KokkosNodeTsqr<Ordinal, Scalar, NodeType> node_tsqr_type;
00401       typedef typename node_tsqr_type::FactorOutput factor_output_type;
00402       typedef Teuchos::ScalarTraits<Scalar> STS;
00403       typedef typename STS::magnitudeType magnitude_type;
00404       typedef Teuchos::Time timer_type;
00405       typedef Matrix<Ordinal, Scalar> matrix_type;
00406 
00407       const std::string scalarTypeName = TypeNameTraits<Scalar>::name();
00408 
00409       // Pseudorandom normal(0,1) generator.  Default seed is OK,
00410       // because this is a benchmark, not an accuracy test.
00411       TSQR::Random::NormalGenerator<Ordinal, Scalar> gen;
00412 
00413       // Set up TSQR implementation.
00414       RCP<ParameterList> params = parameterList ("Intranode TSQR");
00415       params->set ("Cache Size Hint", cacheSizeHint);
00416       params->set ("Num Partitions", numPartitions);
00417       node_tsqr_type actor (node, params);
00418 
00419       // Allocate space for test problem.
00420       matrix_type A (numRows, numCols);
00421       matrix_type A_copy (numRows, numCols);
00422       matrix_type Q (numRows, numCols);
00423       matrix_type R (numCols, numCols);
00424 
00425       // Fill R with zeros, since the factorization may not overwrite
00426       // the strict lower triangle of R.
00427       R.fill (STS::zero());
00428 
00429       // Create a test problem
00430       nodeTestProblem (gen, numRows, numCols, A.get(), A.lda(), false);
00431 
00432       // Copy A into A_copy, since TSQR overwrites the input.  If
00433       // specified, rearrange the data in A_copy so that the data in
00434       // each cache block is contiguously stored.   
00435       if (contiguousCacheBlocks)
00436   actor.cache_block (numRows, numCols, A_copy.get(), A.get(), A.lda());
00437       else
00438   A_copy.copy (A);
00439 
00440       // Do a few timing runs and throw away the results, just to warm
00441       // up any libraries that do autotuning.
00442       const int numWarmupRuns = 5;
00443       for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00444   {
00445     // Factor the matrix in-place in A_copy, and extract the
00446     // resulting R factor into R.
00447     factor_output_type factor_output = 
00448       actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(), 
00449         R.get(), R.lda(), contiguousCacheBlocks);
00450     // Compute the explicit Q factor (which was stored
00451     // implicitly in A_copy and factor_output) and store in Q.
00452     // We don't need to un-cache-block the output, because we
00453     // aren't verifying it here.
00454     actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(), 
00455           factor_output, numCols, Q.get(), Q.lda(), 
00456           contiguousCacheBlocks);
00457   }
00458 
00459       // Benchmark intranode TSQR for numTrials trials.
00460       //
00461       // Name of timer doesn't matter here; we only need the timing.
00462       timer_type timer("KokkosNodeTsqr");
00463       timer.start();
00464       for (int trialNum = 0; trialNum < numTrials; ++trialNum)
00465   {
00466     // Factor the matrix in-place in A_copy, and extract the
00467     // resulting R factor into R.
00468     factor_output_type factor_output = 
00469       actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(), 
00470         R.get(), R.lda(), contiguousCacheBlocks);
00471     // Compute the explicit Q factor (which was stored
00472     // implicitly in A_copy and factor_output) and store in Q.
00473     // We don't need to un-cache-block the output, because we
00474     // aren't verifying it here.
00475     actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(), 
00476           factor_output, numCols, Q.get(), Q.lda(), 
00477           contiguousCacheBlocks);
00478   }
00479       const double timing = timer.stop();
00480 
00481       // Print the results
00482       if (humanReadable)
00483   {
00484     cout << "KokkosNodeTsqr cumulative timings:" << endl
00485          << "Scalar type: " << scalarTypeName << endl
00486          << "# rows = " << numRows << endl
00487          << "# columns = " << numCols << endl
00488          << "# partitions: " << numPartitions << endl
00489          << "Cache size hint (in bytes) = " << actor.cache_size_hint() << endl
00490          << "Contiguous cache blocks? " << contiguousCacheBlocks << endl
00491          << "# trials = " << numTrials << endl
00492          << "Total time (s) = " << timing << endl;
00493   }
00494       else
00495   {
00496     if (printFieldNames)
00497       {
00498         const char prefix[] = "%";
00499         cout << prefix 
00500        << "method"
00501        << ",scalarType"
00502        << ",numRows"
00503        << ",numCols"
00504        << ",numPartitions"
00505        << ",cacheSizeHint"
00506        << ",contiguousCacheBlocks"
00507        << ",numTrials"
00508        << ",timing"
00509        << endl;
00510       }
00511 
00512     // We don't include {min,max}_seq_apply_timing() here, because
00513     // those times don't benefit from the accuracy of benchmarking
00514     // for numTrials > 1.  Thus, it's misleading to include them
00515     // with tbb_tsqr_timing, the total time over numTrials trials.
00516     cout << "KokkosNodeTsqr"
00517          << "," << scalarTypeName
00518          << "," << numRows
00519          << "," << numCols
00520          << "," << numPartitions
00521          << "," << actor.cache_size_hint()
00522          << "," << contiguousCacheBlocks 
00523          << "," << numTrials
00524          << "," << timing
00525          << endl;
00526   }
00527     }
00528   } // namespace Test
00529 } // namespace TSQR
00530 
00531 #endif // __TSQR_Test_KokkosNodeTsqrTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends