Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_KokkosNodeTsqrTest.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #ifndef __TSQR_Test_KokkosNodeTsqrTest_hpp
00043 #define __TSQR_Test_KokkosNodeTsqrTest_hpp
00044 
00045 #include <Tsqr_nodeTestProblem.hpp>
00046 #include <Tsqr_verifyTimerConcept.hpp>
00047 #include <Tsqr_Random_NormalGenerator.hpp>
00048 #include <Tsqr_LocalVerify.hpp>
00049 #include <Tsqr_Matrix.hpp>
00050 #include <Tsqr_ScalarTraits.hpp>
00051 #include <Tsqr_KokkosNodeTsqr.hpp>
00052 
00053 #include <Teuchos_Time.hpp>
00054 #include <Teuchos_TypeNameTraits.hpp>
00055 
00056 #include <algorithm>
00057 #include <iostream>
00058 #include <limits>
00059 #include <stdexcept>
00060 
00061 namespace TSQR {
00062   namespace Test {
00063 
00091     template<class Ordinal, class Scalar, class NodeType>
00092     void
00093     verifyKokkosNodeTsqr (const Teuchos::RCP<NodeType>& node,
00094         TSQR::Random::NormalGenerator<Ordinal, Scalar>& gen,
00095         const Ordinal numRows,
00096         const Ordinal numCols,
00097         const int numPartitions,
00098         const size_t cacheSizeHint,
00099         const bool contiguousCacheBlocks,
00100         const bool printFieldNames,
00101         const bool humanReadable,
00102         const bool debug)
00103     {
00104       using Teuchos::ParameterList;
00105       using Teuchos::parameterList;
00106       using Teuchos::RCP;
00107       using Teuchos::TypeNameTraits;
00108       using std::cerr;
00109       using std::cout;
00110       using std::endl;
00111       typedef TSQR::KokkosNodeTsqr<Ordinal, Scalar, NodeType> node_tsqr_type;
00112       typedef typename node_tsqr_type::FactorOutput factor_output_type;
00113       typedef Teuchos::ScalarTraits<Scalar> STS;
00114       typedef typename STS::magnitudeType magnitude_type;
00115       typedef Teuchos::Time timer_type;
00116       typedef Matrix<Ordinal, Scalar> matrix_type;
00117 
00118       const std::string scalarTypeName = TypeNameTraits<Scalar>::name();
00119 
00120       // Set up TSQR implementation.
00121       RCP<ParameterList> params = parameterList ("Intranode TSQR");
00122       params->set ("Cache Size Hint", cacheSizeHint);
00123       params->set ("Num Tasks", numPartitions);
00124       node_tsqr_type actor (params);
00125       actor.setNode (node);
00126       if (debug)
00127   {
00128     cerr << actor.description() << endl;
00129     if (contiguousCacheBlocks)
00130       cerr << "-- Test with contiguous cache blocks" << endl;
00131   }
00132 
00133       // Allocate space for test problem.
00134       matrix_type A (numRows, numCols);
00135       matrix_type A_copy (numRows, numCols);
00136       matrix_type Q (numRows, numCols);
00137       matrix_type R (numCols, numCols);
00138       if (std::numeric_limits<Scalar>::has_quiet_NaN)
00139   {
00140     A.fill (std::numeric_limits<Scalar>::quiet_NaN());
00141     A_copy.fill (std::numeric_limits<Scalar>::quiet_NaN());
00142     Q.fill (std::numeric_limits<Scalar>::quiet_NaN());
00143     R.fill (std::numeric_limits<Scalar>::quiet_NaN());
00144   }
00145       else
00146   {
00147     A.fill (STS::zero());
00148     A_copy.fill (STS::zero());
00149     Q.fill (STS::zero());
00150     R.fill (STS::zero());
00151   }
00152       const Ordinal lda = numRows;
00153       const Ordinal ldq = numRows;
00154       const Ordinal ldr = numCols;
00155 
00156       // Create a test problem
00157       nodeTestProblem (gen, numRows, numCols, A.get(), A.lda(), true);
00158 
00159       if (debug)
00160   {
00161     cerr << "-- Generated test problem" << endl;
00162     // Don't print the matrix if it's too big.
00163     if (A.nrows() <= 30)
00164       {
00165         cerr << "A = " << endl;
00166         print_local_matrix (cerr, A.nrows(), A.ncols(), 
00167           A.get(), A.lda());
00168         cerr << endl << endl;
00169       }
00170   }
00171 
00172       // Copy A into A_copy, since TSQR overwrites the input.  If
00173       // specified, rearrange the data in A_copy so that the data in
00174       // each cache block is contiguously stored.   
00175       if (! contiguousCacheBlocks)
00176   {
00177     A_copy.copy (A);
00178     if (debug)
00179       {
00180         cerr << "-- Copied test problem from A into A_copy" << endl;
00181         // Don't print the matrix if it's too big.
00182         if (A_copy.nrows() <= 30)
00183     {
00184       cerr << "A_copy = " << endl;
00185       print_local_matrix (cerr, A_copy.nrows(), A_copy.ncols(), 
00186               A_copy.get(), A_copy.lda());
00187       cerr << endl << endl;
00188     }
00189       }
00190   }
00191       else
00192   {
00193     actor.cache_block (numRows, numCols, A_copy.get(), A.get(), A.lda());
00194     if (debug)
00195       {
00196         cerr << "-- Reorganized test matrix to have contiguous "
00197     "cache blocks" << endl;
00198         // Don't print the matrix if it's too big.
00199         if (A_copy.nrows() <= 30)
00200     {
00201       cerr << "A_copy = " << endl;
00202       print_local_matrix (cerr, A_copy.nrows(), A_copy.ncols(), 
00203               A_copy.get(), A_copy.lda());
00204       cerr << endl << endl;
00205     }
00206       }
00207 
00208     // Verify cache blocking, when in debug mode.
00209     if (debug)
00210       {
00211         matrix_type A2 (numRows, numCols);
00212         if (std::numeric_limits<Scalar>::has_quiet_NaN)
00213     A2.fill (std::numeric_limits<Scalar>::quiet_NaN());
00214 
00215         actor.un_cache_block (numRows, numCols, A2.get(), A2.lda(), A_copy.get());
00216         if (A == A2)
00217     {
00218       if (debug)
00219         cerr << "-- Cache blocking test succeeded!" << endl;
00220     }
00221         else
00222     {
00223       if (debug)
00224         {
00225           cerr << "*** Cache blocking test failed! A != A2 ***" 
00226          << endl << endl;
00227           // Don't print the matrices if they are too big.
00228           if (A.nrows() <= 30 && A2.nrows() <= 30) 
00229       {
00230         cerr << "A = " << endl;
00231         print_local_matrix (cerr, A.nrows(), A.ncols(), 
00232                 A.get(), A.lda());
00233         cerr << endl << "A2 = " << endl;
00234         print_local_matrix (cerr, A2.nrows(), A2.ncols(), 
00235                 A2.get(), A2.lda());
00236         cerr << endl;
00237       }
00238         }
00239       throw std::logic_error ("Cache blocking failed");
00240     }
00241       }
00242   }
00243 
00244       // Fill R with zeros, since the factorization may not
00245       // necessarily overwrite the strict lower triangle of R.
00246       if (debug)
00247   cerr << "-- Filling R with zeros" << endl;
00248       R.fill (STS::zero());
00249 
00250       if (debug)
00251   cerr << "-- Calling factor()" << endl;
00252 
00253       // Factor the matrix and compute the explicit Q factor
00254       factor_output_type factor_output = 
00255   actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(), 
00256           R.get(), R.lda(), contiguousCacheBlocks);
00257       if (debug)
00258   {
00259     cerr << "-- Finished factor()" << endl;
00260     cerr << "-- Calling explicit_Q()" << endl;
00261   }
00262 
00263       // KokkosNodeTsqr isn't designed to be used by itself, so we
00264       // have to help it along by filling the top ncols x ncols
00265       // entries with the first ncols columns of the identity matrix.
00266       {
00267   typedef MatView<Ordinal, Scalar> view_type;
00268   view_type Q_top = actor.top_block (Q.view(), contiguousCacheBlocks);
00269   view_type Q_top_square (Q_top.ncols(), Q_top.ncols(), 
00270         Q_top.get(), Q_top.lda());
00271   Q_top_square.fill (STS::zero());
00272   for (Ordinal j = 0; j < Q_top_square.ncols(); ++j)
00273     Q_top_square(j,j) = STS::one();
00274       }
00275       actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(), 
00276       factor_output, numCols, Q.get(), Q.lda(), 
00277       contiguousCacheBlocks);
00278       if (debug)
00279   cerr << "-- Finished explicit_Q()" << endl;
00280 
00281       // "Un"-cache-block the output Q (the explicit Q factor), if
00282       // contiguous cache blocks were used.  This is only necessary
00283       // because local_verify() doesn't currently support contiguous
00284       // cache blocks.
00285       if (contiguousCacheBlocks)
00286   {
00287     // Use A_copy as temporary storage for un-cache-blocking Q.
00288     actor.un_cache_block (numRows, numCols, A_copy.get(), 
00289         A_copy.lda(), Q.get());
00290     Q.copy (A_copy);
00291     if (debug)
00292       cerr << "-- Un-cache-blocked output Q factor" << endl;
00293   }
00294 
00295       // Print out the Q and R factors in debug mode.
00296       if (debug)
00297   {
00298     // Don't print the matrix if it's too big.
00299     if (Q.nrows() <= 30)
00300       {
00301         cerr << endl << "-- Q factor:" << endl;
00302         print_local_matrix (cerr, Q.nrows(), Q.ncols(), 
00303           Q.get(), Q.lda());
00304         cerr << endl << endl;
00305       }
00306     cerr << endl << "-- R factor:" << endl;
00307     print_local_matrix (cerr, numCols, numCols, R.get(), R.lda());
00308     cerr << endl;
00309   }
00310 
00311       // Validate the factorization
00312       std::vector<magnitude_type> results =
00313   local_verify (numRows, numCols, A.get(), lda, 
00314           Q.get(), ldq, R.get(), ldr);
00315       if (debug)
00316   cerr << "-- Finished local_verify" << endl;
00317 
00318       // Print the results
00319       if (humanReadable)
00320   cout << "KokkosNodeTsqr:" << endl
00321        << "Scalar type: " << scalarTypeName << endl
00322        << "# rows: " << numRows << endl
00323        << "# columns: " << numCols << endl
00324        << "# partitions: " << numPartitions << endl
00325        << "cache size hint (revised) in bytes: " << actor.cache_size_hint() << endl
00326        << "contiguous cache blocks? " << contiguousCacheBlocks << endl
00327        << "Absolute residual $\\|A - Q*R\\|_2$: "
00328        << results[0] << endl
00329        << "Absolute orthogonality $\\|I - Q^T*Q\\|_2$: " 
00330        << results[1] << endl
00331        << "Test matrix norm $\\| A \\|_F$: "
00332        << results[2] << endl
00333        << endl;
00334       else
00335   {
00336     if (printFieldNames)
00337       {
00338         const char prefix[] = "%";
00339         cout << prefix
00340        << "method"
00341        << ",scalarType"
00342        << ",numRows"
00343        << ",numCols"
00344        << ",numPartitions"
00345        << ",cacheSizeHint"
00346        << ",contiguousCacheBlocks"
00347        << ",absFrobResid"
00348        << ",absFrobOrthog"
00349        << ",frobA"
00350        << endl;
00351       }
00352     cout << "KokkosNodeTsqr"
00353          << "," << scalarTypeName
00354          << "," << numRows
00355          << "," << numCols
00356          << "," << numPartitions
00357          << "," << actor.cache_size_hint()
00358          << "," << contiguousCacheBlocks 
00359          << "," << results[0]
00360          << "," << results[1]
00361          << "," << results[2]
00362          << endl;
00363   }
00364     }
00365 
00366 
00395     template<class Ordinal, class Scalar, class NodeType>
00396     void
00397     benchmarkKokkosNodeTsqr (const Teuchos::RCP<NodeType>& node,
00398            const int numTrials,
00399            const Ordinal numRows, 
00400            const Ordinal numCols, 
00401            const int numPartitions,
00402            const size_t cacheSizeHint,
00403            const bool contiguousCacheBlocks,
00404            const bool printFieldNames,
00405            const bool humanReadable)
00406     {
00407       using Teuchos::ParameterList;
00408       using Teuchos::parameterList;
00409       using Teuchos::RCP;
00410       using Teuchos::TypeNameTraits;
00411       using std::cerr;
00412       using std::cout;
00413       using std::endl;
00414       typedef TSQR::KokkosNodeTsqr<Ordinal, Scalar, NodeType> node_tsqr_type;
00415       typedef typename node_tsqr_type::FactorOutput factor_output_type;
00416       typedef Teuchos::ScalarTraits<Scalar> STS;
00417       typedef typename STS::magnitudeType magnitude_type;
00418       typedef Teuchos::Time timer_type;
00419       typedef Matrix<Ordinal, Scalar> matrix_type;
00420 
00421       const std::string scalarTypeName = TypeNameTraits<Scalar>::name();
00422 
00423       // Pseudorandom normal(0,1) generator.  Default seed is OK,
00424       // because this is a benchmark, not an accuracy test.
00425       TSQR::Random::NormalGenerator<Ordinal, Scalar> gen;
00426 
00427       // Set up TSQR implementation.
00428       RCP<ParameterList> params = parameterList ("Intranode TSQR");
00429       params->set ("Cache Size Hint", cacheSizeHint);
00430       params->set ("Num Tasks", numPartitions);
00431       node_tsqr_type actor (params);
00432       actor.setNode (node);
00433 
00434       // Allocate space for test problem.
00435       matrix_type A (numRows, numCols);
00436       matrix_type A_copy (numRows, numCols);
00437       matrix_type Q (numRows, numCols);
00438       matrix_type R (numCols, numCols);
00439 
00440       // Fill R with zeros, since the factorization may not overwrite
00441       // the strict lower triangle of R.
00442       R.fill (STS::zero());
00443 
00444       // Create a test problem
00445       nodeTestProblem (gen, numRows, numCols, A.get(), A.lda(), false);
00446 
00447       // Copy A into A_copy, since TSQR overwrites the input.  If
00448       // specified, rearrange the data in A_copy so that the data in
00449       // each cache block is contiguously stored.   
00450       if (contiguousCacheBlocks)
00451   actor.cache_block (numRows, numCols, A_copy.get(), A.get(), A.lda());
00452       else
00453   A_copy.copy (A);
00454 
00455       // Do a few timing runs and throw away the results, just to warm
00456       // up any libraries that do autotuning.
00457       const int numWarmupRuns = 5;
00458       for (int warmupRun = 0; warmupRun < numWarmupRuns; ++warmupRun)
00459   {
00460     // Factor the matrix in-place in A_copy, and extract the
00461     // resulting R factor into R.
00462     factor_output_type factor_output = 
00463       actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(), 
00464         R.get(), R.lda(), contiguousCacheBlocks);
00465     // Compute the explicit Q factor (which was stored
00466     // implicitly in A_copy and factor_output) and store in Q.
00467     // We don't need to un-cache-block the output, because we
00468     // aren't verifying it here.
00469     actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(), 
00470           factor_output, numCols, Q.get(), Q.lda(), 
00471           contiguousCacheBlocks);
00472   }
00473 
00474       // Benchmark intranode TSQR for numTrials trials.
00475       //
00476       // Name of timer doesn't matter here; we only need the timing.
00477       timer_type timer("KokkosNodeTsqr");
00478       timer.start();
00479       for (int trialNum = 0; trialNum < numTrials; ++trialNum)
00480   {
00481     // Factor the matrix in-place in A_copy, and extract the
00482     // resulting R factor into R.
00483     factor_output_type factor_output = 
00484       actor.factor (numRows, numCols, A_copy.get(), A_copy.lda(), 
00485         R.get(), R.lda(), contiguousCacheBlocks);
00486     // Compute the explicit Q factor (which was stored
00487     // implicitly in A_copy and factor_output) and store in Q.
00488     // We don't need to un-cache-block the output, because we
00489     // aren't verifying it here.
00490     actor.explicit_Q (numRows, numCols, A_copy.get(), A_copy.lda(), 
00491           factor_output, numCols, Q.get(), Q.lda(), 
00492           contiguousCacheBlocks);
00493   }
00494       const double timing = timer.stop();
00495 
00496       // Print the results
00497       if (humanReadable)
00498   {
00499     cout << "KokkosNodeTsqr cumulative timings:" << endl
00500          << "Scalar type: " << scalarTypeName << endl
00501          << "# rows = " << numRows << endl
00502          << "# columns = " << numCols << endl
00503          << "# partitions: " << numPartitions << endl
00504          << "Cache size hint (in bytes) = " << actor.cache_size_hint() << endl
00505          << "Contiguous cache blocks? " << contiguousCacheBlocks << endl
00506          << "# trials = " << numTrials << endl
00507          << "Total time (s) = " << timing << endl;
00508   }
00509       else
00510   {
00511     if (printFieldNames)
00512       {
00513         const char prefix[] = "%";
00514         cout << prefix 
00515        << "method"
00516        << ",scalarType"
00517        << ",numRows"
00518        << ",numCols"
00519        << ",numPartitions"
00520        << ",cacheSizeHint"
00521        << ",contiguousCacheBlocks"
00522        << ",numTrials"
00523        << ",timing"
00524        << endl;
00525       }
00526 
00527     // We don't include {min,max}_seq_apply_timing() here, because
00528     // those times don't benefit from the accuracy of benchmarking
00529     // for numTrials > 1.  Thus, it's misleading to include them
00530     // with tbb_tsqr_timing, the total time over numTrials trials.
00531     cout << "KokkosNodeTsqr"
00532          << "," << scalarTypeName
00533          << "," << numRows
00534          << "," << numCols
00535          << "," << numPartitions
00536          << "," << actor.cache_size_hint()
00537          << "," << contiguousCacheBlocks 
00538          << "," << numTrials
00539          << "," << timing
00540          << endl;
00541   }
00542     }
00543   } // namespace Test
00544 } // namespace TSQR
00545 
00546 #endif // __TSQR_Test_KokkosNodeTsqrTest_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends