Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_SeqTest.cpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #include <Tsqr_SeqTest.hpp>
00030 
00031 #include <Tsqr_Random_NormalGenerator.hpp>
00032 #include <Tsqr_nodeTestProblem.hpp>
00033 #include <Tsqr_verifyTimerConcept.hpp>
00034 #include <Teuchos_Time.hpp>
00035 
00036 #include <Tsqr_Blas.hpp>
00037 #include <Tsqr_Lapack.hpp>
00038 #include <Tsqr_LocalVerify.hpp>
00039 #include <Tsqr_Matrix.hpp>
00040 #include <Tsqr_ScalarTraits.hpp>
00041 #include <Tsqr_SequentialTsqr.hpp>
00042 #include <Tsqr_Util.hpp>
00043 
00044 #include <algorithm>
00045 #include <cstring> // size_t definition
00046 #include <fstream>
00047 #include <iomanip>
00048 #include <iostream>
00049 #include <limits>
00050 #include <sstream>
00051 #include <stdexcept>
00052 #include <vector>
00053 
00054 
00055 namespace TSQR {
00056   namespace Test {
00057 
00058     template<class Ordinal, class Scalar>
00059     static Ordinal
00060     lworkQueryLapackQr (LAPACK<Ordinal, Scalar>& lapack,
00061       const Ordinal nrows,
00062       const Ordinal ncols,
00063       const Ordinal lda)
00064     {
00065       typedef typename ScalarTraits<Scalar>::magnitude_type magnitude_type;
00066       using std::ostringstream;
00067       using std::endl;
00068 
00069       Scalar d_lwork_geqrf = Scalar (0);
00070       int INFO = 0;
00071       lapack.GEQRF (nrows, ncols, NULL, lda, NULL, &d_lwork_geqrf, -1, &INFO);
00072       if (INFO != 0)
00073   {
00074     ostringstream os;
00075     os << "LAPACK _GEQRF workspace size query failed: INFO = " << INFO;
00076     // It's a logic error and not a runtime error, because the
00077     // LWORK query should only fail if the input parameters have
00078     // invalid (e.g., out of range) values.
00079     throw std::logic_error (os.str());
00080   }
00081 
00082       Scalar d_lwork_orgqr = Scalar (0);
00083       // A workspace query appropriate for computing the explicit Q
00084       // factor (nrows x ncols) in place, from the QR factorization of
00085       // an nrows x ncols matrix with leading dimension lda.
00086       lapack.ORGQR (nrows, ncols, ncols, NULL, lda, NULL, &d_lwork_orgqr, -1, &INFO);
00087       if (INFO != 0)
00088   {
00089     ostringstream os;
00090     os << "LAPACK _ORGQR workspace size query failed: INFO = " << INFO;
00091     // It's a logic error and not a runtime error, because the
00092     // LWORK query should only fail if the input parameters have
00093     // invalid (e.g., out of range) values.
00094     throw std::logic_error (os.str());
00095   }
00096 
00097       // LAPACK workspace queries do return their results as a
00098       // double-precision floating-point value, but LAPACK promises
00099       // that that value will fit in an int.  Thus, we don't need to
00100       // check for valid casts to int below.  I include the checks
00101       // just to be "bulletproof" and also to show how to do the
00102       // checks for later reference.
00103       const magnitude_type lwork_geqrf_test = 
00104   static_cast< magnitude_type > (static_cast<Ordinal> (ScalarTraits<Scalar>::abs (d_lwork_geqrf)));
00105       if (lwork_geqrf_test != ScalarTraits<Scalar>::abs (d_lwork_geqrf))
00106   {
00107     ostringstream os;
00108     os << "LAPACK _GEQRF workspace query returned a result, " 
00109        << d_lwork_geqrf << ", bigger than the max Ordinal value, " 
00110        << std::numeric_limits<Ordinal>::max();
00111     throw std::range_error (os.str());
00112   }
00113       const Scalar lwork_orgqr_test = 
00114   static_cast< magnitude_type > (static_cast<Ordinal> (ScalarTraits<Scalar>::abs ((d_lwork_orgqr))));
00115       if (lwork_orgqr_test != ScalarTraits<Scalar>::abs (d_lwork_orgqr))
00116   {
00117     ostringstream os;
00118     os << "LAPACK _ORGQR workspace query returned a result, " 
00119        << d_lwork_orgqr << ", bigger than the max Ordinal value, " 
00120        << std::numeric_limits<Ordinal>::max();
00121     throw std::range_error (os.str());
00122   }
00123       return std::max (static_cast<Ordinal> (ScalarTraits<Scalar>::abs (d_lwork_geqrf)),
00124            static_cast<Ordinal> (ScalarTraits<Scalar>::abs (d_lwork_orgqr)));
00125     }
00126 
00130     template< class Ordinal, class Scalar >
00131     static void
00132     verifySeqTsqrTemplate (std::ostream& out,
00133          TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator,
00134          const std::string& datatype,
00135          const std::string& shortDatatype,
00136          const Ordinal nrows, 
00137          const Ordinal ncols, 
00138          const size_t cache_size_hint,
00139          const bool contiguous_cache_blocks,
00140          const bool save_matrices,
00141          const std::string& additionalFieldNames,
00142          const std::string& additionalData,
00143          const bool printFieldNames,
00144          const bool human_readable,
00145          const bool b_debug)
00146     {
00147       typedef typename ScalarTraits<Scalar>::magnitude_type magnitude_type;
00148       using std::cerr;
00149       using std::endl;
00150       using std::pair;
00151       using std::string;
00152       using std::vector;
00153 
00154       SequentialTsqr< Ordinal, Scalar > actor (cache_size_hint);
00155       Ordinal numCacheBlocks;
00156 
00157       if (b_debug)
00158   {
00159     cerr << "Sequential TSQR test problem:" << endl
00160          << "* " << nrows << " x " << ncols << endl
00161          << "* Cache size hint of " << actor.cache_size_hint() << " bytes" << endl;
00162     if (contiguous_cache_blocks)
00163       cerr << "* Contiguous cache blocks" << endl;
00164   }
00165 
00166       Matrix< Ordinal, Scalar > A (nrows, ncols);
00167       Matrix< Ordinal, Scalar > A_copy (nrows, ncols);
00168       Matrix< Ordinal, Scalar > Q (nrows, ncols);
00169       Matrix< Ordinal, Scalar > R (ncols, ncols);
00170       if (std::numeric_limits<Scalar>::has_quiet_NaN)
00171   {
00172     A.fill (std::numeric_limits< Scalar>::quiet_NaN());
00173     A_copy.fill (std::numeric_limits<Scalar>::quiet_NaN());
00174     Q.fill (std::numeric_limits<Scalar>::quiet_NaN());
00175     R.fill (std::numeric_limits<Scalar>::quiet_NaN());
00176   }
00177       const Ordinal lda = nrows;
00178       const Ordinal ldq = nrows;
00179       const Ordinal ldr = ncols;
00180 
00181       // Create a test problem
00182       nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), true);
00183 
00184       if (save_matrices)
00185   {
00186     string filename = "A_" + shortDatatype + ".txt";
00187     if (b_debug)
00188       cerr << "-- Saving test problem to \"" << filename << "\"" << endl;
00189     std::ofstream fileOut (filename.c_str());
00190     print_local_matrix (fileOut, nrows, ncols, A.get(), A.lda());
00191     fileOut.close();
00192   }
00193 
00194       if (b_debug)
00195   cerr << "-- Generated test problem" << endl;
00196 
00197       // Copy A into A_copy, since TSQR overwrites the input.  If
00198       // specified, rearrange the data in A_copy so that the data in
00199       // each cache block is contiguously stored.   
00200       if (! contiguous_cache_blocks)
00201   {
00202     A_copy.copy (A);
00203     if (b_debug)
00204       cerr << "-- Copied test problem from A into A_copy" << endl;
00205   }
00206       else
00207   {
00208     actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda());
00209     if (b_debug)
00210       cerr << "-- Reorganized test matrix to have contiguous "
00211         "cache blocks" << endl;
00212 
00213     // Verify cache blocking, when in debug mode.
00214     if (b_debug)
00215       {
00216         Matrix< Ordinal, Scalar > A2 (nrows, ncols);
00217         if (std::numeric_limits<Scalar>::has_quiet_NaN)
00218     A2.fill (std::numeric_limits<Scalar>::quiet_NaN());
00219 
00220         actor.un_cache_block (nrows, ncols, A2.get(), A2.lda(), A_copy.get());
00221         if (A == A2)
00222     {
00223       if (b_debug)
00224         cerr << "-- Cache blocking test succeeded!" << endl;
00225     }
00226         else
00227     throw std::logic_error ("Cache blocking failed");
00228       }
00229   }
00230 
00231       // Fill R with zeros, since the factorization may not overwrite
00232       // the strict lower triangle of R.
00233       R.fill (Scalar(0));
00234 
00235       // Count the number of cache blocks that factor() will use. 
00236       // This is only for diagnostic purposes.
00237       numCacheBlocks = 
00238   actor.factor_num_cache_blocks (nrows, ncols, A_copy.get(), 
00239                A_copy.lda(), contiguous_cache_blocks);
00240       // In debug mode, report how many cache blocks factor() will use.
00241       if (b_debug)
00242   cerr << "-- Number of cache blocks factor() will use: " 
00243        << numCacheBlocks << endl << endl;
00244 
00245       // Factor the matrix and compute the explicit Q factor
00246       typedef typename SequentialTsqr< Ordinal, Scalar >::FactorOutput 
00247   factor_output_type;
00248       factor_output_type factorOutput = 
00249   actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), 
00250           R.get(), R.lda(), contiguous_cache_blocks);
00251       if (b_debug)
00252   cerr << "-- Finished SequentialTsqr::factor" << endl;
00253 
00254       if (save_matrices)
00255   {
00256     string filename = "R_" + shortDatatype + ".txt";
00257     if (b_debug)
00258       cerr << "-- Saving R factor to \"" << filename << "\"" << endl;
00259     std::ofstream fileOut (filename.c_str());
00260     print_local_matrix (fileOut, ncols, ncols, R.get(), R.lda());
00261     fileOut.close();
00262   }
00263 
00264       actor.explicit_Q (nrows, ncols, A_copy.get(), lda, factorOutput,
00265       ncols, Q.get(), Q.lda(), contiguous_cache_blocks);
00266       if (b_debug)
00267   cerr << "-- Finished SequentialTsqr::explicit_Q" << endl;
00268 
00269       // "Un"-cache-block the output, if contiguous cache blocks were
00270       // used.  This is only necessary because local_verify() doesn't
00271       // currently support contiguous cache blocks.
00272       if (contiguous_cache_blocks)
00273   {
00274     // Use A_copy as temporary storage for un-cache-blocking Q.
00275     actor.un_cache_block (nrows, ncols, A_copy.get(), A_copy.lda(), Q.get());
00276     Q.copy (A_copy);
00277     if (b_debug)
00278       cerr << "-- Un-cache-blocked output Q factor" << endl;
00279   }
00280 
00281       if (save_matrices)
00282   {
00283     string filename = "Q_" + shortDatatype + ".txt";
00284     if (b_debug)
00285       cerr << "-- Saving Q factor to \"" << filename << "\"" << endl;
00286     std::ofstream fileOut (filename.c_str());
00287     print_local_matrix (fileOut, nrows, ncols, Q.get(), Q.lda());
00288     fileOut.close();
00289   }
00290 
00291       // Print out the R factor
00292       if (false && b_debug)
00293   {
00294     cerr << endl << "-- R factor:" << endl;
00295     print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00296     cerr << endl;
00297   }
00298 
00299       // Validate the factorization
00300       vector< magnitude_type > results =
00301   local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, R.get(), ldr);
00302       if (b_debug)
00303   cerr << "-- Finished local_verify" << endl;
00304 
00305       // Print the results
00306       if (human_readable)
00307   out << "Sequential cache-blocked TSQR:" << endl
00308       << "Scalar type: " << datatype << endl
00309       << "Matrix dimensions: " << nrows << " by " << ncols << endl
00310       << "Cache size hint in bytes: " << actor.cache_size_hint() << endl
00311       << "Number of cache blocks: " << numCacheBlocks << endl
00312       << "Contiguous cache blocks? " << contiguous_cache_blocks << endl
00313       << "Absolute residual $\\| A - QR \\|_F$: " << results[0] << endl
00314       << "Absolute orthogonality $\\| I - Q^* Q \\|_F$: " << results[1] << endl
00315       << "Test matrix norm $\\| A \\|_F$: " << results[2] << endl
00316       << endl << endl;
00317       else
00318   {
00319     if (printFieldNames)
00320       {
00321         const char prefix[] = "%";
00322         out << prefix
00323       << "method"
00324       << ",scalarType"
00325       << ",numRows"
00326       << ",numCols"
00327       << ",cacheSizeHint"
00328       << ",contiguousCacheBlocks"
00329       << ",absFrobResid"
00330       << ",absFrobOrthog"
00331       << ",frobA";
00332         if (! additionalFieldNames.empty())
00333     out << "," << additionalFieldNames;
00334         out << endl;
00335       }
00336     out << "SeqTSQR"
00337         << "," << datatype
00338         << "," << nrows
00339         << "," << ncols
00340         << "," << actor.cache_size_hint()
00341         << "," << contiguous_cache_blocks 
00342         << "," << results[0]
00343         << "," << results[1]
00344         << "," << results[2];
00345     if (! additionalData.empty())
00346       out << "," << additionalData;
00347     out << endl;
00348   }
00349     }
00350 
00351 
00352     void
00353     verifySeqTsqr (std::ostream& out,
00354        const int nrows, 
00355        const int ncols, 
00356        const size_t cache_size_hint,
00357        const bool test_complex_arithmetic,
00358        const bool save_matrices,
00359        const bool contiguous_cache_blocks,
00360        const std::string& additionalFieldNames,
00361        const std::string& additionalData,
00362        const bool printFieldNames,
00363        const bool human_readable,
00364        const bool b_debug)
00365     {
00366       using TSQR::Random::NormalGenerator;
00367 #ifdef HAVE_TSQR_COMPLEX
00368       using std::complex;
00369 #endif // HAVE_TSQR_COMPLEX
00370       using std::string;
00371       using std::vector;
00372 
00373       //
00374       // We do tests one after another, using the seed from the
00375       // previous test in the current test, so that the pseudorandom
00376       // streams used by the tests are independent.
00377       //
00378 
00379       // On output: Seed for the next pseudorandom number generator.
00380       vector< int > iseed(4);
00381       string datatype; // name of the current datatype being tested
00382       string shortDatatype; // one-letter version of datatype
00383 
00384       // First test.  The PRNG seeds itself with a default value.
00385       // This will be the same each time, so if you want
00386       // nondeterministic behavior, you should pick the seed values
00387       // yourself.  Only print field names (if at all) for the first
00388       // data type tested; field names are only printed if output is
00389       // not human_readable.
00390       NormalGenerator< int, float > normgenS;
00391       datatype = "float";
00392       shortDatatype = "S";
00393       verifySeqTsqrTemplate (out, normgenS, datatype, shortDatatype, nrows, ncols, 
00394            cache_size_hint, contiguous_cache_blocks, 
00395            save_matrices, additionalFieldNames, additionalData,
00396            printFieldNames, human_readable, b_debug);
00397       // Fetch the pseudorandom seed from the previous test.
00398       normgenS.getSeed (iseed);
00399       NormalGenerator< int, double > normgenD (iseed);
00400       // Next test.
00401       datatype = "double";
00402       shortDatatype = "D";
00403       verifySeqTsqrTemplate (out, normgenD, datatype, shortDatatype, nrows, ncols, 
00404            cache_size_hint, contiguous_cache_blocks, 
00405            save_matrices, additionalFieldNames, additionalData,
00406            printFieldNames, human_readable, b_debug);
00407 #ifdef HAVE_TSQR_COMPLEX
00408       if (test_complex_arithmetic)
00409   {
00410     normgenD.getSeed (iseed);
00411     NormalGenerator< int, complex<float> > normgenC (iseed);
00412     datatype = "complex<float>";
00413     shortDatatype = "C";
00414     verifySeqTsqrTemplate (out, normgenC, datatype, shortDatatype, nrows, ncols, 
00415          cache_size_hint, contiguous_cache_blocks, 
00416          save_matrices, additionalFieldNames, additionalData,
00417          printFieldNames, human_readable, b_debug);
00418     normgenC.getSeed (iseed);
00419     NormalGenerator< int, complex<double> > normgenZ (iseed);
00420     datatype = "complex<double>";
00421     shortDatatype = "Z";
00422     verifySeqTsqrTemplate (out, normgenZ, datatype, shortDatatype, nrows, ncols, 
00423          cache_size_hint, contiguous_cache_blocks, 
00424          save_matrices, additionalFieldNames, additionalData,
00425          printFieldNames, human_readable, b_debug);
00426   }
00427 #else // HAVE_TSQR_COMPLEX
00428       if (test_complex_arithmetic)
00429   throw std::logic_error ("Trilinos was not built with "
00430         "complex arithmetic support");
00431 #endif // HAVE_TSQR_COMPLEX
00432     }
00433 
00434 
00435 
00436     template< class Ordinal, class Scalar >
00437     static void
00438     verifyLapackTemplate (std::ostream& out,
00439         TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator,
00440         const std::string& datatype,
00441         const Ordinal nrows, 
00442         const Ordinal ncols, 
00443         const std::string& additionalFieldNames,
00444         const std::string& additionalData,
00445         const bool printFieldNames,
00446         const bool human_readable,
00447         const bool b_debug)
00448     {
00449       typedef typename ScalarTraits<Scalar>::magnitude_type magnitude_type;
00450       using std::ostringstream;
00451       using std::cerr;
00452       using std::endl;
00453 
00454       // Initialize LAPACK.
00455       LAPACK< Ordinal, Scalar > lapack;
00456 
00457       if (b_debug)
00458   cerr << "LAPACK test problem:" << endl
00459        << "* " << nrows << " x " << ncols << endl;
00460 
00461       Matrix< Ordinal, Scalar > A (nrows, ncols);
00462       Matrix< Ordinal, Scalar > A_copy (nrows, ncols);
00463       Matrix< Ordinal, Scalar > Q (nrows, ncols);
00464       Matrix< Ordinal, Scalar > R (ncols, ncols);
00465       if (std::numeric_limits<Scalar>::has_quiet_NaN)
00466   {
00467     A.fill (std::numeric_limits< Scalar>::quiet_NaN());
00468     A_copy.fill (std::numeric_limits<Scalar>::quiet_NaN());
00469     Q.fill (std::numeric_limits<Scalar>::quiet_NaN());
00470     R.fill (std::numeric_limits<Scalar>::quiet_NaN());
00471   }
00472       const Ordinal lda = nrows;
00473       const Ordinal ldq = nrows;
00474       const Ordinal ldr = ncols;
00475 
00476       // Create a test problem
00477       nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), true);
00478 
00479       if (b_debug)
00480   cerr << "-- Generated test problem" << endl;
00481 
00482       // Copy A into A_copy, since LAPACK QR overwrites the input.
00483       A_copy.copy (A);
00484       if (b_debug)
00485   cerr << "-- Copied test problem from A into A_copy" << endl;
00486 
00487       // Now determine the required workspace for the factorization.
00488       const Ordinal lwork = lworkQueryLapackQr (lapack, nrows, ncols, A_copy.lda());
00489       std::vector<Scalar> work (lwork);
00490       std::vector<Scalar> tau (ncols);
00491 
00492       // Fill R with zeros, since the factorization may not overwrite
00493       // the strict lower triangle of R.
00494       R.fill (Scalar(0));
00495 
00496       // Compute the QR factorization
00497       int info = 0; // INFO is always an int
00498       lapack.GEQRF (nrows, ncols, A_copy.get(), A_copy.lda(), 
00499         &tau[0], &work[0], lwork, &info);
00500       if (info != 0)
00501   {
00502     ostringstream os;
00503     os << "LAPACK QR factorization (_GEQRF) failed: INFO = " << info;
00504     throw std::runtime_error (os.str());
00505   }
00506 
00507       // Copy out the R factor from A_copy (where we computed the QR
00508       // factorization in place) into R.
00509       copy_upper_triangle (ncols, ncols, R.get(), ldr, A_copy.get(), lda);
00510 
00511       if (b_debug)
00512   {
00513     cerr << endl << "-- R factor:" << endl;
00514     print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00515     cerr << endl;
00516   }
00517 
00518       // The explicit Q factor will be computed in place, so copy the
00519       // result of the factorization into Q.
00520       Q.copy (A_copy);
00521 
00522       // Compute the explicit Q factor
00523       lapack.ORGQR (nrows, ncols, ncols, Q.get(), ldq, &tau[0], &work[0], lwork, &info);
00524       if (info != 0)
00525   {
00526     ostringstream os;
00527     os << "LAPACK explicit Q computation (_ORGQR) failed: INFO = " << info;
00528     throw std::runtime_error (os.str());
00529   }
00530   
00531       // Validate the factorization
00532       std::vector< magnitude_type > results = 
00533   local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, R.get(), ldr);
00534 
00535       // Print the results
00536       if (human_readable)
00537   out << "LAPACK QR (DGEQRF and DORGQR):" << endl
00538       << "Scalar type: " << datatype << endl
00539       << "Absolute residual $\\| A - QR \\|_F$: " << results[0] << endl
00540       << "Absolute orthogonality $\\| I - Q^* Q \\|_F$: " << results[1] << endl
00541       << "Test matrix norm $\\| A \\|_F$: " << results[2] << endl
00542       << endl << endl;
00543       else
00544   {
00545     if (printFieldNames)
00546       {
00547         const char prefix[] = "%";
00548         out << prefix
00549       << "method"
00550       << ",scalarType"
00551       << ",numRows"
00552       << ",numCols"
00553       << ",cacheSizeHint"
00554       << ",contiguousCacheBlocks"
00555       << ",absFrobResid"
00556       << ",absFrobOrthog"
00557       << ",frobA";
00558         if (! additionalFieldNames.empty())
00559     out << "," << additionalFieldNames;
00560         out << endl;
00561       }
00562     out << "LAPACK"
00563         << "," << datatype
00564         << "," << nrows
00565         << "," << ncols
00566         << "," << size_t(0) // cache_size_hint
00567         << "," << false     // contiguous_cache_blocks 
00568         << "," << results[0]
00569         << "," << results[1]
00570         << "," << results[2];
00571     if (! additionalData.empty())
00572       out << "," << additionalData;
00573     out << endl;
00574   }
00575     }
00576 
00577 
00578     void
00579     verifyLapack (std::ostream& out,
00580       const int nrows, 
00581       const int ncols, 
00582       const bool test_complex_arithmetic,
00583       const std::string& additionalFieldNames,
00584       const std::string& additionalData,
00585       const bool printFieldNames,
00586       const bool human_readable,
00587       const bool b_debug)
00588     {
00589       using TSQR::Random::NormalGenerator;
00590 #ifdef HAVE_TSQR_COMPLEX
00591       using std::complex;
00592 #endif // HAVE_TSQR_COMPLEX
00593       using std::string;
00594       using std::vector;
00595 
00596       //
00597       // We do tests one after another, using the seed from the
00598       // previous test in the current test, so that the pseudorandom
00599       // streams used by the tests are independent.
00600       //
00601 
00602       // On output: Seed for the next pseudorandom number generator.
00603       vector< int > iseed(4);
00604       string datatype; // name of the current datatype being tested
00605 
00606       // First test.  The PRNG seeds itself with a default value.
00607       // This will be the same each time, so if you want
00608       // nondeterministic behavior, you should pick the seed values
00609       // yourself.
00610       NormalGenerator< int, float > normgenS;
00611       datatype = "float";
00612       verifyLapackTemplate (out, normgenS, datatype, nrows, ncols, 
00613           additionalFieldNames, additionalData,
00614           printFieldNames, human_readable, b_debug);
00615       // Fetch the pseudorandom seed from the previous test.
00616       normgenS.getSeed (iseed);
00617       NormalGenerator< int, double > normgenD (iseed);
00618       // Next test.
00619       datatype = "double";
00620       verifyLapackTemplate (out, normgenD, datatype, nrows, ncols, 
00621           additionalFieldNames, additionalData,
00622           false, human_readable, b_debug);
00623 #ifdef HAVE_TSQR_COMPLEX
00624       if (test_complex_arithmetic)
00625   {
00626     normgenD.getSeed (iseed);
00627     NormalGenerator< int, complex<float> > normgenC (iseed);
00628     datatype = "complex<float>";
00629     verifyLapackTemplate (out, normgenC, datatype, nrows, ncols,
00630         additionalFieldNames, additionalData, 
00631         false, human_readable, b_debug);
00632     normgenC.getSeed (iseed);
00633     NormalGenerator< int, complex<double> > normgenZ (iseed);
00634     datatype = "complex<double>";
00635     verifyLapackTemplate (out, normgenZ, datatype, nrows, ncols, 
00636         additionalFieldNames, additionalData,
00637         false, human_readable, b_debug);
00638   }
00639 #else // HAVE_TSQR_COMPLEX
00640       if (test_complex_arithmetic)
00641   throw std::logic_error ("Trilinos was not built with "
00642         "complex arithmetic support");
00643 #endif // HAVE_TSQR_COMPLEX
00644     }
00645 
00651     template< class Ordinal, class Scalar, class TimerType >
00652     class LapackBenchmarker {
00653     public:
00654       typedef Ordinal ordinal_type;
00655       typedef Scalar scalar_type;
00656 
00667       LapackBenchmarker (const std::string& scalarTypeName,
00668        std::ostream& out = std::cout,
00669        const bool humanReadable = false) :
00670   scalarTypeName_ (scalarTypeName),
00671   out_ (out), 
00672   humanReadable_ (humanReadable)
00673       {
00674   TSQR::Test::verifyTimerConcept< TimerType >();
00675       }
00676 
00677       void 
00678       benchmark (const int numTrials,
00679      const Ordinal numRows,
00680      const Ordinal numCols,
00681      const std::string& additionalFieldNames, 
00682      const std::string& additionalData,
00683      const bool printFieldNames)
00684       {
00685   Matrix< Ordinal, Scalar > A (numRows, numCols);
00686   Matrix< Ordinal, Scalar > Q (numRows, numCols);
00687   Matrix< Ordinal, Scalar > R (numCols, numCols);
00688   const Ordinal lda = numRows;
00689   const Ordinal ldq = numRows;
00690   const Ordinal ldr = numCols;
00691 
00692   // Create a test problem
00693   nodeTestProblem (gen_, numRows, numCols, A.get(), lda, false);
00694 
00695   // Copy A into Q, since LAPACK QR overwrites the input.  We only
00696   // need Q because LAPACK's computation of the explicit Q factor
00697   // occurs in place.  This doesn't work with TSQR.  To give
00698   // LAPACK QR the fullest possible advantage over TSQR, we don't
00699   // allocate an A_copy here (as we would when benchmarking TSQR).
00700   Q.copy (A);
00701 
00702   // Determine the required workspace for the factorization
00703   const Ordinal lwork = lworkQueryLapackQr (lapack_, numRows, numCols, lda);
00704   std::vector<Scalar> work (lwork);
00705   std::vector<Scalar> tau (numCols);
00706 
00707   // Benchmark LAPACK's QR factorization for numTrials trials.
00708   //
00709   // Name of timer doesn't matter here; we only need the timing.
00710   TimerType timer("LAPACK");
00711   timer.start();
00712   for (int trialNum = 0; trialNum < numTrials; ++trialNum)
00713     {
00714       // Compute the QR factorization
00715       int info = 0; // INFO is always an int
00716       lapack_.GEQRF (numRows, numCols, Q.get(), ldq, &tau[0], &work[0], lwork, &info);
00717       if (info != 0)
00718         {
00719     std::ostringstream os;
00720     os << "LAPACK QR factorization (_GEQRF) failed: INFO = " << info;
00721     throw std::runtime_error (os.str());
00722         }
00723 
00724       // Extract the upper triangular factor R from Q (where it
00725       // was computed in place by GEQRF), since ORGQR will
00726       // overwrite all of Q with the explicit Q factor.
00727       copy_upper_triangle (numRows, numCols, R.get(), ldr, Q.get(), ldq);
00728 
00729       // Compute the explicit Q factor
00730       lapack_.ORGQR (numRows, numCols, numCols, Q.get(), ldq,
00731         &tau[0], &work[0], lwork, &info);
00732       if (info != 0)
00733         {
00734     std::ostringstream os;
00735     os << "LAPACK explicit Q computation (_ORGQR) failed: INFO = " << info;
00736     throw std::runtime_error (os.str());
00737         }
00738     }
00739   const double lapackTiming = timer.stop();
00740   reportResults (numTrials, numRows, numCols, lapackTiming, 
00741            additionalFieldNames, additionalData, printFieldNames);
00742       }
00743 
00744 
00745     private:
00747       TSQR::LAPACK< Ordinal, Scalar > lapack_;
00748       
00753       TSQR::Random::NormalGenerator< ordinal_type, scalar_type > gen_;
00754       
00756       std::string scalarTypeName_;
00757 
00759       std::ostream& out_;
00760 
00764       bool humanReadable_;
00765 
00767       void 
00768       reportResults (const int numTrials,
00769          const Ordinal numRows,
00770          const Ordinal numCols,
00771          const double lapackTiming,
00772          const std::string& additionalFieldNames, 
00773          const std::string& additionalData,
00774          const bool printFieldNames)
00775       {
00776   using std::endl;
00777   if (humanReadable_)
00778     out_ << "LAPACK\'s QR factorization (_GEQRF + _ORGQR):" << endl
00779          << "Scalar type = " << scalarTypeName_ << endl
00780          << "# rows = " << numRows << endl
00781          << "# columns = " << numCols << endl
00782          << "# trials = " << numTrials << endl
00783          << "Total time (s) = " << lapackTiming << endl 
00784          << endl;
00785   else
00786     {
00787       if (printFieldNames)
00788         {
00789     const char prefix[] = "%";
00790     out_ << prefix 
00791          << "method" 
00792          << ",scalarType"
00793          << ",numRows"
00794          << ",numCols"
00795          << ",cacheSizeHint"
00796          << ",contiguousCacheBlocks"
00797          << ",numTrials"
00798          << ",timing";
00799     if (! additionalFieldNames.empty())
00800       out_ << "," << additionalFieldNames;
00801     out_ << endl;
00802         }
00803       // "0" refers to the cache size hint, which is not
00804       // applicable in this case; we retain it for easy
00805       // comparison of results with SequentialTsqr (so that the
00806       // number of fields is the same in both cases).  "false"
00807       // (that follows 0) refers to whether or not contiguous
00808       // cache blocks were used (see TSQR::SequentialTsqr); this
00809       // is also not applicable in this case.
00810       out_ << "LAPACK" 
00811      << "," << scalarTypeName_
00812      << "," << numRows
00813      << "," << numCols
00814      << "," << 0
00815      << "," << false
00816      << "," << numTrials 
00817      << "," << lapackTiming;
00818       if (! additionalData.empty())
00819         out_ << "," << additionalData;
00820       out_ << endl;
00821     }
00822       }
00823     };
00824 
00825 
00826     void
00827     benchmarkLapack (std::ostream& out,
00828          const int numRows,
00829          const int numCols,
00830          const int numTrials,
00831          const bool testComplex,
00832          const std::string& additionalFieldNames,
00833          const std::string& additionalData,
00834          const bool printFieldNames,
00835          const bool humanReadable)
00836     {
00837       typedef Teuchos::Time timer_type;
00838       const bool testReal = true;
00839       using std::string;
00840 
00841       // Only print field names (if at all) for the first data type tested.
00842       bool printedFieldNames = false;
00843 
00844       if (testReal)
00845   {
00846     { // Scalar=float
00847       typedef LapackBenchmarker< int, float, timer_type > benchmark_type;
00848       string scalarTypeName ("float");
00849       benchmark_type widget (scalarTypeName, out, humanReadable);
00850       widget.benchmark (numTrials, numRows, numCols, 
00851             additionalFieldNames, additionalData,
00852             printFieldNames && ! printedFieldNames);
00853       if (printFieldNames && ! printedFieldNames)
00854         printedFieldNames = true;
00855     }
00856     { // Scalar=double
00857       typedef LapackBenchmarker< int, double, timer_type > benchmark_type;
00858       string scalarTypeName ("double");
00859       benchmark_type widget (scalarTypeName, out, humanReadable);
00860       widget.benchmark (numTrials, numRows, numCols,
00861             additionalFieldNames, additionalData,
00862             printFieldNames && ! printedFieldNames);
00863       if (printFieldNames && ! printedFieldNames)
00864         printedFieldNames = true;
00865     }
00866   }
00867 
00868       if (testComplex)
00869   {
00870 #ifdef HAVE_TSQR_COMPLEX
00871     using std::complex;
00872     { // Scalar=complex<float>
00873       typedef LapackBenchmarker< int, complex<float>, timer_type > benchmark_type;
00874       string scalarTypeName ("complex<float>");
00875       benchmark_type widget (scalarTypeName, out, humanReadable);
00876       widget.benchmark (numTrials, numRows, numCols,
00877             additionalFieldNames, additionalData,
00878             printFieldNames && ! printedFieldNames);
00879       if (printFieldNames && ! printedFieldNames)
00880         printedFieldNames = true;
00881     }
00882     { // Scalar=complex<double>
00883       typedef LapackBenchmarker<int, complex<double>, timer_type> benchmark_type;
00884       string scalarTypeName ("complex<double>");
00885       benchmark_type widget (scalarTypeName, out, humanReadable);
00886       widget.benchmark (numTrials, numRows, numCols,
00887             additionalFieldNames, additionalData,
00888             printFieldNames && ! printedFieldNames);
00889       if (printFieldNames && ! printedFieldNames)
00890         printedFieldNames = true;
00891     }
00892 #else // Don't HAVE_TSQR_COMPLEX
00893     throw std::logic_error ("Trilinos was not built with "
00894           "complex arithmetic support");
00895 #endif // HAVE_TSQR_COMPLEX
00896   }
00897     }
00898 
00899 
00900 
00906     template<class Ordinal, class Scalar, class TimerType>
00907     class SeqTsqrBenchmarker {
00908     public:
00909       typedef Ordinal ordinal_type;
00910       typedef Scalar scalar_type;
00911 
00922       SeqTsqrBenchmarker (const std::string& scalarTypeName,
00923         std::ostream& out = std::cout,
00924         const bool humanReadable = false) : 
00925   scalarTypeName_ (scalarTypeName),
00926   out_ (out), 
00927   humanReadable_ (humanReadable)
00928       {
00929   // Make sure that TimerType satisfies the required interface.
00930   TSQR::Test::verifyTimerConcept<TimerType>();
00931       }
00932 
00933       void 
00934       benchmark (const int numTrials,
00935      const Ordinal numRows,
00936      const Ordinal numCols,
00937      const size_t cacheSizeHint,
00938      const bool contiguousCacheBlocks,
00939      const std::string& additionalFieldNames,
00940      const std::string& additionalData,
00941      const bool printFieldNames)
00942       {
00943   SequentialTsqr<Ordinal, Scalar> actor (cacheSizeHint);
00944 
00945   Matrix<Ordinal, Scalar> A (numRows, numCols);
00946   Matrix<Ordinal, Scalar> A_copy (numRows, numCols);
00947   Matrix<Ordinal, Scalar> Q (numRows, numCols);
00948   Matrix<Ordinal, Scalar> R (numCols, numCols);
00949   const Ordinal lda = numRows;
00950   const Ordinal ldq = numRows;
00951 
00952   // Create a test problem
00953   nodeTestProblem (gen_, numRows, numCols, A.get(), lda, false);
00954 
00955   // Copy A into A_copy, since TSQR overwrites the input
00956   A_copy.copy (A);
00957 
00958   // Benchmark sequential TSQR for numTrials trials.
00959   //
00960   // Name of timer doesn't matter here; we only need the timing.
00961   TimerType timer("SeqTSQR");
00962   timer.start();
00963   for (int trialNum = 0; trialNum < numTrials; ++trialNum)
00964     {
00965       // Factor the matrix and extract the resulting R factor
00966       typedef typename SequentialTsqr<Ordinal, Scalar>::FactorOutput 
00967         factor_output_type;
00968       factor_output_type factorOutput = 
00969         actor.factor (numRows, numCols, A_copy.get(), lda, 
00970           R.get(), R.lda(), contiguousCacheBlocks);
00971       // Compute the explicit Q factor.  Unlike with LAPACK QR,
00972       // this doesn't happen in place: the implicit Q factor is
00973       // stored in A_copy, and the explicit Q factor is written to
00974       // Q.
00975       actor.explicit_Q (numRows, numCols, A_copy.get(), lda, factorOutput, 
00976             numCols, Q.get(), ldq, contiguousCacheBlocks);
00977     }
00978   const double seqTsqrTiming = timer.stop();
00979   reportResults (numTrials, numRows, numCols, actor.cache_size_hint(),
00980            contiguousCacheBlocks, seqTsqrTiming, 
00981            additionalFieldNames, additionalData, printFieldNames);
00982       }
00983 
00984 
00985     private:
00990       TSQR::Random::NormalGenerator<ordinal_type, scalar_type> gen_;
00991       
00993       std::string scalarTypeName_;
00994 
00996       std::ostream& out_;
00997 
01001       bool humanReadable_;
01002 
01004       void 
01005       reportResults (const int numTrials,
01006          const Ordinal numRows,
01007          const Ordinal numCols,
01008          const size_t actualCacheSizeHint,
01009          const bool contiguousCacheBlocks,
01010          const double seqTsqrTiming,
01011          const std::string& additionalFieldNames,
01012          const std::string& additionalData,
01013          const bool printFieldNames)
01014       {
01015   using std::endl;
01016   if (humanReadable_)
01017     out_ << "Sequential (cache-blocked) TSQR:" << endl
01018          << "Scalar type = " << scalarTypeName_ << endl
01019          << "# rows = " << numRows << endl
01020          << "# columns = " << numCols << endl
01021          << "cache size hint in bytes = " << actualCacheSizeHint << endl
01022          << "contiguous cache blocks? " << contiguousCacheBlocks << endl
01023          << "# trials = " << numTrials << endl
01024          << "Total time (s) = " << seqTsqrTiming << endl 
01025          << endl;
01026   else
01027     {
01028       if (printFieldNames)
01029         {
01030     const char prefix[] = "%";
01031     out_ << prefix 
01032          << "method" 
01033          << ",scalarType"
01034          << ",numRows"
01035          << ",numCols"
01036          << ",cacheSizeHint"
01037          << ",contiguousCacheBlocks"
01038          << ",numTrials"
01039          << ",timing";
01040     if (! additionalFieldNames.empty())
01041       out_ << "," << additionalFieldNames;
01042     out_ << endl;
01043         }
01044       out_ << "SeqTSQR" 
01045      << "," << scalarTypeName_
01046      << "," << numRows
01047      << "," << numCols
01048      << "," << actualCacheSizeHint
01049      << "," << contiguousCacheBlocks
01050      << "," << numTrials 
01051      << "," << seqTsqrTiming;
01052       if (! additionalData.empty())
01053         out_ << "," << additionalData;
01054       out_ << endl;
01055     }
01056       }
01057     };
01058 
01059 
01060     void
01061     benchmarkSeqTsqr (std::ostream& out,
01062           const int numRows,
01063           const int numCols,
01064           const int numTrials,
01065           const size_t cacheSizeHint,
01066           const bool contiguousCacheBlocks,
01067           const bool testComplex,
01068           const std::string& additionalFieldNames,
01069           const std::string& additionalData,
01070           const bool printFieldNames,
01071           const bool humanReadable)
01072     {
01073       typedef Teuchos::Time timer_type;
01074       const bool testReal = true;
01075       using std::string;
01076 
01077       // Only print field names (if at all) for the first data type tested.
01078       bool printedFieldNames = false;
01079 
01080       if (testReal)
01081   {
01082     { // Scalar=float
01083       typedef SeqTsqrBenchmarker<int, float, timer_type> benchmark_type;
01084       string scalarTypeName ("float");
01085       benchmark_type widget (scalarTypeName, out, humanReadable);
01086       widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, 
01087             contiguousCacheBlocks, 
01088             additionalFieldNames, additionalData,
01089             printFieldNames && ! printedFieldNames);
01090       if (printFieldNames && ! printedFieldNames)
01091         printedFieldNames = true;
01092     }
01093     { // Scalar=double
01094       typedef SeqTsqrBenchmarker< int, double, timer_type > benchmark_type;
01095       string scalarTypeName ("double");
01096       benchmark_type widget (scalarTypeName, out, humanReadable);
01097       widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, 
01098             contiguousCacheBlocks, 
01099             additionalFieldNames, additionalData,
01100             printFieldNames && ! printedFieldNames);
01101       if (printFieldNames && ! printedFieldNames)
01102         printedFieldNames = true;
01103     }
01104   }
01105 
01106       if (testComplex)
01107   {
01108 #ifdef HAVE_TSQR_COMPLEX
01109     using std::complex;
01110     { // Scalar=complex<float>
01111       typedef SeqTsqrBenchmarker< int, complex<float>, timer_type > benchmark_type;
01112       string scalarTypeName ("complex<float>");
01113       benchmark_type widget (scalarTypeName, out, humanReadable);
01114       widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, 
01115             contiguousCacheBlocks, 
01116             additionalFieldNames, additionalData,
01117             printFieldNames && ! printedFieldNames);
01118       if (printFieldNames && ! printedFieldNames)
01119         printedFieldNames = true;
01120     }
01121     { // Scalar=complex<double>
01122       typedef SeqTsqrBenchmarker< int, complex<double>, timer_type > benchmark_type;
01123       string scalarTypeName ("complex<double>");
01124       benchmark_type widget (scalarTypeName, out, humanReadable);
01125       widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, 
01126             contiguousCacheBlocks, 
01127             additionalFieldNames, additionalData,
01128             printFieldNames && ! printedFieldNames);
01129       if (printFieldNames && ! printedFieldNames)
01130         printedFieldNames = true;
01131     }
01132 #else // Don't HAVE_TSQR_COMPLEX
01133     throw std::logic_error ("Trilinos was not built with "
01134           "complex arithmetic support");
01135 #endif // HAVE_TSQR_COMPLEX
01136   }
01137     }
01138 
01139 
01140 
01141   } // namespace Test
01142 } // namespace TSQR
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends