Kokkos Node API and Local Linear Algebra Kernels Version of the Day
Tsqr_SeqTest.cpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2008) Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 // 
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00038 // 
00039 // ************************************************************************
00040 //@HEADER
00041 
00042 #include <Tsqr_SeqTest.hpp>
00043 
00044 #include <Tsqr_Random_NormalGenerator.hpp>
00045 #include <Tsqr_nodeTestProblem.hpp>
00046 #include <Tsqr_verifyTimerConcept.hpp>
00047 #include <Teuchos_Time.hpp>
00048 
00049 #include <Tsqr_Blas.hpp>
00050 #include <Tsqr_Lapack.hpp>
00051 #include <Tsqr_LocalVerify.hpp>
00052 #include <Tsqr_Matrix.hpp>
00053 #include <Tsqr_ScalarTraits.hpp>
00054 #include <Tsqr_SequentialTsqr.hpp>
00055 #include <Tsqr_Util.hpp>
00056 
00057 #include <algorithm>
00058 #include <cstring> // size_t definition
00059 #include <fstream>
00060 #include <iomanip>
00061 #include <iostream>
00062 #include <limits>
00063 #include <sstream>
00064 #include <stdexcept>
00065 #include <vector>
00066 
00067 
00068 namespace TSQR {
00069   namespace Test {
00070 
00071     template<class Ordinal, class Scalar>
00072     static Ordinal
00073     lworkQueryLapackQr (LAPACK<Ordinal, Scalar>& lapack,
00074       const Ordinal nrows,
00075       const Ordinal ncols,
00076       const Ordinal lda)
00077     {
00078       typedef typename ScalarTraits<Scalar>::magnitude_type magnitude_type;
00079       using std::ostringstream;
00080       using std::endl;
00081 
00082       Scalar d_lwork_geqrf = Scalar (0);
00083       int INFO = 0;
00084       lapack.GEQRF (nrows, ncols, NULL, lda, NULL, &d_lwork_geqrf, -1, &INFO);
00085       if (INFO != 0)
00086   {
00087     ostringstream os;
00088     os << "LAPACK _GEQRF workspace size query failed: INFO = " << INFO;
00089     // It's a logic error and not a runtime error, because the
00090     // LWORK query should only fail if the input parameters have
00091     // invalid (e.g., out of range) values.
00092     throw std::logic_error (os.str());
00093   }
00094 
00095       Scalar d_lwork_orgqr = Scalar (0);
00096       // A workspace query appropriate for computing the explicit Q
00097       // factor (nrows x ncols) in place, from the QR factorization of
00098       // an nrows x ncols matrix with leading dimension lda.
00099       lapack.ORGQR (nrows, ncols, ncols, NULL, lda, NULL, &d_lwork_orgqr, -1, &INFO);
00100       if (INFO != 0)
00101   {
00102     ostringstream os;
00103     os << "LAPACK _ORGQR workspace size query failed: INFO = " << INFO;
00104     // It's a logic error and not a runtime error, because the
00105     // LWORK query should only fail if the input parameters have
00106     // invalid (e.g., out of range) values.
00107     throw std::logic_error (os.str());
00108   }
00109 
00110       // LAPACK workspace queries do return their results as a
00111       // double-precision floating-point value, but LAPACK promises
00112       // that that value will fit in an int.  Thus, we don't need to
00113       // check for valid casts to int below.  I include the checks
00114       // just to be "bulletproof" and also to show how to do the
00115       // checks for later reference.
00116       const magnitude_type lwork_geqrf_test = 
00117   static_cast< magnitude_type > (static_cast<Ordinal> (ScalarTraits<Scalar>::abs (d_lwork_geqrf)));
00118       if (lwork_geqrf_test != ScalarTraits<Scalar>::abs (d_lwork_geqrf))
00119   {
00120     ostringstream os;
00121     os << "LAPACK _GEQRF workspace query returned a result, " 
00122        << d_lwork_geqrf << ", bigger than the max Ordinal value, " 
00123        << std::numeric_limits<Ordinal>::max();
00124     throw std::range_error (os.str());
00125   }
00126       const Scalar lwork_orgqr_test = 
00127   static_cast< magnitude_type > (static_cast<Ordinal> (ScalarTraits<Scalar>::abs ((d_lwork_orgqr))));
00128       if (lwork_orgqr_test != ScalarTraits<Scalar>::abs (d_lwork_orgqr))
00129   {
00130     ostringstream os;
00131     os << "LAPACK _ORGQR workspace query returned a result, " 
00132        << d_lwork_orgqr << ", bigger than the max Ordinal value, " 
00133        << std::numeric_limits<Ordinal>::max();
00134     throw std::range_error (os.str());
00135   }
00136       return std::max (static_cast<Ordinal> (ScalarTraits<Scalar>::abs (d_lwork_geqrf)),
00137            static_cast<Ordinal> (ScalarTraits<Scalar>::abs (d_lwork_orgqr)));
00138     }
00139 
00143     template< class Ordinal, class Scalar >
00144     static void
00145     verifySeqTsqrTemplate (std::ostream& out,
00146          TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator,
00147          const std::string& datatype,
00148          const std::string& shortDatatype,
00149          const Ordinal nrows, 
00150          const Ordinal ncols, 
00151          const size_t cache_size_hint,
00152          const bool contiguous_cache_blocks,
00153          const bool save_matrices,
00154          const std::string& additionalFieldNames,
00155          const std::string& additionalData,
00156          const bool printFieldNames,
00157          const bool human_readable,
00158          const bool b_debug)
00159     {
00160       typedef typename ScalarTraits<Scalar>::magnitude_type magnitude_type;
00161       using std::cerr;
00162       using std::endl;
00163       using std::pair;
00164       using std::string;
00165       using std::vector;
00166 
00167       SequentialTsqr< Ordinal, Scalar > actor (cache_size_hint);
00168       Ordinal numCacheBlocks;
00169 
00170       if (b_debug)
00171   {
00172     cerr << "Sequential TSQR test problem:" << endl
00173          << "* " << nrows << " x " << ncols << endl
00174          << "* Cache size hint of " << actor.cache_size_hint() << " bytes" << endl;
00175     if (contiguous_cache_blocks)
00176       cerr << "* Contiguous cache blocks" << endl;
00177   }
00178 
00179       Matrix< Ordinal, Scalar > A (nrows, ncols);
00180       Matrix< Ordinal, Scalar > A_copy (nrows, ncols);
00181       Matrix< Ordinal, Scalar > Q (nrows, ncols);
00182       Matrix< Ordinal, Scalar > R (ncols, ncols);
00183       if (std::numeric_limits<Scalar>::has_quiet_NaN)
00184   {
00185     A.fill (std::numeric_limits< Scalar>::quiet_NaN());
00186     A_copy.fill (std::numeric_limits<Scalar>::quiet_NaN());
00187     Q.fill (std::numeric_limits<Scalar>::quiet_NaN());
00188     R.fill (std::numeric_limits<Scalar>::quiet_NaN());
00189   }
00190       const Ordinal lda = nrows;
00191       const Ordinal ldq = nrows;
00192       const Ordinal ldr = ncols;
00193 
00194       // Create a test problem
00195       nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), true);
00196 
00197       if (save_matrices)
00198   {
00199     string filename = "A_" + shortDatatype + ".txt";
00200     if (b_debug)
00201       cerr << "-- Saving test problem to \"" << filename << "\"" << endl;
00202     std::ofstream fileOut (filename.c_str());
00203     print_local_matrix (fileOut, nrows, ncols, A.get(), A.lda());
00204     fileOut.close();
00205   }
00206 
00207       if (b_debug)
00208   cerr << "-- Generated test problem" << endl;
00209 
00210       // Copy A into A_copy, since TSQR overwrites the input.  If
00211       // specified, rearrange the data in A_copy so that the data in
00212       // each cache block is contiguously stored.   
00213       if (! contiguous_cache_blocks)
00214   {
00215     A_copy.copy (A);
00216     if (b_debug)
00217       cerr << "-- Copied test problem from A into A_copy" << endl;
00218   }
00219       else
00220   {
00221     actor.cache_block (nrows, ncols, A_copy.get(), A.get(), A.lda());
00222     if (b_debug)
00223       cerr << "-- Reorganized test matrix to have contiguous "
00224         "cache blocks" << endl;
00225 
00226     // Verify cache blocking, when in debug mode.
00227     if (b_debug)
00228       {
00229         Matrix< Ordinal, Scalar > A2 (nrows, ncols);
00230         if (std::numeric_limits<Scalar>::has_quiet_NaN)
00231     A2.fill (std::numeric_limits<Scalar>::quiet_NaN());
00232 
00233         actor.un_cache_block (nrows, ncols, A2.get(), A2.lda(), A_copy.get());
00234         if (A == A2)
00235     {
00236       if (b_debug)
00237         cerr << "-- Cache blocking test succeeded!" << endl;
00238     }
00239         else
00240     throw std::logic_error ("Cache blocking failed");
00241       }
00242   }
00243 
00244       // Fill R with zeros, since the factorization may not overwrite
00245       // the strict lower triangle of R.
00246       R.fill (Scalar(0));
00247 
00248       // Count the number of cache blocks that factor() will use. 
00249       // This is only for diagnostic purposes.
00250       numCacheBlocks = 
00251   actor.factor_num_cache_blocks (nrows, ncols, A_copy.get(), 
00252                A_copy.lda(), contiguous_cache_blocks);
00253       // In debug mode, report how many cache blocks factor() will use.
00254       if (b_debug)
00255   cerr << "-- Number of cache blocks factor() will use: " 
00256        << numCacheBlocks << endl << endl;
00257 
00258       // Factor the matrix and compute the explicit Q factor
00259       typedef typename SequentialTsqr< Ordinal, Scalar >::FactorOutput 
00260   factor_output_type;
00261       factor_output_type factorOutput = 
00262   actor.factor (nrows, ncols, A_copy.get(), A_copy.lda(), 
00263           R.get(), R.lda(), contiguous_cache_blocks);
00264       if (b_debug)
00265   cerr << "-- Finished SequentialTsqr::factor" << endl;
00266 
00267       if (save_matrices)
00268   {
00269     string filename = "R_" + shortDatatype + ".txt";
00270     if (b_debug)
00271       cerr << "-- Saving R factor to \"" << filename << "\"" << endl;
00272     std::ofstream fileOut (filename.c_str());
00273     print_local_matrix (fileOut, ncols, ncols, R.get(), R.lda());
00274     fileOut.close();
00275   }
00276 
00277       actor.explicit_Q (nrows, ncols, A_copy.get(), lda, factorOutput,
00278       ncols, Q.get(), Q.lda(), contiguous_cache_blocks);
00279       if (b_debug)
00280   cerr << "-- Finished SequentialTsqr::explicit_Q" << endl;
00281 
00282       // "Un"-cache-block the output, if contiguous cache blocks were
00283       // used.  This is only necessary because local_verify() doesn't
00284       // currently support contiguous cache blocks.
00285       if (contiguous_cache_blocks)
00286   {
00287     // Use A_copy as temporary storage for un-cache-blocking Q.
00288     actor.un_cache_block (nrows, ncols, A_copy.get(), A_copy.lda(), Q.get());
00289     Q.copy (A_copy);
00290     if (b_debug)
00291       cerr << "-- Un-cache-blocked output Q factor" << endl;
00292   }
00293 
00294       if (save_matrices)
00295   {
00296     string filename = "Q_" + shortDatatype + ".txt";
00297     if (b_debug)
00298       cerr << "-- Saving Q factor to \"" << filename << "\"" << endl;
00299     std::ofstream fileOut (filename.c_str());
00300     print_local_matrix (fileOut, nrows, ncols, Q.get(), Q.lda());
00301     fileOut.close();
00302   }
00303 
00304       // Print out the R factor
00305       if (false && b_debug)
00306   {
00307     cerr << endl << "-- R factor:" << endl;
00308     print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00309     cerr << endl;
00310   }
00311 
00312       // Validate the factorization
00313       vector< magnitude_type > results =
00314   local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, R.get(), ldr);
00315       if (b_debug)
00316   cerr << "-- Finished local_verify" << endl;
00317 
00318       // Print the results
00319       if (human_readable)
00320   out << "Sequential cache-blocked TSQR:" << endl
00321       << "Scalar type: " << datatype << endl
00322       << "Matrix dimensions: " << nrows << " by " << ncols << endl
00323       << "Cache size hint in bytes: " << actor.cache_size_hint() << endl
00324       << "Number of cache blocks: " << numCacheBlocks << endl
00325       << "Contiguous cache blocks? " << contiguous_cache_blocks << endl
00326       << "Absolute residual $\\| A - QR \\|_F$: " << results[0] << endl
00327       << "Absolute orthogonality $\\| I - Q^* Q \\|_F$: " << results[1] << endl
00328       << "Test matrix norm $\\| A \\|_F$: " << results[2] << endl
00329       << endl << endl;
00330       else
00331   {
00332     if (printFieldNames)
00333       {
00334         const char prefix[] = "%";
00335         out << prefix
00336       << "method"
00337       << ",scalarType"
00338       << ",numRows"
00339       << ",numCols"
00340       << ",cacheSizeHint"
00341       << ",contiguousCacheBlocks"
00342       << ",absFrobResid"
00343       << ",absFrobOrthog"
00344       << ",frobA";
00345         if (! additionalFieldNames.empty())
00346     out << "," << additionalFieldNames;
00347         out << endl;
00348       }
00349     out << "SeqTSQR"
00350         << "," << datatype
00351         << "," << nrows
00352         << "," << ncols
00353         << "," << actor.cache_size_hint()
00354         << "," << contiguous_cache_blocks 
00355         << "," << results[0]
00356         << "," << results[1]
00357         << "," << results[2];
00358     if (! additionalData.empty())
00359       out << "," << additionalData;
00360     out << endl;
00361   }
00362     }
00363 
00364 
00365     void
00366     verifySeqTsqr (std::ostream& out,
00367        const int nrows, 
00368        const int ncols, 
00369        const size_t cache_size_hint,
00370        const bool test_complex_arithmetic,
00371        const bool save_matrices,
00372        const bool contiguous_cache_blocks,
00373        const std::string& additionalFieldNames,
00374        const std::string& additionalData,
00375        const bool printFieldNames,
00376        const bool human_readable,
00377        const bool b_debug)
00378     {
00379       using TSQR::Random::NormalGenerator;
00380 #ifdef HAVE_TSQR_COMPLEX
00381       using std::complex;
00382 #endif // HAVE_TSQR_COMPLEX
00383       using std::string;
00384       using std::vector;
00385 
00386       //
00387       // We do tests one after another, using the seed from the
00388       // previous test in the current test, so that the pseudorandom
00389       // streams used by the tests are independent.
00390       //
00391 
00392       // On output: Seed for the next pseudorandom number generator.
00393       vector< int > iseed(4);
00394       string datatype; // name of the current datatype being tested
00395       string shortDatatype; // one-letter version of datatype
00396 
00397       // First test.  The PRNG seeds itself with a default value.
00398       // This will be the same each time, so if you want
00399       // nondeterministic behavior, you should pick the seed values
00400       // yourself.  Only print field names (if at all) for the first
00401       // data type tested; field names are only printed if output is
00402       // not human_readable.
00403       NormalGenerator< int, float > normgenS;
00404       datatype = "float";
00405       shortDatatype = "S";
00406       verifySeqTsqrTemplate (out, normgenS, datatype, shortDatatype, nrows, ncols, 
00407            cache_size_hint, contiguous_cache_blocks, 
00408            save_matrices, additionalFieldNames, additionalData,
00409            printFieldNames, human_readable, b_debug);
00410       // Fetch the pseudorandom seed from the previous test.
00411       normgenS.getSeed (iseed);
00412       NormalGenerator< int, double > normgenD (iseed);
00413       // Next test.
00414       datatype = "double";
00415       shortDatatype = "D";
00416       verifySeqTsqrTemplate (out, normgenD, datatype, shortDatatype, nrows, ncols, 
00417            cache_size_hint, contiguous_cache_blocks, 
00418            save_matrices, additionalFieldNames, additionalData,
00419            printFieldNames, human_readable, b_debug);
00420 #ifdef HAVE_TSQR_COMPLEX
00421       if (test_complex_arithmetic)
00422   {
00423     normgenD.getSeed (iseed);
00424     NormalGenerator< int, complex<float> > normgenC (iseed);
00425     datatype = "complex<float>";
00426     shortDatatype = "C";
00427     verifySeqTsqrTemplate (out, normgenC, datatype, shortDatatype, nrows, ncols, 
00428          cache_size_hint, contiguous_cache_blocks, 
00429          save_matrices, additionalFieldNames, additionalData,
00430          printFieldNames, human_readable, b_debug);
00431     normgenC.getSeed (iseed);
00432     NormalGenerator< int, complex<double> > normgenZ (iseed);
00433     datatype = "complex<double>";
00434     shortDatatype = "Z";
00435     verifySeqTsqrTemplate (out, normgenZ, datatype, shortDatatype, nrows, ncols, 
00436          cache_size_hint, contiguous_cache_blocks, 
00437          save_matrices, additionalFieldNames, additionalData,
00438          printFieldNames, human_readable, b_debug);
00439   }
00440 #else // HAVE_TSQR_COMPLEX
00441       if (test_complex_arithmetic)
00442   throw std::logic_error ("Trilinos was not built with "
00443         "complex arithmetic support");
00444 #endif // HAVE_TSQR_COMPLEX
00445     }
00446 
00447 
00448 
00449     template< class Ordinal, class Scalar >
00450     static void
00451     verifyLapackTemplate (std::ostream& out,
00452         TSQR::Random::NormalGenerator< Ordinal, Scalar >& generator,
00453         const std::string& datatype,
00454         const Ordinal nrows, 
00455         const Ordinal ncols, 
00456         const std::string& additionalFieldNames,
00457         const std::string& additionalData,
00458         const bool printFieldNames,
00459         const bool human_readable,
00460         const bool b_debug)
00461     {
00462       typedef typename ScalarTraits<Scalar>::magnitude_type magnitude_type;
00463       using std::ostringstream;
00464       using std::cerr;
00465       using std::endl;
00466 
00467       // Initialize LAPACK.
00468       LAPACK< Ordinal, Scalar > lapack;
00469 
00470       if (b_debug)
00471   cerr << "LAPACK test problem:" << endl
00472        << "* " << nrows << " x " << ncols << endl;
00473 
00474       Matrix< Ordinal, Scalar > A (nrows, ncols);
00475       Matrix< Ordinal, Scalar > A_copy (nrows, ncols);
00476       Matrix< Ordinal, Scalar > Q (nrows, ncols);
00477       Matrix< Ordinal, Scalar > R (ncols, ncols);
00478       if (std::numeric_limits<Scalar>::has_quiet_NaN)
00479   {
00480     A.fill (std::numeric_limits< Scalar>::quiet_NaN());
00481     A_copy.fill (std::numeric_limits<Scalar>::quiet_NaN());
00482     Q.fill (std::numeric_limits<Scalar>::quiet_NaN());
00483     R.fill (std::numeric_limits<Scalar>::quiet_NaN());
00484   }
00485       const Ordinal lda = nrows;
00486       const Ordinal ldq = nrows;
00487       const Ordinal ldr = ncols;
00488 
00489       // Create a test problem
00490       nodeTestProblem (generator, nrows, ncols, A.get(), A.lda(), true);
00491 
00492       if (b_debug)
00493   cerr << "-- Generated test problem" << endl;
00494 
00495       // Copy A into A_copy, since LAPACK QR overwrites the input.
00496       A_copy.copy (A);
00497       if (b_debug)
00498   cerr << "-- Copied test problem from A into A_copy" << endl;
00499 
00500       // Now determine the required workspace for the factorization.
00501       const Ordinal lwork = lworkQueryLapackQr (lapack, nrows, ncols, A_copy.lda());
00502       std::vector<Scalar> work (lwork);
00503       std::vector<Scalar> tau (ncols);
00504 
00505       // Fill R with zeros, since the factorization may not overwrite
00506       // the strict lower triangle of R.
00507       R.fill (Scalar(0));
00508 
00509       // Compute the QR factorization
00510       int info = 0; // INFO is always an int
00511       lapack.GEQRF (nrows, ncols, A_copy.get(), A_copy.lda(), 
00512         &tau[0], &work[0], lwork, &info);
00513       if (info != 0)
00514   {
00515     ostringstream os;
00516     os << "LAPACK QR factorization (_GEQRF) failed: INFO = " << info;
00517     throw std::runtime_error (os.str());
00518   }
00519 
00520       // Copy out the R factor from A_copy (where we computed the QR
00521       // factorization in place) into R.
00522       copy_upper_triangle (ncols, ncols, R.get(), ldr, A_copy.get(), lda);
00523 
00524       if (b_debug)
00525   {
00526     cerr << endl << "-- R factor:" << endl;
00527     print_local_matrix (cerr, ncols, ncols, R.get(), R.lda());
00528     cerr << endl;
00529   }
00530 
00531       // The explicit Q factor will be computed in place, so copy the
00532       // result of the factorization into Q.
00533       Q.copy (A_copy);
00534 
00535       // Compute the explicit Q factor
00536       lapack.ORGQR (nrows, ncols, ncols, Q.get(), ldq, &tau[0], &work[0], lwork, &info);
00537       if (info != 0)
00538   {
00539     ostringstream os;
00540     os << "LAPACK explicit Q computation (_ORGQR) failed: INFO = " << info;
00541     throw std::runtime_error (os.str());
00542   }
00543   
00544       // Validate the factorization
00545       std::vector< magnitude_type > results = 
00546   local_verify (nrows, ncols, A.get(), lda, Q.get(), ldq, R.get(), ldr);
00547 
00548       // Print the results
00549       if (human_readable)
00550   out << "LAPACK QR (DGEQRF and DORGQR):" << endl
00551       << "Scalar type: " << datatype << endl
00552       << "Absolute residual $\\| A - QR \\|_F$: " << results[0] << endl
00553       << "Absolute orthogonality $\\| I - Q^* Q \\|_F$: " << results[1] << endl
00554       << "Test matrix norm $\\| A \\|_F$: " << results[2] << endl
00555       << endl << endl;
00556       else
00557   {
00558     if (printFieldNames)
00559       {
00560         const char prefix[] = "%";
00561         out << prefix
00562       << "method"
00563       << ",scalarType"
00564       << ",numRows"
00565       << ",numCols"
00566       << ",cacheSizeHint"
00567       << ",contiguousCacheBlocks"
00568       << ",absFrobResid"
00569       << ",absFrobOrthog"
00570       << ",frobA";
00571         if (! additionalFieldNames.empty())
00572     out << "," << additionalFieldNames;
00573         out << endl;
00574       }
00575     out << "LAPACK"
00576         << "," << datatype
00577         << "," << nrows
00578         << "," << ncols
00579         << "," << size_t(0) // cache_size_hint
00580         << "," << false     // contiguous_cache_blocks 
00581         << "," << results[0]
00582         << "," << results[1]
00583         << "," << results[2];
00584     if (! additionalData.empty())
00585       out << "," << additionalData;
00586     out << endl;
00587   }
00588     }
00589 
00590 
00591     void
00592     verifyLapack (std::ostream& out,
00593       const int nrows, 
00594       const int ncols, 
00595       const bool test_complex_arithmetic,
00596       const std::string& additionalFieldNames,
00597       const std::string& additionalData,
00598       const bool printFieldNames,
00599       const bool human_readable,
00600       const bool b_debug)
00601     {
00602       using TSQR::Random::NormalGenerator;
00603 #ifdef HAVE_TSQR_COMPLEX
00604       using std::complex;
00605 #endif // HAVE_TSQR_COMPLEX
00606       using std::string;
00607       using std::vector;
00608 
00609       //
00610       // We do tests one after another, using the seed from the
00611       // previous test in the current test, so that the pseudorandom
00612       // streams used by the tests are independent.
00613       //
00614 
00615       // On output: Seed for the next pseudorandom number generator.
00616       vector< int > iseed(4);
00617       string datatype; // name of the current datatype being tested
00618 
00619       // First test.  The PRNG seeds itself with a default value.
00620       // This will be the same each time, so if you want
00621       // nondeterministic behavior, you should pick the seed values
00622       // yourself.
00623       NormalGenerator< int, float > normgenS;
00624       datatype = "float";
00625       verifyLapackTemplate (out, normgenS, datatype, nrows, ncols, 
00626           additionalFieldNames, additionalData,
00627           printFieldNames, human_readable, b_debug);
00628       // Fetch the pseudorandom seed from the previous test.
00629       normgenS.getSeed (iseed);
00630       NormalGenerator< int, double > normgenD (iseed);
00631       // Next test.
00632       datatype = "double";
00633       verifyLapackTemplate (out, normgenD, datatype, nrows, ncols, 
00634           additionalFieldNames, additionalData,
00635           false, human_readable, b_debug);
00636 #ifdef HAVE_TSQR_COMPLEX
00637       if (test_complex_arithmetic)
00638   {
00639     normgenD.getSeed (iseed);
00640     NormalGenerator< int, complex<float> > normgenC (iseed);
00641     datatype = "complex<float>";
00642     verifyLapackTemplate (out, normgenC, datatype, nrows, ncols,
00643         additionalFieldNames, additionalData, 
00644         false, human_readable, b_debug);
00645     normgenC.getSeed (iseed);
00646     NormalGenerator< int, complex<double> > normgenZ (iseed);
00647     datatype = "complex<double>";
00648     verifyLapackTemplate (out, normgenZ, datatype, nrows, ncols, 
00649         additionalFieldNames, additionalData,
00650         false, human_readable, b_debug);
00651   }
00652 #else // HAVE_TSQR_COMPLEX
00653       if (test_complex_arithmetic)
00654   throw std::logic_error ("Trilinos was not built with "
00655         "complex arithmetic support");
00656 #endif // HAVE_TSQR_COMPLEX
00657     }
00658 
00664     template< class Ordinal, class Scalar, class TimerType >
00665     class LapackBenchmarker {
00666     public:
00667       typedef Ordinal ordinal_type;
00668       typedef Scalar scalar_type;
00669 
00680       LapackBenchmarker (const std::string& scalarTypeName,
00681        std::ostream& out = std::cout,
00682        const bool humanReadable = false) :
00683   scalarTypeName_ (scalarTypeName),
00684   out_ (out), 
00685   humanReadable_ (humanReadable)
00686       {
00687   TSQR::Test::verifyTimerConcept< TimerType >();
00688       }
00689 
00690       void 
00691       benchmark (const int numTrials,
00692      const Ordinal numRows,
00693      const Ordinal numCols,
00694      const std::string& additionalFieldNames, 
00695      const std::string& additionalData,
00696      const bool printFieldNames)
00697       {
00698   Matrix< Ordinal, Scalar > A (numRows, numCols);
00699   Matrix< Ordinal, Scalar > Q (numRows, numCols);
00700   Matrix< Ordinal, Scalar > R (numCols, numCols);
00701   const Ordinal lda = numRows;
00702   const Ordinal ldq = numRows;
00703   const Ordinal ldr = numCols;
00704 
00705   // Create a test problem
00706   nodeTestProblem (gen_, numRows, numCols, A.get(), lda, false);
00707 
00708   // Copy A into Q, since LAPACK QR overwrites the input.  We only
00709   // need Q because LAPACK's computation of the explicit Q factor
00710   // occurs in place.  This doesn't work with TSQR.  To give
00711   // LAPACK QR the fullest possible advantage over TSQR, we don't
00712   // allocate an A_copy here (as we would when benchmarking TSQR).
00713   Q.copy (A);
00714 
00715   // Determine the required workspace for the factorization
00716   const Ordinal lwork = lworkQueryLapackQr (lapack_, numRows, numCols, lda);
00717   std::vector<Scalar> work (lwork);
00718   std::vector<Scalar> tau (numCols);
00719 
00720   // Benchmark LAPACK's QR factorization for numTrials trials.
00721   //
00722   // Name of timer doesn't matter here; we only need the timing.
00723   TimerType timer("LAPACK");
00724   timer.start();
00725   for (int trialNum = 0; trialNum < numTrials; ++trialNum)
00726     {
00727       // Compute the QR factorization
00728       int info = 0; // INFO is always an int
00729       lapack_.GEQRF (numRows, numCols, Q.get(), ldq, &tau[0], &work[0], lwork, &info);
00730       if (info != 0)
00731         {
00732     std::ostringstream os;
00733     os << "LAPACK QR factorization (_GEQRF) failed: INFO = " << info;
00734     throw std::runtime_error (os.str());
00735         }
00736 
00737       // Extract the upper triangular factor R from Q (where it
00738       // was computed in place by GEQRF), since ORGQR will
00739       // overwrite all of Q with the explicit Q factor.
00740       copy_upper_triangle (numRows, numCols, R.get(), ldr, Q.get(), ldq);
00741 
00742       // Compute the explicit Q factor
00743       lapack_.ORGQR (numRows, numCols, numCols, Q.get(), ldq,
00744         &tau[0], &work[0], lwork, &info);
00745       if (info != 0)
00746         {
00747     std::ostringstream os;
00748     os << "LAPACK explicit Q computation (_ORGQR) failed: INFO = " << info;
00749     throw std::runtime_error (os.str());
00750         }
00751     }
00752   const double lapackTiming = timer.stop();
00753   reportResults (numTrials, numRows, numCols, lapackTiming, 
00754            additionalFieldNames, additionalData, printFieldNames);
00755       }
00756 
00757 
00758     private:
00760       TSQR::LAPACK< Ordinal, Scalar > lapack_;
00761       
00766       TSQR::Random::NormalGenerator< ordinal_type, scalar_type > gen_;
00767       
00769       std::string scalarTypeName_;
00770 
00772       std::ostream& out_;
00773 
00777       bool humanReadable_;
00778 
00780       void 
00781       reportResults (const int numTrials,
00782          const Ordinal numRows,
00783          const Ordinal numCols,
00784          const double lapackTiming,
00785          const std::string& additionalFieldNames, 
00786          const std::string& additionalData,
00787          const bool printFieldNames)
00788       {
00789   using std::endl;
00790   if (humanReadable_)
00791     out_ << "LAPACK\'s QR factorization (_GEQRF + _ORGQR):" << endl
00792          << "Scalar type = " << scalarTypeName_ << endl
00793          << "# rows = " << numRows << endl
00794          << "# columns = " << numCols << endl
00795          << "# trials = " << numTrials << endl
00796          << "Total time (s) = " << lapackTiming << endl 
00797          << endl;
00798   else
00799     {
00800       if (printFieldNames)
00801         {
00802     const char prefix[] = "%";
00803     out_ << prefix 
00804          << "method" 
00805          << ",scalarType"
00806          << ",numRows"
00807          << ",numCols"
00808          << ",cacheSizeHint"
00809          << ",contiguousCacheBlocks"
00810          << ",numTrials"
00811          << ",timing";
00812     if (! additionalFieldNames.empty())
00813       out_ << "," << additionalFieldNames;
00814     out_ << endl;
00815         }
00816       // "0" refers to the cache size hint, which is not
00817       // applicable in this case; we retain it for easy
00818       // comparison of results with SequentialTsqr (so that the
00819       // number of fields is the same in both cases).  "false"
00820       // (that follows 0) refers to whether or not contiguous
00821       // cache blocks were used (see TSQR::SequentialTsqr); this
00822       // is also not applicable in this case.
00823       out_ << "LAPACK" 
00824      << "," << scalarTypeName_
00825      << "," << numRows
00826      << "," << numCols
00827      << "," << 0
00828      << "," << false
00829      << "," << numTrials 
00830      << "," << lapackTiming;
00831       if (! additionalData.empty())
00832         out_ << "," << additionalData;
00833       out_ << endl;
00834     }
00835       }
00836     };
00837 
00838 
00839     void
00840     benchmarkLapack (std::ostream& out,
00841          const int numRows,
00842          const int numCols,
00843          const int numTrials,
00844          const bool testComplex,
00845          const std::string& additionalFieldNames,
00846          const std::string& additionalData,
00847          const bool printFieldNames,
00848          const bool humanReadable)
00849     {
00850       typedef Teuchos::Time timer_type;
00851       const bool testReal = true;
00852       using std::string;
00853 
00854       // Only print field names (if at all) for the first data type tested.
00855       bool printedFieldNames = false;
00856 
00857       if (testReal)
00858   {
00859     { // Scalar=float
00860       typedef LapackBenchmarker< int, float, timer_type > benchmark_type;
00861       string scalarTypeName ("float");
00862       benchmark_type widget (scalarTypeName, out, humanReadable);
00863       widget.benchmark (numTrials, numRows, numCols, 
00864             additionalFieldNames, additionalData,
00865             printFieldNames && ! printedFieldNames);
00866       if (printFieldNames && ! printedFieldNames)
00867         printedFieldNames = true;
00868     }
00869     { // Scalar=double
00870       typedef LapackBenchmarker< int, double, timer_type > benchmark_type;
00871       string scalarTypeName ("double");
00872       benchmark_type widget (scalarTypeName, out, humanReadable);
00873       widget.benchmark (numTrials, numRows, numCols,
00874             additionalFieldNames, additionalData,
00875             printFieldNames && ! printedFieldNames);
00876       if (printFieldNames && ! printedFieldNames)
00877         printedFieldNames = true;
00878     }
00879   }
00880 
00881       if (testComplex)
00882   {
00883 #ifdef HAVE_TSQR_COMPLEX
00884     using std::complex;
00885     { // Scalar=complex<float>
00886       typedef LapackBenchmarker< int, complex<float>, timer_type > benchmark_type;
00887       string scalarTypeName ("complex<float>");
00888       benchmark_type widget (scalarTypeName, out, humanReadable);
00889       widget.benchmark (numTrials, numRows, numCols,
00890             additionalFieldNames, additionalData,
00891             printFieldNames && ! printedFieldNames);
00892       if (printFieldNames && ! printedFieldNames)
00893         printedFieldNames = true;
00894     }
00895     { // Scalar=complex<double>
00896       typedef LapackBenchmarker<int, complex<double>, timer_type> benchmark_type;
00897       string scalarTypeName ("complex<double>");
00898       benchmark_type widget (scalarTypeName, out, humanReadable);
00899       widget.benchmark (numTrials, numRows, numCols,
00900             additionalFieldNames, additionalData,
00901             printFieldNames && ! printedFieldNames);
00902       if (printFieldNames && ! printedFieldNames)
00903         printedFieldNames = true;
00904     }
00905 #else // Don't HAVE_TSQR_COMPLEX
00906     throw std::logic_error ("Trilinos was not built with "
00907           "complex arithmetic support");
00908 #endif // HAVE_TSQR_COMPLEX
00909   }
00910     }
00911 
00912 
00913 
00919     template<class Ordinal, class Scalar, class TimerType>
00920     class SeqTsqrBenchmarker {
00921     public:
00922       typedef Ordinal ordinal_type;
00923       typedef Scalar scalar_type;
00924 
00935       SeqTsqrBenchmarker (const std::string& scalarTypeName,
00936         std::ostream& out = std::cout,
00937         const bool humanReadable = false) : 
00938   scalarTypeName_ (scalarTypeName),
00939   out_ (out), 
00940   humanReadable_ (humanReadable)
00941       {
00942   // Make sure that TimerType satisfies the required interface.
00943   TSQR::Test::verifyTimerConcept<TimerType>();
00944       }
00945 
00946       void 
00947       benchmark (const int numTrials,
00948      const Ordinal numRows,
00949      const Ordinal numCols,
00950      const size_t cacheSizeHint,
00951      const bool contiguousCacheBlocks,
00952      const std::string& additionalFieldNames,
00953      const std::string& additionalData,
00954      const bool printFieldNames)
00955       {
00956   SequentialTsqr<Ordinal, Scalar> actor (cacheSizeHint);
00957 
00958   Matrix<Ordinal, Scalar> A (numRows, numCols);
00959   Matrix<Ordinal, Scalar> A_copy (numRows, numCols);
00960   Matrix<Ordinal, Scalar> Q (numRows, numCols);
00961   Matrix<Ordinal, Scalar> R (numCols, numCols);
00962   const Ordinal lda = numRows;
00963   const Ordinal ldq = numRows;
00964 
00965   // Create a test problem
00966   nodeTestProblem (gen_, numRows, numCols, A.get(), lda, false);
00967 
00968   // Copy A into A_copy, since TSQR overwrites the input
00969   A_copy.copy (A);
00970 
00971   // Benchmark sequential TSQR for numTrials trials.
00972   //
00973   // Name of timer doesn't matter here; we only need the timing.
00974   TimerType timer("SeqTSQR");
00975   timer.start();
00976   for (int trialNum = 0; trialNum < numTrials; ++trialNum)
00977     {
00978       // Factor the matrix and extract the resulting R factor
00979       typedef typename SequentialTsqr<Ordinal, Scalar>::FactorOutput 
00980         factor_output_type;
00981       factor_output_type factorOutput = 
00982         actor.factor (numRows, numCols, A_copy.get(), lda, 
00983           R.get(), R.lda(), contiguousCacheBlocks);
00984       // Compute the explicit Q factor.  Unlike with LAPACK QR,
00985       // this doesn't happen in place: the implicit Q factor is
00986       // stored in A_copy, and the explicit Q factor is written to
00987       // Q.
00988       actor.explicit_Q (numRows, numCols, A_copy.get(), lda, factorOutput, 
00989             numCols, Q.get(), ldq, contiguousCacheBlocks);
00990     }
00991   const double seqTsqrTiming = timer.stop();
00992   reportResults (numTrials, numRows, numCols, actor.cache_size_hint(),
00993            contiguousCacheBlocks, seqTsqrTiming, 
00994            additionalFieldNames, additionalData, printFieldNames);
00995       }
00996 
00997 
00998     private:
01003       TSQR::Random::NormalGenerator<ordinal_type, scalar_type> gen_;
01004       
01006       std::string scalarTypeName_;
01007 
01009       std::ostream& out_;
01010 
01014       bool humanReadable_;
01015 
01017       void 
01018       reportResults (const int numTrials,
01019          const Ordinal numRows,
01020          const Ordinal numCols,
01021          const size_t actualCacheSizeHint,
01022          const bool contiguousCacheBlocks,
01023          const double seqTsqrTiming,
01024          const std::string& additionalFieldNames,
01025          const std::string& additionalData,
01026          const bool printFieldNames)
01027       {
01028   using std::endl;
01029   if (humanReadable_)
01030     out_ << "Sequential (cache-blocked) TSQR:" << endl
01031          << "Scalar type = " << scalarTypeName_ << endl
01032          << "# rows = " << numRows << endl
01033          << "# columns = " << numCols << endl
01034          << "cache size hint in bytes = " << actualCacheSizeHint << endl
01035          << "contiguous cache blocks? " << contiguousCacheBlocks << endl
01036          << "# trials = " << numTrials << endl
01037          << "Total time (s) = " << seqTsqrTiming << endl 
01038          << endl;
01039   else
01040     {
01041       if (printFieldNames)
01042         {
01043     const char prefix[] = "%";
01044     out_ << prefix 
01045          << "method" 
01046          << ",scalarType"
01047          << ",numRows"
01048          << ",numCols"
01049          << ",cacheSizeHint"
01050          << ",contiguousCacheBlocks"
01051          << ",numTrials"
01052          << ",timing";
01053     if (! additionalFieldNames.empty())
01054       out_ << "," << additionalFieldNames;
01055     out_ << endl;
01056         }
01057       out_ << "SeqTSQR" 
01058      << "," << scalarTypeName_
01059      << "," << numRows
01060      << "," << numCols
01061      << "," << actualCacheSizeHint
01062      << "," << contiguousCacheBlocks
01063      << "," << numTrials 
01064      << "," << seqTsqrTiming;
01065       if (! additionalData.empty())
01066         out_ << "," << additionalData;
01067       out_ << endl;
01068     }
01069       }
01070     };
01071 
01072 
01073     void
01074     benchmarkSeqTsqr (std::ostream& out,
01075           const int numRows,
01076           const int numCols,
01077           const int numTrials,
01078           const size_t cacheSizeHint,
01079           const bool contiguousCacheBlocks,
01080           const bool testComplex,
01081           const std::string& additionalFieldNames,
01082           const std::string& additionalData,
01083           const bool printFieldNames,
01084           const bool humanReadable)
01085     {
01086       typedef Teuchos::Time timer_type;
01087       const bool testReal = true;
01088       using std::string;
01089 
01090       // Only print field names (if at all) for the first data type tested.
01091       bool printedFieldNames = false;
01092 
01093       if (testReal)
01094   {
01095     { // Scalar=float
01096       typedef SeqTsqrBenchmarker<int, float, timer_type> benchmark_type;
01097       string scalarTypeName ("float");
01098       benchmark_type widget (scalarTypeName, out, humanReadable);
01099       widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, 
01100             contiguousCacheBlocks, 
01101             additionalFieldNames, additionalData,
01102             printFieldNames && ! printedFieldNames);
01103       if (printFieldNames && ! printedFieldNames)
01104         printedFieldNames = true;
01105     }
01106     { // Scalar=double
01107       typedef SeqTsqrBenchmarker< int, double, timer_type > benchmark_type;
01108       string scalarTypeName ("double");
01109       benchmark_type widget (scalarTypeName, out, humanReadable);
01110       widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, 
01111             contiguousCacheBlocks, 
01112             additionalFieldNames, additionalData,
01113             printFieldNames && ! printedFieldNames);
01114       if (printFieldNames && ! printedFieldNames)
01115         printedFieldNames = true;
01116     }
01117   }
01118 
01119       if (testComplex)
01120   {
01121 #ifdef HAVE_TSQR_COMPLEX
01122     using std::complex;
01123     { // Scalar=complex<float>
01124       typedef SeqTsqrBenchmarker< int, complex<float>, timer_type > benchmark_type;
01125       string scalarTypeName ("complex<float>");
01126       benchmark_type widget (scalarTypeName, out, humanReadable);
01127       widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, 
01128             contiguousCacheBlocks, 
01129             additionalFieldNames, additionalData,
01130             printFieldNames && ! printedFieldNames);
01131       if (printFieldNames && ! printedFieldNames)
01132         printedFieldNames = true;
01133     }
01134     { // Scalar=complex<double>
01135       typedef SeqTsqrBenchmarker< int, complex<double>, timer_type > benchmark_type;
01136       string scalarTypeName ("complex<double>");
01137       benchmark_type widget (scalarTypeName, out, humanReadable);
01138       widget.benchmark (numTrials, numRows, numCols, cacheSizeHint, 
01139             contiguousCacheBlocks, 
01140             additionalFieldNames, additionalData,
01141             printFieldNames && ! printedFieldNames);
01142       if (printFieldNames && ! printedFieldNames)
01143         printedFieldNames = true;
01144     }
01145 #else // Don't HAVE_TSQR_COMPLEX
01146     throw std::logic_error ("Trilinos was not built with "
01147           "complex arithmetic support");
01148 #endif // HAVE_TSQR_COMPLEX
01149   }
01150     }
01151 
01152 
01153 
01154   } // namespace Test
01155 } // namespace TSQR
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends