Anasazi Version of the Day
Tsqr_SequentialTsqr.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                 Anasazi: Block Eigensolvers Package
00005 //                 Copyright (2010) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00025 //
00026 // ***********************************************************************
00027 // @HEADER
00028 
00029 #ifndef __TSQR_Tsqr_SequentialTsqr_hpp
00030 #define __TSQR_Tsqr_SequentialTsqr_hpp
00031 
00032 #include <Tsqr_ApplyType.hpp>
00033 #include <Tsqr_Matrix.hpp>
00034 #include <Tsqr_CacheBlockingStrategy.hpp>
00035 #include <Tsqr_CacheBlocker.hpp>
00036 #include <Tsqr_LocalVerify.hpp>
00037 #include <Tsqr_ScalarTraits.hpp>
00038 #include <Tsqr_Combine.hpp>
00039 #include <Tsqr_Util.hpp>
00040 
00041 #include <algorithm>
00042 #include <limits>
00043 #include <sstream>
00044 #include <string>
00045 #include <utility> // std::pair
00046 #include <vector>
00047 
00048 // #define TSQR_SEQ_TSQR_EXTRA_DEBUG 1
00049 
00050 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG
00051 #  include <iostream>
00052 using std::cerr;
00053 using std::endl;
00054 
00055 template< class MatrixView >
00056 void view_print (const MatrixView& view) {
00057   cerr << view.nrows() << ", " << view.ncols() << ", " << view.lda();
00058 }
00059 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG
00060 
00063 
00064 namespace TSQR {
00065 
00066   template< class LocalOrdinal, class Scalar >
00067   class SequentialTsqr {
00068   private:
00069     typedef typename std::vector< std::vector< Scalar > >::const_iterator FactorOutputIter;
00070     typedef typename std::vector< std::vector< Scalar > >::const_reverse_iterator FactorOutputReverseIter;
00071     typedef MatView< LocalOrdinal, Scalar > mat_view;
00072     typedef ConstMatView< LocalOrdinal, Scalar > const_mat_view;
00073     typedef std::pair< mat_view, mat_view > block_pair_type;
00074     typedef std::pair< const_mat_view, const_mat_view > const_block_pair_type;
00075 
00084     mat_view
00085     factor_first_block (Combine< LocalOrdinal, Scalar >& combine,
00086       mat_view& A_top,
00087       std::vector< Scalar >& tau,
00088       std::vector< Scalar >& work)
00089     {
00090       const LocalOrdinal ncols = A_top.ncols();
00091       combine.factor_first (A_top.nrows(), ncols, A_top.get(), A_top.lda(), 
00092           &tau[0], &work[0]);
00093       return mat_view(ncols, ncols, A_top.get(), A_top.lda());
00094     }
00095 
00100     void 
00101     apply_first_block (Combine< LocalOrdinal, Scalar >& combine,
00102            const ApplyType& applyType,
00103            const const_mat_view& Q_first,
00104            const std::vector< Scalar >& tau,
00105            mat_view& C_first,
00106            std::vector< Scalar >& work)
00107     {
00108       const LocalOrdinal nrowsLocal = Q_first.nrows();
00109       combine.apply_first (applyType, nrowsLocal, C_first.ncols(), 
00110          Q_first.ncols(), Q_first.get(), Q_first.lda(),
00111          &tau[0], C_first.get(), C_first.lda(), &work[0]);
00112     }
00113 
00114     void
00115     combine_apply (Combine< LocalOrdinal, Scalar >& combine,
00116        const ApplyType& apply_type,
00117        const const_mat_view& Q_cur,
00118        const std::vector< Scalar >& tau,
00119        mat_view& C_top,
00120        mat_view& C_cur,
00121        std::vector< Scalar >& work)
00122     {
00123       const LocalOrdinal nrows_local = Q_cur.nrows();
00124       const LocalOrdinal ncols_Q = Q_cur.ncols();
00125       const LocalOrdinal ncols_C = C_cur.ncols();
00126 
00127       combine.apply_inner (apply_type, 
00128          nrows_local, ncols_C, ncols_Q, 
00129          Q_cur.get(), C_cur.lda(), &tau[0],
00130          C_top.get(), C_top.lda(),
00131          C_cur.get(), C_cur.lda(), &work[0]);
00132     }
00133 
00134     void
00135     combine_factor (Combine< LocalOrdinal, Scalar >& combine,
00136         mat_view& R,
00137         mat_view& A_cur,
00138         std::vector< Scalar >& tau,
00139         std::vector< Scalar >& work)
00140     {
00141       const LocalOrdinal nrows_local = A_cur.nrows();
00142       const LocalOrdinal ncols = A_cur.ncols();
00143 
00144       combine.factor_inner (nrows_local, ncols, R.get(), R.lda(), 
00145           A_cur.get(), A_cur.lda(), &tau[0], 
00146           &work[0]);
00147     }
00148 
00149   public:
00150     typedef Scalar scalar_type;
00151     typedef typename ScalarTraits< Scalar >::magnitude_type magnitude_type;
00152     typedef LocalOrdinal ordinal_type;
00153     typedef std::vector< std::vector< Scalar > > FactorOutput;
00154 
00160     SequentialTsqr (const size_t cacheBlockSize = 0) :
00161       strategy_ (cacheBlockSize) 
00162     {}
00163 
00166     bool QR_produces_R_factor_with_nonnegative_diagonal () const {
00167       typedef Combine< LocalOrdinal, Scalar > combine_type;
00168       return combine_type::QR_produces_R_factor_with_nonnegative_diagonal();
00169     }
00170 
00172     size_t
00173     cache_block_size () const { return strategy_.cache_block_size(); }
00174 
00184     FactorOutput
00185     factor (const LocalOrdinal nrows,
00186       const LocalOrdinal ncols,
00187       Scalar A[],
00188       const LocalOrdinal lda, 
00189       const bool contiguous_cache_blocks = false)
00190     {
00191       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00192       Combine< LocalOrdinal, Scalar > combine;
00193       std::vector< Scalar > work (ncols);
00194       FactorOutput tau_arrays;
00195 
00196       // We say "A_rest" because it points to the remaining part of
00197       // the matrix left to factor; at the beginning, the "remaining"
00198       // part is the whole matrix, but that will change as the
00199       // algorithm progresses.
00200       //
00201       // Note: if the cache blocks are stored contiguously, lda won't
00202       // be the correct leading dimension of A, but it won't matter:
00203       // we only ever operate on A_cur here, and A_cur's leading
00204       // dimension is set correctly by A_rest.split_top().
00205       mat_view A_rest (nrows, ncols, A, lda);
00206       // This call modifies A_rest.
00207       mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00208 
00209       // Factor the topmost block of A.
00210       std::vector< Scalar > tau_first (ncols);
00211       mat_view R_view = factor_first_block (combine, A_cur, tau_first, work);
00212       tau_arrays.push_back (tau_first);
00213 
00214       while (! A_rest.empty())
00215   {
00216     A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00217     std::vector< Scalar > tau (ncols);
00218     combine_factor (combine, R_view, A_cur, tau, work);
00219     tau_arrays.push_back (tau);
00220   }
00221       return tau_arrays;
00222     }
00223 
00225     void
00226     extract_R (const LocalOrdinal nrows,
00227          const LocalOrdinal ncols,
00228          const Scalar A[],
00229          const LocalOrdinal lda,
00230          Scalar R[],
00231          const LocalOrdinal ldr,
00232          const bool contiguous_cache_blocks = false)
00233     {
00234       const_mat_view A_view (nrows, ncols, A, lda);
00235 
00236       // Identify top cache block of A
00237       const_mat_view A_top = top_block (A_view, contiguous_cache_blocks);
00238 
00239       // Fill R (including lower triangle) with zeros.
00240       fill_matrix (ncols, ncols, R, ldr, Scalar(0));
00241 
00242       // Copy out the upper triangle of the R factor from A into R.
00243       copy_upper_triangle (ncols, ncols, R, ldr, A_top.get(), A_top.lda());
00244     }
00245 
00255     FactorOutput
00256     factor (const LocalOrdinal nrows,
00257       const LocalOrdinal ncols,
00258       Scalar A[],
00259       const LocalOrdinal lda, 
00260       Scalar R[],
00261       const LocalOrdinal ldr,
00262       const bool contiguous_cache_blocks = false)
00263     {
00264       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00265       Combine< LocalOrdinal, Scalar > combine;
00266       std::vector< Scalar > work (ncols);
00267       FactorOutput tau_arrays;
00268 
00269       // We say "A_rest" because it points to the remaining part of
00270       // the matrix left to factor; at the beginning, the "remaining"
00271       // part is the whole matrix, but that will change as the
00272       // algorithm progresses.
00273       //
00274       // Note: if the cache blocks are stored contiguously, lda won't
00275       // be the correct leading dimension of A, but it won't matter:
00276       // we only ever operate on A_cur here, and A_cur's leading
00277       // dimension is set correctly by A_rest.split_top().
00278       mat_view A_rest (nrows, ncols, A, lda);
00279       // This call modifies A_rest.
00280       mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00281 
00282       // Factor the topmost block of A.
00283       std::vector< Scalar > tau_first (ncols);
00284       mat_view R_view = factor_first_block (combine, A_cur, tau_first, work);
00285       tau_arrays.push_back (tau_first);
00286 
00287       while (! A_rest.empty())
00288   {
00289     A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00290     std::vector< Scalar > tau (ncols);
00291     combine_factor (combine, R_view, A_cur, tau, work);
00292     tau_arrays.push_back (tau);
00293   }
00294       
00295       // Copy the R factor resulting from the factorization out of
00296       // R_view (a view of the topmost cache block of A) into the R
00297       // output argument.
00298       fill_matrix (ncols, ncols, R, ldr, Scalar(0));
00299       copy_upper_triangle (ncols, ncols, R, ldr, R_view.get(), R_view.lda());
00300       return tau_arrays;
00301     }
00302 
00303 
00306     LocalOrdinal
00307     factor_num_cache_blocks (const LocalOrdinal nrows,
00308            const LocalOrdinal ncols,
00309            Scalar A[],
00310            const LocalOrdinal lda, 
00311            const bool contiguous_cache_blocks = false)
00312     {
00313       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00314       LocalOrdinal count = 0;
00315 
00316       mat_view A_rest (nrows, ncols, A, lda);
00317       if (A_rest.empty())
00318   return count;
00319 
00320       mat_view A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00321       count++; // first factor step
00322 
00323       while (! A_rest.empty())
00324   {
00325     A_cur = blocker.split_top_block (A_rest, contiguous_cache_blocks);
00326     count++; // next factor step
00327   }
00328       return count;
00329     }
00330 
00331 
00332     void
00333     apply (const ApplyType& apply_type,
00334      const LocalOrdinal nrows,
00335      const LocalOrdinal ncols_Q,
00336      const Scalar Q[],
00337      const LocalOrdinal ldq,
00338      const FactorOutput& factor_output,
00339      const LocalOrdinal ncols_C,
00340      Scalar C[],
00341      const LocalOrdinal ldc,
00342      const bool contiguous_cache_blocks = false)
00343     {
00344       // Quick exit and error tests
00345       if (ncols_Q == 0 || ncols_C == 0 || nrows == 0)
00346   return;
00347       else if (ldc < nrows)
00348   {
00349     std::ostringstream os;
00350     os << "SequentialTsqr::apply: ldc (= " << ldc << ") < nrows (= " << nrows << ")";
00351     throw std::invalid_argument (os.str());
00352   }
00353       else if (ldq < nrows)
00354   {
00355     std::ostringstream os;
00356     os << "SequentialTsqr::apply: ldq (= " << ldq << ") < nrows (= " << nrows << ")";
00357     throw std::invalid_argument (os.str());
00358   }
00359 
00360       // If contiguous cache blocks are used, then we have to use the
00361       // same convention as we did for factor().  Otherwise, we are
00362       // free to choose the cache block dimensions as we wish in
00363       // apply(), independently of what we did in factor().
00364       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols_Q, strategy_);
00365       LAPACK< LocalOrdinal, Scalar > lapack;
00366       Combine< LocalOrdinal, Scalar > combine;
00367 
00368       const bool transposed = apply_type.transposed();
00369       const FactorOutput& tau_arrays = factor_output; // rename for encapsulation
00370       std::vector< Scalar > work (ncols_C);
00371       
00372       // We say "*_rest" because it points to the remaining part of
00373       // the matrix left to factor; at the beginning, the "remaining"
00374       // part is the whole matrix, but that will change as the
00375       // algorithm progresses.
00376       //
00377       // Note: if the cache blocks are stored contiguously, ldq
00378       // resp. ldc won't be the correct leading dimension, but it
00379       // won't matter, since we only read the leading dimension of
00380       // return values of split_top_block() / split_bottom_block(),
00381       // which are set correctly (based e.g., on whether or not we are
00382       // using contiguous cache blocks).
00383       const_mat_view Q_rest (nrows, ncols_Q, Q, ldq);
00384       mat_view C_rest (nrows, ncols_C, C, ldc);
00385 
00386 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG
00387       if (Q_rest.empty())
00388   throw std::logic_error ("Q_rest initially empty");
00389       else if (C_rest.empty())
00390   throw std::logic_error ("C_rest initially empty");
00391 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG
00392 
00393       // Identify the top ncols_C by ncols_C block of C.  C_rest is
00394       // not modified.
00395       mat_view C_top = blocker.top_block (C_rest, contiguous_cache_blocks);
00396 
00397 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG
00398       if (C_top.empty())
00399   throw std::logic_error ("C_top initially empty");
00400 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG
00401 
00402       if (transposed)
00403   {
00404     const_mat_view Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks);
00405     mat_view C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks);
00406 
00407     // Apply the topmost block of Q.
00408     FactorOutputIter tau_iter = tau_arrays.begin();
00409     const std::vector< Scalar >& tau = *tau_iter++;
00410     apply_first_block (combine, apply_type, Q_cur, tau, C_cur, work);
00411 
00412     while (! Q_rest.empty())
00413       {
00414         Q_cur = blocker.split_top_block (Q_rest, contiguous_cache_blocks);
00415         C_cur = blocker.split_top_block (C_rest, contiguous_cache_blocks);
00416 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG
00417         if (tau_iter == tau_arrays.end())
00418     throw std::logic_error("Not enough tau arrays!");
00419 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG
00420         combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work);
00421       }
00422 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG
00423     if (tau_iter != tau_arrays.end())
00424       throw std::logic_error ("Too many tau arrays!");
00425 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG
00426   }
00427       else
00428   {
00429     // Start with the last local Q factor and work backwards up the matrix.
00430     FactorOutputReverseIter tau_iter = tau_arrays.rbegin();
00431 
00432     const_mat_view Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks);
00433     mat_view C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks);
00434 
00435     while (! Q_rest.empty())
00436       {
00437 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG
00438         if (Q_cur.empty())
00439           throw std::logic_error ("Q_cur empty at last stage of applying Q");
00440         else if (C_cur.empty())
00441           throw std::logic_error ("C_cur empty at last stage of applying Q");
00442         else if (tau_iter == tau_arrays.rend())
00443           throw std::logic_error ("Not enough tau arrays!");
00444 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG
00445         combine_apply (combine, apply_type, Q_cur, *tau_iter++, C_top, C_cur, work);
00446         Q_cur = blocker.split_bottom_block (Q_rest, contiguous_cache_blocks);
00447         C_cur = blocker.split_bottom_block (C_rest, contiguous_cache_blocks);
00448       }
00449     //
00450     // Apply to last (topmost) cache block.
00451     //
00452 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG
00453     if (Q_cur.empty())
00454       throw std::logic_error ("Q_cur empty at last stage of applying Q");
00455     else if (C_cur.empty())
00456       throw std::logic_error ("C_cur empty at last stage of applying Q");
00457     else if (tau_iter == tau_arrays.rend())
00458       throw std::logic_error ("Not enough tau arrays!");
00459 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG
00460 
00461     apply_first_block (combine, apply_type, Q_cur, *tau_iter++, C_cur, work);
00462 
00463 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG
00464     if (tau_iter != tau_arrays.rend())
00465       throw std::logic_error ("Too many tau arrays!");
00466 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG
00467   }
00468     }
00469 
00470     void
00471     explicit_Q (const LocalOrdinal nrows,
00472     const LocalOrdinal ncols_Q,
00473     const Scalar Q[],
00474     const LocalOrdinal ldq,
00475     const FactorOutput& factor_output,
00476     const LocalOrdinal ncols_C,
00477     Scalar C[],
00478     const LocalOrdinal ldc,
00479     const bool contiguous_cache_blocks = false) 
00480     {
00481       // Identify top ncols_C by ncols_C block of C.  C_view is not
00482       // modified.  top_block() will set C_top to have the correct
00483       // leading dimension, whether or not cache blocks are stored
00484       // contiguously.
00485       mat_view C_view (nrows, ncols_C, C, ldc);
00486       mat_view C_top = top_block (C_view, contiguous_cache_blocks);
00487 
00488       // Fill C with zeros, and then fill the topmost block of C with
00489       // the first ncols_C columns of the identity matrix, so that C
00490       // itself contains the first ncols_C columns of the identity
00491       // matrix.
00492       fill_with_zeros (nrows, ncols_C, C, ldc, contiguous_cache_blocks);
00493       for (LocalOrdinal j = 0; j < ncols_C; ++j)
00494         C_top(j, j) = Scalar(1);
00495 
00496       // Apply the Q factor to C, to extract the first ncols_C columns
00497       // of Q in explicit form.
00498       apply (ApplyType::NoTranspose, 
00499        nrows, ncols_Q, Q, ldq, factor_output, 
00500        ncols_C, C, ldc, contiguous_cache_blocks);
00501     }
00502 
00503 
00508     void
00509     Q_times_B (const LocalOrdinal nrows,
00510          const LocalOrdinal ncols,
00511          Scalar Q[],
00512          const LocalOrdinal ldq,
00513          const Scalar B[],
00514          const LocalOrdinal ldb,
00515          const bool contiguous_cache_blocks = false) const
00516     {
00517       // We don't do any other error checking here (e.g., matrix
00518       // dimensions), though it would be a good idea to do so.
00519 
00520       // Take the easy exit if available.
00521       if (ncols == 0 || nrows == 0)
00522   return;
00523 
00524       // Compute Q := Q*B by iterating through cache blocks of Q.
00525       // This iteration works much like iteration through cache blocks
00526       // of A in factor() (which see).  Here, though, each cache block
00527       // computation is completely independent of the others; a slight
00528       // restructuring of this code would parallelize nicely using
00529       // OpenMP.
00530       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00531       BLAS< LocalOrdinal, Scalar > blas;
00532       mat_view Q_rest (nrows, ncols, Q, ldq);
00533       Matrix< LocalOrdinal, Scalar > 
00534   Q_cur_copy (LocalOrdinal(0), LocalOrdinal(0)); // will be resized
00535       while (! Q_rest.empty())
00536   {
00537     mat_view Q_cur = 
00538       blocker.split_top_block (Q_rest, contiguous_cache_blocks);
00539 
00540     // GEMM doesn't like aliased arguments, so we use a copy.
00541     // We only copy the current cache block, rather than all of
00542     // Q; this saves memory.
00543     Q_cur_copy.reshape (Q_cur.nrows(), ncols);
00544     Q_cur_copy.copy (Q_cur);
00545     // Q_cur := Q_cur_copy * B.
00546     blas.GEMM ("N", "N", Q_cur.nrows(), ncols, ncols, Scalar(1),
00547          Q_cur_copy.get(), Q_cur_copy.lda(), B, ldb,
00548          Scalar(0), Q_cur.get(), Q_cur.lda());
00549   }
00550     }
00551 
00559     LocalOrdinal
00560     reveal_R_rank (const LocalOrdinal ncols,
00561        Scalar R[],
00562        const LocalOrdinal ldr,
00563        Scalar U[],
00564        const LocalOrdinal ldu,
00565        const magnitude_type tol) const 
00566     {
00567       if (tol < 0)
00568   {
00569     std::ostringstream os;
00570     os << "reveal_R_rank: negative tolerance tol = "
00571        << tol << " is not allowed.";
00572     throw std::logic_error (os.str());
00573   }
00574       // We don't do any other error checking here (e.g., matrix
00575       // dimensions), though it would be a good idea to do so.
00576 
00577       // Take the easy exit if available.
00578       if (ncols == 0)
00579   return 0;
00580 
00581       LAPACK< LocalOrdinal, Scalar > lapack;
00582       MatView< LocalOrdinal, Scalar > R_view (ncols, ncols, R, ldr);
00583       Matrix< LocalOrdinal, Scalar > B (R_view); // B := R (deep copy)
00584       MatView< LocalOrdinal, Scalar > U_view (ncols, ncols, U, ldu);
00585       Matrix< LocalOrdinal, Scalar > VT (ncols, ncols, Scalar(0));
00586 
00587       std::vector< magnitude_type > svd_rwork (5*ncols);
00588       std::vector< magnitude_type > singular_values (ncols);
00589       LocalOrdinal svd_lwork = -1; // -1 for LWORK query; will be changed
00590       int svd_info = 0;
00591 
00592       // LAPACK LWORK query for singular value decomposition.  WORK
00593       // array is always of ScalarType, even in the complex case.
00594       {
00595   Scalar svd_lwork_scalar = Scalar(0);
00596   lapack.GESVD ("A", "A", ncols, ncols, 
00597            B.get(), B.lda(), &singular_values[0], 
00598            U_view.get(), U_view.lda(), VT.get(), VT.lda(),
00599            &svd_lwork_scalar, svd_lwork, &svd_rwork[0], &svd_info);
00600   if (svd_info != 0)
00601     {
00602       std::ostringstream os;
00603       os << "reveal_R_rank: GESVD LWORK query returned nonzero INFO = "
00604          << svd_info;
00605       throw std::logic_error (os.str());
00606     }
00607   svd_lwork = static_cast< LocalOrdinal > (svd_lwork_scalar);
00608   // Check the LWORK cast.  LAPACK shouldn't ever return LWORK
00609   // that won't fit in an OrdinalType, but it's not bad to make
00610   // sure.
00611   if (static_cast< Scalar > (svd_lwork) != svd_lwork_scalar)
00612     {
00613       std::ostringstream os;
00614       os << "In SequentialTsqr::reveal_rank: GESVD LWORK query "
00615         "returned LWORK that doesn\'t fit in LocalOrdinal: returned "
00616         "LWORK (as Scalar) is " << svd_lwork_scalar << ", but cast to "
00617         "LocalOrdinal it becomes " << svd_lwork << ".";
00618       throw std::logic_error (os.str());
00619     }
00620   // Make sure svd_lwork >= 0.
00621   if (std::numeric_limits< LocalOrdinal >::is_signed && svd_lwork < 0)
00622     {
00623       std::ostringstream os;
00624       os << "In SequentialTsqr::reveal_rank: GESVD LWORK query "
00625         "returned negative LWORK = " << svd_lwork;
00626       throw std::logic_error (os.str());
00627     }
00628       }
00629       // Allocate workspace for SVD.
00630       std::vector< Scalar > svd_work (svd_lwork);
00631 
00632       // Compute SVD $B := U \Sigma V^*$.  B is overwritten, which is
00633       // why we copied R into B (so that we don't overwrite R if R is
00634       // full rank).
00635       lapack.GESVD ("A", "A", ncols, ncols, 
00636         B.get(), B.lda(), &singular_values[0], 
00637         U_view.get(), U_view.lda(), VT.get(), VT.lda(),
00638         &svd_work[0], svd_lwork, &svd_rwork[0], &svd_info);
00639 
00640 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG
00641       {
00642         cerr << "-- GESVD computed singular values:" << endl;
00643         for (int k = 0; k < ncols; ++k)
00644           cerr << singular_values[k] << " ";
00645         cerr << endl;
00646       }
00647 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG
00648 
00649       // GESVD computes singular values in decreasing order and they
00650       // are all nonnegative.  We know by now that ncols > 0.  "tol"
00651       // is a relative tolerance: relative to the largest singular
00652       // value, which is the 2-norm of the matrix.
00653       const magnitude_type absolute_tolerance = tol * singular_values[0];
00654 
00655       // Determine rank of B, using singular values.  
00656       LocalOrdinal rank = ncols;
00657       for (LocalOrdinal k = 1; k < ncols; ++k)
00658   // "<=" in case singular_values[0] == 0.
00659   if (singular_values[k] <= absolute_tolerance)
00660     {
00661       rank = k;
00662       break;
00663     }
00664 
00665       if (rank == ncols)
00666   return rank; // Don't modify Q or R, if R is full rank.
00667 
00668 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG
00669       {
00670   cerr << "Rank of B (i.e., R): " << rank << " < ncols=" << ncols << endl;
00671   cerr << "Original R = " << endl;
00672   print_local_matrix (cerr, ncols, ncols, R, ldr);
00673       }
00674 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG
00675 
00676       //
00677       // R is not full rank.  
00678       //
00679       // 1. Compute \f$R := \Sigma V^*\f$.
00680       // 2. Return rank (0 <= rank < ncols).
00681       //
00682 
00683       // Compute R := \Sigma VT.  \Sigma is diagonal so we apply it
00684       // column by column (normally one would think of this as row by
00685       // row, but this "Hadamard product" formulation iterates more
00686       // efficiently over VT).  
00687       //
00688       // After this computation, R may no longer be upper triangular.
00689       // R may be zero if all the singular values are zero, but we
00690       // don't need to check for this case; it's rare in practice, and
00691       // the computations below will be correct regardless.
00692       for (LocalOrdinal j = 0; j < ncols; ++j)
00693   {
00694     const Scalar* const VT_j = &VT(0,j);
00695     Scalar* const R_j = &R_view(0,j);
00696 
00697     for (LocalOrdinal i = 0; i < ncols; ++i)
00698       R_j[i] = singular_values[i] * VT_j[i];
00699   }
00700 
00701 #ifdef TSQR_SEQ_TSQR_EXTRA_DEBUG
00702       {
00703   cerr << "Resulting R = " << endl;
00704   print_local_matrix (cerr, ncols, ncols, R, ldr);
00705       }
00706 #endif // TSQR_SEQ_TSQR_EXTRA_DEBUG
00707 
00708       return rank;
00709     }
00710 
00722     LocalOrdinal
00723     reveal_rank (const LocalOrdinal nrows,
00724      const LocalOrdinal ncols,
00725      Scalar Q[],
00726      const LocalOrdinal ldq,
00727      Scalar R[],
00728      const LocalOrdinal ldr,
00729      const magnitude_type tol,
00730      const bool contiguous_cache_blocks = false) const
00731     {
00732       // Take the easy exit if available.
00733       if (ncols == 0)
00734   return 0;
00735       Matrix< LocalOrdinal, Scalar > U (ncols, ncols, Scalar(0));
00736       const LocalOrdinal rank = 
00737   reveal_R_rank (ncols, R, ldr, U.get(), U.ldu(), tol);
00738       
00739       if (rank < ncols)
00740   {
00741     // If R is not full rank: reveal_R_rank() already computed
00742     // the SVD \f$R = U \Sigma V^*\f$ of (the input) R, and
00743     // overwrote R with \f$\Sigma V^*\f$.  Now, we compute \f$Q
00744     // := Q \cdot U\f$, respecting cache blocks of Q.
00745     Q_times_B (nrows, ncols, Q, ldq, U.get(), U.lda(), 
00746          contiguous_cache_blocks);
00747   }
00748       return rank;
00749     }
00750 
00752     void
00753     cache_block (const LocalOrdinal nrows,
00754      const LocalOrdinal ncols,
00755      Scalar A_out[],
00756      const Scalar A_in[],
00757      const LocalOrdinal lda_in) const
00758     {
00759       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00760       blocker.cache_block (nrows, ncols, A_out, A_in, lda_in);
00761     }
00762 
00764     void
00765     un_cache_block (const LocalOrdinal nrows,
00766         const LocalOrdinal ncols,
00767         Scalar A_out[],
00768         const LocalOrdinal lda_out,       
00769         const Scalar A_in[]) const
00770     {
00771       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00772       blocker.un_cache_block (nrows, ncols, A_out, lda_out, A_in);
00773     }
00774 
00787     void
00788     fill_with_zeros (const LocalOrdinal nrows,
00789          const LocalOrdinal ncols,
00790          Scalar A[],
00791          const LocalOrdinal lda, 
00792          const bool contiguous_cache_blocks = false) const
00793     {
00794       CacheBlocker< LocalOrdinal, Scalar > blocker (nrows, ncols, strategy_);
00795       blocker.fill_with_zeros (nrows, ncols, A, lda, contiguous_cache_blocks);
00796     }
00797 
00814     template< class MatrixViewType >
00815     MatrixViewType
00816     top_block (const MatrixViewType& C, 
00817          const bool contiguous_cache_blocks = false) const 
00818     {
00819       // The CacheBlocker object knows how to construct a view of the
00820       // top cache block of C.  This is complicated because cache
00821       // blocks (in C) may or may not be stored contiguously.  If they
00822       // are stored contiguously, the CacheBlocker knows the right
00823       // layout, based on the cache blocking strategy.
00824       CacheBlocker< LocalOrdinal, Scalar > blocker (C.nrows(), C.ncols(), strategy_);
00825 
00826       // C_top_block is a view of the topmost cache block of C.
00827       // C_top_block should have >= ncols rows, otherwise either cache
00828       // blocking is broken or the input matrix C itself had fewer
00829       // rows than columns.
00830       MatrixViewType C_top_block = blocker.top_block (C, contiguous_cache_blocks);
00831       if (C_top_block.nrows() < C_top_block.ncols())
00832   throw std::logic_error ("C\'s topmost cache block has fewer rows than "
00833         "columns");
00834       return C_top_block;
00835     }
00836 
00837   private:
00838     CacheBlockingStrategy< LocalOrdinal, Scalar > strategy_;
00839   };
00840   
00841 } // namespace TSQR
00842 
00843 #endif // __TSQR_Tsqr_SequentialTsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends