Kokkos Node API and Local Linear Algebra Kernels Version of the Day
TbbTsqr.hpp
Go to the documentation of this file.
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00032 #ifndef __TSQR_TbbTsqr_hpp
00033 #define __TSQR_TbbTsqr_hpp
00034 
00035 #include <TbbTsqr_TbbParallelTsqr.hpp>
00036 #include <Tsqr_TimeStats.hpp>
00037 #include <Teuchos_Time.hpp>
00038 // #include <TbbRecursiveTsqr.hpp>
00039 
00040 #include <stdexcept>
00041 #include <string>
00042 #include <utility> // std::pair
00043 #include <vector>
00044 
00047 
00048 namespace TSQR {
00049   namespace TBB {
00050 
00064     template< class LocalOrdinal, class Scalar, class TimerType = Teuchos::Time >
00065     class TbbTsqr : public Teuchos::Describable {
00066     private:
00076       TbbParallelTsqr< LocalOrdinal, Scalar, TimerType > impl_;
00077 
00078       // Collected running statistcs on various computations
00079       mutable TimeStats factorStats_, applyStats_, explicitQStats_, cacheBlockStats_, unCacheBlockStats_;
00080 
00081       // Timers for various computations
00082       mutable TimerType factorTimer_, applyTimer_, explicitQTimer_, cacheBlockTimer_, unCacheBlockTimer_;
00083 
00084     public:
00085       typedef Scalar scalar_type;
00086       typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
00087       typedef LocalOrdinal ordinal_type;
00088 
00096       typedef typename TbbParallelTsqr<LocalOrdinal, Scalar, TimerType>::FactorOutput FactorOutput;
00097 
00099       size_t ncores() const { return impl_.ncores(); }
00100 
00102       size_t cache_size_hint() const { return impl_.cache_size_hint(); }
00103 
00108       size_t TEUCHOS_DEPRECATED cache_block_size() const { 
00109   return impl_.cache_size_hint(); 
00110       }
00111 
00125       TbbTsqr (const size_t numCores,
00126          const size_t cacheSizeHint = 0) :
00127   impl_ (numCores, cacheSizeHint),
00128   factorTimer_ ("TbbTsqr::factor"),
00129   applyTimer_ ("TbbTsqr::apply"),
00130   explicitQTimer_ ("TbbTsqr::explicit_Q"),
00131   cacheBlockTimer_ ("TbbTsqr::cache_block"),
00132   unCacheBlockTimer_ ("TbbTsqr::un_cache_block")
00133       {}
00134 
00137       static bool QR_produces_R_factor_with_nonnegative_diagonal() {
00138   typedef TbbParallelTsqr< LocalOrdinal, Scalar, TimerType > impl_type;
00139   return impl_type::QR_produces_R_factor_with_nonnegative_diagonal();
00140       }
00141 
00147       std::string description () const {
00148   using std::endl;
00149 
00150   // SequentialTsqr also implements Describable, so if you
00151   // decide to implement describe(), you could call
00152   // SequentialTsqr's describe() and get a nice hierarchy of
00153   // descriptions.
00154   std::ostringstream os;
00155   os << "Intranode Tall Skinny QR (TSQR): "
00156      << "Intel Threading Building Blocks (TBB) implementation"
00157      << ", max " << ncores() << "-way parallelism"
00158      << ", cache size hint of " << cache_size_hint() << " bytes.";
00159   return os.str();
00160       }
00161 
00162       void
00163       cache_block (const LocalOrdinal nrows,
00164        const LocalOrdinal ncols, 
00165        Scalar A_out[],
00166        const Scalar A_in[],
00167        const LocalOrdinal lda_in) const
00168       {
00169   cacheBlockTimer_.start(true);
00170   impl_.cache_block (nrows, ncols, A_out, A_in, lda_in);
00171   cacheBlockStats_.update (cacheBlockTimer_.stop());
00172       }
00173 
00174       void
00175       un_cache_block (const LocalOrdinal nrows,
00176           const LocalOrdinal ncols,
00177           Scalar A_out[],
00178           const LocalOrdinal lda_out,       
00179           const Scalar A_in[]) const
00180       {
00181   unCacheBlockTimer_.start(true);
00182   impl_.un_cache_block (nrows, ncols, A_out, lda_out, A_in);
00183   unCacheBlockStats_.update (unCacheBlockTimer_.stop());
00184       }
00185 
00186       void
00187       fill_with_zeros (const LocalOrdinal nrows,
00188            const LocalOrdinal ncols,
00189            Scalar C[],
00190            const LocalOrdinal ldc, 
00191            const bool contiguous_cache_blocks) const
00192       {
00193   impl_.fill_with_zeros (nrows, ncols, C, ldc, contiguous_cache_blocks);
00194       }
00195 
00196       template< class MatrixViewType >
00197       MatrixViewType
00198       top_block (const MatrixViewType& C, 
00199      const bool contiguous_cache_blocks) const
00200       {
00201   return impl_.top_block (C, contiguous_cache_blocks);
00202       }
00203 
00239       FactorOutput
00240       factor (const LocalOrdinal nrows,
00241         const LocalOrdinal ncols, 
00242         Scalar A[],
00243         const LocalOrdinal lda,
00244         Scalar R[],
00245         const LocalOrdinal ldr,
00246         const bool contiguous_cache_blocks) const
00247       {
00248   factorTimer_.start(true);
00249   return impl_.factor (nrows, ncols, A, lda, R, ldr, contiguous_cache_blocks);
00250   factorStats_.update (factorTimer_.stop());
00251       }
00252 
00286       void
00287       apply (const ApplyType& apply_type,
00288        const LocalOrdinal nrows,
00289        const LocalOrdinal ncols_Q,
00290        const Scalar Q[],
00291        const LocalOrdinal ldq,
00292        const FactorOutput& factor_output,
00293        const LocalOrdinal ncols_C,
00294        Scalar C[],
00295        const LocalOrdinal ldc,
00296        const bool contiguous_cache_blocks) const
00297       {
00298   applyTimer_.start(true);
00299   impl_.apply (apply_type, nrows, ncols_Q, Q, ldq, factor_output, 
00300          ncols_C, C, ldc, contiguous_cache_blocks);
00301   applyStats_.update (applyTimer_.stop());
00302       }
00303 
00330       void
00331       explicit_Q (const LocalOrdinal nrows,
00332       const LocalOrdinal ncols_Q_in,
00333       const Scalar Q_in[],
00334       const LocalOrdinal ldq_in,
00335       const FactorOutput& factor_output,
00336       const LocalOrdinal ncols_Q_out,
00337       Scalar Q_out[],
00338       const LocalOrdinal ldq_out,
00339       const bool contiguous_cache_blocks) const
00340       {
00341   explicitQTimer_.start(true);
00342   impl_.explicit_Q (nrows, ncols_Q_in, Q_in, ldq_in, factor_output,
00343         ncols_Q_out, Q_out, ldq_out, contiguous_cache_blocks);
00344   explicitQStats_.update (explicitQTimer_.stop());
00345       }
00346 
00351       void
00352       Q_times_B (const LocalOrdinal nrows,
00353      const LocalOrdinal ncols,
00354      Scalar Q[],
00355      const LocalOrdinal ldq,
00356      const Scalar B[],
00357      const LocalOrdinal ldb,
00358      const bool contiguous_cache_blocks) const
00359       {
00360   impl_.Q_times_B (nrows, ncols, Q, ldq, B, ldb, contiguous_cache_blocks);
00361       }
00362 
00370       LocalOrdinal
00371       reveal_R_rank (const LocalOrdinal ncols,
00372          Scalar R[],
00373          const LocalOrdinal ldr,
00374          Scalar U[],
00375          const LocalOrdinal ldu,
00376          const magnitude_type tol) const 
00377       {
00378   return impl_.reveal_R_rank (ncols, R, ldr, U, ldu, tol);
00379       }
00380 
00392       LocalOrdinal
00393       reveal_rank (const LocalOrdinal nrows,
00394        const LocalOrdinal ncols,
00395        Scalar Q[],
00396        const LocalOrdinal ldq,
00397        Scalar R[],
00398        const LocalOrdinal ldr,
00399        const magnitude_type tol,
00400        const bool contiguous_cache_blocks) const
00401       {
00402   return impl_.reveal_rank (nrows, ncols, Q, ldq, R, ldr, tol, 
00403           contiguous_cache_blocks);
00404       }
00405 
00406       double
00407       min_seq_factor_timing () const { return impl_.min_seq_factor_timing(); }
00408       double
00409       max_seq_factor_timing () const { return impl_.max_seq_factor_timing(); }
00410       double
00411       min_seq_apply_timing () const { return impl_.min_seq_apply_timing(); }
00412       double
00413       max_seq_apply_timing () const { return impl_.max_seq_apply_timing(); }
00414 
00415       void getStats (std::vector< TimeStats >& stats) {
00416   const int numStats = 5;
00417   stats.resize (numStats);
00418   stats[0] = factorStats_;
00419   stats[1] = applyStats_;
00420   stats[2] = explicitQStats_;
00421   stats[3] = cacheBlockStats_;
00422   stats[4] = unCacheBlockStats_;
00423       }
00424 
00425       void getStatsLabels (std::vector< std::string >& labels) {
00426   const int numStats = 5;
00427   labels.resize (numStats);
00428   labels[0] = factorTimer_.name();
00429   labels[1] = applyTimer_.name();
00430   labels[2] = explicitQTimer_.name();
00431   labels[3] = cacheBlockTimer_.name();
00432   labels[4] = unCacheBlockTimer_.name();
00433       }
00434 
00435     }; // class TbbTsqr
00436 
00437   } // namespace TBB
00438 } // namespace TSQR
00439 
00440 #endif // __TSQR_TbbTsqr_hpp
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends