Kokkos Node API and Local Linear Algebra Kernels Version of the Day
TbbTsqr_TbbMgs.hpp
00001 //@HEADER
00002 // ************************************************************************
00003 // 
00004 //          Kokkos: Node API and Parallel Node Kernels
00005 //              Copyright (2009) Sandia Corporation
00006 // 
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 // 
00010 // This library is free software; you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as
00012 // published by the Free Software Foundation; either version 2.1 of the
00013 // License, or (at your option) any later version.
00014 //  
00015 // This library is distributed in the hope that it will be useful, but
00016 // WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //  
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00023 // USA
00024 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
00025 // 
00026 // ************************************************************************
00027 //@HEADER
00028 
00029 #ifndef __TSQR_TBB_TbbMgs_hpp
00030 #define __TSQR_TBB_TbbMgs_hpp
00031 
00032 #include <algorithm>
00033 #include <cassert>
00034 #include <cmath>
00035 #include <numeric>
00036 #include <utility> // std::pair
00037 
00038 #include <Tsqr_MessengerBase.hpp>
00039 #include <Teuchos_ScalarTraits.hpp>
00040 #include <Tsqr_Util.hpp>
00041 
00042 #include <Teuchos_RCP.hpp>
00043 
00044 #include <tbb/blocked_range.h>
00045 #include <tbb/parallel_for.h>
00046 #include <tbb/parallel_reduce.h>
00047 #include <tbb/partitioner.h>
00048 
00049 // #define TBB_MGS_DEBUG 1
00050 #ifdef TBB_MGS_DEBUG
00051 #  include <iostream>
00052 using std::cerr;
00053 using std::endl;
00054 #endif // TBB_MGS_DEBUG
00055 
00058 
00059 namespace TSQR {
00060   namespace TBB {
00061 
00062     // Forward declaration
00063     template< class LocalOrdinal, class Scalar >
00064     class TbbMgs {
00065     public:
00066       typedef Scalar scalar_type;
00067       typedef LocalOrdinal ordinal_type;
00068       typedef typename Teuchos::ScalarTraits<Scalar>::magnitudeType magnitude_type;
00069       typedef MessengerBase< Scalar > messenger_type;
00070       typedef Teuchos::RCP< messenger_type > messenger_ptr;
00071 
00072       TbbMgs (const messenger_ptr& messenger) :
00073   messenger_ (messenger) {}
00074     
00075       void 
00076       mgs (const LocalOrdinal nrows_local, 
00077      const LocalOrdinal ncols, 
00078      Scalar A_local[], 
00079      const LocalOrdinal lda_local,
00080      Scalar R[],
00081      const LocalOrdinal ldr);
00082 
00083     private:
00084       messenger_ptr messenger_;
00085     };
00086 
00089 
00090     namespace details {
00091 
00094       template< class LocalOrdinal, class Scalar >
00095       class TbbDot {
00096       public:
00097   void 
00098   operator() (const tbb::blocked_range< LocalOrdinal >& r) 
00099   {
00100     typedef Teuchos::ScalarTraits< Scalar > STS;
00101 
00102     // The TBB book likes this copying of pointers into the local routine.
00103     // It probably helps the compiler do optimizations.
00104     const Scalar* const x = x_;
00105     const Scalar* const y = y_;
00106     Scalar local_result = result_;
00107 
00108 #ifdef TBB_MGS_DEBUG
00109     // cerr << "Range: [" << r.begin() << ", " << r.end() << ")" << endl;
00110     // for (LocalOrdinal k = r.begin(); k != r.end(); ++k)
00111     //   cerr << "(x[" << k << "], y[" << k << "]) = (" << x[k] << "," << y[k] << ")" << " ";
00112     // cerr << endl;
00113 #endif // TBB_MGS_DEBUG
00114 
00115     for (LocalOrdinal i = r.begin(); i != r.end(); ++i)
00116       local_result += x[i] * STS::conjugate (y[i]);
00117 
00118 #ifdef TBB_MGS_DEBUG
00119     //    cerr << "-- Final value = " << local_result << endl;
00120 #endif // TBB_MGS_DEBUG
00121 
00122     result_ = local_result;
00123   }
00125   Scalar result() const { return result_; }
00126 
00128   TbbDot (const Scalar* const x, const Scalar* const y) :
00129     result_ (Scalar(0)), x_ (x), y_ (y) {}
00130 
00132   TbbDot (TbbDot& d, tbb::split) : 
00133     result_ (Scalar(0)), x_ (d.x_), y_ (d.y_)
00134   {}
00137   void join (const TbbDot& d) { result_ += d.result(); }
00138 
00139       private:
00140   // Default constructor doesn't make sense.
00141   TbbDot ();
00142 
00143   Scalar result_;
00144   const Scalar* const x_;
00145   const Scalar* const y_;
00146       };
00147 
00148       template< class LocalOrdinal, class Scalar >
00149       class TbbScale {
00150       public:
00151   TbbScale (Scalar* const x, const Scalar& denom) : 
00152     x_ (x), denom_ (denom) {}
00153 
00154   // TBB demands that this be a "const" operator, in order for
00155   // the parallel_for expression to compile.  Strictly speaking,
00156   // it is const, because it does not change the address of the
00157   // pointer x_ (only the values stored there).
00158   void 
00159   operator() (const tbb::blocked_range< LocalOrdinal >& r) const 
00160   {
00161     // TBB likes arrays to have their pointers copied like this in
00162     // the operator() method.  I suspect it has something to do
00163     // with compiler optimizations.  If C++ supported the
00164     // "restrict" keyword, here would be a good place to add it...
00165     Scalar* const x = x_;
00166     const Scalar denom = denom_;
00167     for (LocalOrdinal i = r.begin(); i != r.end(); ++i)
00168       x[i] = x[i] / denom;
00169   }
00170       private:
00171   Scalar* const x_;
00172   const Scalar denom_;
00173       };
00174 
00175       template< class LocalOrdinal, class Scalar >
00176       class TbbAxpy {
00177       public:
00178   TbbAxpy (const Scalar& alpha, const Scalar* const x, Scalar* const y) : 
00179     alpha_ (alpha), x_ (x), y_ (y) 
00180   {}
00181   // TBB demands that this be a "const" operator, in order for
00182   // the parallel_for expression to compile.  Strictly speaking,
00183   // it is const, because it does change the address of the
00184   // pointer y_ (only the values stored there).
00185   void 
00186   operator() (const tbb::blocked_range< LocalOrdinal >& r) const 
00187   {
00188     const Scalar alpha = alpha_;
00189     const Scalar* const x = x_;
00190     Scalar* const y = y_;
00191     for (LocalOrdinal i = r.begin(); i != r.end(); ++i)
00192       y[i] = y[i] + alpha * x[i];
00193   }
00194       private:
00195   const Scalar alpha_;
00196   const Scalar* const x_;
00197   Scalar* const y_;
00198       };
00199 
00200       template< class LocalOrdinal, class Scalar >
00201       class TbbNormSquared {
00202       public:
00203   typedef Teuchos::ScalarTraits< Scalar > STS;
00204   typedef typename STS::magnitudeType magnitude_type;
00205 
00206   void operator() (const tbb::blocked_range< LocalOrdinal >& r) {
00207     // Doing the right thing in the complex case requires taking
00208     // an absolute value.  We want to avoid this additional cost
00209     // in the real case, which is why we check is_complex.
00210     if (STS::isComplex) 
00211       {
00212         // The TBB book favors copying array pointers into the
00213         // local routine.  It probably helps the compiler do
00214         // optimizations.
00215         const Scalar* const x = x_;
00216         for (LocalOrdinal i = r.begin(); i != r.end(); ++i)
00217     {
00218       // One could implement this by computing
00219       //
00220       // result_ += STS::real (x[i] * STS::conjugate(x[i]));
00221       //
00222       // However, in terms of type theory, it's much more
00223       // natural to start with a magnitude_type before
00224       // doing the multiplication.
00225       const magnitude_type xi = STS::magnitude (x[i]);
00226       result_ += xi * xi;
00227     }
00228       }
00229     else
00230       {
00231         const Scalar* const x = x_;
00232         for (LocalOrdinal i = r.begin(); i != r.end(); ++i)
00233     {
00234       const Scalar xi = x[i];
00235       result_ += xi * xi;
00236     }
00237       }
00238   }
00239   magnitude_type result() const { return result_; }
00240 
00241   TbbNormSquared (const Scalar* const x) :
00242     result_ (magnitude_type(0)), x_ (x) {}
00243   TbbNormSquared (TbbNormSquared& d, tbb::split) : 
00244     result_ (magnitude_type(0)), x_ (d.x_) {}
00245   void join (const TbbNormSquared& d) { result_ += d.result(); }
00246 
00247       private:
00248   // Default constructor doesn't make sense
00249   TbbNormSquared ();
00250 
00251   magnitude_type result_;
00252   const Scalar* const x_;
00253       };
00254 
00257   
00258       template< class LocalOrdinal, class Scalar >
00259       class TbbMgsOps {
00260       private:
00261   typedef tbb::blocked_range< LocalOrdinal > range_type;
00262   typedef Teuchos::ScalarTraits< Scalar > STS;
00263 
00264       public:
00265   typedef MessengerBase< Scalar > messenger_type;
00266   typedef Teuchos::RCP< messenger_type > messenger_ptr;
00267   typedef typename Teuchos::ScalarTraits< Scalar >::magnitudeType magnitude_type;
00268 
00269   TbbMgsOps (const messenger_ptr& messenger) :
00270     messenger_ (messenger) {}
00271 
00272   void
00273   axpy (const LocalOrdinal nrows_local,
00274         const Scalar alpha,
00275         const Scalar x_local[],
00276         Scalar y_local[]) const
00277   {
00278     using tbb::auto_partitioner;
00279     using tbb::parallel_for;
00280 
00281     TbbAxpy< LocalOrdinal, Scalar > axpyer (alpha, x_local, y_local);
00282     parallel_for (range_type(0, nrows_local), axpyer, auto_partitioner());
00283   }
00284 
00285   void
00286   scale (const LocalOrdinal nrows_local, 
00287          Scalar x_local[], 
00288          const Scalar denom) const
00289   {
00290     using tbb::auto_partitioner;
00291     using tbb::parallel_for;
00292 
00293     // "scaler" is spelled that way (and not as "scalar") on
00294     // purpose.  Think about it.
00295     TbbScale< LocalOrdinal, Scalar > scaler (x_local, denom);
00296     parallel_for (range_type(0, nrows_local), scaler, auto_partitioner());
00297   }
00298 
00301   Scalar
00302   dot (const LocalOrdinal nrows_local, 
00303        const Scalar x_local[], 
00304        const Scalar y_local[])
00305   {
00306     Scalar localResult (0);
00307     if (true)
00308       {
00309         // FIXME (mfh 26 Aug 2010) I'm not sure why I did this
00310         // (i.e., why I wrote "if (true)" here).  Certainly the
00311         // branch that is currently enabled should produce
00312         // correct behavior.  I suspect the nonenabled branch
00313         // will not.
00314         if (true)
00315     {
00316       TbbDot< LocalOrdinal, Scalar > dotter (x_local, y_local);
00317       dotter(range_type(0, nrows_local));
00318       localResult = dotter.result();
00319     }
00320         else
00321     {
00322       using tbb::auto_partitioner;
00323       using tbb::parallel_reduce;
00324 
00325       TbbDot< LocalOrdinal, Scalar > dotter (x_local, y_local);
00326       parallel_reduce (range_type(0, nrows_local),
00327            dotter, auto_partitioner());
00328       localResult = dotter.result();
00329     }
00330       }
00331     else 
00332       {
00333         for (LocalOrdinal i = 0; i != nrows_local; ++i)
00334     localResult += x_local[i] * STS::conjugate (y_local[i]);
00335       }
00336     
00337     // FIXME (mfh 23 Apr 2010) Does MPI_SUM do the right thing for
00338     // complex or otherwise general MPI data types?  Perhaps an MPI_Op
00339     // should belong in the MessengerBase...
00340     return messenger_->globalSum (localResult);
00341   }
00342 
00343   magnitude_type
00344   norm2 (const LocalOrdinal nrows_local, 
00345          const Scalar x_local[])
00346   {
00347     using tbb::auto_partitioner;
00348     using tbb::parallel_reduce;
00349 
00350     TbbNormSquared< LocalOrdinal, Scalar > normer (x_local);
00351     parallel_reduce (range_type(0, nrows_local), normer, auto_partitioner());
00352     const magnitude_type localResult = normer.result();
00353     // FIXME (mfh 12 Oct 2010) This involves an implicit
00354     // typecast from Scalar to magnitude_type.
00355     const magnitude_type globalResult = messenger_->globalSum (localResult);
00356     // Make sure that sqrt's argument is a magnitude_type.  Of
00357     // course global_result should be nonnegative real, but we
00358     // want the compiler to pick up the correct sqrt function.
00359     return Teuchos::ScalarTraits< magnitude_type >::squareroot (globalResult);
00360   }
00361 
00362   Scalar
00363   project (const LocalOrdinal nrows_local, 
00364      const Scalar q_local[], 
00365      Scalar v_local[])
00366   {
00367     const Scalar coeff = this->dot (nrows_local, v_local, q_local);
00368     this->axpy (nrows_local, -coeff, q_local, v_local);
00369     return coeff;
00370   }
00371 
00372       private:
00373   messenger_ptr messenger_;
00374       };
00375     } // namespace details
00376 
00379 
00380     template< class LocalOrdinal, class Scalar >
00381     void
00382     TbbMgs< LocalOrdinal, Scalar >::mgs (const LocalOrdinal nrows_local, 
00383            const LocalOrdinal ncols, 
00384            Scalar A_local[], 
00385            const LocalOrdinal lda_local,
00386            Scalar R[],
00387            const LocalOrdinal ldr)
00388     {
00389       details::TbbMgsOps< LocalOrdinal, Scalar > ops (messenger_);
00390       
00391       for (LocalOrdinal j = 0; j < ncols; ++j)
00392   {
00393     Scalar* const v = &A_local[j*lda_local];
00394     for (LocalOrdinal i = 0; i < j; ++i)
00395       {
00396         const Scalar* const q = &A_local[i*lda_local];
00397         R[i + j*ldr] = ops.project (nrows_local, q, v);
00398 #ifdef TBB_MGS_DEBUG
00399         if (my_rank == 0)
00400     cerr << "(i,j) = (" << i << "," << j << "): coeff = " 
00401          << R[i + j*ldr] << endl;
00402 #endif // TBB_MGS_DEBUG
00403       }
00404     const magnitude_type denom = ops.norm2 (nrows_local, v);
00405 #ifdef TBB_MGS_DEBUG
00406     if (my_rank == 0)
00407       cerr << "j = " << j << ": denom = " << denom << endl;
00408 #endif // TBB_MGS_DEBUG
00409 
00410     // FIXME (mfh 29 Apr 2010)
00411     //
00412     // NOTE IMPLICIT CAST.  This should work for complex numbers.
00413     // If it doesn't work for your Scalar data type, it means that
00414     // you need a different data type for the diagonal elements of
00415     // the R factor, than you need for the other elements.  This
00416     // is unlikely if we're comparing MGS against a Householder QR
00417     // factorization; I don't really understand how the latter
00418     // would work (not that it couldn't be given a sensible
00419     // interpretation) in the case of Scalars that aren't plain
00420     // old real or complex numbers.
00421     R[j + j*ldr] = Scalar (denom);
00422     ops.scale (nrows_local, v, denom);
00423   }
00424     }
00425   } // namespace TBB
00426 } // namespace TSQR
00427 
00428 #endif // __TSQR_TBB_TbbMgs_hpp
00429 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends