Teuchos - Trilinos Tools Package Version of the Day
Teuchos_BLAS.hpp
Go to the documentation of this file.
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //                    Teuchos: Common Tools Package
00005 //                 Copyright (2004) Sandia Corporation
00006 //
00007 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00008 // license for use of this work by or on behalf of the U.S. Government.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ***********************************************************************
00040 // @HEADER
00041 
00042 // Kris
00043 // 06.16.03 -- Start over from scratch
00044 // 06.16.03 -- Initial templatization (Tpetra_BLAS.cpp is no longer needed)
00045 // 06.18.03 -- Changed xxxxx_() function calls to XXXXX_F77()
00046 //          -- Added warning messages for generic calls
00047 // 07.08.03 -- Move into Teuchos package/namespace
00048 // 07.24.03 -- The first iteration of BLAS generics is nearing completion. Caveats:
00049 //             * TRSM isn't finished yet; it works for one case at the moment (left side, upper tri., no transpose, no unit diag.)
00050 //             * Many of the generic implementations are quite inefficient, ugly, or both. I wrote these to be easy to debug, not for efficiency or legibility. The next iteration will improve both of these aspects as much as possible.
00051 //             * Very little verification of input parameters is done, save for the character-type arguments (TRANS, etc.) which is quite robust.
00052 //             * All of the routines that make use of both an incx and incy parameter (which includes much of the L1 BLAS) are set up to work iff incx == incy && incx > 0. Allowing for differing/negative values of incx/incy should be relatively trivial.
00053 //             * All of the L2/L3 routines assume that the entire matrix is being used (that is, if A is mxn, lda = m); they don't work on submatrices yet. This *should* be a reasonably trivial thing to fix, as well.
00054 //          -- Removed warning messages for generic calls
00055 // 08.08.03 -- TRSM now works for all cases where SIDE == L and DIAG == N. DIAG == U is implemented but does not work correctly; SIDE == R is not yet implemented.
00056 // 08.14.03 -- TRSM now works for all cases and accepts (and uses) leading-dimension information.
00057 // 09.26.03 -- character input replaced with enumerated input to cause compiling errors and not run-time errors ( suggested by RAB ).
00058 
00059 #ifndef _TEUCHOS_BLAS_HPP_
00060 #define _TEUCHOS_BLAS_HPP_
00061 
00069 /* for INTEL_CXML, the second arg may need to be changed to 'one'.  If so
00070 the appropriate declaration of one will need to be added back into
00071 functions that include the macro:
00072 */
00073 #if defined (INTEL_CXML)
00074         unsigned int one=1;
00075 #endif
00076 
00077 #ifdef CHAR_MACRO
00078 #undef CHAR_MACRO
00079 #endif
00080 #if defined (INTEL_CXML)
00081 #define CHAR_MACRO(char_var) &char_var, one
00082 #else
00083 #define CHAR_MACRO(char_var) &char_var
00084 #endif
00085 
00086 #include "Teuchos_ConfigDefs.hpp"
00087 #include "Teuchos_ScalarTraits.hpp"
00088 #include "Teuchos_OrdinalTraits.hpp"
00089 #include "Teuchos_BLAS_types.hpp"
00090 #include "Teuchos_Assert.hpp"
00091 
00124 namespace Teuchos
00125 {
00126   extern TEUCHOS_LIB_DLL_EXPORT const char ESideChar[];
00127   extern TEUCHOS_LIB_DLL_EXPORT const char ETranspChar[];
00128   extern TEUCHOS_LIB_DLL_EXPORT const char EUploChar[];
00129   extern TEUCHOS_LIB_DLL_EXPORT const char EDiagChar[];
00130 
00132 
00137   template<typename OrdinalType, typename ScalarType>
00138   class DefaultBLASImpl
00139   {
00140 
00141     typedef typename Teuchos::ScalarTraits<ScalarType>::magnitudeType MagnitudeType;
00142 
00143   public:
00145 
00146 
00148     inline DefaultBLASImpl(void) {}
00149 
00151     inline DefaultBLASImpl(const DefaultBLASImpl<OrdinalType, ScalarType>& /*BLAS_source*/) {}
00152 
00154     inline virtual ~DefaultBLASImpl(void) {}
00156 
00158 
00159 
00161     void ROTG(ScalarType* da, ScalarType* db, MagnitudeType* c, ScalarType* s) const;
00162 
00164     void ROT(const OrdinalType n, ScalarType* dx, const OrdinalType incx, ScalarType* dy, const OrdinalType incy, MagnitudeType* c, ScalarType* s) const;
00165 
00167     void SCAL(const OrdinalType n, const ScalarType alpha, ScalarType* x, const OrdinalType incx) const;
00168 
00170     void COPY(const OrdinalType n, const ScalarType* x, const OrdinalType incx, ScalarType* y, const OrdinalType incy) const;
00171 
00173     template <typename alpha_type, typename x_type>
00174     void AXPY(const OrdinalType n, const alpha_type alpha, const x_type* x, const OrdinalType incx, ScalarType* y, const OrdinalType incy) const;
00175 
00177     typename ScalarTraits<ScalarType>::magnitudeType ASUM(const OrdinalType n, const ScalarType* x, const OrdinalType incx) const;
00178 
00180     template <typename x_type, typename y_type>
00181     ScalarType DOT(const OrdinalType n, const x_type* x, const OrdinalType incx, const y_type* y, const OrdinalType incy) const;
00182 
00184     typename ScalarTraits<ScalarType>::magnitudeType NRM2(const OrdinalType n, const ScalarType* x, const OrdinalType incx) const;
00185 
00187     OrdinalType IAMAX(const OrdinalType n, const ScalarType* x, const OrdinalType incx) const;
00189 
00191 
00192 
00194     template <typename alpha_type, typename A_type, typename x_type, typename beta_type>
00195     void GEMV(ETransp trans, const OrdinalType m, const OrdinalType n, const alpha_type alpha, const A_type* A,
00196               const OrdinalType lda, const x_type* x, const OrdinalType incx, const beta_type beta, ScalarType* y, const OrdinalType incy) const;
00197 
00199     template <typename A_type>
00200     void TRMV(EUplo uplo, ETransp trans, EDiag diag, const OrdinalType n, const A_type* A,
00201               const OrdinalType lda, ScalarType* x, const OrdinalType incx) const;
00202 
00204     template <typename alpha_type, typename x_type, typename y_type>
00205     void GER(const OrdinalType m, const OrdinalType n, const alpha_type alpha, const x_type* x, const OrdinalType incx,
00206              const y_type* y, const OrdinalType incy, ScalarType* A, const OrdinalType lda) const;
00208 
00210 
00211 
00218     template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
00219     void GEMM(ETransp transa, ETransp transb, const OrdinalType m, const OrdinalType n, const OrdinalType k, const alpha_type alpha, const A_type* A, const OrdinalType lda, const B_type* B, const OrdinalType ldb, const beta_type beta, ScalarType* C, const OrdinalType ldc) const;
00220 
00222     template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
00223     void SYMM(ESide side, EUplo uplo, const OrdinalType m, const OrdinalType n, const alpha_type alpha, const A_type* A, const OrdinalType lda, const B_type* B, const OrdinalType ldb, const beta_type beta, ScalarType* C, const OrdinalType ldc) const;
00224 
00226     template <typename alpha_type, typename A_type, typename beta_type>
00227     void SYRK(EUplo uplo, ETransp trans, const OrdinalType n, const OrdinalType k, const alpha_type alpha, const A_type* A, const OrdinalType lda, const beta_type beta, ScalarType* C, const OrdinalType ldc) const;
00228 
00230     template <typename alpha_type, typename A_type>
00231     void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType m, const OrdinalType n,
00232                 const alpha_type alpha, const A_type* A, const OrdinalType lda, ScalarType* B, const OrdinalType ldb) const;
00233 
00235     template <typename alpha_type, typename A_type>
00236     void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType m, const OrdinalType n,
00237                 const alpha_type alpha, const A_type* A, const OrdinalType lda, ScalarType* B, const OrdinalType ldb) const;
00239   };
00240 
00241   template<typename OrdinalType, typename ScalarType>
00242   class TEUCHOS_LIB_DLL_EXPORT BLAS : public DefaultBLASImpl<OrdinalType,ScalarType>
00243   {
00244 
00245     typedef typename Teuchos::ScalarTraits<ScalarType>::magnitudeType MagnitudeType;
00246 
00247   public:
00249 
00250 
00252     inline BLAS(void) {}
00253 
00255     inline BLAS(const BLAS<OrdinalType, ScalarType>& /*BLAS_source*/) {}
00256 
00258     inline virtual ~BLAS(void) {}
00260   };
00261 
00262 //------------------------------------------------------------------------------------------
00263 //      LEVEL 1 BLAS ROUTINES
00264 //------------------------------------------------------------------------------------------
00265 
00273   namespace details {
00274 
00275     template<typename ScalarType, bool isComplex>
00276     class GivensRotator {
00277     public:
00278       void
00279       ROTG (ScalarType* a,
00280             ScalarType* b,
00281             typename ScalarTraits<ScalarType>::magnitudeType* c,
00282             ScalarType* s) const;
00283     };
00284 
00285     // Complex-arithmetic specialization.
00286     template<typename ScalarType>
00287     class GivensRotator<ScalarType, true> {
00288     public:
00289       void
00290       ROTG (ScalarType* ca,
00291             ScalarType* cb,
00292             typename ScalarTraits<ScalarType>::magnitudeType* c,
00293             ScalarType* s) const;
00294     };
00295 
00296     // Real-arithmetic specialization.
00297     template<typename ScalarType>
00298     class GivensRotator<ScalarType, false> {
00299     public:
00300       void
00301       ROTG (ScalarType* da,
00302             ScalarType* db,
00303             ScalarType* c,
00304             ScalarType* s) const;
00305 
00306     private:
00319       ScalarType SIGN (ScalarType x, ScalarType y) const {
00320         typedef ScalarTraits<ScalarType> STS;
00321 
00322         if (y > STS::zero()) {
00323           return STS::magnitude (x);
00324         } else if (y < STS::zero()) {
00325           return -STS::magnitude (x);
00326         } else { // y == STS::zero()
00327           // Suppose that ScalarType implements signed zero, as IEEE
00328           // 754 - compliant floating-point numbers should.  You can't
00329           // use == to test for signed zero, since +0 == -0.  However,
00330           // 1/0 = Inf > 0 and 1/-0 = -Inf < 0.  Let's hope ScalarType
00331           // supports Inf... we don't need to test for Inf, just see
00332           // if it's greater than or less than zero.
00333           //
00334           // NOTE: This ONLY works if ScalarType is real.  Complex
00335           // infinity doesn't have a sign, so we can't compare it with
00336           // zero.  That's OK, because finite complex numbers don't
00337           // have a sign either; they have an angle.
00338           ScalarType signedInfinity = STS::one() / y;
00339           if (signedInfinity > STS::zero()) {
00340             return STS::magnitude (x);
00341           } else {
00342             // Even if ScalarType doesn't implement signed zero,
00343             // Fortran's SIGN intrinsic returns -ABS(X) if the second
00344             // argument Y is zero.  We imitate this behavior here.
00345             return -STS::magnitude (x);
00346           }
00347         }
00348       }
00349     };
00350 
00351     // Implementation of complex-arithmetic specialization.
00352     template<typename ScalarType>
00353     void
00354     GivensRotator<ScalarType, true>::
00355     ROTG (ScalarType* ca,
00356           ScalarType* cb,
00357           typename ScalarTraits<ScalarType>::magnitudeType* c,
00358           ScalarType* s) const
00359     {
00360       typedef ScalarTraits<ScalarType> STS;
00361       typedef typename STS::magnitudeType MagnitudeType;
00362       typedef ScalarTraits<MagnitudeType> STM;
00363 
00364       // This is a straightforward translation into C++ of the
00365       // reference BLAS' implementation of ZROTG.  You can get
00366       // the Fortran 77 source code of ZROTG here:
00367       //
00368       // http://www.netlib.org/blas/zrotg.f
00369       //
00370       // I used the following rules to translate Fortran types and
00371       // intrinsic functions into C++:
00372       //
00373       // DOUBLE PRECISION -> MagnitudeType
00374       // DOUBLE COMPLEX -> ScalarType
00375       // CDABS -> STS::magnitude
00376       // DCMPLX -> ScalarType constructor (assuming that ScalarType
00377       //   is std::complex<MagnitudeType>)
00378       // DCONJG -> STS::conjugate
00379       // DSQRT -> STM::squareroot
00380       ScalarType alpha;
00381       MagnitudeType norm, scale;
00382 
00383       if (STS::magnitude (*ca) == STM::zero()) {
00384         *c = STM::zero();
00385         *s = STS::one();
00386         *ca = *cb;
00387       } else {
00388         scale = STS::magnitude (*ca) + STS::magnitude (*cb);
00389         { // I introduced temporaries into the translated BLAS code in
00390           // order to make the expression easier to read and also save a
00391           // few floating-point operations.
00392           const MagnitudeType ca_scaled =
00393             STS::magnitude (*ca / ScalarType(scale, STM::zero()));
00394           const MagnitudeType cb_scaled =
00395             STS::magnitude (*cb / ScalarType(scale, STM::zero()));
00396           norm = scale *
00397             STM::squareroot (ca_scaled*ca_scaled + cb_scaled*cb_scaled);
00398         }
00399         alpha = *ca / STS::magnitude (*ca);
00400         *c = STS::magnitude (*ca) / norm;
00401         *s = alpha * STS::conjugate (*cb) / norm;
00402         *ca = alpha * norm;
00403       }
00404     }
00405 
00406     // Implementation of real-arithmetic specialization.
00407     template<typename ScalarType>
00408     void
00409     GivensRotator<ScalarType, false>::
00410     ROTG (ScalarType* da,
00411           ScalarType* db,
00412           ScalarType* c,
00413           ScalarType* s) const
00414     {
00415       typedef ScalarTraits<ScalarType> STS;
00416 
00417       // This is a straightforward translation into C++ of the
00418       // reference BLAS' implementation of DROTG.  You can get
00419       // the Fortran 77 source code of DROTG here:
00420       //
00421       // http://www.netlib.org/blas/drotg.f
00422       //
00423       // I used the following rules to translate Fortran types and
00424       // intrinsic functions into C++:
00425       //
00426       // DOUBLE PRECISION -> ScalarType
00427       // DABS -> STS::magnitude
00428       // DSQRT -> STM::squareroot
00429       // DSIGN -> SIGN (see below)
00430       //
00431       // DSIGN(x,y) (the old DOUBLE PRECISION type-specific form of
00432       // the Fortran type-generic SIGN intrinsic) required special
00433       // translation, which we did in a separate utility function in
00434       // the specializaton of GivensRotator for real arithmetic.
00435       // (ROTG for complex arithmetic doesn't require this function.)
00436       // C99 provides a copysign() math library function, but we are
00437       // not able to rely on the existence of C99 functions here.
00438       ScalarType r, roe, scale, z;
00439 
00440       roe = *db;
00441       if (STS::magnitude (*da) > STS::magnitude (*db)) {
00442         roe = *da;
00443       }
00444       scale = STS::magnitude (*da) + STS::magnitude (*db);
00445       if (scale == STS::zero()) {
00446         *c = STS::one();
00447         *s = STS::zero();
00448         r = STS::zero();
00449         z = STS::zero();
00450       } else {
00451         // I introduced temporaries into the translated BLAS code in
00452         // order to make the expression easier to read and also save
00453         // a few floating-point operations.
00454         const ScalarType da_scaled = *da / scale;
00455         const ScalarType db_scaled = *db / scale;
00456         r = scale * STS::squareroot (da_scaled*da_scaled + db_scaled*db_scaled);
00457         r = SIGN (STS::one(), roe) * r;
00458         *c = *da / r;
00459         *s = *db / r;
00460         z = STS::one();
00461         if (STS::magnitude (*da) > STS::magnitude (*db)) {
00462           z = *s;
00463         }
00464         if (STS::magnitude (*db) >= STS::magnitude (*da) && *c != STS::zero()) {
00465           z = STS::one() / *c;
00466         }
00467       }
00468 
00469       *da = r;
00470       *db = z;
00471     }
00472   } // namespace details
00473 
00474   template<typename OrdinalType, typename ScalarType>
00475   void
00476   DefaultBLASImpl<OrdinalType, ScalarType>::
00477   ROTG (ScalarType* da,
00478         ScalarType* db,
00479         MagnitudeType* c,
00480         ScalarType* s) const
00481   {
00482     typedef ScalarTraits<ScalarType> STS;
00483     details::GivensRotator<ScalarType, STS::isComplex> rotator;
00484     rotator.ROTG (da, db, c, s);
00485   }
00486 
00487   template<typename OrdinalType, typename ScalarType>
00488   void DefaultBLASImpl<OrdinalType,ScalarType>::ROT(const OrdinalType n, ScalarType* dx, const OrdinalType incx, ScalarType* dy, const OrdinalType incy, MagnitudeType* c, ScalarType* s) const
00489   {
00490     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
00491     ScalarType sconj = Teuchos::ScalarTraits<ScalarType>::conjugate(*s);
00492     if (n <= 0) return;
00493     if (incx==1 && incy==1) {
00494       for(OrdinalType i=0; i<n; ++i) {
00495         ScalarType temp = *c*dx[i] + sconj*dy[i];
00496         dy[i] = *c*dy[i] - sconj*dx[i];
00497         dx[i] = temp;
00498       }
00499     }
00500     else {
00501       OrdinalType ix = 0, iy = 0;
00502       if (incx < izero) ix = (-n+1)*incx;
00503       if (incy < izero) iy = (-n+1)*incy;
00504       for(OrdinalType i=0; i<n; ++i) {
00505         ScalarType temp = *c*dx[ix] + sconj*dy[iy];
00506         dy[iy] = *c*dy[iy] - sconj*dx[ix];
00507         dx[ix] = temp;
00508         ix += incx;
00509         iy += incy;
00510       }
00511     }
00512   }
00513 
00514   template<typename OrdinalType, typename ScalarType>
00515   void DefaultBLASImpl<OrdinalType, ScalarType>::SCAL(const OrdinalType n, const ScalarType alpha, ScalarType* x, const OrdinalType incx) const
00516   {
00517     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
00518     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
00519     OrdinalType i, ix = izero;
00520     if ( n > izero ) {
00521         // Set the initial index (ix).
00522         if (incx < izero) { ix = (-n+ione)*incx; }
00523         // Scale the std::vector.
00524         for(i = izero; i < n; i++)
00525         {
00526             x[ix] *= alpha;
00527             ix += incx;
00528         }
00529     }
00530   } /* end SCAL */
00531 
00532   template<typename OrdinalType, typename ScalarType>
00533   void DefaultBLASImpl<OrdinalType, ScalarType>::COPY(const OrdinalType n, const ScalarType* x, const OrdinalType incx, ScalarType* y, const OrdinalType incy) const
00534   {
00535     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
00536     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
00537     OrdinalType i, ix = izero, iy = izero;
00538     if ( n > izero ) {
00539         // Set the initial indices (ix, iy).
00540         if (incx < izero) { ix = (-n+ione)*incx; }
00541         if (incy < izero) { iy = (-n+ione)*incy; }
00542 
00543         for(i = izero; i < n; i++)
00544           {
00545             y[iy] = x[ix];
00546             ix += incx;
00547             iy += incy;
00548           }
00549       }
00550   } /* end COPY */
00551 
00552   template<typename OrdinalType, typename ScalarType>
00553   template <typename alpha_type, typename x_type>
00554   void DefaultBLASImpl<OrdinalType, ScalarType>::AXPY(const OrdinalType n, const alpha_type alpha, const x_type* x, const OrdinalType incx, ScalarType* y, const OrdinalType incy) const
00555   {
00556     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
00557     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
00558     OrdinalType i, ix = izero, iy = izero;
00559     if( n > izero && alpha != ScalarTraits<alpha_type>::zero())
00560       {
00561         // Set the initial indices (ix, iy).
00562         if (incx < izero) { ix = (-n+ione)*incx; }
00563         if (incy < izero) { iy = (-n+ione)*incy; }
00564 
00565         for(i = izero; i < n; i++)
00566           {
00567             y[iy] += alpha * x[ix];
00568             ix += incx;
00569             iy += incy;
00570           }
00571       }
00572   } /* end AXPY */
00573 
00574   template<typename OrdinalType, typename ScalarType>
00575   typename ScalarTraits<ScalarType>::magnitudeType DefaultBLASImpl<OrdinalType, ScalarType>::ASUM(const OrdinalType n, const ScalarType* x, const OrdinalType incx) const
00576   {
00577     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
00578     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
00579     typename ScalarTraits<ScalarType>::magnitudeType result =
00580       ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero();
00581     OrdinalType i, ix = izero;
00582     if( n > izero ) {
00583         // Set the initial indices
00584         if (incx < izero) { ix = (-n+ione)*incx; }
00585 
00586         for(i = izero; i < n; i++)
00587           {
00588             result += ScalarTraits<ScalarType>::magnitude(x[ix]);
00589             ix += incx;
00590           }
00591     }
00592    return result;
00593   } /* end ASUM */
00594 
00595   template<typename OrdinalType, typename ScalarType>
00596   template <typename x_type, typename y_type>
00597   ScalarType DefaultBLASImpl<OrdinalType, ScalarType>::DOT(const OrdinalType n, const x_type* x, const OrdinalType incx, const y_type* y, const OrdinalType incy) const
00598   {
00599     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
00600     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
00601     ScalarType result = ScalarTraits<ScalarType>::zero();
00602     OrdinalType i, ix = izero, iy = izero;
00603     if( n > izero )
00604       {
00605         // Set the initial indices (ix,iy).
00606         if (incx < izero) { ix = (-n+ione)*incx; }
00607         if (incy < izero) { iy = (-n+ione)*incy; }
00608 
00609         for(i = izero; i < n; i++)
00610           {
00611             result += ScalarTraits<x_type>::conjugate(x[ix]) * y[iy];
00612             ix += incx;
00613             iy += incy;
00614           }
00615       }
00616     return result;
00617   } /* end DOT */
00618 
00619   template<typename OrdinalType, typename ScalarType>
00620   typename ScalarTraits<ScalarType>::magnitudeType DefaultBLASImpl<OrdinalType, ScalarType>::NRM2(const OrdinalType n, const ScalarType* x, const OrdinalType incx) const
00621   {
00622     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
00623     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
00624     typename ScalarTraits<ScalarType>::magnitudeType result =
00625       ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero();
00626     OrdinalType i, ix = izero;
00627     if ( n > izero )
00628       {
00629         // Set the initial index.
00630         if (incx < izero) { ix = (-n+ione)*incx; }
00631 
00632         for(i = izero; i < n; i++)
00633           {
00634             result += ScalarTraits<ScalarType>::magnitude(ScalarTraits<ScalarType>::conjugate(x[ix]) * x[ix]);
00635             ix += incx;
00636           }
00637         result = ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::squareroot(result);
00638       }
00639     return result;
00640   } /* end NRM2 */
00641 
00642   template<typename OrdinalType, typename ScalarType>
00643   OrdinalType DefaultBLASImpl<OrdinalType, ScalarType>::IAMAX(const OrdinalType n, const ScalarType* x, const OrdinalType incx) const
00644   {
00645     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
00646     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
00647     OrdinalType result = izero, ix = izero, i;
00648     typename ScalarTraits<ScalarType>::magnitudeType maxval =
00649       ScalarTraits<typename ScalarTraits<ScalarType>::magnitudeType>::zero();
00650 
00651     if ( n > izero )
00652       {
00653         if (incx < izero) { ix = (-n+ione)*incx; }
00654         maxval = ScalarTraits<ScalarType>::magnitude(x[ix]);
00655         ix += incx;
00656         for(i = ione; i < n; i++)
00657           {
00658             if(ScalarTraits<ScalarType>::magnitude(x[ix]) > maxval)
00659               {
00660                 result = i;
00661                 maxval = ScalarTraits<ScalarType>::magnitude(x[ix]);
00662               }
00663             ix += incx;
00664           }
00665       }
00666     return result + 1; // the BLAS I?AMAX functions return 1-indexed (Fortran-style) values
00667   } /* end IAMAX */
00668 
00669 //------------------------------------------------------------------------------------------
00670 //      LEVEL 2 BLAS ROUTINES
00671 //------------------------------------------------------------------------------------------
00672   template<typename OrdinalType, typename ScalarType>
00673   template <typename alpha_type, typename A_type, typename x_type, typename beta_type>
00674   void DefaultBLASImpl<OrdinalType, ScalarType>::GEMV(ETransp trans, const OrdinalType m, const OrdinalType n, const alpha_type alpha, const A_type* A, const OrdinalType lda, const x_type* x, const OrdinalType incx, const beta_type beta, ScalarType* y, const OrdinalType incy) const
00675   {
00676     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
00677     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
00678     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
00679     beta_type beta_zero = ScalarTraits<beta_type>::zero();
00680     x_type x_zero = ScalarTraits<x_type>::zero();
00681     ScalarType y_zero = ScalarTraits<ScalarType>::zero();
00682     beta_type beta_one = ScalarTraits<beta_type>::one();
00683     bool noConj = true;
00684     bool BadArgument = false;
00685 
00686     // Quick return if there is nothing to do!
00687     if( m == izero || n == izero || ( alpha == alpha_zero && beta == beta_one ) ){ return; }
00688 
00689     // Otherwise, we need to check the argument list.
00690     if( m < izero ) {
00691         std::cout << "BLAS::GEMV Error: M == " << m << std::endl;
00692         BadArgument = true;
00693     }
00694     if( n < izero ) {
00695         std::cout << "BLAS::GEMV Error: N == " << n << std::endl;
00696         BadArgument = true;
00697     }
00698     if( lda < m ) {
00699         std::cout << "BLAS::GEMV Error: LDA < MAX(1,M)"<< std::endl;
00700         BadArgument = true;
00701     }
00702     if( incx == izero ) {
00703         std::cout << "BLAS::GEMV Error: INCX == 0"<< std::endl;
00704         BadArgument = true;
00705     }
00706     if( incy == izero ) {
00707         std::cout << "BLAS::GEMV Error: INCY == 0"<< std::endl;
00708         BadArgument = true;
00709     }
00710 
00711     if(!BadArgument) {
00712       OrdinalType i, j, lenx, leny, ix, iy, jx, jy;
00713       OrdinalType kx = izero, ky = izero;
00714       ScalarType temp;
00715 
00716       // Determine the lengths of the vectors x and y.
00717       if(ETranspChar[trans] == 'N') {
00718         lenx = n;
00719         leny = m;
00720       } else {
00721         lenx = m;
00722         leny = n;
00723       }
00724 
00725       // Determine if this is a conjugate tranpose
00726       noConj = (ETranspChar[trans] == 'T');
00727 
00728       // Set the starting pointers for the vectors x and y if incx/y < 0.
00729       if (incx < izero ) { kx =  (ione - lenx)*incx; }
00730       if (incy < izero ) { ky =  (ione - leny)*incy; }
00731 
00732       // Form y = beta*y
00733       ix = kx; iy = ky;
00734       if(beta != beta_one) {
00735         if (incy == ione) {
00736           if (beta == beta_zero) {
00737             for(i = izero; i < leny; i++) { y[i] = y_zero; }
00738           } else {
00739             for(i = izero; i < leny; i++) { y[i] *= beta; }
00740           }
00741         } else {
00742           if (beta == beta_zero) {
00743             for(i = izero; i < leny; i++) {
00744               y[iy] = y_zero;
00745               iy += incy;
00746             }
00747           } else {
00748             for(i = izero; i < leny; i++) {
00749               y[iy] *= beta;
00750               iy += incy;
00751             }
00752           }
00753         }
00754       }
00755 
00756       // Return if we don't have to do anything more.
00757       if(alpha == alpha_zero) { return; }
00758 
00759       if( ETranspChar[trans] == 'N' ) {
00760         // Form y = alpha*A*y
00761         jx = kx;
00762         if (incy == ione) {
00763           for(j = izero; j < n; j++) {
00764             if (x[jx] != x_zero) {
00765               temp = alpha*x[jx];
00766               for(i = izero; i < m; i++) {
00767                 y[i] += temp*A[j*lda + i];
00768               }
00769             }
00770             jx += incx;
00771           }
00772         } else {
00773           for(j = izero; j < n; j++) {
00774             if (x[jx] != x_zero) {
00775               temp = alpha*x[jx];
00776               iy = ky;
00777               for(i = izero; i < m; i++) {
00778                 y[iy] += temp*A[j*lda + i];
00779                 iy += incy;
00780               }
00781             }
00782             jx += incx;
00783           }
00784         }
00785       } else {
00786         jy = ky;
00787         if (incx == ione) {
00788           for(j = izero; j < n; j++) {
00789             temp = y_zero;
00790             if ( noConj ) {
00791               for(i = izero; i < m; i++) {
00792                 temp += A[j*lda + i]*x[i];
00793               }
00794             } else {
00795               for(i = izero; i < m; i++) {
00796                 temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i];
00797               }
00798             }
00799             y[jy] += alpha*temp;
00800             jy += incy;
00801           }
00802         } else {
00803           for(j = izero; j < n; j++) {
00804             temp = y_zero;
00805             ix = kx;
00806             if ( noConj ) {
00807               for (i = izero; i < m; i++) {
00808                 temp += A[j*lda + i]*x[ix];
00809                 ix += incx;
00810               }
00811             } else {
00812               for (i = izero; i < m; i++) {
00813                 temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix];
00814                 ix += incx;
00815               }
00816             }
00817             y[jy] += alpha*temp;
00818             jy += incy;
00819           }
00820         }
00821       }
00822     } /* if (!BadArgument) */
00823   } /* end GEMV */
00824 
00825  template<typename OrdinalType, typename ScalarType>
00826  template <typename A_type>
00827  void DefaultBLASImpl<OrdinalType, ScalarType>::TRMV(EUplo uplo, ETransp trans, EDiag diag, const OrdinalType n, const A_type* A, const OrdinalType lda, ScalarType* x, const OrdinalType incx) const
00828   {
00829     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
00830     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
00831     ScalarType zero = ScalarTraits<ScalarType>::zero();
00832     bool BadArgument = false;
00833     bool noConj = true;
00834 
00835     // Quick return if there is nothing to do!
00836     if( n == izero ){ return; }
00837 
00838     // Otherwise, we need to check the argument list.
00839     if( n < izero ) {
00840       std::cout << "BLAS::TRMV Error: N == " << n << std::endl;
00841       BadArgument = true;
00842     }
00843     if( lda < n ) {
00844       std::cout << "BLAS::TRMV Error: LDA < MAX(1,N)"<< std::endl;
00845       BadArgument = true;
00846     }
00847     if( incx == izero ) {
00848       std::cout << "BLAS::TRMV Error: INCX == 0"<< std::endl;
00849       BadArgument = true;
00850     }
00851 
00852     if(!BadArgument) {
00853       OrdinalType i, j, ix, jx, kx = izero;
00854       ScalarType temp;
00855       bool noUnit = (EDiagChar[diag] == 'N');
00856 
00857       // Determine if this is a conjugate tranpose
00858       noConj = (ETranspChar[trans] == 'T');
00859 
00860       // Set the starting pointer for the std::vector x if incx < 0.
00861       if (incx < izero) { kx = (-n+ione)*incx; }
00862 
00863       // Start the operations for a nontransposed triangular matrix
00864       if (ETranspChar[trans] == 'N') {
00865         /* Compute x = A*x */
00866         if (EUploChar[uplo] == 'U') {
00867           /* A is an upper triangular matrix */
00868           if (incx == ione) {
00869             for (j=izero; j<n; j++) {
00870               if (x[j] != zero) {
00871                 temp = x[j];
00872                 for (i=izero; i<j; i++) {
00873                   x[i] += temp*A[j*lda + i];
00874                 }
00875                 if ( noUnit )
00876                   x[j] *= A[j*lda + j];
00877               }
00878             }
00879           } else {
00880             jx = kx;
00881             for (j=izero; j<n; j++) {
00882               if (x[jx] != zero) {
00883                 temp = x[jx];
00884                 ix = kx;
00885                 for (i=izero; i<j; i++) {
00886                   x[ix] += temp*A[j*lda + i];
00887                   ix += incx;
00888                 }
00889                 if ( noUnit )
00890                   x[jx] *= A[j*lda + j];
00891               }
00892               jx += incx;
00893             }
00894           } /* if (incx == ione) */
00895         } else { /* A is a lower triangular matrix */
00896           if (incx == ione) {
00897             for (j=n-ione; j>-ione; j--) {
00898               if (x[j] != zero) {
00899                 temp = x[j];
00900                 for (i=n-ione; i>j; i--) {
00901                   x[i] += temp*A[j*lda + i];
00902                 }
00903                 if ( noUnit )
00904                   x[j] *= A[j*lda + j];
00905               }
00906             }
00907           } else {
00908             kx += (n-ione)*incx;
00909             jx = kx;
00910             for (j=n-ione; j>-ione; j--) {
00911               if (x[jx] != zero) {
00912                 temp = x[jx];
00913                 ix = kx;
00914                 for (i=n-ione; i>j; i--) {
00915                   x[ix] += temp*A[j*lda + i];
00916                   ix -= incx;
00917                 }
00918                 if ( noUnit )
00919                   x[jx] *= A[j*lda + j];
00920               }
00921               jx -= incx;
00922             }
00923           }
00924         } /* if (EUploChar[uplo]=='U') */
00925       } else { /* A is transposed/conjugated */
00926         /* Compute x = A'*x */
00927         if (EUploChar[uplo]=='U') {
00928           /* A is an upper triangular matrix */
00929           if (incx == ione) {
00930             for (j=n-ione; j>-ione; j--) {
00931               temp = x[j];
00932               if ( noConj ) {
00933                 if ( noUnit )
00934                   temp *= A[j*lda + j];
00935                 for (i=j-ione; i>-ione; i--) {
00936                   temp += A[j*lda + i]*x[i];
00937                 }
00938               } else {
00939                 if ( noUnit )
00940                   temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
00941                 for (i=j-ione; i>-ione; i--) {
00942                   temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i];
00943                 }
00944               }
00945               x[j] = temp;
00946             }
00947           } else {
00948             jx = kx + (n-ione)*incx;
00949             for (j=n-ione; j>-ione; j--) {
00950               temp = x[jx];
00951               ix = jx;
00952               if ( noConj ) {
00953                 if ( noUnit )
00954                   temp *= A[j*lda + j];
00955                 for (i=j-ione; i>-ione; i--) {
00956                   ix -= incx;
00957                   temp += A[j*lda + i]*x[ix];
00958                 }
00959               } else {
00960                 if ( noUnit )
00961                   temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
00962                 for (i=j-ione; i>-ione; i--) {
00963                   ix -= incx;
00964                   temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix];
00965                 }
00966               }
00967               x[jx] = temp;
00968               jx -= incx;
00969             }
00970           }
00971         } else {
00972           /* A is a lower triangular matrix */
00973           if (incx == ione) {
00974             for (j=izero; j<n; j++) {
00975               temp = x[j];
00976               if ( noConj ) {
00977                 if ( noUnit )
00978                   temp *= A[j*lda + j];
00979                 for (i=j+ione; i<n; i++) {
00980                   temp += A[j*lda + i]*x[i];
00981                 }
00982               } else {
00983                 if ( noUnit )
00984                   temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
00985                 for (i=j+ione; i<n; i++) {
00986                   temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[i];
00987                 }
00988               }
00989               x[j] = temp;
00990             }
00991           } else {
00992             jx = kx;
00993             for (j=izero; j<n; j++) {
00994               temp = x[jx];
00995               ix = jx;
00996               if ( noConj ) {
00997                 if ( noUnit )
00998                   temp *= A[j*lda + j];
00999                 for (i=j+ione; i<n; i++) {
01000                   ix += incx;
01001                   temp += A[j*lda + i]*x[ix];
01002                 }
01003               } else {
01004                 if ( noUnit )
01005                   temp *= ScalarTraits<A_type>::conjugate(A[j*lda + j]);
01006                 for (i=j+ione; i<n; i++) {
01007                   ix += incx;
01008                   temp += ScalarTraits<A_type>::conjugate(A[j*lda + i])*x[ix];
01009                 }
01010               }
01011               x[jx] = temp;
01012               jx += incx;
01013             }
01014           }
01015         } /* if (EUploChar[uplo]=='U') */
01016       } /* if (ETranspChar[trans]=='N') */
01017     } /* if (!BadArgument) */
01018   } /* end TRMV */
01019 
01020   template<typename OrdinalType, typename ScalarType>
01021   template <typename alpha_type, typename x_type, typename y_type>
01022   void DefaultBLASImpl<OrdinalType, ScalarType>::GER(const OrdinalType m, const OrdinalType n, const alpha_type alpha, const x_type* x, const OrdinalType incx, const y_type* y, const OrdinalType incy, ScalarType* A, const OrdinalType lda) const
01023   {
01024     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
01025     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
01026     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
01027     y_type y_zero = ScalarTraits<y_type>::zero();
01028     bool BadArgument = false;
01029 
01030     TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::ScalarTraits<ScalarType>::isComplex, std::logic_error,
01031             "Teuchos::BLAS::GER() does not currently support complex data types.");
01032 
01033     // Quick return if there is nothing to do!
01034     if( m == izero || n == izero || alpha == alpha_zero ){ return; }
01035 
01036     // Otherwise, we need to check the argument list.
01037     if( m < izero ) {
01038         std::cout << "BLAS::GER Error: M == " << m << std::endl;
01039         BadArgument = true;
01040     }
01041     if( n < izero ) {
01042         std::cout << "BLAS::GER Error: N == " << n << std::endl;
01043         BadArgument = true;
01044     }
01045     if( lda < m ) {
01046         std::cout << "BLAS::GER Error: LDA < MAX(1,M)"<< std::endl;
01047         BadArgument = true;
01048     }
01049     if( incx == 0 ) {
01050         std::cout << "BLAS::GER Error: INCX == 0"<< std::endl;
01051         BadArgument = true;
01052     }
01053     if( incy == 0 ) {
01054         std::cout << "BLAS::GER Error: INCY == 0"<< std::endl;
01055         BadArgument = true;
01056     }
01057 
01058     if(!BadArgument) {
01059       OrdinalType i, j, ix, jy = izero, kx = izero;
01060       ScalarType temp;
01061 
01062       // Set the starting pointers for the vectors x and y if incx/y < 0.
01063       if (incx < izero) { kx = (-m+ione)*incx; }
01064       if (incy < izero) { jy = (-n+ione)*incy; }
01065 
01066       // Start the operations for incx == 1
01067       if( incx == ione ) {
01068         for( j=izero; j<n; j++ ) {
01069           if ( y[jy] != y_zero ) {
01070             temp = alpha*y[jy];
01071             for ( i=izero; i<m; i++ ) {
01072               A[j*lda + i] += x[i]*temp;
01073             }
01074           }
01075           jy += incy;
01076         }
01077       }
01078       else { // Start the operations for incx != 1
01079         for( j=izero; j<n; j++ ) {
01080           if ( y[jy] != y_zero ) {
01081             temp = alpha*y[jy];
01082             ix = kx;
01083             for( i=izero; i<m; i++ ) {
01084               A[j*lda + i] += x[ix]*temp;
01085               ix += incx;
01086             }
01087           }
01088           jy += incy;
01089         }
01090       }
01091     } /* if(!BadArgument) */
01092   } /* end GER */
01093 
01094 //------------------------------------------------------------------------------------------
01095 //      LEVEL 3 BLAS ROUTINES
01096 //------------------------------------------------------------------------------------------
01097 
01098   template<typename OrdinalType, typename ScalarType>
01099   template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
01100   void DefaultBLASImpl<OrdinalType, ScalarType>::GEMM(ETransp transa, ETransp transb, const OrdinalType m, const OrdinalType n, const OrdinalType k, const alpha_type alpha, const A_type* A, const OrdinalType lda, const B_type* B, const OrdinalType ldb, const beta_type beta, ScalarType* C, const OrdinalType ldc) const
01101   {
01102 
01103     typedef TypeNameTraits<OrdinalType> OTNT;
01104     typedef TypeNameTraits<ScalarType> STNT;
01105 
01106     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
01107     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
01108     beta_type beta_zero = ScalarTraits<beta_type>::zero();
01109     B_type B_zero = ScalarTraits<B_type>::zero();
01110     ScalarType C_zero = ScalarTraits<ScalarType>::zero();
01111     beta_type beta_one = ScalarTraits<beta_type>::one();
01112     OrdinalType i, j, p;
01113     OrdinalType NRowA = m, NRowB = k;
01114     ScalarType temp;
01115     bool BadArgument = false;
01116     bool conjA = false, conjB = false;
01117 
01118     // Change dimensions of matrix if either matrix is transposed
01119     if( !(ETranspChar[transa]=='N') ) {
01120       NRowA = k;
01121     }
01122     if( !(ETranspChar[transb]=='N') ) {
01123       NRowB = n;
01124     }
01125 
01126     // Quick return if there is nothing to do!
01127     if( (m==izero) || (n==izero) || (((alpha==alpha_zero)||(k==izero)) && (beta==beta_one)) ){ return; }
01128     if( m < izero ) {
01129       std::cout << "BLAS::GEMM Error: M == " << m << std::endl;
01130       BadArgument = true;
01131     }
01132     if( n < izero ) {
01133       std::cout << "BLAS::GEMM Error: N == " << n << std::endl;
01134       BadArgument = true;
01135     }
01136     if( k < izero ) {
01137       std::cout << "BLAS::GEMM Error: K == " << k << std::endl;
01138       BadArgument = true;
01139     }
01140     if( lda < NRowA ) {
01141       std::cout << "BLAS::GEMM Error: LDA < "<<NRowA<<std::endl;
01142       BadArgument = true;
01143     }
01144     if( ldb < NRowB ) {
01145       std::cout << "BLAS::GEMM Error: LDB < "<<NRowB<<std::endl;
01146       BadArgument = true;
01147     }
01148      if( ldc < m ) {
01149       std::cout << "BLAS::GEMM Error: LDC < MAX(1,M)"<< std::endl;
01150       BadArgument = true;
01151     }
01152 
01153     if(!BadArgument) {
01154 
01155       // Determine if this is a conjugate tranpose
01156       conjA = (ETranspChar[transa] == 'C');
01157       conjB = (ETranspChar[transb] == 'C');
01158 
01159       // Only need to scale the resulting matrix C.
01160       if( alpha == alpha_zero ) {
01161         if( beta == beta_zero ) {
01162           for (j=izero; j<n; j++) {
01163             for (i=izero; i<m; i++) {
01164               C[j*ldc + i] = C_zero;
01165             }
01166           }
01167         } else {
01168           for (j=izero; j<n; j++) {
01169             for (i=izero; i<m; i++) {
01170               C[j*ldc + i] *= beta;
01171             }
01172           }
01173         }
01174         return;
01175       }
01176       //
01177       // Now start the operations.
01178       //
01179       if ( ETranspChar[transb]=='N' ) {
01180         if ( ETranspChar[transa]=='N' ) {
01181           // Compute C = alpha*A*B + beta*C
01182           for (j=izero; j<n; j++) {
01183             if( beta == beta_zero ) {
01184               for (i=izero; i<m; i++){
01185                 C[j*ldc + i] = C_zero;
01186               }
01187             } else if( beta != beta_one ) {
01188               for (i=izero; i<m; i++){
01189                 C[j*ldc + i] *= beta;
01190               }
01191             }
01192             for (p=izero; p<k; p++){
01193               if (B[j*ldb + p] != B_zero ){
01194                 temp = alpha*B[j*ldb + p];
01195                 for (i=izero; i<m; i++) {
01196                   C[j*ldc + i] += temp*A[p*lda + i];
01197                 }
01198               }
01199             }
01200           }
01201         } else if ( conjA ) {
01202           // Compute C = alpha*conj(A')*B + beta*C
01203           for (j=izero; j<n; j++) {
01204             for (i=izero; i<m; i++) {
01205               temp = C_zero;
01206               for (p=izero; p<k; p++) {
01207                 temp += ScalarTraits<A_type>::conjugate(A[i*lda + p])*B[j*ldb + p];
01208               }
01209               if (beta == beta_zero) {
01210                 C[j*ldc + i] = alpha*temp;
01211               } else {
01212                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
01213               }
01214             }
01215           }
01216         } else {
01217           // Compute C = alpha*A'*B + beta*C
01218           for (j=izero; j<n; j++) {
01219             for (i=izero; i<m; i++) {
01220               temp = C_zero;
01221               for (p=izero; p<k; p++) {
01222                 temp += A[i*lda + p]*B[j*ldb + p];
01223               }
01224               if (beta == beta_zero) {
01225                 C[j*ldc + i] = alpha*temp;
01226               } else {
01227                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
01228               }
01229             }
01230           }
01231         }
01232       } else if ( ETranspChar[transa]=='N' ) {
01233         if ( conjB ) {
01234           // Compute C = alpha*A*conj(B') + beta*C
01235           for (j=izero; j<n; j++) {
01236             if (beta == beta_zero) {
01237               for (i=izero; i<m; i++) {
01238                 C[j*ldc + i] = C_zero;
01239               }
01240             } else if ( beta != beta_one ) {
01241               for (i=izero; i<m; i++) {
01242                 C[j*ldc + i] *= beta;
01243               }
01244             }
01245             for (p=izero; p<k; p++) {
01246               if (B[p*ldb + j] != B_zero) {
01247                 temp = alpha*ScalarTraits<B_type>::conjugate(B[p*ldb + j]);
01248                 for (i=izero; i<m; i++) {
01249                   C[j*ldc + i] += temp*A[p*lda + i];
01250                 }
01251               }
01252             }
01253           }
01254         } else {
01255           // Compute C = alpha*A*B' + beta*C
01256           for (j=izero; j<n; j++) {
01257             if (beta == beta_zero) {
01258               for (i=izero; i<m; i++) {
01259                 C[j*ldc + i] = C_zero;
01260               }
01261             } else if ( beta != beta_one ) {
01262               for (i=izero; i<m; i++) {
01263                 C[j*ldc + i] *= beta;
01264               }
01265             }
01266             for (p=izero; p<k; p++) {
01267               if (B[p*ldb + j] != B_zero) {
01268                 temp = alpha*B[p*ldb + j];
01269                 for (i=izero; i<m; i++) {
01270                   C[j*ldc + i] += temp*A[p*lda + i];
01271                 }
01272               }
01273             }
01274           }
01275         }
01276       } else if ( conjA ) {
01277         if ( conjB ) {
01278           // Compute C = alpha*conj(A')*conj(B') + beta*C
01279           for (j=izero; j<n; j++) {
01280             for (i=izero; i<m; i++) {
01281               temp = C_zero;
01282               for (p=izero; p<k; p++) {
01283                 temp += ScalarTraits<A_type>::conjugate(A[i*lda + p])
01284                       * ScalarTraits<B_type>::conjugate(B[p*ldb + j]);
01285               }
01286               if (beta == beta_zero) {
01287                 C[j*ldc + i] = alpha*temp;
01288               } else {
01289                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
01290               }
01291             }
01292           }
01293         } else {
01294           // Compute C = alpha*conj(A')*B' + beta*C
01295           for (j=izero; j<n; j++) {
01296             for (i=izero; i<m; i++) {
01297               temp = C_zero;
01298               for (p=izero; p<k; p++) {
01299                 temp += ScalarTraits<A_type>::conjugate(A[i*lda + p])
01300                       * B[p*ldb + j];
01301               }
01302               if (beta == beta_zero) {
01303                 C[j*ldc + i] = alpha*temp;
01304               } else {
01305                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
01306               }
01307             }
01308           }
01309         }
01310      } else {
01311        if ( conjB ) {
01312          // Compute C = alpha*A'*conj(B') + beta*C
01313          for (j=izero; j<n; j++) {
01314             for (i=izero; i<m; i++) {
01315               temp = C_zero;
01316               for (p=izero; p<k; p++) {
01317                 temp += A[i*lda + p]
01318                       * ScalarTraits<B_type>::conjugate(B[p*ldb + j]);
01319               }
01320               if (beta == beta_zero) {
01321                 C[j*ldc + i] = alpha*temp;
01322               } else {
01323                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
01324               }
01325             }
01326           }
01327         } else {
01328           // Compute C = alpha*A'*B' + beta*C
01329           for (j=izero; j<n; j++) {
01330             for (i=izero; i<m; i++) {
01331               temp = C_zero;
01332               for (p=izero; p<k; p++) {
01333                 temp += A[i*lda + p]*B[p*ldb + j];
01334               }
01335               if (beta == beta_zero) {
01336                 C[j*ldc + i] = alpha*temp;
01337               } else {
01338                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
01339               }
01340             }
01341           }
01342         } // end if (ETranspChar[transa]=='N') ...
01343       } // end if (ETranspChar[transb]=='N') ...
01344     } // end if (!BadArgument) ...
01345   } // end of GEMM
01346 
01347 
01348   template<typename OrdinalType, typename ScalarType>
01349   template <typename alpha_type, typename A_type, typename B_type, typename beta_type>
01350   void DefaultBLASImpl<OrdinalType, ScalarType>::SYMM(ESide side, EUplo uplo, const OrdinalType m, const OrdinalType n, const alpha_type alpha, const A_type* A, const OrdinalType lda, const B_type* B, const OrdinalType ldb, const beta_type beta, ScalarType* C, const OrdinalType ldc) const
01351   {
01352     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
01353     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
01354     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
01355     beta_type beta_zero = ScalarTraits<beta_type>::zero();
01356     ScalarType C_zero = ScalarTraits<ScalarType>::zero();
01357     beta_type beta_one = ScalarTraits<beta_type>::one();
01358     OrdinalType i, j, k, NRowA = m;
01359     ScalarType temp1, temp2;
01360     bool BadArgument = false;
01361     bool Upper = (EUploChar[uplo] == 'U');
01362     if (ESideChar[side] == 'R') { NRowA = n; }
01363 
01364     // Quick return.
01365     if ( (m==izero) || (n==izero) || ( (alpha==alpha_zero)&&(beta==beta_one) ) ) { return; }
01366     if( m < izero ) {
01367       std::cout << "BLAS::SYMM Error: M == "<< m << std::endl;
01368       BadArgument = true; }
01369     if( n < izero ) {
01370       std::cout << "BLAS::SYMM Error: N == "<< n << std::endl;
01371       BadArgument = true; }
01372     if( lda < NRowA ) {
01373       std::cout << "BLAS::SYMM Error: LDA < "<<NRowA<<std::endl;
01374       BadArgument = true; }
01375     if( ldb < m ) {
01376       std::cout << "BLAS::SYMM Error: LDB < MAX(1,M)"<<std::endl;
01377       BadArgument = true; }
01378     if( ldc < m ) {
01379       std::cout << "BLAS::SYMM Error: LDC < MAX(1,M)"<<std::endl;
01380       BadArgument = true; }
01381 
01382     if(!BadArgument) {
01383 
01384       // Only need to scale C and return.
01385       if (alpha == alpha_zero) {
01386         if (beta == beta_zero ) {
01387           for (j=izero; j<n; j++) {
01388             for (i=izero; i<m; i++) {
01389               C[j*ldc + i] = C_zero;
01390             }
01391           }
01392         } else {
01393           for (j=izero; j<n; j++) {
01394             for (i=izero; i<m; i++) {
01395               C[j*ldc + i] *= beta;
01396             }
01397           }
01398         }
01399         return;
01400       }
01401 
01402       if ( ESideChar[side] == 'L') {
01403         // Compute C = alpha*A*B + beta*C
01404 
01405         if (Upper) {
01406           // The symmetric part of A is stored in the upper triangular part of the matrix.
01407           for (j=izero; j<n; j++) {
01408             for (i=izero; i<m; i++) {
01409               temp1 = alpha*B[j*ldb + i];
01410               temp2 = C_zero;
01411               for (k=izero; k<i; k++) {
01412                 C[j*ldc + k] += temp1*A[i*lda + k];
01413                 temp2 += B[j*ldb + k]*A[i*lda + k];
01414               }
01415               if (beta == beta_zero) {
01416                 C[j*ldc + i] = temp1*A[i*lda + i] + alpha*temp2;
01417               } else {
01418                 C[j*ldc + i] = beta*C[j*ldc + i] + temp1*A[i*lda + i] + alpha*temp2;
01419               }
01420             }
01421           }
01422         } else {
01423           // The symmetric part of A is stored in the lower triangular part of the matrix.
01424           for (j=izero; j<n; j++) {
01425             for (i=m-ione; i>-ione; i--) {
01426               temp1 = alpha*B[j*ldb + i];
01427               temp2 = C_zero;
01428               for (k=i+ione; k<m; k++) {
01429                 C[j*ldc + k] += temp1*A[i*lda + k];
01430                 temp2 += B[j*ldb + k]*A[i*lda + k];
01431               }
01432               if (beta == beta_zero) {
01433                 C[j*ldc + i] = temp1*A[i*lda + i] + alpha*temp2;
01434               } else {
01435                 C[j*ldc + i] = beta*C[j*ldc + i] + temp1*A[i*lda + i] + alpha*temp2;
01436               }
01437             }
01438           }
01439         }
01440       } else {
01441         // Compute C = alpha*B*A + beta*C.
01442         for (j=izero; j<n; j++) {
01443           temp1 = alpha*A[j*lda + j];
01444           if (beta == beta_zero) {
01445             for (i=izero; i<m; i++) {
01446               C[j*ldc + i] = temp1*B[j*ldb + i];
01447             }
01448           } else {
01449             for (i=izero; i<m; i++) {
01450               C[j*ldc + i] = beta*C[j*ldc + i] + temp1*B[j*ldb + i];
01451             }
01452           }
01453           for (k=izero; k<j; k++) {
01454             if (Upper) {
01455               temp1 = alpha*A[j*lda + k];
01456             } else {
01457               temp1 = alpha*A[k*lda + j];
01458             }
01459             for (i=izero; i<m; i++) {
01460               C[j*ldc + i] += temp1*B[k*ldb + i];
01461             }
01462           }
01463           for (k=j+ione; k<n; k++) {
01464             if (Upper) {
01465               temp1 = alpha*A[k*lda + j];
01466             } else {
01467               temp1 = alpha*A[j*lda + k];
01468             }
01469             for (i=izero; i<m; i++) {
01470               C[j*ldc + i] += temp1*B[k*ldb + i];
01471             }
01472           }
01473         }
01474       } // end if (ESideChar[side]=='L') ...
01475     } // end if(!BadArgument) ...
01476 } // end SYMM
01477 
01478   template<typename OrdinalType, typename ScalarType>
01479   template <typename alpha_type, typename A_type, typename beta_type>
01480   void DefaultBLASImpl<OrdinalType, ScalarType>::SYRK(EUplo uplo, ETransp trans, const OrdinalType n, const OrdinalType k, const alpha_type alpha, const A_type* A, const OrdinalType lda, const beta_type beta, ScalarType* C, const OrdinalType ldc) const
01481   {
01482     typedef TypeNameTraits<OrdinalType> OTNT;
01483     typedef TypeNameTraits<ScalarType> STNT;
01484 
01485     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
01486     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
01487     beta_type beta_zero = ScalarTraits<beta_type>::zero();
01488     beta_type beta_one = ScalarTraits<beta_type>::one();
01489     A_type temp, A_zero = ScalarTraits<A_type>::zero();
01490     ScalarType C_zero = ScalarTraits<ScalarType>::zero();
01491     OrdinalType i, j, l, NRowA = n;
01492     bool BadArgument = false;
01493     bool Upper = (EUploChar[uplo] == 'U');
01494 
01495     TEUCHOS_TEST_FOR_EXCEPTION(
01496       Teuchos::ScalarTraits<ScalarType>::isComplex
01497       && (trans == CONJ_TRANS),
01498       std::logic_error,
01499             "Teuchos::BLAS<"<<OTNT::name()<<","<<STNT::name()<<">::SYRK()"
01500             " does not support CONJ_TRANS for complex data types."
01501       );
01502 
01503     // Change dimensions of A matrix is transposed
01504     if( !(ETranspChar[trans]=='N') ) {
01505       NRowA = k;
01506     }
01507 
01508     // Quick return.
01509     if ( n==izero ) { return; }
01510     if ( ( (alpha==alpha_zero) || (k==izero) )  && (beta==beta_one) ) { return; }
01511     if( n < izero ) {
01512       std::cout << "BLAS::SYRK Error: N == "<< n <<std::endl;
01513       BadArgument = true; }
01514     if( k < izero ) {
01515       std::cout << "BLAS::SYRK Error: K == "<< k <<std::endl;
01516       BadArgument = true; }
01517     if( lda < NRowA ) {
01518       std::cout << "BLAS::SYRK Error: LDA < "<<NRowA<<std::endl;
01519       BadArgument = true; }
01520     if( ldc < n ) {
01521       std::cout << "BLAS::SYRK Error: LDC < MAX(1,N)"<<std::endl;
01522       BadArgument = true; }
01523 
01524     if(!BadArgument) {
01525 
01526       // Scale C when alpha is zero
01527       if (alpha == alpha_zero) {
01528         if (Upper) {
01529           if (beta==beta_zero) {
01530             for (j=izero; j<n; j++) {
01531               for (i=izero; i<=j; i++) {
01532                 C[j*ldc + i] = C_zero;
01533               }
01534             }
01535           }
01536           else {
01537             for (j=izero; j<n; j++) {
01538               for (i=izero; i<=j; i++) {
01539                 C[j*ldc + i] *= beta;
01540               }
01541             }
01542           }
01543         }
01544         else {
01545           if (beta==beta_zero) {
01546             for (j=izero; j<n; j++) {
01547               for (i=j; i<n; i++) {
01548                 C[j*ldc + i] = C_zero;
01549               }
01550             }
01551           }
01552           else {
01553             for (j=izero; j<n; j++) {
01554               for (i=j; i<n; i++) {
01555                 C[j*ldc + i] *= beta;
01556               }
01557             }
01558           }
01559         }
01560         return;
01561       }
01562 
01563       // Now we can start the computation
01564 
01565       if ( ETranspChar[trans]=='N' ) {
01566 
01567         // Form C <- alpha*A*A' + beta*C
01568         if (Upper) {
01569           for (j=izero; j<n; j++) {
01570             if (beta==beta_zero) {
01571               for (i=izero; i<=j; i++) {
01572                 C[j*ldc + i] = C_zero;
01573               }
01574             }
01575             else if (beta!=beta_one) {
01576               for (i=izero; i<=j; i++) {
01577                 C[j*ldc + i] *= beta;
01578               }
01579             }
01580             for (l=izero; l<k; l++) {
01581               if (A[l*lda + j] != A_zero) {
01582                 temp = alpha*A[l*lda + j];
01583                 for (i = izero; i <=j; i++) {
01584                   C[j*ldc + i] += temp*A[l*lda + i];
01585                 }
01586               }
01587             }
01588           }
01589         }
01590         else {
01591           for (j=izero; j<n; j++) {
01592             if (beta==beta_zero) {
01593               for (i=j; i<n; i++) {
01594                 C[j*ldc + i] = C_zero;
01595               }
01596             }
01597             else if (beta!=beta_one) {
01598               for (i=j; i<n; i++) {
01599                 C[j*ldc + i] *= beta;
01600               }
01601             }
01602             for (l=izero; l<k; l++) {
01603               if (A[l*lda + j] != A_zero) {
01604                 temp = alpha*A[l*lda + j];
01605                 for (i=j; i<n; i++) {
01606                   C[j*ldc + i] += temp*A[l*lda + i];
01607                 }
01608               }
01609             }
01610           }
01611         }
01612       }
01613       else {
01614 
01615         // Form C <- alpha*A'*A + beta*C
01616         if (Upper) {
01617           for (j=izero; j<n; j++) {
01618             for (i=izero; i<=j; i++) {
01619               temp = A_zero;
01620               for (l=izero; l<k; l++) {
01621                 temp += A[i*lda + l]*A[j*lda + l];
01622               }
01623               if (beta==beta_zero) {
01624                 C[j*ldc + i] = alpha*temp;
01625               }
01626               else {
01627                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
01628               }
01629             }
01630           }
01631         }
01632         else {
01633           for (j=izero; j<n; j++) {
01634             for (i=j; i<n; i++) {
01635               temp = A_zero;
01636               for (l=izero; l<k; ++l) {
01637                 temp += A[i*lda + l]*A[j*lda + l];
01638               }
01639               if (beta==beta_zero) {
01640                 C[j*ldc + i] = alpha*temp;
01641               }
01642               else {
01643                 C[j*ldc + i] = alpha*temp + beta*C[j*ldc + i];
01644               }
01645             }
01646           }
01647         }
01648       }
01649     } /* if (!BadArgument) */
01650   } /* END SYRK */
01651 
01652   template<typename OrdinalType, typename ScalarType>
01653   template <typename alpha_type, typename A_type>
01654   void DefaultBLASImpl<OrdinalType, ScalarType>::TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType m, const OrdinalType n, const alpha_type alpha, const A_type* A, const OrdinalType lda, ScalarType* B, const OrdinalType ldb) const
01655   {
01656     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
01657     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
01658     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
01659     A_type A_zero = ScalarTraits<A_type>::zero();
01660     ScalarType B_zero = ScalarTraits<ScalarType>::zero();
01661     ScalarType one = ScalarTraits<ScalarType>::one();
01662     OrdinalType i, j, k, NRowA = m;
01663     ScalarType temp;
01664     bool BadArgument = false;
01665     bool LSide = (ESideChar[side] == 'L');
01666     bool noUnit = (EDiagChar[diag] == 'N');
01667     bool Upper = (EUploChar[uplo] == 'U');
01668     bool noConj = (EUploChar[transa] == 'T');
01669 
01670     if(!LSide) { NRowA = n; }
01671 
01672     // Quick return.
01673     if (n==izero || m==izero) { return; }
01674     if( m < izero ) {
01675       std::cout << "BLAS::TRMM Error: M == "<< m <<std::endl;
01676       BadArgument = true; }
01677     if( n < izero ) {
01678       std::cout << "BLAS::TRMM Error: N == "<< n <<std::endl;
01679       BadArgument = true; }
01680     if( lda < NRowA ) {
01681       std::cout << "BLAS::TRMM Error: LDA < "<<NRowA<<std::endl;
01682       BadArgument = true; }
01683     if( ldb < m ) {
01684       std::cout << "BLAS::TRMM Error: LDB < MAX(1,M)"<<std::endl;
01685       BadArgument = true; }
01686 
01687     if(!BadArgument) {
01688 
01689       // B only needs to be zeroed out.
01690       if( alpha == alpha_zero ) {
01691         for( j=izero; j<n; j++ ) {
01692           for( i=izero; i<m; i++ ) {
01693             B[j*ldb + i] = B_zero;
01694           }
01695         }
01696         return;
01697       }
01698 
01699       //  Start the computations.
01700       if ( LSide ) {
01701         // A is on the left side of B.
01702 
01703         if ( ETranspChar[transa]=='N' ) {
01704           // Compute B = alpha*A*B
01705 
01706           if ( Upper ) {
01707             // A is upper triangular
01708             for( j=izero; j<n; j++ ) {
01709               for( k=izero; k<m; k++) {
01710                 if ( B[j*ldb + k] != B_zero ) {
01711                   temp = alpha*B[j*ldb + k];
01712                   for( i=izero; i<k; i++ ) {
01713                     B[j*ldb + i] += temp*A[k*lda + i];
01714                   }
01715                   if ( noUnit )
01716                     temp *=A[k*lda + k];
01717                   B[j*ldb + k] = temp;
01718                 }
01719               }
01720             }
01721           } else {
01722             // A is lower triangular
01723             for( j=izero; j<n; j++ ) {
01724               for( k=m-ione; k>-ione; k-- ) {
01725                 if( B[j*ldb + k] != B_zero ) {
01726                   temp = alpha*B[j*ldb + k];
01727                   B[j*ldb + k] = temp;
01728                   if ( noUnit )
01729                     B[j*ldb + k] *= A[k*lda + k];
01730                   for( i=k+ione; i<m; i++ ) {
01731                     B[j*ldb + i] += temp*A[k*lda + i];
01732                   }
01733                 }
01734               }
01735             }
01736           }
01737         } else {
01738           // Compute B = alpha*A'*B or B = alpha*conj(A')*B
01739           if( Upper ) {
01740             for( j=izero; j<n; j++ ) {
01741               for( i=m-ione; i>-ione; i-- ) {
01742                 temp = B[j*ldb + i];
01743                 if ( noConj ) {
01744                   if( noUnit )
01745                     temp *= A[i*lda + i];
01746                   for( k=izero; k<i; k++ ) {
01747                     temp += A[i*lda + k]*B[j*ldb + k];
01748                   }
01749                 } else {
01750                   if( noUnit )
01751                     temp *= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
01752                   for( k=izero; k<i; k++ ) {
01753                     temp += ScalarTraits<A_type>::conjugate(A[i*lda + k])*B[j*ldb + k];
01754                   }
01755                 }
01756                 B[j*ldb + i] = alpha*temp;
01757               }
01758             }
01759           } else {
01760             for( j=izero; j<n; j++ ) {
01761               for( i=izero; i<m; i++ ) {
01762                 temp = B[j*ldb + i];
01763                 if ( noConj ) {
01764                   if( noUnit )
01765                     temp *= A[i*lda + i];
01766                   for( k=i+ione; k<m; k++ ) {
01767                     temp += A[i*lda + k]*B[j*ldb + k];
01768                   }
01769                 } else {
01770                   if( noUnit )
01771                     temp *= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
01772                   for( k=i+ione; k<m; k++ ) {
01773                     temp += ScalarTraits<A_type>::conjugate(A[i*lda + k])*B[j*ldb + k];
01774                   }
01775                 }
01776                 B[j*ldb + i] = alpha*temp;
01777               }
01778             }
01779           }
01780         }
01781       } else {
01782         // A is on the right hand side of B.
01783 
01784         if( ETranspChar[transa] == 'N' ) {
01785           // Compute B = alpha*B*A
01786 
01787           if( Upper ) {
01788             // A is upper triangular.
01789             for( j=n-ione; j>-ione; j-- ) {
01790               temp = alpha;
01791               if( noUnit )
01792                 temp *= A[j*lda + j];
01793               for( i=izero; i<m; i++ ) {
01794                 B[j*ldb + i] *= temp;
01795               }
01796               for( k=izero; k<j; k++ ) {
01797                 if( A[j*lda + k] != A_zero ) {
01798                   temp = alpha*A[j*lda + k];
01799                   for( i=izero; i<m; i++ ) {
01800                     B[j*ldb + i] += temp*B[k*ldb + i];
01801                   }
01802                 }
01803               }
01804             }
01805           } else {
01806             // A is lower triangular.
01807             for( j=izero; j<n; j++ ) {
01808               temp = alpha;
01809               if( noUnit )
01810                 temp *= A[j*lda + j];
01811               for( i=izero; i<m; i++ ) {
01812                 B[j*ldb + i] *= temp;
01813               }
01814               for( k=j+ione; k<n; k++ ) {
01815                 if( A[j*lda + k] != A_zero ) {
01816                   temp = alpha*A[j*lda + k];
01817                   for( i=izero; i<m; i++ ) {
01818                     B[j*ldb + i] += temp*B[k*ldb + i];
01819                   }
01820                 }
01821               }
01822             }
01823           }
01824         } else {
01825           // Compute B = alpha*B*A' or B = alpha*B*conj(A')
01826 
01827           if( Upper ) {
01828             for( k=izero; k<n; k++ ) {
01829               for( j=izero; j<k; j++ ) {
01830                 if( A[k*lda + j] != A_zero ) {
01831                   if ( noConj )
01832                     temp = alpha*A[k*lda + j];
01833                   else
01834                     temp = alpha*ScalarTraits<A_type>::conjugate(A[k*lda + j]);
01835                   for( i=izero; i<m; i++ ) {
01836                     B[j*ldb + i] += temp*B[k*ldb + i];
01837                   }
01838                 }
01839               }
01840               temp = alpha;
01841               if( noUnit ) {
01842                 if ( noConj )
01843                   temp *= A[k*lda + k];
01844                 else
01845                   temp *= ScalarTraits<A_type>::conjugate(A[k*lda + k]);
01846               }
01847               if( temp != one ) {
01848                 for( i=izero; i<m; i++ ) {
01849                   B[k*ldb + i] *= temp;
01850                 }
01851               }
01852             }
01853           } else {
01854             for( k=n-ione; k>-ione; k-- ) {
01855               for( j=k+ione; j<n; j++ ) {
01856                 if( A[k*lda + j] != A_zero ) {
01857                   if ( noConj )
01858                     temp = alpha*A[k*lda + j];
01859                   else
01860                     temp = alpha*ScalarTraits<A_type>::conjugate(A[k*lda + j]);
01861                   for( i=izero; i<m; i++ ) {
01862                     B[j*ldb + i] += temp*B[k*ldb + i];
01863                   }
01864                 }
01865               }
01866               temp = alpha;
01867               if( noUnit ) {
01868                 if ( noConj )
01869                   temp *= A[k*lda + k];
01870                 else
01871                   temp *= ScalarTraits<A_type>::conjugate(A[k*lda + k]);
01872               }
01873               if( temp != one ) {
01874                 for( i=izero; i<m; i++ ) {
01875                   B[k*ldb + i] *= temp;
01876                 }
01877               }
01878             }
01879           }
01880         } // end if( ETranspChar[transa] == 'N' ) ...
01881       } // end if ( LSide ) ...
01882     } // end if (!BadArgument)
01883   } // end TRMM
01884 
01885   template<typename OrdinalType, typename ScalarType>
01886   template <typename alpha_type, typename A_type>
01887   void DefaultBLASImpl<OrdinalType, ScalarType>::TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const OrdinalType m, const OrdinalType n, const alpha_type alpha, const A_type* A, const OrdinalType lda, ScalarType* B, const OrdinalType ldb) const
01888   {
01889     OrdinalType izero = OrdinalTraits<OrdinalType>::zero();
01890     OrdinalType ione = OrdinalTraits<OrdinalType>::one();
01891     alpha_type alpha_zero = ScalarTraits<alpha_type>::zero();
01892     A_type A_zero = ScalarTraits<A_type>::zero();
01893     ScalarType B_zero = ScalarTraits<ScalarType>::zero();
01894     alpha_type alpha_one = ScalarTraits<alpha_type>::one();
01895     ScalarType B_one = ScalarTraits<ScalarType>::one();
01896     ScalarType temp;
01897     OrdinalType NRowA = m;
01898     bool BadArgument = false;
01899     bool noUnit = (EDiagChar[diag]=='N');
01900     bool noConj = (EUploChar[transa] == 'T');
01901 
01902     if (!(ESideChar[side] == 'L')) { NRowA = n; }
01903 
01904     // Quick return.
01905     if (n == izero || m == izero) { return; }
01906     if( m < izero ) {
01907       std::cout << "BLAS::TRSM Error: M == "<<m<<std::endl;
01908       BadArgument = true; }
01909     if( n < izero ) {
01910       std::cout << "BLAS::TRSM Error: N == "<<n<<std::endl;
01911       BadArgument = true; }
01912     if( lda < NRowA ) {
01913       std::cout << "BLAS::TRSM Error: LDA < "<<NRowA<<std::endl;
01914       BadArgument = true; }
01915     if( ldb < m ) {
01916       std::cout << "BLAS::TRSM Error: LDB < MAX(1,M)"<<std::endl;
01917       BadArgument = true; }
01918 
01919     if(!BadArgument)
01920       {
01921         int i, j, k;
01922         // Set the solution to the zero std::vector.
01923         if(alpha == alpha_zero) {
01924             for(j = izero; j < n; j++) {
01925                 for( i = izero; i < m; i++) {
01926                     B[j*ldb+i] = B_zero;
01927                 }
01928             }
01929         }
01930         else
01931         { // Start the operations.
01932             if(ESideChar[side] == 'L') {
01933                 //
01934                 // Perform computations for OP(A)*X = alpha*B
01935                 //
01936                 if(ETranspChar[transa] == 'N') {
01937                     //
01938                     //  Compute B = alpha*inv( A )*B
01939                     //
01940                     if(EUploChar[uplo] == 'U') {
01941                         // A is upper triangular.
01942                         for(j = izero; j < n; j++) {
01943                             // Perform alpha*B if alpha is not 1.
01944                           if(alpha != alpha_one) {
01945                                 for( i = izero; i < m; i++) {
01946                                     B[j*ldb+i] *= alpha;
01947                                 }
01948                             }
01949                             // Perform a backsolve for column j of B.
01950                             for(k = (m - ione); k > -ione; k--) {
01951                                 // If this entry is zero, we don't have to do anything.
01952                                 if (B[j*ldb + k] != B_zero) {
01953                                     if ( noUnit ) {
01954                                         B[j*ldb + k] /= A[k*lda + k];
01955                                     }
01956                                     for(i = izero; i < k; i++) {
01957                                         B[j*ldb + i] -= B[j*ldb + k] * A[k*lda + i];
01958                                     }
01959                                 }
01960                             }
01961                         }
01962                     }
01963                     else
01964                     { // A is lower triangular.
01965                         for(j = izero; j < n; j++) {
01966                             // Perform alpha*B if alpha is not 1.
01967                             if(alpha != alpha_one) {
01968                                 for( i = izero; i < m; i++) {
01969                                     B[j*ldb+i] *= alpha;
01970                                 }
01971                             }
01972                             // Perform a forward solve for column j of B.
01973                             for(k = izero; k < m; k++) {
01974                                 // If this entry is zero, we don't have to do anything.
01975                                 if (B[j*ldb + k] != B_zero) {
01976                                     if ( noUnit ) {
01977                                         B[j*ldb + k] /= A[k*lda + k];
01978                                     }
01979                                     for(i = k+ione; i < m; i++) {
01980                                         B[j*ldb + i] -= B[j*ldb + k] * A[k*lda + i];
01981                                     }
01982                                 }
01983                             }
01984                         }
01985                     } // end if (uplo == 'U')
01986                 }  // if (transa =='N')
01987                 else {
01988                     //
01989                     //  Compute B = alpha*inv( A' )*B
01990                     //  or      B = alpha*inv( conj(A') )*B
01991                     //
01992                     if(EUploChar[uplo] == 'U') {
01993                         // A is upper triangular.
01994                         for(j = izero; j < n; j++) {
01995                             for( i = izero; i < m; i++) {
01996                                 temp = alpha*B[j*ldb+i];
01997                                 if ( noConj ) {
01998                                   for(k = izero; k < i; k++) {
01999                                       temp -= A[i*lda + k] * B[j*ldb + k];
02000                                   }
02001                                   if ( noUnit ) {
02002                                       temp /= A[i*lda + i];
02003                                   }
02004                                 } else {
02005                                   for(k = izero; k < i; k++) {
02006                                       temp -= ScalarTraits<A_type>::conjugate(A[i*lda + k])
02007                                             * B[j*ldb + k];
02008                                   }
02009                                   if ( noUnit ) {
02010                                       temp /= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
02011                                   }
02012                                 }
02013                                 B[j*ldb + i] = temp;
02014                             }
02015                         }
02016                     }
02017                     else
02018                     { // A is lower triangular.
02019                         for(j = izero; j < n; j++) {
02020                             for(i = (m - ione) ; i > -ione; i--) {
02021                                 temp = alpha*B[j*ldb+i];
02022                                 if ( noConj ) {
02023                                   for(k = i+ione; k < m; k++) {
02024                                     temp -= A[i*lda + k] * B[j*ldb + k];
02025                                   }
02026                                   if ( noUnit ) {
02027                                     temp /= A[i*lda + i];
02028                                   }
02029                                 } else {
02030                                   for(k = i+ione; k < m; k++) {
02031                                     temp -= ScalarTraits<A_type>::conjugate(A[i*lda + k])
02032                                           * B[j*ldb + k];
02033                                   }
02034                                   if ( noUnit ) {
02035                                     temp /= ScalarTraits<A_type>::conjugate(A[i*lda + i]);
02036                                   }
02037                                 }
02038                                 B[j*ldb + i] = temp;
02039                             }
02040                         }
02041                     }
02042                 }
02043             }  // if (side == 'L')
02044             else {
02045                // side == 'R'
02046                //
02047                // Perform computations for X*OP(A) = alpha*B
02048                //
02049               if (ETranspChar[transa] == 'N') {
02050                     //
02051                     //  Compute B = alpha*B*inv( A )
02052                     //
02053                     if(EUploChar[uplo] == 'U') {
02054                         // A is upper triangular.
02055                         // Perform a backsolve for column j of B.
02056                         for(j = izero; j < n; j++) {
02057                             // Perform alpha*B if alpha is not 1.
02058                             if(alpha != alpha_one) {
02059                                 for( i = izero; i < m; i++) {
02060                                     B[j*ldb+i] *= alpha;
02061                                 }
02062                             }
02063                             for(k = izero; k < j; k++) {
02064                                 // If this entry is zero, we don't have to do anything.
02065                                 if (A[j*lda + k] != A_zero) {
02066                                     for(i = izero; i < m; i++) {
02067                                         B[j*ldb + i] -= A[j*lda + k] * B[k*ldb + i];
02068                                     }
02069                                 }
02070                             }
02071                             if ( noUnit ) {
02072                                 temp = B_one/A[j*lda + j];
02073                                 for(i = izero; i < m; i++) {
02074                                     B[j*ldb + i] *= temp;
02075                                 }
02076                             }
02077                         }
02078                     }
02079                     else
02080                     { // A is lower triangular.
02081                         for(j = (n - ione); j > -ione; j--) {
02082                             // Perform alpha*B if alpha is not 1.
02083                             if(alpha != alpha_one) {
02084                                 for( i = izero; i < m; i++) {
02085                                     B[j*ldb+i] *= alpha;
02086                                 }
02087                             }
02088                             // Perform a forward solve for column j of B.
02089                             for(k = j+ione; k < n; k++) {
02090                                 // If this entry is zero, we don't have to do anything.
02091                                 if (A[j*lda + k] != A_zero) {
02092                                     for(i = izero; i < m; i++) {
02093                                         B[j*ldb + i] -= A[j*lda + k] * B[k*ldb + i];
02094                                     }
02095                                 }
02096                             }
02097                             if ( noUnit ) {
02098                                 temp = B_one/A[j*lda + j];
02099                                 for(i = izero; i < m; i++) {
02100                                     B[j*ldb + i] *= temp;
02101                                 }
02102                             }
02103                         }
02104                     } // end if (uplo == 'U')
02105                 }  // if (transa =='N')
02106                 else {
02107                     //
02108                     //  Compute B = alpha*B*inv( A' )
02109                     //  or      B = alpha*B*inv( conj(A') )
02110                     //
02111                     if(EUploChar[uplo] == 'U') {
02112                         // A is upper triangular.
02113                         for(k = (n - ione); k > -ione; k--) {
02114                             if ( noUnit ) {
02115                                 if ( noConj )
02116                                   temp = B_one/A[k*lda + k];
02117                                 else
02118                                   temp = B_one/ScalarTraits<A_type>::conjugate(A[k*lda + k]);
02119                                 for(i = izero; i < m; i++) {
02120                                     B[k*ldb + i] *= temp;
02121                                 }
02122                             }
02123                             for(j = izero; j < k; j++) {
02124                                 if (A[k*lda + j] != A_zero) {
02125                                     if ( noConj )
02126                                       temp = A[k*lda + j];
02127                                     else
02128                                       temp = ScalarTraits<A_type>::conjugate(A[k*lda + j]);
02129                                     for(i = izero; i < m; i++) {
02130                                         B[j*ldb + i] -= temp*B[k*ldb + i];
02131                                     }
02132                                 }
02133                             }
02134                             if (alpha != alpha_one) {
02135                                 for (i = izero; i < m; i++) {
02136                                     B[k*ldb + i] *= alpha;
02137                                 }
02138                             }
02139                         }
02140                     }
02141                     else
02142                     { // A is lower triangular.
02143                         for(k = izero; k < n; k++) {
02144                             if ( noUnit ) {
02145                                 if ( noConj )
02146                                   temp = B_one/A[k*lda + k];
02147                                 else
02148                                   temp = B_one/ScalarTraits<A_type>::conjugate(A[k*lda + k]);
02149                                 for (i = izero; i < m; i++) {
02150                                     B[k*ldb + i] *= temp;
02151                                 }
02152                             }
02153                             for(j = k+ione; j < n; j++) {
02154                                 if(A[k*lda + j] != A_zero) {
02155                                     if ( noConj )
02156                                       temp = A[k*lda + j];
02157                                     else
02158                                       temp = ScalarTraits<A_type>::conjugate(A[k*lda + j]);
02159                                     for(i = izero; i < m; i++) {
02160                                         B[j*ldb + i] -= temp*B[k*ldb + i];
02161                                     }
02162                                 }
02163                             }
02164                             if (alpha != alpha_one) {
02165                                 for (i = izero; i < m; i++) {
02166                                     B[k*ldb + i] *= alpha;
02167                                 }
02168                             }
02169                         }
02170                     }
02171                 }
02172             }
02173         }
02174     }
02175   }
02176 
02177   // Explicit instantiation for template<int,float>
02178 
02179   template <>
02180   class TEUCHOS_LIB_DLL_EXPORT BLAS<int, float>
02181   {
02182   public:
02183     inline BLAS(void) {}
02184     inline BLAS(const BLAS<int, float>& /*BLAS_source*/) {}
02185     inline virtual ~BLAS(void) {}
02186     void ROTG(float* da, float* db, float* c, float* s) const;
02187     void ROT(const int n, float* dx, const int incx, float* dy, const int incy, float* c, float* s) const;
02188     float ASUM(const int n, const float* x, const int incx) const;
02189     void AXPY(const int n, const float alpha, const float* x, const int incx, float* y, const int incy) const;
02190     void COPY(const int n, const float* x, const int incx, float* y, const int incy) const;
02191     float DOT(const int n, const float* x, const int incx, const float* y, const int incy) const;
02192     float NRM2(const int n, const float* x, const int incx) const;
02193     void SCAL(const int n, const float alpha, float* x, const int incx) const;
02194     int IAMAX(const int n, const float* x, const int incx) const;
02195     void GEMV(ETransp trans, const int m, const int n, const float alpha, const float* A, const int lda, const float* x, const int incx, const float beta, float* y, const int incy) const;
02196     void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int n, const float* A, const int lda, float* x, const int incx) const;
02197     void GER(const int m, const int n, const float alpha, const float* x, const int incx, const float* y, const int incy, float* A, const int lda) const;
02198     void GEMM(ETransp transa, ETransp transb, const int m, const int n, const int k, const float alpha, const float* A, const int lda, const float* B, const int ldb, const float beta, float* C, const int ldc) const;
02199     void SYMM(ESide side, EUplo uplo, const int m, const int n, const float alpha, const float* A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc) const;
02200     void SYRK(EUplo uplo, ETransp trans, const int n, const int k, const float alpha, const float* A, const int lda, const float beta, float* C, const int ldc) const;
02201     void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const float alpha, const float* A, const int lda, float* B, const int ldb) const;
02202     void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const float alpha, const float* A, const int lda, float* B, const int ldb) const;
02203   };
02204 
02205   // Explicit instantiation for template<int,double>
02206 
02207   template<>
02208   class TEUCHOS_LIB_DLL_EXPORT BLAS<int, double>
02209   {
02210   public:
02211     inline BLAS(void) {}
02212     inline BLAS(const BLAS<int, double>& /*BLAS_source*/) {}
02213     inline virtual ~BLAS(void) {}
02214     void ROTG(double* da, double* db, double* c, double* s) const;
02215     void ROT(const int n, double* dx, const int incx, double* dy, const int incy, double* c, double* s) const;
02216     double ASUM(const int n, const double* x, const int incx) const;
02217     void AXPY(const int n, const double alpha, const double* x, const int incx, double* y, const int incy) const;
02218     void COPY(const int n, const double* x, const int incx, double* y, const int incy) const;
02219     double DOT(const int n, const double* x, const int incx, const double* y, const int incy) const;
02220     double NRM2(const int n, const double* x, const int incx) const;
02221     void SCAL(const int n, const double alpha, double* x, const int incx) const;
02222     int IAMAX(const int n, const double* x, const int incx) const;
02223     void GEMV(ETransp trans, const int m, const int n, const double alpha, const double* A, const int lda, const double* x, const int incx, const double beta, double* y, const int incy) const;
02224     void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int n, const double* A, const int lda, double* x, const int incx) const;
02225     void GER(const int m, const int n, const double alpha, const double* x, const int incx, const double* y, const int incy, double* A, const int lda) const;
02226     void GEMM(ETransp transa, ETransp transb, const int m, const int n, const int k, const double alpha, const double* A, const int lda, const double* B, const int ldb, const double beta, double* C, const int ldc) const;
02227     void SYMM(ESide side, EUplo uplo, const int m, const int n, const double alpha, const double* A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc) const;
02228     void SYRK(EUplo uplo, ETransp trans, const int n, const int k, const double alpha, const double* A, const int lda, const double beta, double* C, const int ldc) const;
02229     void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const double alpha, const double* A, const int lda, double* B, const int ldb) const;
02230     void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const double alpha, const double* A, const int lda, double* B, const int ldb) const;
02231   };
02232 
02233   // Explicit instantiation for template<int,complex<float> >
02234 
02235   template<>
02236   class TEUCHOS_LIB_DLL_EXPORT BLAS<int, std::complex<float> >
02237   {
02238   public:
02239     inline BLAS(void) {}
02240     inline BLAS(const BLAS<int, std::complex<float> >& /*BLAS_source*/) {}
02241     inline virtual ~BLAS(void) {}
02242     void ROTG(std::complex<float>* da, std::complex<float>* db, float* c, std::complex<float>* s) const;
02243     void ROT(const int n, std::complex<float>* dx, const int incx, std::complex<float>* dy, const int incy, float* c, std::complex<float>* s) const;
02244     float ASUM(const int n, const std::complex<float>* x, const int incx) const;
02245     void AXPY(const int n, const std::complex<float> alpha, const std::complex<float>* x, const int incx, std::complex<float>* y, const int incy) const;
02246     void COPY(const int n, const std::complex<float>* x, const int incx, std::complex<float>* y, const int incy) const;
02247     std::complex<float> DOT(const int n, const std::complex<float>* x, const int incx, const std::complex<float>* y, const int incy) const;
02248     float NRM2(const int n, const std::complex<float>* x, const int incx) const;
02249     void SCAL(const int n, const std::complex<float> alpha, std::complex<float>* x, const int incx) const;
02250     int IAMAX(const int n, const std::complex<float>* x, const int incx) const;
02251     void GEMV(ETransp trans, const int m, const int n, const std::complex<float> alpha, const std::complex<float>* A, const int lda, const std::complex<float>* x, const int incx, const std::complex<float> beta, std::complex<float>* y, const int incy) const;
02252     void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int n, const std::complex<float>* A, const int lda, std::complex<float>* x, const int incx) const;
02253     void GER(const int m, const int n, const std::complex<float> alpha, const std::complex<float>* x, const int incx, const std::complex<float>* y, const int incy, std::complex<float>* A, const int lda) const;
02254     void GEMM(ETransp transa, ETransp transb, const int m, const int n, const int k, const std::complex<float> alpha, const std::complex<float>* A, const int lda, const std::complex<float>* B, const int ldb, const std::complex<float> beta, std::complex<float>* C, const int ldc) const;
02255     void SYMM(ESide side, EUplo uplo, const int m, const int n, const std::complex<float> alpha, const std::complex<float>* A, const int lda, const std::complex<float> *B, const int ldb, const std::complex<float> beta, std::complex<float> *C, const int ldc) const;
02256     void SYRK(EUplo uplo, ETransp trans, const int n, const int k, const std::complex<float> alpha, const std::complex<float>* A, const int lda, const std::complex<float> beta, std::complex<float>* C, const int ldc) const;
02257     void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const std::complex<float> alpha, const std::complex<float>* A, const int lda, std::complex<float>* B, const int ldb) const;
02258     void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const std::complex<float> alpha, const std::complex<float>* A, const int lda, std::complex<float>* B, const int ldb) const;
02259   };
02260 
02261   // Explicit instantiation for template<int,complex<double> >
02262 
02263   template<>
02264   class TEUCHOS_LIB_DLL_EXPORT BLAS<int, std::complex<double> >
02265   {
02266   public:
02267     inline BLAS(void) {}
02268     inline BLAS(const BLAS<int, std::complex<double> >& /*BLAS_source*/) {}
02269     inline virtual ~BLAS(void) {}
02270     void ROTG(std::complex<double>* da, std::complex<double>* db, double* c, std::complex<double>* s) const;
02271     void ROT(const int n, std::complex<double>* dx, const int incx, std::complex<double>* dy, const int incy, double* c, std::complex<double>* s) const;
02272     double ASUM(const int n, const std::complex<double>* x, const int incx) const;
02273     void AXPY(const int n, const std::complex<double> alpha, const std::complex<double>* x, const int incx, std::complex<double>* y, const int incy) const;
02274     void COPY(const int n, const std::complex<double>* x, const int incx, std::complex<double>* y, const int incy) const;
02275     std::complex<double> DOT(const int n, const std::complex<double>* x, const int incx, const std::complex<double>* y, const int incy) const;
02276     double NRM2(const int n, const std::complex<double>* x, const int incx) const;
02277     void SCAL(const int n, const std::complex<double> alpha, std::complex<double>* x, const int incx) const;
02278     int IAMAX(const int n, const std::complex<double>* x, const int incx) const;
02279     void GEMV(ETransp trans, const int m, const int n, const std::complex<double> alpha, const std::complex<double>* A, const int lda, const std::complex<double>* x, const int incx, const std::complex<double> beta, std::complex<double>* y, const int incy) const;
02280     void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int n, const std::complex<double>* A, const int lda, std::complex<double>* x, const int incx) const;
02281     void GER(const int m, const int n, const std::complex<double> alpha, const std::complex<double>* x, const int incx, const std::complex<double>* y, const int incy, std::complex<double>* A, const int lda) const;
02282     void GEMM(ETransp transa, ETransp transb, const int m, const int n, const int k, const std::complex<double> alpha, const std::complex<double>* A, const int lda, const std::complex<double>* B, const int ldb, const std::complex<double> beta, std::complex<double>* C, const int ldc) const;
02283     void SYMM(ESide side, EUplo uplo, const int m, const int n, const std::complex<double> alpha, const std::complex<double>* A, const int lda, const std::complex<double> *B, const int ldb, const std::complex<double> beta, std::complex<double> *C, const int ldc) const;
02284     void SYRK(EUplo uplo, ETransp trans, const int n, const int k, const std::complex<double> alpha, const std::complex<double>* A, const int lda, const std::complex<double> beta, std::complex<double>* C, const int ldc) const;
02285     void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const std::complex<double> alpha, const std::complex<double>* A, const int lda, std::complex<double>* B, const int ldb) const;
02286     void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const std::complex<double> alpha, const std::complex<double>* A, const int lda, std::complex<double>* B, const int ldb) const;
02287   };
02288 
02289 } // namespace Teuchos
02290 
02291 #endif // _TEUCHOS_BLAS_HPP_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines