EpetraExt Development
EpetraExt_MatrixMatrix_mult_A_B.cpp
Go to the documentation of this file.
00001 //@HEADER
00002 // ***********************************************************************
00003 //
00004 //     EpetraExt: Epetra Extended - Linear Algebra Services Package
00005 //                 Copyright (2011) Sandia Corporation
00006 //
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ***********************************************************************
00040 //@HEADER
00041 
00042 #include <EpetraExt_ConfigDefs.h>
00043 #include <EpetraExt_MatrixMatrix.h>
00044 
00045 #include <EpetraExt_MMHelpers.h>
00046 
00047 #include <EpetraExt_Transpose_RowMatrix.h>
00048 
00049 #include <Epetra_Export.h>
00050 #include <Epetra_Import.h>
00051 #include <Epetra_Util.h>
00052 #include <Epetra_Map.h>
00053 #include <Epetra_Comm.h>
00054 #include <Epetra_CrsMatrix.h>
00055 #include <Epetra_Vector.h>
00056 #include <Epetra_Directory.h>
00057 #include <Epetra_HashTable.h>
00058 #include <Epetra_Distributor.h>
00059 #include <Epetra_IntSerialDenseVector.h>
00060 
00061 #ifdef HAVE_VECTOR
00062 #include <vector>
00063 #endif
00064 
00065 #ifdef HAVE_MPI
00066 #include <Epetra_MpiDistributor.h>
00067 #endif
00068 
00069 
00070 #include <Teuchos_TimeMonitor.hpp>
00071 
00072 namespace EpetraExt {
00073 
00074 /*****************************************************************************/
00075 /*****************************************************************************/
00076 /*****************************************************************************/
00077 static inline int C_estimate_nnz(const Epetra_CrsMatrix & A, const Epetra_CrsMatrix &B){
00078   // Follows the NZ estimate in ML's ml_matmatmult.c
00079   int Aest=(A.NumMyRows()>0)? A.NumMyNonzeros()/A.NumMyRows():100;
00080   int Best=(B.NumMyRows()>0)? B.NumMyNonzeros()/B.NumMyRows():100;
00081 
00082   int nnzperrow=(int)(sqrt((double)Aest) + sqrt((double)Best) - 1);
00083   nnzperrow*=nnzperrow;
00084  
00085   return (int)(A.NumMyRows()*nnzperrow*0.75 + 100);
00086 }
00087 
00088 /*****************************************************************************/
00089 /*****************************************************************************/
00090 /*****************************************************************************/
00091 static inline int auto_resize(std::vector<int> &x,int num_new){
00092   int newsize=x.size() + EPETRA_MAX((int)x.size(),num_new);
00093   x.resize(newsize);
00094   return newsize;
00095 }
00096 
00097 
00098 /*****************************************************************************/
00099 /*****************************************************************************/
00100 /*****************************************************************************/
00101 template<typename int_type>
00102 int aztecoo_and_ml_compatible_map_union(const Epetra_CrsMatrix &B, const LightweightCrsMatrix &Bimport, Epetra_Map*& unionmap, std::vector<int>& Cremotepids,
00103           std::vector<int> &Bcols2Ccols, std::vector<int> &Icols2Ccols)
00104 {
00105 #ifdef HAVE_MPI
00106  
00107 #ifdef ENABLE_MMM_TIMINGS
00108   using Teuchos::TimeMonitor;
00109   Teuchos::RCP<Teuchos::TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM M5 CMap 1")));
00110 #endif
00111 
00112  // So we need to merge the ColMap of B and the TargetMap of Bimport in an AztecOO/Ifpack/ML compliant order.
00113   Epetra_Util util;
00114   int i,j,MyPID = B.Comm().MyPID(), NumProc = B.Comm().NumProc();
00115   int Bstart=0, Istart=0, Cstart=0,Pstart=0;
00116 
00117   const Epetra_Map & BColMap       = B.ColMap();
00118   const Epetra_Map & DomainMap     = B.DomainMap();
00119   const LightweightMap & IColMap   = Bimport.ColMap_;
00120 
00121   int Nb         = BColMap.NumMyElements();
00122   int_type * Bgids = 0;
00123   BColMap.MyGlobalElementsPtr(Bgids);
00124   int Ni         = IColMap.NumMyElements();
00125   int_type * Igids    = 0;
00126   if(Ni>0)
00127     IColMap.MyGlobalElementsPtr(Igids);
00128 
00129   if((int)Bcols2Ccols.size() != Nb) Bcols2Ccols.resize(Nb);
00130   if((int)Icols2Ccols.size() != Ni) Icols2Ccols.resize(Ni);
00131 
00132   // Since we're getting called, we know we have to be using an MPI implementation of Epetra.  
00133   // Which means we should have an MpiDistributor for both B and Bimport.  
00134   // Unless all of B's columns are owned by the calling proc (e.g. MueLu for A*Ptent w/ uncoupled aggregation)
00135   Epetra_MpiDistributor *Distor=0;
00136   if(B.Importer()) { 
00137     Distor=dynamic_cast<Epetra_MpiDistributor*>(&B.Importer()->Distributor());
00138     if(!Distor) EPETRA_CHK_ERR(-2);
00139   }
00140 
00141   // **********************
00142   // Stage 1: Get the owning PIDs
00143   // **********************
00144   // Note: if B doesn't have an importer, the calling proc owns all its colids...
00145   std::vector<int> Bpids(Nb), Ipids(Ni);
00146   if(B.Importer()) {EPETRA_CHK_ERR(util.GetPids(*B.Importer(),Bpids,true));}
00147   else Bpids.assign(Nb,-1);
00148 
00149   if(Ni != (int) Bimport.ColMapOwningPIDs_.size()) {
00150     EPETRA_CHK_ERR(-21);
00151   }
00152   for(i=0;i<Ni;i++){
00153     Ipids[i] = (Bimport.ColMapOwningPIDs_[i]==MyPID)?(-1):(Bimport.ColMapOwningPIDs_[i]);    
00154   }
00155 
00156   // **********************
00157   // Stage 2: Allocate memory (make things too big)
00158   // **********************
00159   int Csize=Nb+Ni;
00160   int Psize=Nb+Ni;
00161   std::vector<int_type> Cgids(Csize);
00162   Cremotepids.resize(Psize);
00163 
00164 #ifdef ENABLE_MMM_TIMINGS
00165   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM M5 CMap 2")));
00166 #endif
00167 
00168   // **********************
00169   // Stage 3: Local Unknowns
00170   // **********************
00171   if(!B.Importer() || (B.Importer()->NumSameIDs() == DomainMap.NumMyElements())) {
00172     // B's colmap has all of the domain elements.  We can just copy those into the start of the array.
00173     DomainMap.MyGlobalElements(&Cgids[0]);
00174     Cstart=DomainMap.NumMyElements();
00175     Bstart=DomainMap.NumMyElements();
00176 
00177     for(i=0; i<DomainMap.NumMyElements(); i++) Bcols2Ccols[i] = i;
00178   }
00179   else {
00180     // There are more entries in the DomainMap than B's ColMap.  So we stream through both B and Bimport for the copy.
00181     int NumDomainElements     = DomainMap.NumMyElements();
00182     for(i = 0; i < NumDomainElements; i++) {      
00183       int_type GID = (int_type) DomainMap.GID64(i);
00184       int LID = BColMap.LID(GID);
00185       // B has this guy
00186       if(LID!=-1) {
00187   Bcols2Ccols[LID]=Cstart;
00188   Cgids[Cstart] = GID;
00189   Cstart++;
00190   Bstart++;
00191       }
00192       else {
00193   // B import has this guy
00194   LID = IColMap.LID(GID);
00195   if(LID!=-1) {
00196     Icols2Ccols[LID]=Cstart;
00197     Cgids[Cstart] = GID;
00198     Cstart++;
00199   } 
00200       }
00201     }
00202   }
00203 
00204   // Now advance Ilast to the last owned unknown in Bimport
00205   for(i=0,j=0; i<Ni && Ipids[i]==-1; i++) {
00206     while(Cgids[j]!=Igids[i]) j++;
00207     Icols2Ccols[i]=j;
00208   }
00209   Istart=i;
00210 
00211 
00212 #ifdef ENABLE_MMM_TIMINGS
00213   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM M5 CMap 3")));
00214 #endif
00215 
00216 
00217   // **********************
00218   // Stage 4: Processor-by-processor set_union
00219   // **********************
00220   // NOTE: Intial sizes for Btemp/Itemp from B's distributor.  This should be exact for Btemp and a decent guess for Itemp
00221   int initial_temp_length = 0;
00222   const int * lengths_from=0;
00223   if(Distor) {
00224     lengths_from= Distor->LengthsFrom();
00225     for(i=0; i < Distor->NumReceives(); i++) initial_temp_length += lengths_from[i];
00226   }
00227   else initial_temp_length=100;
00228 
00229   std::vector<int_type> Btemp(initial_temp_length),  Itemp(initial_temp_length);
00230   std::vector<int> Btemp2(initial_temp_length), Itemp2(initial_temp_length);
00231   
00232 
00233   while (Bstart < Nb || Istart < Ni) {
00234     int Bproc=NumProc+1, Iproc=NumProc+1, Cproc;
00235 
00236     // Find the next active processor ID
00237     if(Bstart < Nb) Bproc=Bpids[Bstart];
00238     if(Istart < Ni) Iproc=Ipids[Istart];
00239 
00240     Cproc = (Bproc < Iproc)?Bproc:Iproc;
00241     
00242     if(Bproc == Cproc && Iproc != Cproc) {
00243       // Only B has this processor.  Copy the data.
00244       // B: Find the beginning of the next processor
00245       for(i=Bstart; i<Nb && Bpids[i]==Bproc; i++) {}
00246       int Bnext=i;
00247       
00248       // Copy data to C
00249       int tCsize = Bnext-Bstart;
00250       if(Btemp.size() < (size_t)tCsize) {Btemp2.resize(tCsize);}
00251 
00252       for(i=Bstart; i<Bnext; i++) {
00253   Cremotepids[i-Bstart+Pstart] = Cproc;
00254   Cgids[i-Bstart+Cstart]       = Bgids[i]; 
00255   Btemp2[i-Bstart]             = i;
00256       }
00257 
00258       // Sort & record reindexing
00259       int *Bptr2 = &Btemp2[0]; 
00260       util.Sort(true, tCsize, &Cgids[Cstart], 0, 0, 1, &Bptr2, 0, 0);
00261 
00262       for(i=0, j=Cstart; i<tCsize; i++){
00263   while(Cgids[j] != Bgids[Btemp2[i]]) j++;
00264   Bcols2Ccols[Btemp2[i]] =  j;  
00265       }
00266       Cstart+=tCsize;
00267       Pstart+=tCsize;
00268       Bstart=Bnext;
00269     }
00270     else if(Bproc != Cproc && Iproc == Cproc) {
00271       // Only I has this processor.  Copy the data.
00272       // I: Find the beginning of the next processor
00273       for(i=Istart; i<Ni && Ipids[i]==Iproc; i++) {}
00274       int Inext=i;
00275 
00276       // Copy data to C
00277       int tCsize = Inext-Istart;
00278       if(Itemp.size() < (size_t)tCsize) {Itemp2.resize(tCsize);}
00279 
00280       for(i=Istart; i<Inext; i++) {
00281   Cremotepids[i-Istart+Pstart] = Cproc;
00282   Cgids[i-Istart+Cstart]       = Igids[i]; 
00283   Itemp2[i-Istart]             = i;
00284       }
00285 
00286       // Sort & record reindexing
00287       int *Iptr2 = &Itemp2[0]; 
00288       util.Sort(true, tCsize, &Cgids[Cstart], 0, 0, 1, &Iptr2, 0, 0);
00289 
00290       for(i=0, j=Cstart; i<tCsize; i++){
00291   while(Cgids[j] != Igids[Itemp2[i]]) j++;
00292   Icols2Ccols[Itemp2[i]] =  j;  
00293       }
00294       Cstart+=tCsize;
00295       Pstart+=tCsize;
00296       Istart=Inext;
00297     }
00298     else {
00299       // Both B and I have this processor, so we need to do a set_union.  So we need to sort.
00300       int Bnext, Inext;
00301       // B: Find the beginning of the next processor
00302       for(i=Bstart; i<Nb && Bpids[i]==Bproc; i++) {}
00303       Bnext=i;
00304 
00305       // I: Find the beginning of the next processor
00306       for(i=Istart; i<Ni && Ipids[i]==Iproc; i++) {}
00307       Inext=i;
00308 
00309       // Copy data to temp
00310       int tBsize = Bnext-Bstart;
00311       int tIsize = Inext-Istart;
00312       //      int tCsize = tBsize+tIsize;
00313       if(Btemp.size() < (size_t)tBsize) {Btemp.resize(tBsize); Btemp2.resize(tBsize);}
00314       if(Itemp.size() < (size_t)tIsize) {Itemp.resize(tIsize); Itemp2.resize(tIsize);}
00315 
00316       for(i=Bstart; i<Bnext; i++) {Btemp[i-Bstart]=Bgids[i]; Btemp2[i-Bstart]=i;}
00317       for(i=Istart; i<Inext; i++) {Itemp[i-Istart]=Igids[i]; Itemp2[i-Istart]=i;}
00318 
00319       // Sort & set_union
00320       int *Bptr2 = &Btemp2[0]; int *Iptr2 = &Itemp2[0];
00321       util.Sort(true, tBsize, &Btemp[0], 0, 0, 1, &Bptr2, 0, 0);
00322       util.Sort(true, tIsize, &Itemp[0], 0, 0, 1, &Iptr2, 0, 0);
00323       typename std::vector<int_type>::iterator mycstart = Cgids.begin()+Cstart;
00324       typename std::vector<int_type>::iterator last_el=std::set_union(Btemp.begin(),Btemp.begin()+tBsize,Itemp.begin(),Itemp.begin()+tIsize,mycstart);
00325 
00326       for(i=0, j=Cstart; i<tBsize; i++){
00327   while(Cgids[j] != Bgids[Btemp2[i]]) j++;
00328   Bcols2Ccols[Btemp2[i]] =  j;  
00329       }
00330  
00331       for(i=0, j=Cstart; i<tIsize; i++){
00332   while(Cgids[j] != Igids[Itemp2[i]]) j++;
00333   Icols2Ccols[Itemp2[i]] =  j;  
00334       }
00335      
00336       for(i=Pstart; i<(last_el - mycstart) + Pstart; i++) Cremotepids[i]=Cproc;
00337       Cstart = (last_el - mycstart) + Cstart;
00338       Pstart = (last_el - mycstart) + Pstart;
00339       Bstart=Bnext;
00340       Istart=Inext;
00341     }
00342   } // end while
00343 
00344 #ifdef ENABLE_MMM_TIMINGS
00345   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM M5 CMap 4")));
00346 #endif
00347 
00348   // Resize the RemotePIDs down
00349   Cremotepids.resize(Pstart);
00350 
00351   // **********************
00352   // Stage 5: Call constructor
00353   // **********************
00354   // Make the map
00355   unionmap=new Epetra_Map((int_type) -1,Cstart,&Cgids[0], (int_type) B.ColMap().IndexBase64(),
00356     B.Comm(),B.ColMap().DistributedGlobal(),(int_type) B.ColMap().MinAllGID64(),(int_type) B.ColMap().MaxAllGID64());
00357   return 0;
00358 #else
00359   return -1;
00360 #endif
00361   }
00362 
00363 /*****************************************************************************/
00364 /*****************************************************************************/
00365 /*****************************************************************************/
00366 // Provide a "resize" operation for double*'s. 
00367 inline void resize_doubles(int nold,int nnew,double*& d){
00368   if(nnew > nold){
00369     double *tmp = new double[nnew];
00370     for(int i=0; i<nold; i++)
00371       tmp[i]=d[i];
00372     delete [] d;
00373     d=tmp;
00374   }
00375 }
00376 
00377 
00378 /*****************************************************************************/
00379 /*****************************************************************************/
00380 /*****************************************************************************/
00381 template<typename int_type>
00382 int  mult_A_B_newmatrix(const Epetra_CrsMatrix & A,
00383       const Epetra_CrsMatrix & B,
00384       CrsMatrixStruct& Bview,
00385       std::vector<int> & Bcol2Ccol,
00386       std::vector<int> & Bimportcol2Ccol,
00387       std::vector<int>& Cremotepids,
00388       Epetra_CrsMatrix& C
00389 ){
00390 #ifdef ENABLE_MMM_TIMINGS
00391   using Teuchos::TimeMonitor;
00392   Teuchos::RCP<Teuchos::TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM Newmatrix SerialCore")));
00393 #endif
00394 
00395   // *****************************
00396   // Improved Parallel Gustavson in Local IDs
00397   // *****************************
00398   const Epetra_Map * colmap_C = &(C.ColMap());
00399 
00400 
00401   int NumMyDiagonals=0; // Counter to speed up ESFC
00402 
00403   int m=A.NumMyRows();
00404   int n=colmap_C->NumMyElements();
00405   int i,j,k;
00406 
00407   // DataPointers for A
00408   int *Arowptr, *Acolind;
00409   double *Avals;
00410   EPETRA_CHK_ERR(A.ExtractCrsDataPointers(Arowptr,Acolind,Avals));
00411 
00412   // DataPointers for B, Bimport
00413   int *Browptr, *Bcolind;
00414   double *Bvals;
00415   EPETRA_CHK_ERR(B.ExtractCrsDataPointers(Browptr,Bcolind,Bvals));
00416 
00417   int *Irowptr=0, *Icolind=0;
00418   double *Ivals=0;
00419   if(Bview.importMatrix){
00420     Irowptr = &Bview.importMatrix->rowptr_[0];
00421     Icolind = (Bview.importMatrix->colind_.size()>0)?(&Bview.importMatrix->colind_[0]):0;
00422     Ivals   = (Bview.importMatrix->vals_.size()>0)?(&Bview.importMatrix->vals_[0]):0;
00423   }
00424 
00425   // MemorySetup: If somebody else is sharing this C's graphdata, make a new one.
00426   // This is needed because I'm about to walk all over the CrsGrapData...
00427   C.ExpertMakeUniqueCrsGraphData();
00428 
00429   // The status array will contain the index into colind where this entry was last deposited.
00430   // c_status[i] < CSR_ip - not in the row yet.
00431   // c_status[i] >= CSR_ip, this is the entry where you can find the data
00432   // We start with this filled with -1's indicating that there are no entries yet.
00433   std::vector<int> c_status(n, -1);
00434 
00435   // Classic csr assembly (low memory edition)
00436   int CSR_alloc=C_estimate_nnz(A,B);
00437   if(CSR_alloc < n) CSR_alloc = n;
00438   int CSR_ip=0,OLD_ip=0;
00439   Epetra_IntSerialDenseVector & CSR_rowptr = C.ExpertExtractIndexOffset();
00440   Epetra_IntSerialDenseVector & CSR_colind = C.ExpertExtractIndices();  
00441   double *&                     CSR_vals   = C.ExpertExtractValues();
00442 
00443   CSR_rowptr.Resize(m+1);
00444   CSR_colind.Resize(CSR_alloc);
00445   resize_doubles(0,CSR_alloc,CSR_vals);
00446 
00447   // Static Profile stuff
00448   std::vector<int> NumEntriesPerRow(m);
00449 
00450   // For each row of A/C
00451   for(i=0; i<m; i++){            
00452     bool found_diagonal=false;
00453     CSR_rowptr[i]=CSR_ip;
00454 
00455     for(k=Arowptr[i]; k<Arowptr[i+1]; k++){
00456       int Ak      = Acolind[k];
00457       double Aval = Avals[k];
00458       if(Aval==0) continue;
00459       
00460       if(Bview.targetMapToOrigRow[Ak] != -1){
00461   // Local matrix
00462   int Bk = Bview.targetMapToOrigRow[Ak];
00463   for(j=Browptr[Bk]; j<Browptr[Bk+1]; ++j) {
00464     int Cj=Bcol2Ccol[Bcolind[j]];
00465 
00466     if(Cj==i && !found_diagonal) {found_diagonal=true; NumMyDiagonals++;}
00467 
00468     if(c_status[Cj]<OLD_ip){
00469       // New entry
00470       c_status[Cj]=CSR_ip;
00471       CSR_colind[CSR_ip]=Cj;
00472       CSR_vals[CSR_ip]=Aval*Bvals[j];
00473       CSR_ip++;
00474     }
00475     else
00476       CSR_vals[c_status[Cj]]+=Aval*Bvals[j];
00477   }
00478       }
00479       else{
00480   // Remote matrix
00481   int Ik = Bview.targetMapToImportRow[Ak];
00482   for(j=Irowptr[Ik]; j<Irowptr[Ik+1]; ++j) {
00483     int Cj=Bimportcol2Ccol[Icolind[j]];
00484     
00485     if(Cj==i && !found_diagonal) {found_diagonal=true; NumMyDiagonals++;}
00486 
00487     if(c_status[Cj]<OLD_ip){
00488       // New entry
00489       c_status[Cj]=CSR_ip;
00490       CSR_colind[CSR_ip]=Cj;
00491       CSR_vals[CSR_ip]=Aval*Ivals[j];
00492       CSR_ip++;
00493     }
00494     else
00495       CSR_vals[c_status[Cj]]+=Aval*Ivals[j];
00496   }
00497       }
00498     }
00499     NumEntriesPerRow[i]=CSR_ip-CSR_rowptr[i];
00500 
00501     // Resize for next pass if needed
00502     if(CSR_ip + n > CSR_alloc){
00503       resize_doubles(CSR_alloc,2*CSR_alloc,CSR_vals);
00504       CSR_alloc*=2;
00505       CSR_colind.Resize(CSR_alloc);
00506     }
00507     OLD_ip=CSR_ip;
00508   }
00509 
00510   CSR_rowptr[m]=CSR_ip;
00511 
00512 #ifdef ENABLE_MMM_TIMINGS
00513   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM Newmatrix Final Sort")));
00514 #endif
00515 
00516   // Sort the entries
00517   Epetra_Util::SortCrsEntries(m, &CSR_rowptr[0], &CSR_colind[0], &CSR_vals[0]);
00518 
00519 #ifdef ENABLE_MMM_TIMINGS
00520   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM Newmatrix Fast IE")));
00521 #endif
00522 
00523   // Do a fast build of C's importer
00524   Epetra_Import * Cimport=0; 
00525   int *RemotePIDs = Cremotepids.size()?&Cremotepids[0]:0;
00526   int NumExports=0;
00527   int *ExportLIDs=0, *ExportPIDs=0;
00528   if(Bview.importMatrix) { 
00529     NumExports = Bview.importMatrix->ExportLIDs_.size();
00530     ExportLIDs = Bview.importMatrix->ExportLIDs_.size()?&Bview.importMatrix->ExportLIDs_[0]:0;
00531     ExportPIDs = Bview.importMatrix->ExportPIDs_.size()?&Bview.importMatrix->ExportPIDs_[0]:0;
00532   }
00533   else if(B.Importer()) {
00534     // Grab the exports from B proper
00535     NumExports = B.Importer()->NumExportIDs();
00536     ExportLIDs = B.Importer()->ExportLIDs();
00537     ExportPIDs = B.Importer()->ExportPIDs();    
00538   }
00539 
00540   if(!C.ColMap().SameAs(B.DomainMap()))
00541     Cimport = new Epetra_Import(C.ColMap(),B.DomainMap(),Cremotepids.size(),RemotePIDs,NumExports,ExportLIDs,ExportPIDs);
00542 
00543 #ifdef ENABLE_MMM_TIMINGS
00544   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM Newmatrix ESFC")));
00545 #endif
00546 
00547   // Update the CrsGraphData
00548   C.ExpertStaticFillComplete(B.DomainMap(),A.RangeMap(),Cimport,0,NumMyDiagonals);
00549 
00550   return 0;
00551 }
00552 
00553 
00554 
00555 
00556 /*****************************************************************************/
00557 /*****************************************************************************/
00558 /*****************************************************************************/
00559 template<typename int_type>
00560 int mult_A_B_reuse(const Epetra_CrsMatrix & A,
00561        const Epetra_CrsMatrix & B,
00562        CrsMatrixStruct& Bview,
00563        std::vector<int> & Bcol2Ccol,
00564        std::vector<int> & Bimportcol2Ccol,
00565        Epetra_CrsMatrix& C){
00566 
00567 #ifdef ENABLE_MMM_TIMINGS
00568   using Teuchos::TimeMonitor;
00569   Teuchos::RCP<Teuchos::TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM Reuse SerialCore")));
00570 #endif
00571 
00572   // *****************************
00573   // Improved Parallel Gustavson in Local IDs
00574   // *****************************
00575   const Epetra_Map * colmap_C = &(C.ColMap());
00576 
00577   int m=A.NumMyRows();
00578   int n=colmap_C->NumMyElements();
00579   int i,j,k;
00580 
00581   // DataPointers for A
00582   int *Arowptr, *Acolind;
00583   double *Avals;
00584   EPETRA_CHK_ERR(A.ExtractCrsDataPointers(Arowptr,Acolind,Avals));
00585 
00586   // DataPointers for B, Bimport
00587   int *Browptr, *Bcolind;
00588   double *Bvals;
00589   EPETRA_CHK_ERR(B.ExtractCrsDataPointers(Browptr,Bcolind,Bvals));
00590 
00591   int *Irowptr=0, *Icolind=0;
00592   double *Ivals=0;
00593   if(Bview.importMatrix){
00594     Irowptr = &Bview.importMatrix->rowptr_[0];
00595     Icolind = &Bview.importMatrix->colind_[0];
00596     Ivals   = &Bview.importMatrix->vals_[0];
00597   }
00598 
00599   // DataPointers for C
00600   int *CSR_rowptr, *CSR_colind;
00601   double *CSR_vals;
00602   EPETRA_CHK_ERR(C.ExtractCrsDataPointers(CSR_rowptr,CSR_colind,CSR_vals));
00603 
00604 
00605   // The status array will contain the index into colind where this dude was last deposited.
00606   // c_status[i] < CSR_ip - not in the row yet.
00607   // c_status[i] >= CSR_ip, this is the entry where you can find the data
00608   // We start with this filled with -1's indicating that there are no entries yet.
00609   std::vector<int> c_status(n, -1);
00610 
00611   // Classic csr assembly
00612   int CSR_alloc=CSR_rowptr[m] - CSR_rowptr[0];
00613   int CSR_ip=0,OLD_ip=0;
00614 
00615  // For each row of A/C
00616   for(i=0; i<m; i++){            
00617     for(k=Arowptr[i]; k<Arowptr[i+1]; k++){
00618       int Ak=Acolind[k];
00619       double Aval = Avals[k];
00620       if(Aval==0) continue;
00621 
00622       if(Bview.targetMapToOrigRow[Ak] != -1){
00623   // Local matrix
00624   int Bk = Bview.targetMapToOrigRow[Ak];
00625   for(j=Browptr[Bk]; j<Browptr[Bk+1]; ++j) {
00626     int Cj=Bcol2Ccol[Bcolind[j]];
00627 
00628     if(c_status[Cj]<OLD_ip){
00629       // New entry
00630       if(CSR_ip >= CSR_alloc) EPETRA_CHK_ERR(-13);
00631       c_status[Cj]=CSR_ip;
00632       CSR_colind[CSR_ip]=Cj;
00633       CSR_vals[CSR_ip]=Aval*Bvals[j];
00634       CSR_ip++;
00635     }
00636     else
00637       CSR_vals[c_status[Cj]]+=Aval*Bvals[j];
00638   }
00639       }
00640       else{
00641   // Remote matrix
00642   int Ik = Bview.targetMapToImportRow[Ak];
00643   for(j=Irowptr[Ik]; j<Irowptr[Ik+1]; ++j) {
00644     int Cj=Bimportcol2Ccol[Icolind[j]];
00645 
00646     if(c_status[Cj]<OLD_ip){
00647       // New entry
00648       if(CSR_ip >= CSR_alloc) EPETRA_CHK_ERR(-14);
00649       c_status[Cj]=CSR_ip;
00650       CSR_colind[CSR_ip]=Cj;
00651       CSR_vals[CSR_ip]=Aval*Ivals[j];
00652       CSR_ip++;
00653     }
00654     else
00655       CSR_vals[c_status[Cj]]+=Aval*Ivals[j];
00656   }
00657       }
00658     }
00659     OLD_ip=CSR_ip;
00660   }
00661 
00662 #ifdef ENABLE_MMM_TIMINGS
00663   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM Reuse Final Sort")));
00664 #endif
00665 
00666   // Sort the entries
00667   Epetra_Util::SortCrsEntries(m, &CSR_rowptr[0], &CSR_colind[0], &CSR_vals[0]);
00668 
00669   return 0;
00670 }
00671 
00672 
00673 
00674 /*****************************************************************************/
00675 /*****************************************************************************/
00676 /*****************************************************************************/
00677 //kernel method for computing the local portion of C = A*B
00678 template<typename int_type>
00679   int mult_A_B_general(const Epetra_CrsMatrix & A,
00680            CrsMatrixStruct & Aview,
00681            const Epetra_CrsMatrix & B,
00682            CrsMatrixStruct& Bview,
00683            Epetra_CrsMatrix& C,
00684            bool call_FillComplete_on_result)
00685 {
00686   int C_firstCol = Bview.colMap->MinLID();
00687   int C_lastCol = Bview.colMap->MaxLID();
00688 
00689   int C_firstCol_import = 0;
00690   int C_lastCol_import = -1;
00691 
00692   int_type* bcols = 0;
00693   Bview.colMap->MyGlobalElementsPtr(bcols);
00694   int_type* bcols_import = NULL;
00695   if (Bview.importMatrix != NULL) {
00696     C_firstCol_import = Bview.importMatrix->ColMap_.MinLID();
00697     C_lastCol_import = Bview.importMatrix->ColMap_.MaxLID();
00698     Bview.importMatrix->ColMap_.MyGlobalElementsPtr(bcols_import);
00699   }
00700 
00701   int C_numCols = C_lastCol - C_firstCol + 1;
00702   int C_numCols_import = C_lastCol_import - C_firstCol_import + 1;
00703 
00704   if (C_numCols_import > C_numCols) C_numCols = C_numCols_import;
00705 
00706   // Allocate workspace memory
00707   double* dwork = new double[C_numCols];
00708   int_type* iwork = new int_type[C_numCols];
00709   int_type *c_cols=iwork;
00710   double *c_vals=dwork;
00711   int *c_index=new int[C_numCols];
00712 
00713   int  i, j, k;
00714   bool C_filled=C.Filled();
00715 
00716   // DataPointers for A
00717   int *Arowptr, *Acolind;
00718   double *Avals;
00719   EPETRA_CHK_ERR(A.ExtractCrsDataPointers(Arowptr,Acolind,Avals));
00720 
00721   //To form C = A*B we're going to execute this expression:
00722   //
00723   // C(i,j) = sum_k( A(i,k)*B(k,j) )
00724   //
00725   //Our goal, of course, is to navigate the data in A and B once, without
00726   //performing searches for column-indices, etc.
00727   
00728   // Mark indices as empty w/ -1
00729   for(k=0;k<C_numCols;k++) c_index[k]=-1;
00730 
00731   //loop over the rows of A.
00732   for(i=0; i<A.NumMyRows(); ++i) {
00733 
00734     int_type global_row = (int_type) Aview.rowMap->GID64(i);
00735 
00736     //loop across the i-th row of A and for each corresponding row
00737     //in B, loop across colums and accumulate product
00738     //A(i,k)*B(k,j) into our partial sum quantities C_row_i. In other words,
00739     //as we stride across B(k,:) we're calculating updates for row i of the
00740     //result matrix C.
00741 
00742     /* Outline of the revised, ML-inspired algorithm
00743  
00744     C_{i,j} = \sum_k A_{i,k} B_{k,j}
00745 
00746     This algorithm uses a "middle product" formulation, with the loop ordering of 
00747     i, k, j.  This means we compute a row of C at a time, but compute partial sums of 
00748     each entry in row i until we finish the k loop.
00749 
00750     This algorithm also has a few twists worth documenting.  
00751 
00752     1) The first major twist involves the c_index, c_cols and c_vals arrays.  The arrays c_cols 
00753     and c_vals store the *local* column index and values accumulator respectively.  These 
00754     arrays are allocated to a size equal to the max number of local columns in C, namely C_numcols.
00755     The value c_current tells us how many non-zeros we currently have in this row.
00756     
00757     So how do we take a LCID and find the right accumulator?  This is where the c_index array
00758     comes in.  At the start (and stop) and the i loop, c_index is filled with -1's.  Now 
00759     whenever we find a LCID in the k loop, we first loop at c_index[lcid].  If this value is
00760     -1 we haven't seen this entry yet.  In which case we add the appropriate stuff to c_cols
00761     and c_vals and then set c_index[lcid] to the location of the accumulator (c_current before
00762     we increment it).  If the value is NOT -1, this tells us the location in the c_vals/c_cols
00763     arrays (namely c_index[lcid]) where our accumulator lives.
00764 
00765     This trick works because we're working with local ids.  We can then loop from 0 to c_current
00766     and reset c_index to -1's when we're done, only touching the arrays that have changed.
00767     While we're at it, we can switch to the global ids so we can call [Insert|SumInto]GlobalValues.
00768     Thus, the effect of this trick is to avoid passes over the index array.
00769 
00770     2) The second major twist involves handling the remote and local components of B separately.
00771     (ML doesn't need to do this, because its local ordering scheme is consistent between the local
00772     and imported components of B.)  Since they have different column maps, they have inconsistent 
00773     local column ids.  This means the "second twist" won't work as stated on both matrices at the 
00774     same time.  While this could be handled any number of ways, I have chosen to do the two parts 
00775     of B separately to make the code easier to read (and reduce the memory footprint of the MMM).
00776     */
00777 
00778     // Local matrix: Zero Current counts for matrix
00779     int c_current=0;    
00780 
00781     // Local matrix: Do the "middle product"
00782     for(k=Arowptr[i]; k<Arowptr[i+1]; ++k) {
00783       int Ak      = Acolind[k];
00784       double Aval = Avals[k];
00785       // We're skipping remote entries on this pass.
00786       if(Bview.remote[Ak] || Aval==0) continue;
00787       
00788       int* Bcol_inds = Bview.indices[Ak];
00789       double* Bvals_k = Bview.values[Ak];
00790 
00791       for(j=0; j<Bview.numEntriesPerRow[Ak]; ++j) {
00792   int col=Bcol_inds[j];
00793   if(c_index[col]<0){
00794     // We haven't seen this entry before; add it.  (In ML, on
00795     // the first pass, you haven't seen any of the entries
00796     // before, so they are added without the check.  Not sure
00797     // how much more efficient that would be; depends on branch
00798     // prediction.  We've favored code readability here.)
00799     c_cols[c_current]=col;        
00800     c_vals[c_current]=Aval*Bvals_k[j];
00801     c_index[col]=c_current;
00802     c_current++;
00803   }
00804   else{ 
00805     // We've already seen this entry; accumulate it.
00806     c_vals[c_index[col]]+=Aval*Bvals_k[j];      
00807   }
00808       }
00809     }    
00810     // Local matrix: Reset c_index and switch c_cols to GIDs
00811     for(k=0; k<c_current; k++){
00812       c_index[c_cols[k]]=-1;
00813       c_cols[k]=bcols[c_cols[k]]; // Switch from local to global IDs.     
00814     }
00815     // Local matrix: Insert.
00816     //
00817     // We should check global error results after the algorithm is
00818     // through.  It's probably safer just to let the algorithm run all
00819     // the way through before doing this, since otherwise we have to
00820     // remember to free all allocations carefully.
00821     int err = C_filled ?
00822       C.SumIntoGlobalValues(global_row,c_current,c_vals,c_cols)
00823       :
00824       C.InsertGlobalValues(global_row,c_current,c_vals,c_cols);   
00825 
00826     // Remote matrix: Zero current counts again for matrix
00827     c_current=0;    
00828 
00829     // Remote matrix: Do the "middle product"
00830     for(k=Arowptr[i]; k<Arowptr[i+1]; ++k) {
00831       int Ak      = Acolind[k];
00832       double Aval = Avals[k];
00833       // We're skipping local entries on this pass.
00834       if(!Bview.remote[Ak] || Aval==0) continue;
00835       
00836       int* Bcol_inds = Bview.indices[Ak];
00837       double* Bvals_k = Bview.values[Ak];
00838 
00839       for(j=0; j<Bview.numEntriesPerRow[Ak]; ++j) {
00840   int col=Bcol_inds[j];
00841   if(c_index[col]<0){
00842     c_cols[c_current]=col;        
00843     c_vals[c_current]=Aval*Bvals_k[j];
00844     c_index[col]=c_current;
00845     c_current++;
00846   }
00847   else{
00848     c_vals[c_index[col]]+=Aval*Bvals_k[j];      
00849   }
00850       }
00851     }    
00852     // Remote matrix: Reset c_index and switch c_cols to GIDs
00853     for(k=0; k<c_current; k++){
00854       c_index[c_cols[k]]=-1;
00855       c_cols[k]=bcols_import[c_cols[k]];      
00856     }
00857     // Remove matrix: Insert
00858     //
00859     // See above (on error handling).
00860     err = C_filled ?
00861       C.SumIntoGlobalValues(global_row,c_current,c_vals,c_cols)
00862       :
00863       C.InsertGlobalValues(global_row,c_current,c_vals,c_cols);    
00864   }
00865   
00866   // Since Multiply won't do this
00867   if(call_FillComplete_on_result)
00868     C.FillComplete(B.DomainMap(),A.RangeMap());
00869 
00870   delete [] dwork;
00871   delete [] iwork;
00872   delete [] c_index;
00873   return(0);
00874 }
00875 
00876 
00877 
00878 
00879 
00880 /*****************************************************************************/
00881 /*****************************************************************************/
00882 /*****************************************************************************/
00883 template<typename int_type>
00884 int MatrixMatrix::Tmult_A_B(const Epetra_CrsMatrix & A,
00885          CrsMatrixStruct & Aview,
00886          const Epetra_CrsMatrix & B,
00887          CrsMatrixStruct& Bview,
00888          Epetra_CrsMatrix& C,
00889          bool call_FillComplete_on_result){
00890 
00891   int i,rv;
00892   Epetra_Map* mapunion = 0;
00893   const Epetra_Map * colmap_B = &(B.ColMap());
00894   const Epetra_Map * colmap_C = &(C.ColMap());
00895 
00896   std::vector<int> Cremotepids;
00897   std::vector<int> Bcol2Ccol(B.ColMap().NumMyElements());
00898   std::vector<int> Bimportcol2Ccol;
00899   if(Bview.importMatrix) Bimportcol2Ccol.resize(Bview.importMatrix->ColMap_.NumMyElements());
00900 
00901 #ifdef ENABLE_MMM_TIMINGS
00902   using Teuchos::TimeMonitor;
00903   Teuchos::RCP<Teuchos::TimeMonitor> MM;
00904 #endif  
00905 
00906   // If the user doesn't want us to call FillComplete, use the general routine
00907   if(!call_FillComplete_on_result) {
00908 #ifdef ENABLE_MMM_TIMINGS
00909     MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM General Multiply")));
00910 #endif
00911     rv=mult_A_B_general<int_type>(A,Aview,B,Bview,C,false);
00912     return rv;
00913   }
00914 
00915   // Is this a "clean" matrix
00916   bool NewFlag=!C.IndicesAreLocal() && !C.IndicesAreGlobal();
00917 
00918   // Does ExtractCrsDataPointers work?
00919   int *C_rowptr, *C_colind;
00920   double * C_vals;
00921   C.ExtractCrsDataPointers(C_rowptr,C_colind,C_vals);
00922   bool ExtractFailFlag=!C_rowptr || !C_colind || !C_vals;
00923 
00924   // It's a new matrix that hasn't been fill-completed, use the general routine
00925   if(!NewFlag && ExtractFailFlag){
00926 #ifdef ENABLE_MMM_TIMINGS
00927     MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM General Multiply")));
00928 #endif
00929     rv=mult_A_B_general<int_type>(A,Aview,B,Bview,C,call_FillComplete_on_result);
00930     return rv;
00931   }
00932 
00933 
00934 #ifdef ENABLE_MMM_TIMINGS
00935   if(NewFlag) MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM Newmatrix CMap")));
00936   else MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM Reuse CMap")));
00937 #endif
00938 
00939   // If new, build & clobber a colmap for C
00940   if(NewFlag){
00941     if(Bview.importMatrix) {
00942       EPETRA_CHK_ERR( aztecoo_and_ml_compatible_map_union<int_type>(B,*Bview.importMatrix,mapunion,Cremotepids,Bcol2Ccol,Bimportcol2Ccol) );
00943       EPETRA_CHK_ERR( C.ReplaceColMap(*mapunion) );
00944     }
00945     else  {
00946       EPETRA_CHK_ERR( C.ReplaceColMap(B.ColMap()) );
00947       for(i=0;i<colmap_B->NumMyElements();i++) Bcol2Ccol[i]=i;
00948       
00949       // Copy B's remote list (if any)
00950       if(B.Importer()) 
00951   EPETRA_CHK_ERR( Epetra_Util::GetRemotePIDs(*B.Importer(),Cremotepids));
00952     }
00953   }
00954 
00955 #ifdef ENABLE_MMM_TIMINGS
00956   if(NewFlag)  MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM Newmatrix Lookups")));
00957   else MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: MMM Reuse Lookups")));
00958 #endif
00959 
00960   // ********************************************
00961   // Setup Bcol2Ccol / Bimportcol2Ccol lookups
00962   // ********************************************
00963   // Note: If we ran the map_union, we have this information already
00964 
00965   if(!NewFlag) {
00966     if(colmap_B->SameAs(*colmap_C)){
00967       // Maps are the same: Use local IDs as the hash
00968       for(i=0;i<colmap_B->NumMyElements();i++)
00969   Bcol2Ccol[i]=i;       
00970     }
00971     else {
00972       // Maps are not the same:  Use the map's hash
00973       for(i=0;i<colmap_B->NumMyElements();i++){
00974   Bcol2Ccol[i]=colmap_C->LID((int_type) colmap_B->GID64(i));
00975   if(Bcol2Ccol[i]==-1) EPETRA_CHK_ERR(-11);
00976       }
00977     }
00978     
00979     if(Bview.importMatrix){
00980       Bimportcol2Ccol.resize(Bview.importMatrix->ColMap_.NumMyElements());
00981       for(i=0;i<Bview.importMatrix->ColMap_.NumMyElements();i++){
00982       Bimportcol2Ccol[i]=colmap_C->LID((int_type) Bview.importMatrix->ColMap_.GID64(i));
00983       if(Bimportcol2Ccol[i]==-1) EPETRA_CHK_ERR(-12);
00984       }
00985       
00986     }
00987   }
00988 #ifdef ENABLE_MMM_TIMINGS
00989   MM=Teuchos::null;
00990 #endif
00991 
00992   // Call the appropriate core routine
00993   if(NewFlag) {
00994     EPETRA_CHK_ERR(mult_A_B_newmatrix<int_type>(A,B,Bview,Bcol2Ccol,Bimportcol2Ccol,Cremotepids,C));
00995   }
00996   else {
00997     // This always has a real map
00998     EPETRA_CHK_ERR(mult_A_B_reuse<int_type>(A,B,Bview,Bcol2Ccol,Bimportcol2Ccol,C));
00999   }
01000 
01001   // Cleanup      
01002   delete mapunion;
01003   return 0;
01004 }
01005 
01006 
01007 int MatrixMatrix::mult_A_B(const Epetra_CrsMatrix & A,
01008          CrsMatrixStruct & Aview,
01009          const Epetra_CrsMatrix & B,
01010          CrsMatrixStruct& Bview,
01011          Epetra_CrsMatrix& C,
01012          bool call_FillComplete_on_result){
01013 
01014 #ifndef EPETRA_NO_32BIT_GLOBAL_INDICES
01015   if(A.RowMap().GlobalIndicesInt() && B.RowMap().GlobalIndicesInt()) {
01016   return Tmult_A_B<int>(A, Aview, B, Bview, C, call_FillComplete_on_result);
01017   }
01018   else
01019 #endif
01020 #ifndef EPETRA_NO_64BIT_GLOBAL_INDICES
01021   if(A.RowMap().GlobalIndicesLongLong() && B.RowMap().GlobalIndicesLongLong()) {
01022   return Tmult_A_B<long long>(A, Aview, B, Bview, C, call_FillComplete_on_result);
01023   }
01024   else
01025 #endif
01026     throw "EpetraExt::MatrixMatrix::mult_A_B: GlobalIndices type unknown";
01027 }
01028 
01029 
01030 /*****************************************************************************/
01031 /*****************************************************************************/
01032 /*****************************************************************************/
01033 template<typename int_type>
01034 int jacobi_A_B_reuse(double omega,
01035          const Epetra_Vector & Dinv,
01036          const Epetra_CrsMatrix & A,
01037          const Epetra_CrsMatrix & B,
01038          CrsMatrixStruct& Bview,
01039          std::vector<int> & Bcol2Ccol,
01040          std::vector<int> & Bimportcol2Ccol,
01041          Epetra_CrsMatrix& C){
01042 
01043 #ifdef ENABLE_MMM_TIMINGS
01044   using Teuchos::TimeMonitor;
01045   Teuchos::RCP<Teuchos::TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi Reuse SerialCore")));
01046 #endif
01047 
01048   // *****************************
01049   // Improved Parallel Gustavson in Local IDs
01050   // *****************************
01051   const Epetra_Map * colmap_C = &(C.ColMap());
01052 
01053   int m=A.NumMyRows();
01054   int n=colmap_C->NumMyElements();
01055   int i,j,k;
01056 
01057   // DataPointers for A
01058   int *Arowptr, *Acolind;
01059   double *Avals;
01060   EPETRA_CHK_ERR(A.ExtractCrsDataPointers(Arowptr,Acolind,Avals));
01061 
01062   // DataPointers for B, Bimport
01063   int *Browptr, *Bcolind;
01064   double *Bvals;
01065   EPETRA_CHK_ERR(B.ExtractCrsDataPointers(Browptr,Bcolind,Bvals));
01066 
01067   int *Irowptr=0, *Icolind=0;
01068   double *Ivals=0;
01069   if(Bview.importMatrix){
01070     Irowptr = &Bview.importMatrix->rowptr_[0];
01071     Icolind = &Bview.importMatrix->colind_[0];
01072     Ivals   = &Bview.importMatrix->vals_[0];
01073   }
01074 
01075   // Data pointer for Dinv
01076   const double *Dvals = Dinv.Values();
01077 
01078   // DataPointers for C
01079   int *CSR_rowptr, *CSR_colind;
01080   double *CSR_vals;
01081   EPETRA_CHK_ERR(C.ExtractCrsDataPointers(CSR_rowptr,CSR_colind,CSR_vals));
01082 
01083 
01084   // The status array will contain the index into colind where this dude was last deposited.
01085   // c_status[i] < CSR_ip - not in the row yet.
01086   // c_status[i] >= CSR_ip, this is the entry where you can find the data
01087   // We start with this filled with -1's indicating that there are no entries yet.
01088   std::vector<int> c_status(n, -1);
01089 
01090   // Classic csr assembly
01091   int CSR_alloc=CSR_rowptr[m] - CSR_rowptr[0];
01092   int CSR_ip=0,OLD_ip=0;
01093 
01094   // For each row of C
01095   for(i=0; i<m; i++){ 
01096     double Dval = Dvals[i];
01097 
01098     // Entries of B
01099     for(k=Browptr[i]; k<Browptr[i+1]; k++){
01100       //      int Bk      = Bcolind[k];
01101       double Bval = Bvals[k];
01102       if(Bval==0) continue;
01103       int Ck=Bcol2Ccol[Bcolind[k]];
01104       
01105       // Assume no repeated entries in B
01106       c_status[Ck]=CSR_ip;
01107       CSR_colind[CSR_ip]=Ck;
01108       CSR_vals[CSR_ip]= Bvals[k];
01109       CSR_ip++;
01110     }
01111 
01112     // Entries of -omega * Dinv * A * B          
01113     for(k=Arowptr[i]; k<Arowptr[i+1]; k++){
01114       int Ak=Acolind[k];
01115       double Aval = Avals[k];
01116       if(Aval==0) continue;
01117 
01118       if(Bview.targetMapToOrigRow[Ak] != -1){
01119   // Local matrix
01120   int Bk = Bview.targetMapToOrigRow[Ak];
01121   for(j=Browptr[Bk]; j<Browptr[Bk+1]; ++j) {
01122     int Cj=Bcol2Ccol[Bcolind[j]];
01123 
01124     if(c_status[Cj]<OLD_ip){
01125       // New entry
01126       if(CSR_ip >= CSR_alloc) EPETRA_CHK_ERR(-13);
01127       c_status[Cj]=CSR_ip;
01128       CSR_colind[CSR_ip]=Cj;
01129       CSR_vals[CSR_ip]= - omega * Dval * Aval * Bvals[j];
01130       CSR_ip++;
01131     }
01132     else
01133       CSR_vals[c_status[Cj]]-= omega * Dval * Aval * Bvals[j];
01134   }
01135       }
01136       else{
01137   // Remote matrix
01138   int Ik = Bview.targetMapToImportRow[Ak];
01139   for(j=Irowptr[Ik]; j<Irowptr[Ik+1]; ++j) {
01140     int Cj=Bimportcol2Ccol[Icolind[j]];
01141 
01142     if(c_status[Cj]<OLD_ip){
01143       // New entry
01144       if(CSR_ip >= CSR_alloc) EPETRA_CHK_ERR(-14);
01145       c_status[Cj]=CSR_ip;
01146       CSR_colind[CSR_ip]=Cj;
01147       CSR_vals[CSR_ip]= - omega * Dval * Aval * Ivals[j];
01148       CSR_ip++;
01149     }
01150     else
01151       CSR_vals[c_status[Cj]]-=omega * Dval * Aval * Ivals[j];
01152   }
01153       }
01154     }
01155     OLD_ip=CSR_ip;
01156   }
01157 
01158 #ifdef ENABLE_MMM_TIMINGS
01159   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi Reuse Final Sort")));
01160 #endif
01161   // Sort the entries
01162   Epetra_Util::SortCrsEntries(m, &CSR_rowptr[0], &CSR_colind[0], &CSR_vals[0]);
01163 
01164   return 0;
01165 }
01166 
01167 
01168 /*****************************************************************************/
01169 /*****************************************************************************/
01170 /*****************************************************************************/
01171 template<typename int_type>
01172 int jacobi_A_B_newmatrix(double omega,
01173        const Epetra_Vector & Dinv,
01174        const Epetra_CrsMatrix & A,
01175        const Epetra_CrsMatrix & B,
01176        CrsMatrixStruct& Bview,
01177        std::vector<int> & Bcol2Ccol,
01178        std::vector<int> & Bimportcol2Ccol,
01179        std::vector<int>& Cremotepids,
01180        Epetra_CrsMatrix& C)
01181 {
01182 #ifdef ENABLE_MMM_TIMINGS
01183   using Teuchos::TimeMonitor;
01184   Teuchos::RCP<Teuchos::TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi NewMatrix SerialCore")));
01185 #endif
01186 
01187   // *****************************
01188   // Improved Parallel Gustavson in Local IDs
01189   // *****************************
01190   const Epetra_Map * colmap_C = &(C.ColMap());
01191   int NumMyDiagonals=0; // Counter to speed up ESFC
01192 
01193   int m=A.NumMyRows();
01194   int n=colmap_C->NumMyElements();
01195   int i,j,k;
01196 
01197   // DataPointers for A
01198   int *Arowptr, *Acolind;
01199   double *Avals;
01200   EPETRA_CHK_ERR(A.ExtractCrsDataPointers(Arowptr,Acolind,Avals));
01201 
01202   // DataPointers for B, Bimport
01203   int *Browptr, *Bcolind;
01204   double *Bvals;
01205   EPETRA_CHK_ERR(B.ExtractCrsDataPointers(Browptr,Bcolind,Bvals));
01206 
01207   // Data pointer for Dinv
01208   const double *Dvals = Dinv.Values();
01209 
01210   int *Irowptr=0, *Icolind=0;
01211   double *Ivals=0;
01212   if(Bview.importMatrix){
01213     Irowptr = &Bview.importMatrix->rowptr_[0];
01214     Icolind = (Bview.importMatrix->colind_.size()>0)?(&Bview.importMatrix->colind_[0]):0;
01215     Ivals   = (Bview.importMatrix->vals_.size()>0)?(&Bview.importMatrix->vals_[0]):0;
01216   }
01217 
01218   // MemorySetup: If somebody else is sharing this C's graphdata, make a new one.
01219   // This is needed because I'm about to walk all over the CrsGrapData...
01220   C.ExpertMakeUniqueCrsGraphData();
01221 
01222   // The status array will contain the index into colind where this entry was last deposited.
01223   // c_status[i] < CSR_ip - not in the row yet.
01224   // c_status[i] >= CSR_ip, this is the entry where you can find the data
01225   // We start with this filled with -1's indicating that there are no entries yet.
01226   std::vector<int> c_status(n, -1);
01227 
01228   // Classic csr assembly (low memory edition)
01229   int CSR_alloc=C_estimate_nnz(A,B);
01230   if(CSR_alloc < B.NumMyNonzeros()) CSR_alloc = B.NumMyNonzeros(); // update for Jacobi
01231   int CSR_ip=0,OLD_ip=0;
01232   Epetra_IntSerialDenseVector & CSR_rowptr = C.ExpertExtractIndexOffset();
01233   Epetra_IntSerialDenseVector & CSR_colind = C.ExpertExtractIndices();  
01234   double *&                     CSR_vals   = C.ExpertExtractValues();
01235 
01236   CSR_rowptr.Resize(m+1);
01237   CSR_colind.Resize(CSR_alloc);
01238   resize_doubles(0,CSR_alloc,CSR_vals);
01239 
01240   // Static Profile stuff
01241   std::vector<int> NumEntriesPerRow(m);
01242 
01243   // For each row of C
01244   for(i=0; i<m; i++){
01245     bool found_diagonal=false;
01246     CSR_rowptr[i]=CSR_ip;
01247     double Dval = Dvals[i];
01248 
01249     // Entries of B
01250     for(k=Browptr[i]; k<Browptr[i+1]; k++){
01251       //int Bk      = Bcolind[k];
01252       double Bval = Bvals[k];
01253       if(Bval==0) continue;
01254       int Ck=Bcol2Ccol[Bcolind[k]];
01255       
01256       // Assume no repeated entries in B
01257       c_status[Ck]=CSR_ip;
01258       CSR_colind[CSR_ip]=Ck;
01259       CSR_vals[CSR_ip]= Bvals[k];
01260       CSR_ip++;
01261     }
01262 
01263     // Entries of -omega * Dinv * A * B
01264     for(k=Arowptr[i]; k<Arowptr[i+1]; k++){
01265       int Ak      = Acolind[k];
01266       double Aval = Avals[k];
01267       if(Aval==0) continue;
01268       
01269       if(Bview.targetMapToOrigRow[Ak] != -1){
01270   // Local matrix
01271   int Bk = Bview.targetMapToOrigRow[Ak];
01272   for(j=Browptr[Bk]; j<Browptr[Bk+1]; ++j) {
01273     int Cj=Bcol2Ccol[Bcolind[j]];
01274 
01275     if(Cj==i && !found_diagonal) {found_diagonal=true; NumMyDiagonals++;}
01276 
01277     if(c_status[Cj]<OLD_ip){
01278       // New entry
01279       c_status[Cj]=CSR_ip;
01280       CSR_colind[CSR_ip]=Cj;
01281       CSR_vals[CSR_ip]= - omega * Dval* Aval * Bvals[j];
01282       CSR_ip++;
01283     }
01284     else
01285       CSR_vals[c_status[Cj]]-= omega * Dval * Aval * Bvals[j];
01286   }
01287       }
01288       else{
01289   // Remote matrix
01290   int Ik = Bview.targetMapToImportRow[Ak];
01291   for(j=Irowptr[Ik]; j<Irowptr[Ik+1]; ++j) {
01292     int Cj=Bimportcol2Ccol[Icolind[j]];
01293     
01294     if(Cj==i && !found_diagonal) {found_diagonal=true; NumMyDiagonals++;}
01295 
01296     if(c_status[Cj]<OLD_ip){
01297       // New entry
01298       c_status[Cj]=CSR_ip;
01299       CSR_colind[CSR_ip]=Cj;
01300       CSR_vals[CSR_ip]= - omega * Dval * Aval * Ivals[j];
01301       CSR_ip++;
01302     }
01303     else
01304       CSR_vals[c_status[Cj]]-= omega * Dval * Aval * Ivals[j];
01305   }
01306       }
01307     }
01308     NumEntriesPerRow[i]=CSR_ip-CSR_rowptr[i];
01309 
01310     // Resize for next pass if needed
01311     if(CSR_ip + n > CSR_alloc){
01312       resize_doubles(CSR_alloc,2*CSR_alloc,CSR_vals);
01313       CSR_alloc*=2;
01314       CSR_colind.Resize(CSR_alloc);
01315     }
01316     OLD_ip=CSR_ip;
01317   }
01318 
01319   CSR_rowptr[m]=CSR_ip;
01320 
01321 #ifdef ENABLE_MMM_TIMINGS
01322   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi NewMatrix Final Sort")));
01323 #endif
01324 
01325   // Sort the entries
01326   Epetra_Util::SortCrsEntries(m, &CSR_rowptr[0], &CSR_colind[0], &CSR_vals[0]);
01327 
01328 #ifdef ENABLE_MMM_TIMINGS
01329   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi NewMatrix Fast IE")));
01330 #endif
01331 
01332   // Do a fast build of C's importer
01333   Epetra_Import * Cimport=0; 
01334   int *RemotePIDs = Cremotepids.size()?&Cremotepids[0]:0;
01335   int NumExports=0;
01336   int *ExportLIDs=0, *ExportPIDs=0;
01337   if(Bview.importMatrix) { 
01338     NumExports = Bview.importMatrix->ExportLIDs_.size();
01339     ExportLIDs = Bview.importMatrix->ExportLIDs_.size()?&Bview.importMatrix->ExportLIDs_[0]:0;
01340     ExportPIDs = Bview.importMatrix->ExportPIDs_.size()?&Bview.importMatrix->ExportPIDs_[0]:0;
01341   }
01342   else if(B.Importer()) {
01343     // Grab the exports from B proper
01344     NumExports = B.Importer()->NumExportIDs();
01345     ExportLIDs = B.Importer()->ExportLIDs();
01346     ExportPIDs = B.Importer()->ExportPIDs();    
01347   }
01348 
01349   if(!C.ColMap().SameAs(B.DomainMap()))
01350     Cimport = new Epetra_Import(C.ColMap(),B.DomainMap(),Cremotepids.size(),RemotePIDs,NumExports,ExportLIDs,ExportPIDs);
01351 
01352 #ifdef ENABLE_MMM_TIMINGS
01353   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi NewMatrix ESFC")));
01354 #endif
01355 
01356   // Update the CrsGraphData
01357   C.ExpertStaticFillComplete(B.DomainMap(),A.RangeMap(),Cimport,0,NumMyDiagonals);
01358 
01359   return 0;
01360 }
01361 
01362 
01363 
01364 /*****************************************************************************/
01365 /*****************************************************************************/
01366 /*****************************************************************************/
01367 template<typename int_type>
01368 int MatrixMatrix::Tjacobi_A_B(double omega,
01369            const Epetra_Vector & Dinv,
01370            const Epetra_CrsMatrix & A,
01371            CrsMatrixStruct & Aview,
01372            const Epetra_CrsMatrix & B,
01373            CrsMatrixStruct& Bview,
01374            Epetra_CrsMatrix& C,
01375             bool call_FillComplete_on_result){
01376   int i,rv;
01377   Epetra_Map* mapunion = 0;
01378   const Epetra_Map * colmap_B = &(B.ColMap());
01379   const Epetra_Map * colmap_C = &(C.ColMap());
01380 
01381   std::vector<int> Cremotepids;
01382   std::vector<int> Bcol2Ccol(B.ColMap().NumMyElements());
01383   std::vector<int> Bimportcol2Ccol;
01384   if(Bview.importMatrix) Bimportcol2Ccol.resize(Bview.importMatrix->ColMap_.NumMyElements());
01385 
01386 #ifdef ENABLE_MMM_TIMINGS
01387   using Teuchos::TimeMonitor;
01388   Teuchos::RCP<Teuchos::TimeMonitor> MM;
01389 #endif  
01390 
01391   // If the user doesn't want us to call FillComplete, use the general routine
01392   if(!call_FillComplete_on_result) {
01393 #ifdef ENABLE_MMM_TIMINGS
01394     MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi General Multiply")));
01395 #endif
01396     throw std::runtime_error("jacobi_A_B_general not implemented");
01397     //    rv=mult_A_B_general<int_type>(A,Aview,B,Bview,C,false);
01398     return rv;
01399   }
01400 
01401   // Is this a "clean" matrix
01402   bool NewFlag=!C.IndicesAreLocal() && !C.IndicesAreGlobal();
01403 
01404   // Does ExtractCrsDataPointers work?
01405   int *C_rowptr, *C_colind;
01406   double * C_vals;
01407   C.ExtractCrsDataPointers(C_rowptr,C_colind,C_vals);
01408   bool ExtractFailFlag=!C_rowptr || !C_colind || !C_vals;
01409 
01410   // It's a new matrix that hasn't been fill-completed, use the general routine
01411   if(!NewFlag && ExtractFailFlag){
01412 #ifdef ENABLE_MMM_TIMINGS
01413     MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi General Multiply")));
01414 #endif
01415     throw std::runtime_error("jacobi_A_B_general not implemented");
01416     //    rv=mult_A_B_general<int_type>(A,Aview,B,Bview,C,call_FillComplete_on_result);
01417     return rv;
01418   }
01419 
01420 #ifdef ENABLE_MMM_TIMINGS
01421   if(NewFlag) MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi Newmatrix CMap")));
01422   else MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi Reuse CMap")));
01423 #endif
01424 
01425   // If new, build & clobber a colmap for C
01426   if(NewFlag){
01427     if(Bview.importMatrix) {
01428       EPETRA_CHK_ERR( aztecoo_and_ml_compatible_map_union<int_type>(B,*Bview.importMatrix,mapunion,Cremotepids,Bcol2Ccol,Bimportcol2Ccol) );
01429       EPETRA_CHK_ERR( C.ReplaceColMap(*mapunion) );
01430     }
01431     else  {
01432       EPETRA_CHK_ERR( C.ReplaceColMap(B.ColMap()) );
01433       for(i=0;i<colmap_B->NumMyElements();i++) Bcol2Ccol[i]=i;
01434       
01435       // Copy B's remote list (if any)
01436       if(B.Importer()) 
01437   EPETRA_CHK_ERR( Epetra_Util::GetRemotePIDs(*B.Importer(),Cremotepids));
01438     }
01439   }
01440 
01441 #ifdef ENABLE_MMM_TIMINGS
01442   if(NewFlag) MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi Newmatrix Lookups")));
01443   else MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("EpetraExt: Jacobi Reuse Lookups")));
01444 #endif
01445 
01446   // ********************************************
01447   // Setup Bcol2Ccol / Bimportcol2Ccol lookups
01448   // ********************************************
01449   // Note: If we ran the map_union, we have this information already
01450 
01451   if(!NewFlag) {
01452     if(colmap_B->SameAs(*colmap_C)){
01453       // Maps are the same: Use local IDs as the hash
01454       for(i=0;i<colmap_B->NumMyElements();i++)
01455   Bcol2Ccol[i]=i;       
01456     }
01457     else {
01458       // Maps are not the same:  Use the map's hash
01459       for(i=0;i<colmap_B->NumMyElements();i++){
01460   Bcol2Ccol[i]=colmap_C->LID((int_type) colmap_B->GID64(i));
01461   if(Bcol2Ccol[i]==-1) EPETRA_CHK_ERR(-11);
01462       }
01463     }
01464     
01465     if(Bview.importMatrix){
01466       Bimportcol2Ccol.resize(Bview.importMatrix->ColMap_.NumMyElements());
01467       for(i=0;i<Bview.importMatrix->ColMap_.NumMyElements();i++){
01468       Bimportcol2Ccol[i]=colmap_C->LID((int_type) Bview.importMatrix->ColMap_.GID64(i));
01469       if(Bimportcol2Ccol[i]==-1) EPETRA_CHK_ERR(-12);
01470       }
01471       
01472     }
01473   }
01474   
01475 
01476   // Call the appropriate core routine
01477   if(NewFlag) {
01478     EPETRA_CHK_ERR(jacobi_A_B_newmatrix<int_type>(omega,Dinv,A,B,Bview,Bcol2Ccol,Bimportcol2Ccol,Cremotepids,C));
01479   }
01480   else {
01481     // This always has a real map
01482     EPETRA_CHK_ERR(jacobi_A_B_reuse<int_type>(omega,Dinv,A,B,Bview,Bcol2Ccol,Bimportcol2Ccol,C));
01483   }
01484 
01485   // Cleanup      
01486   delete mapunion;
01487   return 0;
01488 }
01489 
01490 
01491 int MatrixMatrix::jacobi_A_B(double omega,
01492            const Epetra_Vector & Dinv,
01493            const Epetra_CrsMatrix & A,
01494            CrsMatrixStruct & Aview,
01495            const Epetra_CrsMatrix & B,
01496            CrsMatrixStruct& Bview,
01497            Epetra_CrsMatrix& C,
01498            bool call_FillComplete_on_result)
01499 {
01500 #ifndef EPETRA_NO_32BIT_GLOBAL_INDICES
01501   if(A.RowMap().GlobalIndicesInt() && B.RowMap().GlobalIndicesInt()) {
01502     return Tjacobi_A_B<int>(omega,Dinv,A,Aview,B,Bview,C,call_FillComplete_on_result);
01503   }
01504   else
01505 #endif
01506 #ifndef EPETRA_NO_64BIT_GLOBAL_INDICES
01507     if(A.RowMap().GlobalIndicesLongLong() && B.RowMap().GlobalIndicesLongLong()) {
01508       return Tjacobi_A_B<long long>(omega,Dinv,A,Aview,B,Bview,C,call_FillComplete_on_result);
01509     }
01510     else
01511 #endif
01512     throw "EpetraExt::MatrixMatrix::jacobi_A_B: GlobalIndices type unknown";
01513 }
01514 
01515 
01516 
01517 
01518 
01519 }//namespace EpetraExt
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines