Zoltan2
Zoltan2_PartitioningSolution.hpp
Go to the documentation of this file.
00001 // @HEADER
00002 //
00003 // ***********************************************************************
00004 //
00005 //   Zoltan2: A package of combinatorial algorithms for scientific computing
00006 //                  Copyright 2012 Sandia Corporation
00007 //
00008 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00009 // the U.S. Government retains certain rights in this software.
00010 //
00011 // Redistribution and use in source and binary forms, with or without
00012 // modification, are permitted provided that the following conditions are
00013 // met:
00014 //
00015 // 1. Redistributions of source code must retain the above copyright
00016 // notice, this list of conditions and the following disclaimer.
00017 //
00018 // 2. Redistributions in binary form must reproduce the above copyright
00019 // notice, this list of conditions and the following disclaimer in the
00020 // documentation and/or other materials provided with the distribution.
00021 //
00022 // 3. Neither the name of the Corporation nor the names of the
00023 // contributors may be used to endorse or promote products derived from
00024 // this software without specific prior written permission.
00025 //
00026 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00027 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00029 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00030 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00031 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00032 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00033 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00034 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00035 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00036 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00037 //
00038 // Questions? Contact Karen Devine      (kddevin@sandia.gov)
00039 //                    Erik Boman        (egboman@sandia.gov)
00040 //                    Siva Rajamanickam (srajama@sandia.gov)
00041 //
00042 // ***********************************************************************
00043 //
00044 // @HEADER
00045 
00050 #ifndef _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
00051 #define _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
00052 
00053 #include <Zoltan2_IdentifierMap.hpp>
00054 #include <Zoltan2_Solution.hpp>
00055 #include <Zoltan2_GreedyMWM.hpp>
00056 #include <Zoltan2_CoordinatePartitioningGraph.hpp>
00057 #include <cmath>
00058 #include <algorithm>
00059 #include <vector>
00060 #include <limits>
00061 
00062 namespace Teuchos{
00063 
00068 template <typename Ordinal, typename T>
00069 class Zoltan2_BoxBoundaries  : public ValueTypeReductionOp<Ordinal,T>
00070 {
00071 private:
00072     Ordinal size;
00073     T _EPSILON;
00074 
00075 public:
00078     Zoltan2_BoxBoundaries ():size(0), _EPSILON (std::numeric_limits<T>::epsilon()){}
00079 
00086     Zoltan2_BoxBoundaries (Ordinal s_):
00087         size(s_), _EPSILON (std::numeric_limits<T>::epsilon()){}
00088 
00091     void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
00092     {
00093         for (Ordinal i=0; i < count; i++){
00094             if (Z2_ABS(inBuffer[i]) >  _EPSILON){
00095                 inoutBuffer[i] = inBuffer[i];
00096             }
00097         }
00098     }
00099 };
00100 } // namespace Teuchos
00101 
00102 
00103 namespace Zoltan2 {
00104 
00105 
00106 
00107 
00122 template <typename Adapter>
00123   class PartitioningSolution : public Solution
00124 {
00125 public:
00126 
00127 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00128   typedef typename Adapter::gno_t gno_t;
00129   typedef typename Adapter::scalar_t scalar_t;
00130   typedef typename Adapter::lno_t lno_t;
00131   typedef typename Adapter::gid_t gid_t;
00132   typedef typename Adapter::part_t part_t;
00133   typedef typename Adapter::user_t user_t;
00134 #endif
00135 
00153   PartitioningSolution( RCP<const Environment> &env,
00154     RCP<const Comm<int> > &comm,
00155     RCP<const IdentifierMap<user_t> > &idMap,
00156     int nUserWeights);
00157 
00187   PartitioningSolution( RCP<const Environment> &env,
00188     RCP<const Comm<int> > &comm,
00189     RCP<const IdentifierMap<user_t> > &idMap,
00190     int nUserWeights, ArrayView<ArrayRCP<part_t> > reqPartIds,
00191     ArrayView<ArrayRCP<scalar_t> > reqPartSizes);
00192 
00194   // Information that the algorithm may wish to query.
00195 
00198   size_t getTargetGlobalNumberOfParts() const { return nGlobalParts_; }
00199 
00202   size_t getActualGlobalNumberOfParts() const { return nGlobalPartsSolution_; }
00203 
00206   size_t getLocalNumberOfParts() const { return nLocalParts_; }
00207 
00215   scalar_t getLocalFractionOfPart() const { return localFraction_; }
00216 
00226   bool oneToOnePartDistribution() const { return onePartPerProc_; }
00227 
00247   const int *getPartDistribution() const {
00248     if (partDist_.size() > 0) return &partDist_[0];
00249     else return NULL;
00250   }
00251 
00268   const part_t *getProcDistribution() const {
00269     if (procDist_.size() > 0) return &procDist_[0];
00270     else return NULL;
00271   }
00272 
00276   int getNumberOfCriteria() const { return nWeightsPerObj_; }
00277 
00278 
00285   bool criteriaHasUniformPartSizes(int idx) const { return pSizeUniform_[idx];}
00286 
00299   scalar_t getCriteriaPartSize(int idx, part_t part) const {
00300     if (pSizeUniform_[idx])
00301       return 1.0 / nGlobalParts_;
00302     else if (pCompactIndex_[idx].size())
00303       return pSize_[idx][pCompactIndex_[idx][part]];
00304     else
00305       return pSize_[idx][part];
00306   }
00307 
00321   bool criteriaHaveSamePartSizes(int c1, int c2) const;
00322 
00324   // Method used by the algorithm to set results.
00325 
00352   void setParts(ArrayRCP<const gno_t> &gnoList,
00353     ArrayRCP<part_t> &partList, bool dataDidNotMove);
00354 
00356 
00368   void RemapParts();
00369 
00371   /* Return the weight of objects staying with a given remap.
00372    * If remap is NULL, compute weight of objects staying with given partition
00373    */
00374   long measure_stays(part_t *remap, int *idx, part_t *adj, long *wgt,
00375                      part_t nrhs, part_t nlhs)
00376   {
00377     long staying = 0;
00378     for (part_t i = 0; i < nrhs; i++) { 
00379       part_t k = (remap ? remap[i] : i);
00380       for (part_t j = idx[k]; j < idx[k+1]; j++) { 
00381         if (i == (adj[j]-nlhs)) {
00382           staying += wgt[j];
00383           break;
00384         }
00385       }
00386     }
00387     return staying;
00388   }
00389 
00391   // Results that may be queried by the user, by migration methods,
00392   // or by metric calculation methods.
00393   // We return raw pointers so users don't have to learn about our
00394   // pointer wrappers.
00395 
00398   const RCP<const Comm<int> > &getCommunicator() const { return comm_;}
00399 
00402   size_t getLocalNumberOfIds() const { return gids_.size(); }
00403 
00406   const gid_t *getIdList() const {
00407     if (gids_.size() > 0) return gids_.getRawPtr();
00408     else                  return NULL;
00409   }
00410 
00413   const part_t *getPartList() const {
00414     if (parts_.size() > 0) return parts_.getRawPtr();
00415     else                   return NULL;
00416   }
00417 
00422   const int *getProcList() const {
00423     if (procs_.size() > 0) return procs_.getRawPtr();
00424     else                   return NULL;
00425   }
00426 
00427 
00430   void setPartBoxes(RCP < std::vector <Zoltan2::coordinateModelPartBox <scalar_t, part_t> > > outPartBoxes){
00431       this->partBoxes = outPartBoxes;
00432   }
00433 
00436   RCP < std::vector <Zoltan2::coordinateModelPartBox <scalar_t, part_t> > > getPartBoxes(){
00437       return this->partBoxes;
00438   }
00439 
00442   void getCommunicationGraph(
00443           const Teuchos::Comm<int> *comm,
00444           ArrayRCP <part_t> &comXAdj,
00445           ArrayRCP <part_t> &comAdj
00446   ) {
00447 
00448       if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
00449 
00450           part_t ntasks =  this->getActualGlobalNumberOfParts();
00451           if (part_t (this->getTargetGlobalNumberOfParts()) > ntasks){
00452               ntasks = this->getTargetGlobalNumberOfParts();
00453           }
00454           RCP < std::vector <Zoltan2::coordinateModelPartBox <scalar_t, part_t> > > pBoxes = this->getGlobalBoxBoundaries(comm);
00455           int dim = (*pBoxes)[0].getDim();
00456           GridHash < scalar_t, part_t> grid(
00457                   pBoxes,
00458                   ntasks,
00459                   dim);
00460           grid.getAdjArrays(comXAdj_, comAdj_);
00461       }
00462       comAdj = comAdj_;
00463       comXAdj = comXAdj_;
00464 
00465   }
00466 
00467   RCP < std::vector <Zoltan2::coordinateModelPartBox <scalar_t, part_t> > > getGlobalBoxBoundaries(
00468           const Teuchos::Comm<int> *comm){
00469 
00470       part_t ntasks =  this->getActualGlobalNumberOfParts();
00471       if (part_t (this->getTargetGlobalNumberOfParts()) > ntasks){
00472           ntasks = this->getTargetGlobalNumberOfParts();
00473       }
00474 
00475       RCP < std::vector <Zoltan2::coordinateModelPartBox <scalar_t, part_t> > > pBoxes = this->getPartBoxes();
00476 
00477       int dim = (*pBoxes)[0].getDim();
00478 
00479 
00480       scalar_t *localPartBoundaries = new scalar_t[ntasks * 2 *dim];
00481 
00482       memset(localPartBoundaries, 0, sizeof(scalar_t) * ntasks * 2 *dim);
00483 
00484       scalar_t *globalPartBoundaries = new scalar_t[ntasks * 2 *dim];
00485       memset(globalPartBoundaries, 0, sizeof(scalar_t) * ntasks * 2 *dim);
00486 
00487       scalar_t *localPartMins = localPartBoundaries;
00488       scalar_t *localPartMaxs = localPartBoundaries + ntasks * dim;
00489 
00490       scalar_t *globalPartMins = globalPartBoundaries;
00491       scalar_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
00492 
00493       part_t boxCount = pBoxes->size();
00494       for (part_t i = 0; i < boxCount; ++i){
00495           part_t pId = (*pBoxes)[i].getpId();
00496           //cout << "me:" << comm->getRank() << " has:" << pId << endl;
00497 
00498           scalar_t *lmins = (*pBoxes)[i].getlmins();
00499           scalar_t *lmaxs = (*pBoxes)[i].getlmaxs();
00500 
00501           for (int j = 0; j < dim; ++j){
00502               localPartMins[dim * pId + j] = lmins[j];
00503               localPartMaxs[dim * pId + j] = lmaxs[j];
00504               /*
00505               cout << "me:" << comm->getRank()  <<
00506                       " dim * pId + j:"<< dim * pId + j <<
00507                       " localMin:" << localPartMins[dim * pId + j] <<
00508                       " localMax:" << localPartMaxs[dim * pId + j] << endl;
00509                       */
00510           }
00511       }
00512 
00513       Teuchos::Zoltan2_BoxBoundaries<int, scalar_t> reductionOp(ntasks * 2 *dim);
00514 
00515       reduceAll<int, scalar_t>(*comm, reductionOp,
00516               ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries
00517       );
00518       RCP < std::vector <coordinateModelPartBox <scalar_t, part_t> > > pB(new std::vector <coordinateModelPartBox <scalar_t, part_t> > (), true) ;
00519       for (part_t i = 0; i < ntasks; ++i){
00520           Zoltan2::coordinateModelPartBox <scalar_t, part_t> tpb(
00521                   i,
00522                   dim,
00523                   globalPartMins + dim * i,
00524                   globalPartMaxs + dim * i);
00525 
00526           /*
00527           for (int j = 0; j < dim; ++j){
00528               cout << "me:" << comm->getRank()  <<
00529                       " dim * pId + j:"<< dim * i + j <<
00530                       " globalMin:" << globalPartMins[dim * i + j] <<
00531                       " globalMax:" << globalPartMaxs[dim * i + j] << endl;
00532           }
00533           */
00534           pB->push_back(tpb);
00535       }
00536       delete []localPartBoundaries;
00537       delete []globalPartBoundaries;
00538       //RCP < std::vector <Zoltan2::coordinateModelPartBox <scalar_t, part_t> > > tmpRCPBox(pB, true);
00539       this->partBoxes = pB;
00540       return this->partBoxes;
00541   }
00571   template <typename Extra>
00572     size_t convertSolutionToImportList(
00573       int numExtra,
00574       ArrayRCP<Extra> &xtraInfo,
00575       ArrayRCP<typename Adapter::gid_t> &imports,
00576       ArrayRCP<Extra> &newXtraInfo) const;
00577 
00595   void getPartsForProc(int procId, double &numParts, part_t &partMin,
00596     part_t &partMax) const
00597   {
00598     env_->localInputAssertion(__FILE__, __LINE__, "invalid process id",
00599       procId >= 0 && procId < comm_->getSize(), BASIC_ASSERTION);
00600 
00601     procToPartsMap(procId, numParts, partMin, partMax);
00602   }
00603 
00615   void getProcsForPart(part_t partId, part_t &procMin, part_t &procMax) const
00616   {
00617     env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
00618       partId >= 0 && partId < nGlobalParts_, BASIC_ASSERTION);
00619 
00620     partToProcsMap(partId, procMin, procMax);
00621   }
00622 
00623 private:
00624   void partToProc(bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
00625     int numLocalParts, int numGlobalParts);
00626 
00627   void procToPartsMap(int procId, double &numParts, part_t &partMin,
00628     part_t &partMax) const;
00629 
00630   void partToProcsMap(part_t partId, int &procMin, int &procMax) const;
00631 
00632   void setPartDistribution();
00633 
00634   void setPartSizes(ArrayView<ArrayRCP<part_t> > reqPartIds,
00635     ArrayView<ArrayRCP<scalar_t> > reqPartSizes);
00636 
00637   void computePartSizes(int widx, ArrayView<part_t> ids,
00638     ArrayView<scalar_t> sizes);
00639 
00640   void broadcastPartSizes(int widx);
00641 
00642 
00643   RCP<const Environment> env_;             // has application communicator
00644   RCP<const Comm<int> > comm_;             // the problem communicator
00645   RCP<const IdentifierMap<user_t> > idMap_;
00646 
00647   //part box boundaries as a result of geometric partitioning algorithm.
00648   RCP < std::vector <Zoltan2::coordinateModelPartBox <scalar_t, part_t> > > partBoxes;
00649   ArrayRCP <part_t> comXAdj_; //communication graph xadj
00650   ArrayRCP <part_t> comAdj_; //communication graph adj.
00651 
00652   part_t nGlobalParts_;// target global number of parts
00653   part_t nLocalParts_; // number of parts to be on this process
00654 
00655   scalar_t localFraction_; // approx fraction of a part on this process
00656   int nWeightsPerObj_;      // if user has no weights, this is 1  TODO:  WHY???
00657 
00658   // If process p is to be assigned part p for all p, then onePartPerProc_
00659   // is true. Otherwise it is false, and either procDist_ or partDist_
00660   // describes the allocation of parts to processes.
00661   //
00662   // If parts are never split across processes, then procDist_ is defined
00663   // as follows:
00664   //
00665   //   partId              = procDist_[procId]
00666   //   partIdNext          = procDist_[procId+1]
00667   //   globalNumberOfParts = procDist_[numProcs]
00668   //
00669   // meaning that the parts assigned to process procId range from
00670   // [partId, partIdNext).  If partIdNext is the same as partId, then
00671   // process procId has no parts.
00672   //
00673   // If the number parts is less than the number of processes, and the
00674   // user did not specify "num_local_parts" for each of the processes, then
00675   // parts are split across processes, and partDist_ is defined rather than
00676   // procDist_.
00677   //
00678   //   procId              = partDist_[partId]
00679   //   procIdNext          = partDist_[partId+1]
00680   //   globalNumberOfProcs = partDist_[numParts]
00681   //
00682   // which implies that the part partId is shared by processes in the
00683   // the range [procId, procIdNext).
00684   //
00685   // We use std::vector so we can use upper_bound algorithm
00686 
00687   bool             onePartPerProc_;   // either this is true...
00688   std::vector<int>      partDist_;      // or this is defined ...
00689   std::vector<part_t> procDist_;      // or this is defined.
00690   bool procDistEquallySpread_;        // if procDist_ is used and
00691                                       // #parts > #procs and
00692                                       // num_local_parts is not specified,
00693                                       // parts are evenly distributed to procs
00694 
00695   // In order to minimize the storage required for part sizes, we
00696   // have three different representations.
00697   //
00698   // If the part sizes for weight index w are all the same, then:
00699   //    pSizeUniform_[w] = true
00700   //    pCompactIndex_[w].size() = 0
00701   //    pSize_[w].size() = 0
00702   //
00703   // and the size for part p is 1.0 / nparts.
00704   //
00705   // If part sizes differ for each part in weight index w, but there
00706   // are no more than 64 distinct sizes:
00707   //    pSizeUniform_[w] = false
00708   //    pCompactIndex_[w].size() = number of parts
00709   //    pSize_[w].size() = number of different sizes
00710   //
00711   // and the size for part p is pSize_[pCompactIndex_[p]].
00712   //
00713   // If part sizes differ for each part in weight index w, and there
00714   // are more than 64 distinct sizes:
00715   //    pSizeUniform_[w] = false
00716   //    pCompactIndex_[w].size() = 0
00717   //    pSize_[w].size() = nparts
00718   //
00719   // and the size for part p is pSize_[p].
00720   //
00721   // NOTE: If we expect to have similar cases, i.e. a long list of scalars
00722   //   where it is highly possible that just a few unique values appear,
00723   //   then we may want to make this a class.  The advantage is that we
00724   //   save a long list of 1-byte indices instead of a long list of scalars.
00725 
00726   ArrayRCP<bool> pSizeUniform_;
00727   ArrayRCP<ArrayRCP<unsigned char> > pCompactIndex_;
00728   ArrayRCP<ArrayRCP<scalar_t> > pSize_;
00729 
00731   // The algorithm sets these values upon completion.
00732 
00733   ArrayRCP<const gid_t>  gids_;   // User's global IDs
00734   ArrayRCP<part_t> parts_;      // part number assigned to gids_[i]
00735 
00736   bool haveSolution_;
00737 
00738   part_t nGlobalPartsSolution_; // global number of parts in solution
00739 
00741   // The solution calculates this from the part assignments,
00742   // unless onePartPerProc_.
00743 
00744   ArrayRCP<int> procs_;       // process rank assigned to gids_[i]
00745 };
00746 
00748 // Definitions
00750 
00751 template <typename Adapter>
00752   PartitioningSolution<Adapter>::PartitioningSolution(
00753     RCP<const Environment> &env,
00754     RCP<const Comm<int> > &comm,
00755     RCP<const IdentifierMap<user_t> > &idMap, int nUserWeights)
00756     : env_(env), comm_(comm), idMap_(idMap),
00757       partBoxes(),comXAdj_(), comAdj_(),
00758       nGlobalParts_(0), nLocalParts_(0),
00759       localFraction_(0),  nWeightsPerObj_(),
00760       onePartPerProc_(false), partDist_(), procDist_(),
00761       procDistEquallySpread_(false),
00762       pSizeUniform_(), pCompactIndex_(), pSize_(),
00763       gids_(), parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
00764       procs_()
00765 {
00766   nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1);  // TODO:  WHY??  WHY NOT ZERO?
00767 
00768   setPartDistribution();
00769 
00770   // We must call setPartSizes() because part sizes may have
00771   // been provided by the user on other processes.
00772 
00773   ArrayRCP<part_t> *noIds = new ArrayRCP<part_t> [nWeightsPerObj_];
00774   ArrayRCP<scalar_t> *noSizes = new ArrayRCP<scalar_t> [nWeightsPerObj_];
00775   ArrayRCP<ArrayRCP<part_t> > ids(noIds, 0, nWeightsPerObj_, true);
00776   ArrayRCP<ArrayRCP<scalar_t> > sizes(noSizes, 0, nWeightsPerObj_, true);
00777 
00778   setPartSizes(ids.view(0, nWeightsPerObj_), sizes.view(0, nWeightsPerObj_));
00779 
00780   env_->memory("After construction of solution");
00781 }
00782 
00783 template <typename Adapter>
00784   PartitioningSolution<Adapter>::PartitioningSolution(
00785     RCP<const Environment> &env,
00786     RCP<const Comm<int> > &comm,
00787     RCP<const IdentifierMap<user_t> > &idMap, int nUserWeights,
00788     ArrayView<ArrayRCP<part_t> > reqPartIds,
00789     ArrayView<ArrayRCP<scalar_t> > reqPartSizes)
00790     : env_(env), comm_(comm), idMap_(idMap),
00791       partBoxes(),comXAdj_(), comAdj_(),
00792       nGlobalParts_(0), nLocalParts_(0),
00793       localFraction_(0),  nWeightsPerObj_(),
00794       onePartPerProc_(false), partDist_(), procDist_(),
00795       procDistEquallySpread_(false),
00796       pSizeUniform_(), pCompactIndex_(), pSize_(),
00797       gids_(), parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
00798       procs_()
00799 {
00800   nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1);  // TODO:  WHY?? WHY NOT ZERO?
00801 
00802   setPartDistribution();
00803 
00804   setPartSizes(reqPartIds, reqPartSizes);
00805 
00806   env_->memory("After construction of solution");
00807 }
00808 
00809 template <typename Adapter>
00810   void PartitioningSolution<Adapter>::setPartDistribution()
00811 {
00812   // Did the caller define num_global_parts and/or num_local_parts?
00813 
00814   const ParameterList &pl = env_->getParameters();
00815   size_t haveGlobalNumParts=0, haveLocalNumParts=0;
00816   int numLocal=0, numGlobal=0;
00817   double val;
00818 
00819   const Teuchos::ParameterEntry *pe = pl.getEntryPtr("num_global_parts");
00820 
00821   if (pe){
00822     val = pe->getValue<double>(&val);  // TODO: KDD Skip this double get
00823     haveGlobalNumParts = 1;            // TODO: KDD Should be unnecessary once
00824     numGlobal = static_cast<int>(val); // TODO: KDD paramlist handles long long.
00825     nGlobalParts_ = part_t(numGlobal); // TODO: KDD  also do below.
00826   }
00827 
00828   pe = pl.getEntryPtr("num_local_parts");
00829 
00830   if (pe){
00831     val = pe->getValue<double>(&val);
00832     haveLocalNumParts = 1;
00833     numLocal = static_cast<int>(val);
00834     nLocalParts_ = part_t(numLocal);
00835   }
00836 
00837   try{
00838     // Sets onePartPerProc_, partDist_, and procDist_
00839 
00840     partToProc(true, haveLocalNumParts, haveGlobalNumParts,
00841       numLocal, numGlobal);
00842   }
00843   Z2_FORWARD_EXCEPTIONS
00844 
00845   int nprocs = comm_->getSize();
00846   int rank = comm_->getRank();
00847 
00848   if (onePartPerProc_){
00849     nGlobalParts_ = nprocs;
00850     nLocalParts_ = 1;
00851   }
00852   else if (partDist_.size() > 0){   // more procs than parts
00853     nGlobalParts_ = partDist_.size() - 1;
00854     int pstart = partDist_[0];
00855     for (part_t i=1; i <= nGlobalParts_; i++){
00856       int pend = partDist_[i];
00857       if (rank >= pstart && rank < pend){
00858         int numOwners = pend - pstart;
00859         nLocalParts_ = 1;
00860         localFraction_ = 1.0 / numOwners;
00861         break;
00862       }
00863       pstart = pend;
00864     }
00865   }
00866   else if (procDist_.size() > 0){  // more parts than procs
00867     nGlobalParts_ = procDist_[nprocs];
00868     nLocalParts_ = procDist_[rank+1] - procDist_[rank];
00869   }
00870   else {
00871     throw std::logic_error("partToProc error");
00872   }
00873 
00874 }
00875 
00876 template <typename Adapter>
00877   void PartitioningSolution<Adapter>::setPartSizes(
00878     ArrayView<ArrayRCP<part_t> > ids, ArrayView<ArrayRCP<scalar_t> > sizes)
00879 {
00880   int widx = nWeightsPerObj_;
00881   bool fail=false;
00882 
00883   size_t *countBuf = new size_t [widx*2];
00884   ArrayRCP<size_t> counts(countBuf, 0, widx*2, true);
00885 
00886   fail = ((ids.size() != widx) || (sizes.size() != widx));
00887 
00888   for (int w=0; !fail && w < widx; w++){
00889     counts[w] = ids[w].size();
00890     if (ids[w].size() != sizes[w].size()) fail=true;
00891   }
00892 
00893   env_->globalBugAssertion(__FILE__, __LINE__, "bad argument arrays", fail==0,
00894     COMPLEX_ASSERTION, comm_);
00895 
00896   // Are all part sizes the same?  This is the common case.
00897 
00898   ArrayRCP<scalar_t> *emptySizes= new ArrayRCP<scalar_t> [widx];
00899   pSize_ = arcp(emptySizes, 0, widx);
00900 
00901   ArrayRCP<unsigned char> *emptyIndices= new ArrayRCP<unsigned char> [widx];
00902   pCompactIndex_ = arcp(emptyIndices, 0, widx);
00903 
00904   bool *info = new bool [widx];
00905   pSizeUniform_ = arcp(info, 0, widx);
00906   for (int w=0; w < widx; w++)
00907     pSizeUniform_[w] = true;
00908 
00909   if (nGlobalParts_ == 1){
00910     return;   // there's only one part in the whole problem
00911   }
00912 
00913   size_t *ptr1 = counts.getRawPtr();
00914   size_t *ptr2 = counts.getRawPtr() + widx;
00915 
00916   try{
00917     reduceAll<int, size_t>(*comm_, Teuchos::REDUCE_MAX, widx, ptr1, ptr2);
00918   }
00919   Z2_THROW_OUTSIDE_ERROR(*env_);
00920 
00921   bool zero = true;
00922 
00923   for (int w=0; w < widx; w++)
00924     if (counts[widx+w] > 0){
00925       zero = false;
00926       pSizeUniform_[w] = false;
00927     }
00928 
00929   if (zero) // Part sizes for all criteria are uniform.
00930     return;
00931 
00932   // Compute the part sizes for criteria for which part sizes were
00933   // supplied.  Normalize for each criteria so part sizes sum to one.
00934 
00935   int nprocs = comm_->getSize();
00936   int rank = comm_->getRank();
00937 
00938   for (int w=0; w < widx; w++){
00939     if (pSizeUniform_[w]) continue;
00940 
00941     // Send all ids and sizes to one process.
00942     // (There is no simple gather method in Teuchos.)
00943 
00944     part_t length = ids[w].size();
00945     part_t *allLength = new part_t [nprocs];
00946     Teuchos::gatherAll<int, part_t>(*comm_, 1, &length,
00947       nprocs, allLength);
00948 
00949     if (rank == 0){
00950       int total = 0;
00951       for (int i=0; i < nprocs; i++)
00952         total += allLength[i];
00953 
00954       part_t *partNums = new part_t [total];
00955       scalar_t *partSizes = new scalar_t [total];
00956 
00957       ArrayView<part_t> idArray(partNums, total);
00958       ArrayView<scalar_t> sizeArray(partSizes, total);
00959 
00960       if (length > 0){
00961         for (int i=0; i < length; i++){
00962           *partNums++ = ids[w][i];
00963           *partSizes++ = sizes[w][i];
00964         }
00965       }
00966 
00967       for (int p=1; p < nprocs; p++){
00968         if (allLength[p] > 0){
00969           Teuchos::receive<int, part_t>(*comm_, p,
00970             allLength[p], partNums);
00971           Teuchos::receive<int, scalar_t>(*comm_, p,
00972             allLength[p], partSizes);
00973           partNums += allLength[p];
00974           partSizes += allLength[p];
00975         }
00976       }
00977 
00978       delete [] allLength;
00979 
00980       try{
00981         computePartSizes(w, idArray, sizeArray);
00982       }
00983       Z2_FORWARD_EXCEPTIONS
00984 
00985       delete [] idArray.getRawPtr();
00986       delete [] sizeArray.getRawPtr();
00987     }
00988     else{
00989       delete [] allLength;
00990       if (length > 0){
00991         Teuchos::send<int, part_t>(*comm_, length, ids[w].getRawPtr(), 0);
00992         Teuchos::send<int, scalar_t>(*comm_, length, sizes[w].getRawPtr(), 0);
00993       }
00994     }
00995 
00996     broadcastPartSizes(w);
00997   }
00998 }
00999 
01000 template <typename Adapter>
01001   void PartitioningSolution<Adapter>::broadcastPartSizes(int widx)
01002 {
01003   env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
01004     pSize_.size()>widx &&
01005     pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
01006     COMPLEX_ASSERTION);
01007 
01008   int rank = comm_->getRank();
01009   int nprocs = comm_->getSize();
01010   part_t nparts = nGlobalParts_;
01011 
01012   if (nprocs < 2)
01013     return;
01014 
01015   char flag=0;
01016 
01017   if (rank == 0){
01018     if (pSizeUniform_[widx] == true)
01019       flag = 1;
01020     else if (pCompactIndex_[widx].size() > 0)
01021       flag = 2;
01022     else
01023       flag = 3;
01024   }
01025 
01026   try{
01027     Teuchos::broadcast<int, char>(*comm_, 0, 1, &flag);
01028   }
01029   Z2_THROW_OUTSIDE_ERROR(*env_);
01030 
01031   if (flag == 1){
01032     if (rank > 0)
01033       pSizeUniform_[widx] = true;
01034 
01035     return;
01036   }
01037 
01038   if (flag == 2){
01039 
01040     // broadcast the indices into the size list
01041 
01042     unsigned char *idxbuf = NULL;
01043 
01044     if (rank > 0){
01045       idxbuf = new unsigned char [nparts];
01046       env_->localMemoryAssertion(__FILE__, __LINE__, nparts, idxbuf);
01047     }
01048     else{
01049       env_->localBugAssertion(__FILE__, __LINE__, "index list size",
01050         pCompactIndex_[widx].size() == nparts, COMPLEX_ASSERTION);
01051       idxbuf = pCompactIndex_[widx].getRawPtr();
01052     }
01053 
01054     try{
01055       // broadcast of unsigned char is not supported
01056       Teuchos::broadcast<int, char>(*comm_, 0, nparts,
01057         reinterpret_cast<char *>(idxbuf));
01058     }
01059     Z2_THROW_OUTSIDE_ERROR(*env_);
01060 
01061     if (rank > 0)
01062       pCompactIndex_[widx] = arcp(idxbuf, 0, nparts, true);
01063 
01064     // broadcast the list of different part sizes
01065 
01066     unsigned char maxIdx=0;
01067     for (part_t p=0; p < nparts; p++)
01068       if (idxbuf[p] > maxIdx) maxIdx = idxbuf[p];
01069 
01070     int numSizes = maxIdx + 1;
01071 
01072     scalar_t *sizeList = NULL;
01073 
01074     if (rank > 0){
01075       sizeList = new scalar_t [numSizes];
01076       env_->localMemoryAssertion(__FILE__, __LINE__, numSizes, sizeList);
01077     }
01078     else{
01079       env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
01080         numSizes == pSize_[widx].size(), COMPLEX_ASSERTION);
01081 
01082       sizeList = pSize_[widx].getRawPtr();
01083     }
01084 
01085     try{
01086       Teuchos::broadcast<int, scalar_t>(*comm_, 0, numSizes, sizeList);
01087     }
01088     Z2_THROW_OUTSIDE_ERROR(*env_);
01089 
01090     if (rank > 0)
01091       pSize_[widx] = arcp(sizeList, 0, numSizes, true);
01092 
01093     return;
01094   }
01095 
01096   if (flag == 3){
01097 
01098     // broadcast the size of each part
01099 
01100     scalar_t *sizeList = NULL;
01101 
01102     if (rank > 0){
01103       sizeList = new scalar_t [nparts];
01104       env_->localMemoryAssertion(__FILE__, __LINE__, nparts, sizeList);
01105     }
01106     else{
01107       env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
01108         nparts == pSize_[widx].size(), COMPLEX_ASSERTION);
01109 
01110       sizeList = pSize_[widx].getRawPtr();
01111     }
01112 
01113     try{
01114       Teuchos::broadcast<int, scalar_t >(*comm_, 0, nparts, sizeList);
01115     }
01116     Z2_THROW_OUTSIDE_ERROR(*env_);
01117 
01118     if (rank > 0)
01119       pSize_[widx] = arcp(sizeList, 0, nparts);
01120 
01121     return;
01122   }
01123 }
01124 
01125 template <typename Adapter>
01126   void PartitioningSolution<Adapter>::computePartSizes(int widx,
01127     ArrayView<part_t> ids, ArrayView<scalar_t> sizes)
01128 {
01129   int len = ids.size();
01130 
01131   if (len == 0){
01132     pSizeUniform_[widx] = true;
01133     return;
01134   }
01135 
01136   env_->localBugAssertion(__FILE__, __LINE__, "bad array sizes",
01137     len>0 && sizes.size()==len, COMPLEX_ASSERTION);
01138 
01139   env_->localBugAssertion(__FILE__, __LINE__, "bad index",
01140     widx>=0 && widx<nWeightsPerObj_, COMPLEX_ASSERTION);
01141 
01142   env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
01143     pSize_.size()>widx &&
01144     pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
01145     COMPLEX_ASSERTION);
01146 
01147   // Check ids and sizes and find min, max and average sizes.
01148   // If sizes are very close to uniform, call them uniform parts.
01149 
01150   part_t nparts = nGlobalParts_;
01151   unsigned char *buf = new unsigned char [nparts];
01152   env_->localMemoryAssertion(__FILE__, __LINE__, nparts, buf);
01153   memset(buf, 0, nparts);
01154   ArrayRCP<unsigned char> partIdx(buf, 0, nparts, true);
01155 
01156   scalar_t epsilon = 10e-5 / nparts;
01157   scalar_t min=sizes[0], max=sizes[0], sum=0;
01158 
01159   for (int i=0; i < len; i++){
01160     part_t id = ids[i];
01161     scalar_t size = sizes[i];
01162 
01163     env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
01164       id>=0 && id<nparts, BASIC_ASSERTION);
01165 
01166     env_->localInputAssertion(__FILE__, __LINE__, "invalid part size", size>=0,
01167       BASIC_ASSERTION);
01168 
01169     // TODO: we could allow users to specify multiple sizes for the same
01170     // part if we add a parameter that says what we are to do with them:
01171     // add them or take the max.
01172 
01173     env_->localInputAssertion(__FILE__, __LINE__,
01174       "multiple sizes provided for one part", partIdx[id]==0, BASIC_ASSERTION);
01175 
01176     partIdx[id] = 1;    // mark that we have a size for this part
01177 
01178     if (size < min) min = size;
01179     if (size > max) max = size;
01180     sum += size;
01181   }
01182 
01183   if (sum == 0){
01184 
01185     // User has given us a list of parts of size 0, we'll set
01186     // the rest to them to equally sized parts.
01187 
01188     scalar_t *allSizes = new scalar_t [2];
01189     env_->localMemoryAssertion(__FILE__, __LINE__, 2, allSizes);
01190 
01191     ArrayRCP<scalar_t> sizeArray(allSizes, 0, 2, true);
01192 
01193     int numNonZero = nparts - len;
01194 
01195     allSizes[0] = 0.0;
01196     allSizes[1] = 1.0 / numNonZero;
01197 
01198     for (part_t p=0; p < nparts; p++)
01199       buf[p] = 1;                 // index to default part size
01200 
01201     for (int i=0; i < len; i++)
01202       buf[ids[i]] = 0;            // index to part size zero
01203 
01204     pSize_[widx] = sizeArray;
01205     pCompactIndex_[widx] = partIdx;
01206 
01207     return;
01208   }
01209 
01210   if (max - min <= epsilon){
01211     pSizeUniform_[widx] = true;
01212     return;
01213   }
01214 
01215   // A size for parts that were not specified:
01216   scalar_t avg = sum / nparts;
01217 
01218   // We are going to merge part sizes that are very close.  This takes
01219   // computation time now, but can save considerably in the storage of
01220   // all part sizes on each process.  For example, a common case may
01221   // be some parts are size 1 and all the rest are size 2.
01222 
01223   scalar_t *tmp = new scalar_t [len];
01224   env_->localMemoryAssertion(__FILE__, __LINE__, len, tmp);
01225   memcpy(tmp, sizes.getRawPtr(), sizeof(scalar_t) * len);
01226   ArrayRCP<scalar_t> partSizes(tmp, 0, len, true);
01227 
01228   std::sort(partSizes.begin(), partSizes.end());
01229 
01230   // create a list of sizes that are unique within epsilon
01231 
01232   Array<scalar_t> nextUniqueSize;
01233   nextUniqueSize.push_back(partSizes[len-1]);   // largest
01234   scalar_t curr = partSizes[len-1];
01235   int avgIndex = len;
01236   bool haveAvg = false;
01237   if (curr - avg <= epsilon)
01238      avgIndex = 0;
01239 
01240   for (int i=len-2; i >= 0; i--){
01241     scalar_t val = partSizes[i];
01242     if (curr - val > epsilon){
01243       nextUniqueSize.push_back(val);  // the highest in the group
01244       curr = val;
01245       if (avgIndex==len && val > avg && val - avg <= epsilon){
01246         // the average would be in this group
01247         avgIndex = nextUniqueSize.size() - 1;
01248         haveAvg = true;
01249       }
01250     }
01251   }
01252 
01253   partSizes.clear();
01254 
01255   size_t numSizes = nextUniqueSize.size();
01256   int sizeArrayLen = numSizes;
01257 
01258   if (numSizes < 64){
01259     // We can store the size for every part in a compact way.
01260 
01261     // Create a list of all sizes in increasing order
01262 
01263     if (!haveAvg) sizeArrayLen++;   // need to include average
01264 
01265     scalar_t *allSizes = new scalar_t [sizeArrayLen];
01266     env_->localMemoryAssertion(__FILE__, __LINE__, sizeArrayLen, allSizes);
01267     ArrayRCP<scalar_t> sizeArray(allSizes, 0, sizeArrayLen, true);
01268 
01269     int newAvgIndex = sizeArrayLen;
01270 
01271     for (int i=numSizes-1, idx=0; i >= 0; i--){
01272 
01273       if (newAvgIndex == sizeArrayLen){
01274 
01275         if (haveAvg && i==avgIndex)
01276           newAvgIndex = idx;
01277 
01278         else if (!haveAvg && avg < nextUniqueSize[i]){
01279           newAvgIndex = idx;
01280           allSizes[idx++] = avg;
01281         }
01282       }
01283 
01284       allSizes[idx++] = nextUniqueSize[i];
01285     }
01286 
01287     env_->localBugAssertion(__FILE__, __LINE__, "finding average in list",
01288       newAvgIndex < sizeArrayLen, COMPLEX_ASSERTION);
01289 
01290     for (int i=0; i < nparts; i++){
01291       buf[i] = newAvgIndex;   // index to default part size
01292     }
01293 
01294     sum = (nparts - len) * allSizes[newAvgIndex];
01295 
01296     for (int i=0; i < len; i++){
01297       int id = ids[i];
01298       scalar_t size = sizes[i];
01299       int index;
01300 
01301       // Find the first size greater than or equal to this size.
01302 
01303       if (size < avg && avg - size <= epsilon)
01304         index = newAvgIndex;
01305       else{
01306         typename ArrayRCP<scalar_t>::iterator found =
01307           std::lower_bound(sizeArray.begin(), sizeArray.end(), size);
01308 
01309         env_->localBugAssertion(__FILE__, __LINE__, "size array",
01310           found != sizeArray.end(), COMPLEX_ASSERTION);
01311 
01312         index = found - sizeArray.begin();
01313       }
01314 
01315       buf[id] = index;
01316       sum += allSizes[index];
01317     }
01318 
01319     for (int i=0; i < sizeArrayLen; i++){
01320       sizeArray[i] /= sum;
01321     }
01322 
01323     pCompactIndex_[widx] = partIdx;
01324     pSize_[widx] = sizeArray;
01325   }
01326   else{
01327     // To have access to part sizes, we must store nparts scalar_ts on
01328     // every process.  We expect this is a rare case.
01329 
01330     tmp = new scalar_t [nparts];
01331     env_->localMemoryAssertion(__FILE__, __LINE__, nparts, tmp);
01332 
01333     sum += ((nparts - len) * avg);
01334 
01335     for (int i=0; i < nparts; i++){
01336       tmp[i] = avg/sum;
01337     }
01338 
01339     for (int i=0; i < len; i++){
01340       tmp[ids[i]] = sizes[i]/sum;
01341     }
01342 
01343     pSize_[widx] = arcp(tmp, 0, nparts);
01344   }
01345 }
01346 
01347 
01348 template <typename Adapter>
01349   void PartitioningSolution<Adapter>::setParts(
01350     ArrayRCP<const gno_t> &gnoList, ArrayRCP<part_t> &partList,
01351     bool dataDidNotMove)
01352 {
01353   env_->debug(DETAILED_STATUS, "Entering setParts");
01354 
01355   size_t len = partList.size();
01356 
01357   // Find the actual number of parts in the solution, which can
01358   // be more or less than the nGlobalParts_ target.
01359   // (We may want to compute the imbalance of a given solution with
01360   // respect to a desired solution.  This solution may have more or
01361   // fewer parts that the desired solution.)
01362 
01363   part_t lMax=0, lMin=0, gMax, gMin;
01364 
01365   if (len > 0)
01366     IdentifierTraits<part_t>::minMax(partList.getRawPtr(), len, lMin, lMax);
01367 
01368   IdentifierTraits<part_t>::globalMinMax(*comm_, len == 0,
01369     lMin, lMax, gMin, gMax);
01370 
01371   nGlobalPartsSolution_ = gMax - gMin + 1;
01372 
01373   if (dataDidNotMove) {
01374     // Case where the algorithm provides the part list in the same order
01375     // that it received the gnos.
01376 
01377     if (idMap_->gnosAreGids()){
01378       gids_ = Teuchos::arcp_reinterpret_cast<const gid_t>(gnoList);
01379     }
01380     else{
01381       gid_t *gidList = new gid_t [len];
01382       env_->localMemoryAssertion(__FILE__, __LINE__, len, gidList);
01383       ArrayView<gid_t> gidView(gidList, len);
01384 
01385       const gno_t *gnos = gnoList.getRawPtr();
01386       ArrayView<gno_t> gnoView(const_cast<gno_t *>(gnos), len);
01387 
01388       try{
01389         idMap_->gidTranslate(gidView, gnoView, TRANSLATE_LIB_TO_APP);
01390       }
01391       Z2_FORWARD_EXCEPTIONS
01392 
01393       gids_ = arcp<const gid_t>(gidList, 0, len);
01394     }
01395     parts_ = partList;
01396   }
01397   else {
01398     // Case where the algorithm moved the data.
01399     // KDDKDD Should we require the algorithm to return the parts in
01400     // KDDKDD the same order as the gnos provided by the model?
01401 
01402     // We send part information to the process that "owns" the global number.
01403 
01404     ArrayView<gid_t> emptyView;
01405     ArrayView<int> procList;
01406 
01407     if (len){
01408       int *tmp = new int [len];
01409       env_->localMemoryAssertion(__FILE__, __LINE__, len, tmp);
01410       procList = ArrayView<int>(tmp, len);
01411     }
01412 
01413     idMap_->gnoGlobalTranslate (gnoList.view(0,len), emptyView, procList);
01414 
01415     int remotelyOwned = 0;
01416     int rank = comm_->getRank();
01417     int nprocs = comm_->getSize();
01418 
01419     for (size_t i=0; !remotelyOwned && i < len; i++){
01420       if (procList[i] != rank)
01421         remotelyOwned = 1;
01422     }
01423 
01424     int anyRemotelyOwned=0;
01425 
01426     try{
01427       reduceAll<int, int>(*comm_, Teuchos::REDUCE_MAX, 1,
01428         &remotelyOwned, &anyRemotelyOwned);
01429     }
01430     Z2_THROW_OUTSIDE_ERROR(*env_);
01431 
01432     if (anyRemotelyOwned){
01433 
01434       // Send the owners of these gnos their part assignments.
01435 
01436       Array<int> countOutBuf(nprocs, 0);
01437       Array<int> countInBuf(nprocs, 0);
01438 
01439       Array<gno_t> outBuf(len*2, 0);
01440 
01441       if (len > 0){
01442         Array<lno_t> offsetBuf(nprocs+1, 0);
01443 
01444         for (size_t i=0; i < len; i++){
01445           countOutBuf[procList[i]]+=2;
01446         }
01447 
01448         offsetBuf[0] = 0;
01449         for (int i=0; i < nprocs; i++)
01450           offsetBuf[i+1] = offsetBuf[i] + countOutBuf[i];
01451 
01452         for (size_t i=0; i < len; i++){
01453           int p = procList[i];
01454           int off = offsetBuf[p];
01455           outBuf[off] = gnoList[i];
01456           outBuf[off+1] = static_cast<gno_t>(partList[i]);
01457           offsetBuf[p]+=2;
01458         }
01459       }
01460 
01461       ArrayRCP<gno_t> inBuf;
01462 
01463       try{
01464         AlltoAllv<gno_t>(*comm_, *env_,
01465           outBuf(), countOutBuf(), inBuf, countInBuf());
01466       }
01467       Z2_FORWARD_EXCEPTIONS;
01468 
01469       outBuf.clear();
01470       countOutBuf.clear();
01471 
01472       gno_t newLen = 0;
01473       for (int i=0; i < nprocs; i++)
01474         newLen += countInBuf[i];
01475 
01476       countInBuf.clear();
01477 
01478       newLen /= 2;
01479 
01480       ArrayRCP<part_t> parts;
01481       ArrayRCP<const gno_t> myGnos;
01482 
01483       if (newLen > 0){
01484 
01485         gno_t *tmpGno = new gno_t [newLen];
01486         env_->localMemoryAssertion(__FILE__, __LINE__, newLen, tmpGno);
01487 
01488         part_t *tmpPart = new part_t [newLen];
01489         env_->localMemoryAssertion(__FILE__, __LINE__, newLen, tmpPart);
01490 
01491         size_t next = 0;
01492         for (lno_t i=0; i < newLen; i++){
01493           tmpGno[i] = inBuf[next++];
01494           tmpPart[i] = inBuf[next++];
01495         }
01496 
01497         parts = arcp(tmpPart, 0, newLen);
01498         myGnos = arcp(tmpGno, 0, newLen);
01499       }
01500 
01501       gnoList = myGnos;
01502       partList = parts;
01503       len = newLen;
01504     }
01505 
01506     delete [] procList.getRawPtr();
01507 
01508     if (idMap_->gnosAreGids()){
01509       gids_ = Teuchos::arcp_reinterpret_cast<const gid_t>(gnoList);
01510     }
01511     else{
01512       gid_t *gidList = new gid_t [len];
01513       env_->localMemoryAssertion(__FILE__, __LINE__, len, gidList);
01514       ArrayView<gid_t> gidView(gidList, len);
01515 
01516       const gno_t *gnos = gnoList.getRawPtr();
01517       ArrayView<gno_t> gnoView(const_cast<gno_t *>(gnos), len);
01518 
01519       try{
01520         idMap_->gidTranslate(gidView, gnoView, TRANSLATE_LIB_TO_APP);
01521       }
01522       Z2_FORWARD_EXCEPTIONS
01523 
01524       gids_ = arcp<const gid_t>(gidList, 0, len);
01525     }
01526 
01527     parts_ = partList;
01528   }
01529 
01530   // Now determine which process gets each object, if not one-to-one.
01531 
01532   if (!onePartPerProc_){
01533 
01534     int *procs = new int [len];
01535     env_->localMemoryAssertion(__FILE__, __LINE__, len, procs);
01536     procs_ = arcp<int>(procs, 0, len);
01537 
01538     part_t *parts = partList.getRawPtr();
01539 
01540     if (procDist_.size() > 0){    // parts are not split across procs
01541 
01542       int procId;
01543       for (size_t i=0; i < len; i++){
01544         partToProcsMap(parts[i], procs[i], procId);
01545       }
01546     }
01547     else{  // harder - we need to split the parts across multiple procs
01548 
01549       lno_t *partCounter = new lno_t [nGlobalPartsSolution_];
01550       env_->localMemoryAssertion(__FILE__, __LINE__, nGlobalPartsSolution_,
01551         partCounter);
01552 
01553       int numProcs = comm_->getSize();
01554 
01555       //MD NOTE: there was no initialization for partCounter.
01556       //I added the line below, correct me if I am wrong.
01557       memset(partCounter, 0, sizeof(lno_t) * nGlobalPartsSolution_);
01558 
01559       for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++)
01560         partCounter[parts[i]]++;
01561 
01562       lno_t *procCounter = new lno_t [numProcs];
01563       env_->localMemoryAssertion(__FILE__, __LINE__, numProcs, procCounter);
01564 
01565       int proc1;
01566       int proc2 = partDist_[0];
01567 
01568       for (part_t part=1; part < nGlobalParts_; part++){
01569         proc1 = proc2;
01570         proc2 = partDist_[part+1];
01571         int numprocs = proc2 - proc1;
01572 
01573         double dNum = partCounter[part];
01574         double dProcs = numprocs;
01575 
01576         //cout << "dNum:" << dNum << " dProcs:" << dProcs << endl;
01577         double each = floor(dNum/dProcs);
01578         double extra = fmod(dNum,dProcs);
01579 
01580         for (int proc=proc1, i=0; proc<proc2; proc++, i++){
01581           if (i < extra)
01582             procCounter[proc] = lno_t(each) + 1;
01583           else
01584             procCounter[proc] = lno_t(each);
01585         }
01586       }
01587 
01588       delete [] partCounter;
01589 
01590       for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++){
01591         if (partList[i] >= nGlobalParts_){
01592           // Solution has more parts that targeted.  These
01593           // objects just remain on this process.
01594           procs[i] = comm_->getRank();
01595           continue;
01596         }
01597         part_t partNum = parts[i];
01598         proc1 = partDist_[partNum];
01599         proc2 = partDist_[partNum + 1];
01600 
01601         int proc;
01602         for (proc=proc1; proc < proc2; proc++){
01603           if (procCounter[proc] > 0){
01604             procs[i] = proc;
01605             procCounter[proc]--;
01606             break;
01607           }
01608         }
01609         env_->localBugAssertion(__FILE__, __LINE__, "part to proc",
01610           proc < proc2, COMPLEX_ASSERTION);
01611       }
01612 
01613       delete [] procCounter;
01614     }
01615   }
01616 
01617   // Now that parts_ info is back on home process, remap the parts.
01618   // TODO:  The parts will be inconsistent with the proc assignments after
01619   // TODO:  remapping.  This problem will go away after we separate process
01620   // TODO:  mapping from setParts.  But for MueLu's use case, the part
01621   // TODO:  remapping is all that matters; they do not use the process mapping.
01622   int doRemap = 0;
01623   const Teuchos::ParameterEntry *pe =
01624                  env_->getParameters().getEntryPtr("remap_parts");
01625   if (pe) doRemap = pe->getValue(&doRemap);
01626   if (doRemap) RemapParts();
01627 
01628   haveSolution_ = true;
01629 
01630   env_->memory("After Solution has processed algorithm's answer");
01631   env_->debug(DETAILED_STATUS, "Exiting setParts");
01632 }
01633 
01634 template <typename Adapter>
01635   template <typename Extra>
01636     size_t PartitioningSolution<Adapter>::convertSolutionToImportList(
01637       int numExtra, ArrayRCP<Extra> &xtraInfo,
01638       ArrayRCP<typename Adapter::gid_t> &imports,       // output
01639       ArrayRCP<Extra> &newXtraInfo) const               // output
01640 {
01641   env_->localInputAssertion(__FILE__, __LINE__, "no solution yet",
01642     haveSolution_, BASIC_ASSERTION);
01643   env_->debug(DETAILED_STATUS, "Entering convertSolutionToImportList");
01644 
01645   int numProcs                = comm_->getSize();
01646   size_t localNumIds          = gids_.size();
01647 
01648   // How many to each process?
01649   Array<int> counts(numProcs, 0);
01650   if (onePartPerProc_)
01651     for (size_t i=0; i < localNumIds; i++)
01652       counts[parts_[i]]++;
01653   else
01654     for (size_t i=0; i < localNumIds; i++)
01655       counts[procs_[i]]++;
01656 
01657   Array<gno_t> offsets(numProcs+1, 0);
01658   for (int i=1; i <= numProcs; i++){
01659     offsets[i] = offsets[i-1] + counts[i-1];
01660   }
01661 
01662   Array<gid_t> gidList(localNumIds);
01663   Array<Extra> numericInfo;
01664 
01665   if (numExtra > 0)
01666     numericInfo.resize(localNumIds);
01667 
01668   if (onePartPerProc_){
01669     for (size_t i=0; i < localNumIds; i++){
01670       lno_t idx = offsets[parts_[i]];
01671       gidList[idx] = gids_[i];
01672       if (numExtra > 0)
01673         numericInfo[idx] = xtraInfo[i];
01674       offsets[parts_[i]] = idx + 1;
01675     }
01676   }
01677   else {
01678     for (size_t i=0; i < localNumIds; i++){
01679       lno_t idx = offsets[procs_[i]];
01680       gidList[idx] = gids_[i];
01681       if (numExtra > 0)
01682         numericInfo[idx] = xtraInfo[i];
01683       offsets[procs_[i]] = idx + 1;
01684     }
01685   }
01686 
01687   Array<int> recvCounts(numProcs, 0);
01688 
01689   try{
01690     AlltoAllv<gid_t>(*comm_, *env_, gidList(),
01691       counts(), imports, recvCounts());
01692   }
01693   catch (std::exception &e){
01694     throw std::runtime_error("alltoallv 1");
01695   }
01696 
01697   if (numExtra > 0){
01698     try{
01699       AlltoAllv<Extra>(*comm_, *env_, xtraInfo(),
01700         counts(), newXtraInfo, recvCounts());
01701     }
01702     catch (std::exception &e){
01703       throw std::runtime_error("alltoallv 2");
01704     }
01705   }
01706 
01707   env_->debug(DETAILED_STATUS, "Exiting convertSolutionToImportList");
01708   return imports.size();
01709 }
01710 
01711 template <typename Adapter>
01712   void PartitioningSolution<Adapter>::procToPartsMap(int procId,
01713     double &numParts, part_t &partMin, part_t &partMax) const
01714 {
01715   if (onePartPerProc_){
01716     numParts = 1.0;
01717     partMin = partMax = procId;
01718   }
01719   else if (procDist_.size() > 0){
01720     partMin = procDist_[procId];
01721     partMax = procDist_[procId+1] - 1;
01722     numParts = procDist_[procId+1] - partMin;
01723   }
01724   else{
01725     // find the first p such that partDist_[p] > procId
01726 
01727     std::vector<int>::const_iterator entry;
01728     entry = std::upper_bound(partDist_.begin(), partDist_.end(), procId);
01729 
01730     size_t partIdx = entry - partDist_.begin();
01731     int numProcs = partDist_[partIdx] - partDist_[partIdx-1];
01732     partMin = partMax = int(partIdx) - 1;
01733     numParts = 1.0 / numProcs;
01734   }
01735 }
01736 
01737 template <typename Adapter>
01738   void PartitioningSolution<Adapter>::partToProcsMap(part_t partId,
01739     int &procMin, int &procMax) const
01740 {
01741   if (partId >= nGlobalParts_){
01742     // setParts() may be given an initial solution which uses a
01743     // different number of parts than the desired solution.  It is
01744     // still a solution.  We keep it on this process.
01745     procMin = procMax = comm_->getRank();
01746   }
01747   else if (onePartPerProc_){
01748     procMin = procMax = int(partId);
01749   }
01750   else if (procDist_.size() > 0){
01751     if (procDistEquallySpread_) {
01752       // Avoid binary search.
01753       double fProcs = comm_->getSize();
01754       double fParts = nGlobalParts_;
01755       double each = fParts / fProcs;
01756       procMin = int(partId / each);
01757       while (procDist_[procMin] > partId) procMin--;
01758       while (procDist_[procMin+1] <= partId) procMin++;
01759       procMax = procMin;
01760     }
01761     else {
01762       // find the first p such that procDist_[p] > partId.
01763       // For now, do a binary search.
01764 
01765       typename std::vector<part_t>::const_iterator entry;
01766       entry = std::upper_bound(procDist_.begin(), procDist_.end(), partId);
01767 
01768       size_t procIdx = entry - procDist_.begin();
01769       procMin = procMax = int(procIdx) - 1;
01770     }
01771   }
01772   else{
01773     procMin = partDist_[partId];
01774     procMax = partDist_[partId+1] - 1;
01775   }
01776 }
01777 
01778 template <typename Adapter>
01779   bool PartitioningSolution<Adapter>::criteriaHaveSamePartSizes(
01780     int c1, int c2) const
01781 {
01782   if (c1 < 0 || c1 >= nWeightsPerObj_ || c2 < 0 || c2 >= nWeightsPerObj_ )
01783     throw std::logic_error("criteriaHaveSamePartSizes error");
01784 
01785   bool theSame = false;
01786 
01787   if (c1 == c2)
01788     theSame = true;
01789 
01790   else if (pSizeUniform_[c1] == true && pSizeUniform_[c2] == true)
01791     theSame = true;
01792 
01793   else if (pCompactIndex_[c1].size() == pCompactIndex_[c2].size()){
01794     theSame = true;
01795     bool useIndex = pCompactIndex_[c1].size() > 0;
01796     if (useIndex){
01797       for (part_t p=0; theSame && p < nGlobalParts_; p++)
01798         if (pSize_[c1][pCompactIndex_[c1][p]] !=
01799             pSize_[c2][pCompactIndex_[c2][p]])
01800           theSame = false;
01801     }
01802     else{
01803       for (part_t p=0; theSame && p < nGlobalParts_; p++)
01804         if (pSize_[c1][p] != pSize_[c2][p])
01805           theSame = false;
01806     }
01807   }
01808 
01809   return theSame;
01810 }
01811 
01827 template <typename Adapter>
01828   void PartitioningSolution<Adapter>::partToProc(
01829     bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
01830     int numLocalParts, int numGlobalParts)
01831 {
01832   int nprocs = comm_->getSize();
01833   ssize_t reducevals[4];
01834   ssize_t sumHaveGlobal=0, sumHaveLocal=0;
01835   ssize_t sumGlobal=0, sumLocal=0;
01836   ssize_t maxGlobal=0, maxLocal=0;
01837   ssize_t vals[4] = {haveNumGlobalParts, haveNumLocalParts,
01838       numGlobalParts, numLocalParts};
01839 
01840   partDist_.clear();
01841   procDist_.clear();
01842 
01843   if (doCheck){
01844 
01845     try{
01846       reduceAll<int, ssize_t>(*comm_, Teuchos::REDUCE_SUM, 4, vals, reducevals);
01847     }
01848     Z2_THROW_OUTSIDE_ERROR(*env_);
01849 
01850     sumHaveGlobal = reducevals[0];
01851     sumHaveLocal = reducevals[1];
01852     sumGlobal = reducevals[2];
01853     sumLocal = reducevals[3];
01854 
01855     env_->localInputAssertion(__FILE__, __LINE__,
01856       "Either all procs specify num_global/local_parts or none do",
01857       (sumHaveGlobal == 0 || sumHaveGlobal == nprocs) &&
01858       (sumHaveLocal == 0 || sumHaveLocal == nprocs),
01859       BASIC_ASSERTION);
01860   }
01861   else{
01862     if (haveNumLocalParts)
01863       sumLocal = numLocalParts * nprocs;
01864     if (haveNumGlobalParts)
01865       sumGlobal = numGlobalParts * nprocs;
01866 
01867     sumHaveGlobal = haveNumGlobalParts ? nprocs : 0;
01868     sumHaveLocal = haveNumLocalParts ? nprocs : 0;
01869 
01870     maxLocal = numLocalParts;
01871     maxGlobal = numGlobalParts;
01872   }
01873 
01874   if (!haveNumLocalParts && !haveNumGlobalParts){
01875     onePartPerProc_ = true;   // default if user did not specify
01876     return;
01877   }
01878 
01879   if (haveNumGlobalParts){
01880     if (doCheck){
01881       vals[0] = numGlobalParts;
01882       vals[1] = numLocalParts;
01883       try{
01884         reduceAll<int, ssize_t>(
01885           *comm_, Teuchos::REDUCE_MAX, 2, vals, reducevals);
01886       }
01887       Z2_THROW_OUTSIDE_ERROR(*env_);
01888 
01889       maxGlobal = reducevals[0];
01890       maxLocal = reducevals[1];
01891 
01892       env_->localInputAssertion(__FILE__, __LINE__,
01893         "Value for num_global_parts is different on different processes.",
01894         maxGlobal * nprocs == sumGlobal, BASIC_ASSERTION);
01895     }
01896 
01897     if (sumLocal){
01898       env_->localInputAssertion(__FILE__, __LINE__,
01899         "Sum of num_local_parts does not equal requested num_global_parts",
01900         sumLocal == numGlobalParts, BASIC_ASSERTION);
01901 
01902       if (sumLocal == nprocs && maxLocal == 1){
01903         onePartPerProc_ = true;   // user specified one part per proc
01904         return;
01905       }
01906     }
01907     else{
01908       if (maxGlobal == nprocs){
01909         onePartPerProc_ = true;   // user specified num parts is num procs
01910         return;
01911       }
01912     }
01913   }
01914 
01915   // If we are here, we do not have #parts == #procs.
01916 
01917   if (sumHaveLocal == nprocs){
01918     //
01919     // We will go by the number of local parts specified.
01920     //
01921 
01922     try{
01923       procDist_.resize(nprocs+1);
01924     }
01925     catch (std::exception &e){
01926       throw(std::bad_alloc());
01927     }
01928 
01929     part_t *procArray = &procDist_[0];
01930 
01931     try{
01932       part_t tmp = part_t(numLocalParts);
01933       gatherAll<int, part_t>(*comm_, 1, &tmp, nprocs, procArray + 1);
01934     }
01935     Z2_THROW_OUTSIDE_ERROR(*env_);
01936 
01937     procArray[0] = 0;
01938 
01939     for (int proc=0; proc < nprocs; proc++)
01940       procArray[proc+1] += procArray[proc];
01941   }
01942   else{
01943     //
01944     // We will allocate global number of parts to the processes.
01945     //
01946     double fParts = numGlobalParts;
01947     double fProcs = nprocs;
01948 
01949     if (fParts < fProcs){
01950 
01951       try{
01952         partDist_.resize(size_t(fParts+1));
01953       }
01954       catch (std::exception &e){
01955         throw(std::bad_alloc());
01956       }
01957 
01958       int *partArray = &partDist_[0];
01959 
01960       double each = floor(fProcs / fParts);
01961       double extra = fmod(fProcs, fParts);
01962       partDist_[0] = 0;
01963 
01964       for (part_t part=0; part < numGlobalParts; part++){
01965         int numOwners = int(each + ((part<extra) ? 1 : 0));
01966         partArray[part+1] = partArray[part] + numOwners;
01967       }
01968 
01969       env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
01970         partDist_[numGlobalParts] == nprocs, COMPLEX_ASSERTION, comm_);
01971     }
01972     else if (fParts > fProcs){
01973 
01974       // User did not specify local number of parts per proc;
01975       // Distribute the parts evenly among the procs.
01976 
01977       procDistEquallySpread_ = true;
01978 
01979       try{
01980         procDist_.resize(size_t(fProcs+1));
01981       }
01982       catch (std::exception &e){
01983         throw(std::bad_alloc());
01984       }
01985 
01986       part_t *procArray = &procDist_[0];
01987 
01988       double each = floor(fParts / fProcs);
01989       double extra = fmod(fParts, fProcs);
01990       procArray[0] = 0;
01991 
01992       for (int proc=0; proc < nprocs; proc++){
01993         part_t numParts = part_t(each + ((proc<extra) ? 1 : 0));
01994         procArray[proc+1] = procArray[proc] + numParts;
01995       }
01996 
01997       env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
01998         procDist_[nprocs] == numGlobalParts, COMPLEX_ASSERTION, comm_);
01999     }
02000     else{
02001       env_->globalBugAssertion(__FILE__, __LINE__,
02002         "should never get here", 1, COMPLEX_ASSERTION, comm_);
02003     }
02004   }
02005 }
02006 
02008 // Remap a new part assignment vector for maximum overlap with an input
02009 // part assignment.
02010 //
02011 // Assumptions for this version:
02012 //   input part assignment == processor rank for every local object.
02013 //   assuming nGlobalParts_ <= num ranks
02014 // TODO:  Write a version that takes the input part number as input;
02015 //        this change requires input parts in adapters to be provided in
02016 //        the Adapter.
02017 // TODO:  For repartitioning, compare to old remapping results; see Zoltan1.
02018 
02019 template <typename Adapter>
02020 void PartitioningSolution<Adapter>::RemapParts()
02021 {
02022   size_t len = parts_.size();
02023 
02024   part_t me = comm_->getRank();
02025   int np = comm_->getSize();
02026 
02027   if (np < nGlobalParts_) {
02028     if (me == 0)
02029       std::cout << "Remapping not yet supported for "
02030            << "num_global_parts " << nGlobalParts_
02031            << " > num procs " << np << std::endl;
02032     return;
02033   }
02034   // Build edges of a bipartite graph with np + nGlobalParts_ vertices,
02035   // and edges between a process vtx and any parts to which that process'
02036   // objects are assigned.
02037   // Weight edge[parts_[i]] by the number of objects that are going from
02038   // this rank to parts_[i].
02039   // We use a std::map, assuming the number of unique parts in the parts_ array
02040   // is small to keep the binary search efficient.
02041   // TODO We use the count of objects to move; should change to SIZE of objects
02042   // to move; need SIZE function in Adapter.
02043 
02044   std::map<part_t, long> edges;
02045   long lstaying = 0;  // Total num of local objects staying if we keep the
02046                       // current mapping. TODO:  change to SIZE of local objs
02047   long gstaying = 0;  // Total num of objects staying in the current partition
02048 
02049   for (size_t i = 0; i < len; i++) {
02050     edges[parts_[i]]++;                // TODO Use obj size instead of count
02051     if (parts_[i] == me) lstaying++;    // TODO Use obj size instead of count
02052   }
02053 
02054   Teuchos::reduceAll<int, long>(*comm_, Teuchos::REDUCE_SUM, 1,
02055                                 &lstaying, &gstaying);
02056 //TODO  if (gstaying == Adapter::getGlobalNumObjs()) return;  // Nothing to do
02057 
02058   part_t *remap = NULL;
02059 
02060   int nedges = edges.size();
02061 
02062   // Gather the graph to rank 0.
02063   part_t tnVtx = np + nGlobalParts_;  // total # vertices
02064   int *idx = NULL;    // Pointer index into graph adjacencies
02065   int *sizes = NULL;  // nedges per rank
02066   if (me == 0) {
02067     idx = new int[tnVtx+1];
02068     sizes = new int[np];
02069     sizes[0] = nedges;
02070   }
02071   if (np > 1)
02072     Teuchos::gather<int, int>(&nedges, 1, sizes, 1, 0, *comm_);
02073 
02074   // prefix sum to build the idx array
02075   if (me == 0) {
02076     idx[0] = 0;
02077     for (int i = 0; i < np; i++)
02078       idx[i+1] = idx[i] + sizes[i];
02079   }
02080 
02081   // prepare to send edges
02082   int cnt = 0;
02083   part_t *bufv = NULL;
02084   long *bufw = NULL;
02085   if (nedges) {
02086     bufv = new part_t[nedges];
02087     bufw = new long[nedges];
02088     // Create buffer with edges (me, part[i]) and weight edges[parts_[i]].
02089     for (typename std::map<part_t, long>::iterator it = edges.begin();
02090          it != edges.end(); it++) {
02091       bufv[cnt] = it->first;  // target part
02092       bufw[cnt] = it->second; // weight
02093       cnt++;
02094     }
02095   }
02096 
02097   // Prepare to receive edges on rank 0
02098   part_t *adj = NULL;
02099   long *wgt = NULL;
02100   if (me == 0) {
02101 //SYM    adj = new part_t[2*idx[np]];  // need 2x space to symmetrize later
02102 //SYM    wgt = new long[2*idx[np]];  // need 2x space to symmetrize later
02103     adj = new part_t[idx[np]];
02104     wgt = new long[idx[np]];
02105   }
02106 
02107   Teuchos::gatherv<int, part_t>(bufv, cnt, adj, sizes, idx, 0, *comm_);
02108   Teuchos::gatherv<int, long>(bufw, cnt, wgt, sizes, idx, 0, *comm_);
02109   delete [] bufv;
02110   delete [] bufw;
02111 
02112   // Now have constructed graph on rank 0.
02113   // Call the matching algorithm
02114 
02115   int doRemap;
02116   if (me == 0) {
02117     // We have the "LHS" vertices of the bipartite graph; need to create
02118     // "RHS" vertices.
02119     for (int i = 0; i < idx[np]; i++) {
02120       adj[i] += np;  // New RHS vertex number; offset by num LHS vertices
02121     }
02122 
02123     // Build idx for RHS vertices
02124     for (part_t i = np; i < tnVtx; i++) {
02125       idx[i+1] = idx[i];  // No edges for RHS vertices
02126     }
02127 
02128 #ifdef KDDKDD_DEBUG
02129     cout << "IDX ";
02130     for (part_t i = 0; i <= tnVtx; i++) std::cout << idx[i] << " ";
02131     std::cout << std::endl;
02132 
02133     std::cout << "ADJ ";
02134     for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << adj[i] << " ";
02135     std::cout << std::endl;
02136 
02137     std::cout << "WGT ";
02138     for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << wgt[i] << " ";
02139     std::cout << std::endl;
02140 #endif
02141 
02142     // Perform matching on the graph
02143     part_t *match = new part_t[tnVtx];
02144     for (part_t i = 0; i < tnVtx; i++) match[i] = i;
02145     part_t nmatches =
02146              Zoltan2::GreedyMWM<part_t, long>(idx, adj, wgt, tnVtx, match);
02147 
02148 #ifdef KDDKDD_DEBUG
02149     std::cout << "After matching:  " << nmatches << " found" << std::endl;
02150     for (part_t i = 0; i < tnVtx; i++)
02151       std::cout << "match[" << i << "] = " << match[i]
02152            << ((match[i] != i &&
02153                (i < np && match[i] != i+np))
02154                   ? " *" : " ")
02155            << std::endl;
02156 #endif
02157 
02158     // See whether there were nontrivial changes in the matching.
02159     bool nontrivial = false;
02160     if (nmatches) {
02161       for (part_t i = 0; i < np; i++) {
02162         if ((match[i] != i) && (match[i] != (i+np))) {
02163           nontrivial = true;
02164           break;
02165         }
02166       }
02167     }
02168 
02169     // Process the matches
02170     if (nontrivial) {
02171       remap = new part_t[nGlobalParts_];
02172       for (part_t i = 0; i < nGlobalParts_; i++) remap[i] = -1;
02173 
02174       bool *used = new bool[np];
02175       for (part_t i = 0; i < np; i++) used[i] = false;
02176 
02177       // First, process all matched parts
02178       for (part_t i = 0; i < nGlobalParts_; i++) {
02179         part_t tmp = i + np;
02180         if (match[tmp] != tmp) {
02181           remap[i] = match[tmp];
02182           used[match[tmp]] = true;
02183         }
02184       }
02185 
02186       // Second, process unmatched parts; keep same part number if possible
02187       for (part_t i = 0; i < nGlobalParts_; i++) {
02188         if (remap[i] > -1) continue;
02189         if (!used[i]) {
02190           remap[i] = i;
02191           used[i] = true;
02192         }
02193       }
02194 
02195       // Third, process unmatched parts; give them the next unused part
02196       for (part_t i = 0, uidx = 0; i < nGlobalParts_; i++) {
02197         if (remap[i] > -1) continue;
02198         while (used[uidx]) uidx++;
02199         remap[i] = uidx;
02200         used[uidx] = true;
02201       }
02202       delete [] used;
02203     }
02204     delete [] match;
02205 
02206 #ifdef KDDKDD_DEBUG
02207     cout << "Remap vector: ";
02208     for (part_t i = 0; i < nGlobalParts_; i++) cout << remap[i] << " ";
02209     std::cout << std::endl;
02210 #endif
02211 
02212     long newgstaying = measure_stays(remap, idx, adj, wgt,
02213                                                       nGlobalParts_, np);
02214     doRemap = (newgstaying > gstaying);
02215     std::cout << "gstaying " << gstaying << " measure(input) "
02216          << measure_stays(NULL, idx, adj, wgt, nGlobalParts_, np)
02217          << " newgstaying " << newgstaying
02218          << " nontrivial " << nontrivial
02219          << " doRemap " << doRemap << std::endl;
02220   }
02221   delete [] idx;
02222   delete [] sizes;
02223   delete [] adj;
02224   delete [] wgt;
02225 
02226   Teuchos::broadcast<int, int>(*comm_, 0, 1, &doRemap);
02227 
02228   if (doRemap) {
02229     if (me != 0) remap = new part_t[nGlobalParts_];
02230     Teuchos::broadcast<int, part_t>(*comm_, 0, nGlobalParts_, remap);
02231     for (size_t i = 0; i < len; i++) {
02232       parts_[i] = remap[parts_[i]];
02233     }
02234   }
02235 
02236   delete [] remap;  // TODO May want to keep for repartitioning as in Zoltan
02237 }
02238 
02239 
02240 }  // namespace Zoltan2
02241 
02242 #endif