Zoltan2 Version of the Day
Zoltan2_AlgPQJagged.hpp
Go to the documentation of this file.
00001 // @HEADER
00002 //
00003 // ***********************************************************************
00004 //
00005 //   Zoltan2: A package of combinatorial algorithms for scientific computing
00006 //                  Copyright 2012 Sandia Corporation
00007 //
00008 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00009 // the U.S. Government retains certain rights in this software.
00010 //
00011 // Redistribution and use in source and binary forms, with or without
00012 // modification, are permitted provided that the following conditions are
00013 // met:
00014 //
00015 // 1. Redistributions of source code must retain the above copyright
00016 // notice, this list of conditions and the following disclaimer.
00017 //
00018 // 2. Redistributions in binary form must reproduce the above copyright
00019 // notice, this list of conditions and the following disclaimer in the
00020 // documentation and/or other materials provided with the distribution.
00021 //
00022 // 3. Neither the name of the Corporation nor the names of the
00023 // contributors may be used to endorse or promote products derived from
00024 // this software without specific prior written permission.
00025 //
00026 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00027 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00029 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00030 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00031 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00032 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00033 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00034 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00035 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00036 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00037 //
00038 // Questions? Contact Karen Devine      (kddevin@sandia.gov)
00039 //                    Erik Boman        (egboman@sandia.gov)
00040 //                    Siva Rajamanickam (srajama@sandia.gov)
00041 //
00042 // ***********************************************************************
00043 //
00044 // @HEADER
00045 
00050 #ifndef _ZOLTAN2_ALGPQJagged_HPP_
00051 #define _ZOLTAN2_ALGPQJagged_HPP_
00052 
00053 #include <Zoltan2_AlgRCB_methods.hpp>
00054 #include <Zoltan2_CoordinateModel.hpp>
00055 #include <Zoltan2_Metric.hpp>             // won't need thiss
00056 
00057 #include <Teuchos_ParameterList.hpp>
00058 
00059 #include <bitset>
00060 
00061 #define EPS_SCALE 1
00062 #define LEAST_SIGNIFICANCE 0.0001
00063 #define SIGNIFICANCE_MUL 1000
00064 //#define INCLUDE_ZOLTAN2_EXPERIMENTAL
00065 #ifdef HAVE_ZOLTAN2_OMP
00066 #include <omp.h>
00067 #endif
00068 //#define FIRST_TOUCH
00069 
00070 //#define BINARYCUTSEARCH
00071 
00072 #define ABS(x) ((x) >= 0 ? (x) : -(x))
00073 
00074 #define LEAF_IMBALANCE_FACTOR 0.1
00075 #define BINARYCUTOFF 32
00076 //imbalance calculation. Wreal / Wexpected - 1
00077 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
00078     (Wachieved) / ((totalW) * (expectedRatio)) - 1
00079 //#define mpi_communication
00080 
00081 namespace Teuchos{
00082 template <typename Ordinal, typename T>
00083 class PQJaggedCombinedReductionOp  : public ValueTypeReductionOp<Ordinal,T>
00084 {
00085 private:
00086   Ordinal numSum_, numMin_1, numMin_2;
00087   Ordinal k;
00088 
00089 public:
00092   PQJaggedCombinedReductionOp ():numSum_(0), numMin_1(0), numMin_2(0), k(0){}
00093 
00100   PQJaggedCombinedReductionOp (Ordinal nsum, Ordinal nmin1, Ordinal nmin2, Ordinal k_):
00101     numSum_(nsum), numMin_1(nmin1), numMin_2(nmin2), k(k_){}
00102 
00105   void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
00106   {
00107     Ordinal next=0;
00108     for(Ordinal ii = 0; ii < k ; ++ii){
00109       for (Ordinal i=0; i < numSum_; i++, next++)
00110         inoutBuffer[next] += inBuffer[next];
00111 
00112       for (Ordinal i=0; i < numMin_1; i++, next++)
00113         if (inoutBuffer[next] > inBuffer[next])
00114           inoutBuffer[next] = inBuffer[next];
00115 
00116       for (Ordinal i=0; i < numMin_2; i++, next++)
00117         if (inoutBuffer[next] > inBuffer[next])
00118           inoutBuffer[next] = inBuffer[next];
00119     }
00120   }
00121 };
00122 
00123 
00124 
00125 
00126 
00127 
00128 
00129 
00130 template <typename Ordinal, typename T>
00131 class PQJaggedCombinedMinMaxTotalReductionOp  : public ValueTypeReductionOp<Ordinal,T>
00132 {
00133 private:
00134   Ordinal numMin, numMax, numTotal;
00135 
00136 public:
00139   PQJaggedCombinedMinMaxTotalReductionOp ():numMin(0), numMax(0), numTotal(0){}
00140 
00147   PQJaggedCombinedMinMaxTotalReductionOp (Ordinal nmin, Ordinal nmax, Ordinal nTotal):
00148     numMin(nmin), numMax(nmax), numTotal(nTotal){}
00149 
00152   void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
00153   {
00154     Ordinal next=0;
00155 
00156     for (Ordinal i=0; i < numMin; i++, next++)
00157       if (inoutBuffer[next] > inBuffer[next])
00158         inoutBuffer[next] = inBuffer[next];
00159 
00160     for (Ordinal i=0; i < numMax; i++, next++)
00161       if (inoutBuffer[next] < inBuffer[next])
00162         inoutBuffer[next] = inBuffer[next];
00163 
00164 
00165     for (Ordinal i=0; i < numTotal; i++, next++)
00166         inoutBuffer[next] += inBuffer[next];
00167   }
00168 };
00169 } // namespace Teuchos
00170 
00171 namespace Zoltan2{
00172 
00173 //diffclock for temporary timing experiments.
00174 
00175 #ifdef mpi_communication
00176   partId_t concurrent = 0;
00177   void sumMinMin(void *in, void *inout, int *count, MPI_Datatype *type) {
00178 
00179     int k = *count;
00180     int numCut = (k  - 1) / 4;
00181     int total_part_count = numCut * 2 + 1;
00182     int next = 0;
00183 
00184     float *inoutBuffer = (float *) inout;
00185     float *inBuffer = (float *) in;
00186 
00187 
00188     for(partId_t ii = 0; ii < concurrent ; ++ii){
00189     for (long i=0; i < total_part_count; i++, next++)
00190       inoutBuffer[next] += inBuffer[next];
00191 
00192     for (long i=0; i < numCut; i++, next++)
00193       if (inoutBuffer[next] > inBuffer[next])
00194         inoutBuffer[next] = inBuffer[next];
00195 
00196     for (long i=0; i < numCut; i++, next++)
00197       if (inoutBuffer[next] > inBuffer[next])
00198         inoutBuffer[next] = inBuffer[next];
00199     }
00200   }
00201 
00202 
00203   void minMaxSum(void *in, void *inout, int *count, MPI_Datatype *type) {
00204 //    long k = (*((int *) count));
00205     int k = *count;
00206     int num = k / 3;
00207     int next = 0;
00208     float *inoutBuffer = (float *) inout;
00209     float *inBuffer = (float *) in;
00210 
00211 
00212     for (long i=0; i < num; i++, next++)
00213       if (inoutBuffer[next] > inBuffer[next])
00214         inoutBuffer[next] = inBuffer[next];
00215     for (long i=0; i < num; i++, next++)
00216       if (inoutBuffer[next] < inBuffer[next])
00217         inoutBuffer[next] = inBuffer[next];
00218     for (long i=0; i < num; i++, next++)
00219       inoutBuffer[next] += inBuffer[next];
00220 
00221   }
00222 #endif
00223 
00228 template <typename lno_t, typename partId_t>
00229 class pqJagged_PartVertices{
00230 private:
00231   lno_t *linkedList; //initially filled with -1's.
00232   lno_t *partBegins; //initially filled with -1's.
00233   lno_t *partEnds; //initially filled with -1's.
00234 public:
00235 
00236   //default constructor
00237   pqJagged_PartVertices(){};
00238 
00248   void set(lno_t *linkedList_, lno_t *partBegins_, lno_t *partEnds_){
00249     linkedList = linkedList_;
00250     partBegins = partBegins_;
00251     partEnds = partEnds_;
00252   }
00253 
00254   //user is responsible from providing the correct number of part counts
00261   void inserToPart (partId_t partNo, lno_t coordinateIndex){
00262 
00263     switch (partEnds[partNo]){
00264     case -1: // this means partBegins[partNo] is also -1.
00265       partBegins[partNo] = coordinateIndex;
00266       partEnds[partNo] = coordinateIndex;
00267       break;
00268     default:
00269       linkedList[coordinateIndex] = partBegins[partNo];
00270       partBegins[partNo] = coordinateIndex;
00271       break;
00272     }
00273 
00274 
00275   }
00276 
00280   lno_t *getLinkedList(){ return linkedList;}
00281 
00285   lno_t *getPartBegins(){ return partBegins;}
00289   lno_t *getPartEnds(){ return partEnds;}
00290 
00291 };
00292 
00293 template<typename T>
00294 inline void firstTouch(T *arrayName, size_t arraySize){
00295 #ifdef HAVE_ZOLTAN2_OMP
00296 #pragma omp parallel for
00297 #endif
00298   for(size_t jj = 0; jj < arraySize; ++jj){
00299     arrayName[jj] = 0;
00300   }
00301 }
00302 
00313 template <typename scalar_t>
00314 inline scalar_t pivotPos (scalar_t * cutUpperBounds, scalar_t *cutLowerBounds,size_t currentCutIndex, scalar_t *cutUpperWeight, scalar_t *cutLowerWeight, scalar_t ew){
00315 
00316   if(cutUpperWeight[currentCutIndex] == cutLowerWeight[currentCutIndex]){
00317     return cutLowerBounds[currentCutIndex];
00318   }
00319   return ((cutUpperBounds[currentCutIndex] - cutLowerBounds[currentCutIndex]) /
00320       (cutUpperWeight[currentCutIndex] - cutLowerWeight[currentCutIndex]))  * (ew - cutLowerWeight[currentCutIndex]) + cutLowerBounds[currentCutIndex];
00321 }
00322 
00323 
00342 template <typename T>
00343 void pqJagged_getParameters(const Teuchos::ParameterList &pl, T &imbalanceTolerance,
00344     multiCriteriaNorm &mcnorm, std::bitset<NUM_RCB_PARAMS> &params,  int &numTestCuts, bool &ignoreWeights, bool &allowNonrectilinear, partId_t &concurrentPartCount,
00345     bool &force_binary, bool &force_linear){
00346 
00347   string obj;
00348 
00349   const Teuchos::ParameterEntry *pe = pl.getEntryPtr("partitioning_objective");
00350   if (pe)
00351     obj = pe->getValue(&obj);
00352 
00353   if (!pe){
00354     params.set(rcb_balanceWeight);
00355     mcnorm = normBalanceTotalMaximum;
00356   }
00357   else if (obj == string("balance_object_count")){
00358     params.set(rcb_balanceCount);
00359   }
00360   else if (obj == string("multicriteria_minimize_total_weight")){
00361     params.set(rcb_minTotalWeight);
00362     mcnorm = normMinimizeTotalWeight;
00363   }
00364   else if (obj == string("multicriteria_minimize_maximum_weight")){
00365     params.set(rcb_minMaximumWeight);
00366     mcnorm = normMinimizeMaximumWeight;
00367   }
00368   else if (obj == string("multicriteria_balance_total_maximum")){
00369     params.set(rcb_balanceTotalMaximum);
00370     mcnorm = normBalanceTotalMaximum;
00371   }
00372   else{
00373     params.set(rcb_balanceWeight);
00374     mcnorm = normBalanceTotalMaximum;
00375   }
00376 
00377   imbalanceTolerance = .1;
00378   pe = pl.getEntryPtr("imbalance_tolerance");
00379   if (pe){
00380     double tol;
00381     tol = pe->getValue(&tol);
00382     imbalanceTolerance = tol - 1.0;
00383   }
00384 
00385   if (imbalanceTolerance <= 0)
00386     imbalanceTolerance = 10e-4;
00387 
00388   force_binary = false;
00389   pe = pl.getEntryPtr("force_binary_search");
00390   if (pe){
00391     int val = 0;
00392     val = pe->getValue(&val);
00393     if (val == 1)
00394       force_binary = true;
00395   }
00396 
00397   force_linear = false;
00398   pe = pl.getEntryPtr("force_linear_search");
00399   if (pe){
00400     int val;
00401     val = pe->getValue(&val);
00402     if (val == 1)
00403       force_linear = true;
00404   }
00405 
00406   //TODO: FIX ME.
00407   //double aa = 1;
00408   pe = pl.getEntryPtr("parallel_part_calculation_count");
00409   if (pe)
00410     //aa = pe->getValue(&aa);
00411     concurrentPartCount = pe->getValue(&concurrentPartCount);
00412 
00413   //concurrentPartCount = partId_t(aa);
00414 
00415   int val = 0;
00416   pe = pl.getEntryPtr("average_cuts");
00417   if (pe)
00418     val = pe->getValue(&val);
00419 
00420   if (val == 1)
00421     params.set(rcb_averageCuts);
00422 
00423   val = 0;
00424   pe = pl.getEntryPtr("rectilinear_blocks");
00425   if (pe)
00426     val = pe->getValue(&val);
00427 
00428   if (val == 1){
00429     params.set(rcb_rectilinearBlocks);
00430     allowNonrectilinear = false;
00431   } else {
00432     allowNonrectilinear = true;
00433   }
00434 
00435   numTestCuts = 1;
00436   pe = pl.getEntryPtr("bisection_num_test_cuts");
00437   if (pe)
00438     numTestCuts = pe->getValue(&numTestCuts);
00439 
00440   ignoreWeights = params.test(rcb_balanceCount);
00441 }
00442 
00443 
00455 template <typename Adapter>
00456 void pqJagged_getCoordinateValues( const RCP<const CoordinateModel<
00457     typename Adapter::base_adapter_t> > &coords, int &coordDim,
00458     int &weightDim, size_t &numLocalCoords, global_size_t &numGlobalCoords, int &criteriaDim, const bool &ignoreWeights){
00459 
00460   coordDim = coords->getCoordinateDim();
00461   weightDim = coords->getCoordinateWeightDim();
00462   numLocalCoords = coords->getLocalNumCoordinates();
00463   numGlobalCoords = coords->getGlobalNumCoordinates();
00464   criteriaDim = (weightDim ? weightDim : 1);
00465   if (criteriaDim > 1 && ignoreWeights)
00466     criteriaDim = 1;
00467 }
00468 
00469 
00494 template <typename Adapter, typename scalar_t, typename gno_t>
00495 void pqJagged_getInputValues(
00496     const RCP<const Environment> &env, const RCP<const CoordinateModel<
00497     typename Adapter::base_adapter_t> > &coords,
00498     RCP<PartitioningSolution<Adapter> > &solution,
00499     std::bitset<NUM_RCB_PARAMS> &params,
00500     const int &coordDim,
00501     const int &weightDim,
00502     const size_t &numLocalCoords, size_t &numGlobalParts, int &pqJagged_multiVectorDim,
00503     scalar_t **pqJagged_values, const int &criteriaDim, scalar_t **pqJagged_weights, ArrayView<const gno_t> &pqJagged_gnos, bool &ignoreWeights,
00504     bool *pqJagged_uniformWeights, bool *pqJagged_uniformParts, scalar_t **pqJagged_partSizes
00505 ){
00506   typedef typename Adapter::node_t node_t;
00507   typedef typename Adapter::lno_t lno_t;
00508   typedef StridedData<lno_t, scalar_t> input_t;
00509 
00510   ArrayView<const gno_t> gnos;
00511   ArrayView<input_t>     xyz;
00512   ArrayView<input_t>     wgts;
00513 
00514   coords->getCoordinates(gnos, xyz, wgts);
00515   pqJagged_gnos = gnos;
00516 
00517 
00518   //std::cout << std::endl;
00519 
00520   for (int dim=0; dim < coordDim; dim++){
00521     ArrayRCP<const scalar_t> ar;
00522     xyz[dim].getInputArray(ar);
00523     //pqJagged coordinate values assignment
00524     pqJagged_values[dim] =  (scalar_t *)ar.getRawPtr();
00525   }
00526 
00527   if (weightDim == 0 || ignoreWeights){
00528 
00529     pqJagged_uniformWeights[0] = true;
00530     pqJagged_weights[0] = NULL;
00531   }
00532   else{
00533     for (int wdim = 0; wdim < weightDim; wdim++){
00534       if (wgts[wdim].size() == 0){
00535         pqJagged_uniformWeights[wdim] = true;
00536         pqJagged_weights[wdim] = NULL;
00537       }
00538       else{
00539         ArrayRCP<const scalar_t> ar;
00540         wgts[wdim].getInputArray(ar);
00541         pqJagged_uniformWeights[wdim] = false;
00542         pqJagged_weights[wdim] = (scalar_t *) ar.getRawPtr();
00543       }
00544     }
00545   }
00546 
00548   // From the Solution we get part information.
00549   // If the part sizes for a given criteria are not uniform,
00550   // then they are values that sum to 1.0.
00551 
00552   numGlobalParts = solution->getTargetGlobalNumberOfParts();
00553 
00554   for (int wdim = 0; wdim < criteriaDim; wdim++){
00555     if (solution->criteriaHasUniformPartSizes(wdim)){
00556       pqJagged_uniformParts[wdim] = true;
00557       pqJagged_partSizes[wdim] = NULL;
00558     }
00559     else{
00560       scalar_t *tmp = new scalar_t [numGlobalParts];
00561       env->localMemoryAssertion(__FILE__, __LINE__, numGlobalParts, tmp) ;
00562       for (size_t i=0; i < numGlobalParts; i++){
00563         tmp[i] = solution->getCriteriaPartSize(wdim, i);
00564       }
00565       pqJagged_partSizes[wdim] = tmp;
00566 
00567     }
00568   }
00569 
00570   // It may not be possible to solve the partitioning problem
00571   // if we have multiple weight dimensions with part size
00572   // arrays that differ. So let's be aware of this possibility.
00573 
00574   bool multiplePartSizeSpecs = false;
00575 
00576   if (criteriaDim > 1){
00577     for (int wdim1 = 0; wdim1 < criteriaDim; wdim1++)
00578       for (int wdim2 = wdim1+1; wdim2 < criteriaDim; wdim2++)
00579         if (!solution->criteriaHaveSamePartSizes(wdim1, wdim2)){
00580           multiplePartSizeSpecs = true;
00581           break;
00582         }
00583   }
00584 
00585   if (multiplePartSizeSpecs)
00586     params.set(rcb_multiplePartSizeSpecs);
00587 
00589   // Create the distributed data for the algorithm.
00590   //
00591   // It is a multivector containing one vector for each coordinate
00592   // dimension, plus a vector for each weight dimension that is not
00593   // uniform.
00594 
00595   pqJagged_multiVectorDim = coordDim;
00596   for (int wdim = 0; wdim < criteriaDim; wdim++){
00597     if (!pqJagged_uniformWeights[wdim]) pqJagged_multiVectorDim++;
00598   }
00599 
00600 }
00601 
00602 
00606 template <typename scalar_t, typename gno_t>
00607 void pqJagged_printInput(int coordDim, int weightDim, size_t numLocalCoords, global_size_t numGlobalCoords,
00608     int criteriaDim, scalar_t **pqJagged_values, scalar_t **pqJagged_weights,
00609     bool *pqJagged_uniformParts, bool *pqJagged_uniformWeights, gno_t *pqJagged_gnos,
00610     bool ignoreWeights,size_t numGlobalParts, scalar_t **pqJagged_partSizes){
00611 
00612   std::cout << "numLocalCoords:" << numLocalCoords << std::endl;
00613   std::cout << "coordDim:" << coordDim << std::endl;
00614   for(size_t i = 0; i < numLocalCoords; ++i){
00615     for (int ii = 0; ii < coordDim; ++ii){
00616       std::cout <<  pqJagged_values[ii][i] << " ";
00617     }
00618     std::cout << std::endl;
00619   }
00620 
00621 
00622   std::cout << "criteriaDim:" << criteriaDim << std::endl;
00623   std::cout << "weightDim:" << weightDim << std::endl;
00624   if(weightDim){
00625     for(size_t i = 0; i < numLocalCoords; ++i){
00626       for (int ii = 0; ii < weightDim; ++ii){
00627         std::cout <<  pqJagged_weights[ii][i] << " ";
00628       }
00629       std::cout << std::endl;
00630     }
00631   }
00632 
00633   std::cout << "pqJagged_uniformWeights:" << pqJagged_uniformWeights[0] << std::endl;
00634   for(int i = 0; i < criteriaDim; ++i){
00635     std::cout << pqJagged_uniformWeights[i] << " ";
00636   }
00637   std::cout << std::endl;
00638 
00639 
00640   std::cout << "gnos" << std::endl;
00641   for(size_t i = 0; i < numLocalCoords; ++i){
00642     std::cout <<  pqJagged_gnos[i] << " ";
00643   }
00644   std::cout << std::endl;
00645 
00646   std::cout << "ignoreWeights:" << ignoreWeights << std::endl;
00647 
00648   std::cout << "pqJagged_uniformParts:" << pqJagged_uniformParts[0] << std::endl;
00649   for(int i = 0; i < criteriaDim; ++i){
00650     std::cout << pqJagged_uniformParts[i] << " ";
00651   }
00652   std::cout << std::endl;
00653 
00654   std::cout << "pqJagged_partSizes:" << std::endl;
00655   std::cout << "numGlobalParts:" << numGlobalParts << std::endl;
00656   for(int i = 0; i < criteriaDim; ++i){
00657     if(!pqJagged_uniformParts[i])
00658       for(size_t ii = 0; ii < numGlobalParts; ++ii){
00659         std::cout << pqJagged_partSizes[i][ii] << " ";
00660       }
00661     std::cout << std::endl;
00662   }
00663 }
00664 
00665 
00669 int pqJagged_getNumThreads(){
00670   int numThreads = 1;
00671 
00672 
00673 #ifdef HAVE_ZOLTAN2_OMP
00674 #pragma omp parallel shared(numThreads)
00675   {
00676     numThreads = omp_get_num_threads();
00677   }
00678 
00679 #endif
00680 
00681 
00682   return numThreads;
00683 
00684 }
00685 
00686 
00707 template <typename scalar_t, typename lno_t>
00708 void pqJagged_getLocalMinMaxTotalCoord(
00709     lno_t *partitionedPointPermutations,
00710     scalar_t *pqJagged_coordinates,
00711     bool pqJagged_uniformWeights,
00712     scalar_t *pqJagged_weights,
00713 
00714 
00715     int numThreads,
00716     lno_t coordinateBegin,
00717     lno_t coordinateEnd,
00718 
00719     scalar_t *max_min_array /*sized nothreads * 2*/,
00720     scalar_t maxScalar,
00721     scalar_t minScalar,
00722     scalar_t &minCoordinate,
00723     scalar_t &maxCoordinate,
00724     scalar_t &totalWeight
00725 
00726     ){
00727 
00728 
00729   if(coordinateBegin >= coordinateEnd)
00730   {
00731     minCoordinate = maxScalar;
00732     maxCoordinate = minScalar;
00733     totalWeight = 0;
00734   }
00735   else {
00736     scalar_t mytotalWeight = 0;
00737 #ifdef HAVE_ZOLTAN2_OMP
00738 #pragma omp parallel
00739 #endif
00740     {
00741       if (pqJagged_uniformWeights) {
00742   #ifdef HAVE_ZOLTAN2_OMP
00743   #pragma omp single
00744   #endif
00745         {
00746           mytotalWeight = coordinateEnd - coordinateBegin;
00747         }
00748       } else {
00749   #ifdef HAVE_ZOLTAN2_OMP
00750   #pragma omp for reduction(+:mytotalWeight)
00751   #endif
00752         for (lno_t ii = coordinateBegin; ii < coordinateEnd; ++ii){
00753           int i = partitionedPointPermutations[ii];
00754           mytotalWeight += pqJagged_weights[i];
00755         }
00756       }
00757 
00758       int myId = 0;
00759 #ifdef HAVE_ZOLTAN2_OMP
00760       myId = omp_get_thread_num();
00761 #endif
00762       scalar_t myMin, myMax;
00763 
00764       myMin=myMax
00765            =pqJagged_coordinates[partitionedPointPermutations[coordinateBegin]];
00766 
00767 #ifdef HAVE_ZOLTAN2_OMP
00768 #pragma omp for
00769 #endif
00770       for(lno_t j = coordinateBegin + 1; j < coordinateEnd; ++j){
00771         int i = partitionedPointPermutations[j];
00772         if(pqJagged_coordinates[i] > myMax) myMax = pqJagged_coordinates[i];
00773         if(pqJagged_coordinates[i] < myMin) myMin = pqJagged_coordinates[i];
00774       }
00775       max_min_array[myId] = myMin;
00776       max_min_array[myId + numThreads] = myMax;
00777 
00778 #ifdef HAVE_ZOLTAN2_OMP
00779 #pragma omp barrier
00780 #endif
00781 
00782 #ifdef HAVE_ZOLTAN2_OMP
00783 #pragma omp single nowait
00784 #endif
00785       {
00786         minCoordinate = max_min_array[0];
00787         for(int i = 1; i < numThreads; ++i){
00788           if(max_min_array[i] < minCoordinate) minCoordinate = max_min_array[i];
00789         }
00790       }
00791 #ifdef HAVE_ZOLTAN2_OMP
00792 #pragma omp single nowait
00793 #endif
00794       {
00795         maxCoordinate = max_min_array[numThreads];
00796         for(int i = numThreads + 1; i < numThreads * 2; ++i){
00797           if(max_min_array[i] > maxCoordinate) maxCoordinate = max_min_array[i];
00798         }
00799       }
00800     }
00801     totalWeight = mytotalWeight;
00802   }
00803 }
00804 
00805 
00806 
00816 template <typename scalar_t>
00817 void pqJagged_getGlobalMinMaxTotalCoord(
00818     RCP<Comm<int> > &comm,
00819     const RCP<const Environment> &env,
00820     partId_t concurrentPartCount,
00821     scalar_t *localMinMaxTotal,
00822     scalar_t *globalMinMaxTotal){
00823 
00824 
00825   //reduce min for first concurrentPartCount elements, reduce max for next concurrentPartCount elements,
00826   //reduce sum for the last concurrentPartCount elements.
00827   if(comm->getSize()  > 1){
00828 
00829 #ifndef mpi_communication
00830   Teuchos::PQJaggedCombinedMinMaxTotalReductionOp<int, scalar_t> reductionOp(
00831      concurrentPartCount,
00832      concurrentPartCount,
00833      concurrentPartCount);
00834 #endif
00835 
00836 #ifdef mpi_communication
00837     MPI_Op myop;
00838     MPI_Op_create(minMaxSum, 0, &myop);   /* step 3 */
00839 #endif
00840 
00841   try{
00842 
00843 
00844 #ifdef mpi_communication
00845 
00846     MPI_Allreduce(localMinMaxTotal, globalMinMaxTotal, 3 * concurrentPartCount, MPI_FLOAT, myop,MPI_COMM_WORLD);
00847 #endif
00848 #ifndef mpi_communication
00849     reduceAll<int, scalar_t>(*comm, reductionOp,
00850         3 * concurrentPartCount, localMinMaxTotal, globalMinMaxTotal
00851       );
00852 #endif
00853 
00854   }
00855   Z2_THROW_OUTSIDE_ERROR(*env)
00856   }
00857   else {
00858     partId_t s = 3 * concurrentPartCount;
00859     for (partId_t i = 0; i < s; ++i){
00860       globalMinMaxTotal[i] = localMinMaxTotal[i];
00861     }
00862   }
00863 }
00864 
00865 
00876 template <typename scalar_t>
00877 void pqJagged_getCutCoord_Weights(
00878     scalar_t minCoordinate,
00879     scalar_t maxCoordinate,
00880     bool pqJagged_uniformParts,
00881     scalar_t *pqJagged_partSizes /*p sized, weight ratios of each part*/,
00882     partId_t noCuts/*p-1*/ ,
00883     scalar_t *cutCoordinates /*p - 1 sized, coordinate of each cut line*/,
00884     scalar_t *cutPartRatios /*cumulative weight ratios, at left side of each cut line. p-1 sized*/,
00885     int numThreads){
00886 
00887   scalar_t coordinateRange = maxCoordinate - minCoordinate;
00888   if(pqJagged_uniformParts){
00889     scalar_t uniform = 1. / (noCuts + 1);
00890     scalar_t slice = uniform * coordinateRange;
00891 
00892 #ifdef HAVE_ZOLTAN2_OMP
00893 #pragma omp parallel
00894 #endif
00895     {
00896       partId_t myStart = 0;
00897       partId_t myEnd = noCuts;
00898 #ifdef HAVE_ZOLTAN2_OMP
00899 #pragma omp for
00900 #endif
00901       for(partId_t i = myStart; i < myEnd; ++i){
00902         cutPartRatios[i] =  uniform * (i + 1);
00903         cutCoordinates[i] = minCoordinate + slice * (i + 1);
00904       }
00905       cutPartRatios[noCuts] = 1;
00906     }
00907 
00908 
00909   }
00910   else {
00911     //TODO fix here!!
00912     cutPartRatios[0] = pqJagged_partSizes[0];
00913     cutCoordinates[0] = coordinateRange * cutPartRatios[0];
00914     for(partId_t i = 1; i < noCuts; ++i){
00915       cutPartRatios[i] = pqJagged_partSizes[i] + cutPartRatios[i - 1];
00916       cutCoordinates[i] = coordinateRange * cutPartRatios[i];
00917     }
00918   }
00919 }
00920 
00921 
00957 template <typename scalar_t>
00958 void getNewCoordinates(
00959     const RCP<const Environment> &env,
00960     RCP<Comm<int> > &comm,
00961     const size_t &total_part_count,
00962     const partId_t &noCuts,
00963     const scalar_t &maxCoordinate,
00964     const scalar_t &minCoordinate,
00965     const scalar_t &globalTotalWeight,
00966     const scalar_t &imbalanceTolerance,
00967     scalar_t maxScalar,
00968 
00969     const scalar_t * globalPartWeights,
00970     const scalar_t * localPartWeights,
00971     const scalar_t *targetPartWeightRatios,
00972     bool *isDone,
00973 
00974     scalar_t *cutCoordinates,
00975     scalar_t *cutUpperBounds,
00976     scalar_t *cutLowerBounds,
00977     scalar_t *leftClosestDistance,
00978     scalar_t *rightClosestDistance,
00979     scalar_t * cutLowerWeight,
00980     scalar_t * cutUpperWeight,
00981     scalar_t *newCutCoordinates,
00982 
00983     bool allowNonRectelinearPart,
00984     float *nonRectelinearPartRatios,
00985     partId_t *rectilinearCutCount,
00986     scalar_t *localCutWeights,
00987     scalar_t *globalCutWeights,
00988 
00989     partId_t &myNoneDoneCount
00990 )
00991     {
00992 
00993 
00994   scalar_t seenW = 0;
00995   float expected = 0;
00996   scalar_t leftImbalance = 0, rightImbalance = 0;
00997   scalar_t _EPSILON = numeric_limits<scalar_t>::epsilon();
00998   //scalar_t _EPSILON = numeric_limits<float>::epsilon();
00999 
01000 #ifdef HAVE_ZOLTAN2_OMP
01001 #pragma omp for
01002 #endif
01003   for (partId_t i = 0; i < noCuts; i++){
01004 
01005     //if a left and right closes point is not found, set the distance to 0.
01006     if(leftClosestDistance[i] == maxScalar)
01007       leftClosestDistance[i] = 0;
01008     if(rightClosestDistance[i] == maxScalar)
01009       rightClosestDistance[i] = 0;
01010 
01011   }
01012 
01013 #ifdef HAVE_ZOLTAN2_OMP
01014 #pragma omp for
01015 #endif
01016   for (partId_t i = 0; i < noCuts; i++){
01017     globalCutWeights[i] = 0;
01018     localCutWeights[i] = 0;
01019     //if already determined at previous iterations, do nothing.
01020     if(isDone[i]) {
01021       newCutCoordinates[i] = cutCoordinates[i];
01022       continue;
01023     }
01024     //current weight of the part at the left of the cut line.
01025     seenW = globalPartWeights[i * 2];
01026 
01027 
01028     //expected ratio
01029     expected = targetPartWeightRatios[i];
01030     leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
01031     rightImbalance = imbalanceOf(globalTotalWeight - seenW, globalTotalWeight, 1 - expected);
01032 
01033     bool isLeftValid = ABS(leftImbalance) - imbalanceTolerance < _EPSILON ;
01034     bool isRightValid = ABS(rightImbalance) - imbalanceTolerance < _EPSILON;
01035 
01036     //if the cut line reaches to desired imbalance.
01037     if(isLeftValid && isRightValid){
01038 
01039       isDone[i] = true;
01040 #ifdef HAVE_ZOLTAN2_OMP
01041 #pragma omp atomic
01042 #endif
01043       myNoneDoneCount -= 1;
01044       newCutCoordinates [i] = cutCoordinates[i];
01045       continue;
01046     }
01047     else if(leftImbalance < 0){
01048 
01049       scalar_t ew = globalTotalWeight * expected;
01050       if(allowNonRectelinearPart){
01051 
01052         if (globalPartWeights[i * 2 + 1] == ew){
01053           isDone[i] = true;
01054 #ifdef HAVE_ZOLTAN2_OMP
01055 #pragma omp atomic
01056 #endif
01057           myNoneDoneCount -= 1;
01058           newCutCoordinates [i] = cutCoordinates[i];
01059           nonRectelinearPartRatios[i] = 1;
01060           continue;
01061         }
01062         else if (globalPartWeights[i * 2 + 1] > ew){
01063           isDone[i] = true;
01064 #ifdef HAVE_ZOLTAN2_OMP
01065 #pragma omp atomic
01066 #endif
01067           *rectilinearCutCount += 1;
01068 
01069 #ifdef HAVE_ZOLTAN2_OMP
01070 #pragma omp atomic
01071 #endif
01072           myNoneDoneCount -= 1;
01073           newCutCoordinates [i] = cutCoordinates[i];
01074           scalar_t myWeightOnLine = localPartWeights[i * 2 + 1] - localPartWeights[i * 2];
01075           localCutWeights[i] = myWeightOnLine;
01076           continue;
01077         }
01078       }
01079       //when moving right, set lower bound to current line.
01080       cutLowerBounds[i] = cutCoordinates[i] + rightClosestDistance[i];
01081       cutLowerWeight[i] = seenW;
01082 
01083       //compare the upper bound with the current lines.
01084       for (partId_t ii = i + 1; ii < noCuts ; ++ii){
01085         scalar_t pw = globalPartWeights[ii * 2];
01086         scalar_t lw = globalPartWeights[ii * 2 + 1];
01087         if(pw >= ew){
01088           if(pw == ew){
01089             cutUpperBounds[i] = cutCoordinates[ii];
01090             cutUpperWeight[i] = pw;
01091             cutLowerBounds[i] = cutCoordinates[ii];
01092             cutLowerWeight[i] = pw;
01093           } else if (pw < cutUpperWeight[i]){
01094             //if a cut line is more strict than the current upper bound,
01095             //update the upper bound.
01096             cutUpperBounds[i] = cutCoordinates[ii] - leftClosestDistance[ii];
01097             cutUpperWeight[i] = pw;
01098           }
01099           break;
01100         }
01101         //if comes here then pw < ew
01102         if(lw >= ew){
01103           cutUpperBounds[i] = cutCoordinates[ii];
01104           cutUpperWeight[i] = lw;
01105           cutLowerBounds[i] = cutCoordinates[ii];
01106           cutLowerWeight[i] = pw;
01107           break;
01108         }
01109         //if a stricter lower bound is found,
01110         //update the lower bound.
01111         if (pw <= ew && pw >= cutLowerWeight[i]){
01112           cutLowerBounds[i] = cutCoordinates[ii] + rightClosestDistance[ii] ;
01113           cutLowerWeight[i] = pw;
01114         }
01115       }
01116       scalar_t newPivot = pivotPos<scalar_t> (cutUpperBounds, cutLowerBounds,i, cutUpperWeight, cutLowerWeight, ew);
01117       //if cut line does not move significantly.
01118       if (ABS(cutCoordinates[i] - newPivot) < _EPSILON * EPS_SCALE || cutUpperBounds[i] < cutLowerBounds[i]){
01119         isDone[i] = true;
01120 #ifdef HAVE_ZOLTAN2_OMP
01121 #pragma omp atomic
01122 #endif
01123         myNoneDoneCount -= 1;
01124         newCutCoordinates [i] = cutCoordinates[i];
01125       } else {
01126         newCutCoordinates [i] = newPivot;
01127       }
01128     } else {
01129       //moving to left.
01130       scalar_t ew = globalTotalWeight * expected;
01131       //moving left, set upper to current line.
01132       cutUpperBounds[i] = cutCoordinates[i] - leftClosestDistance[i];
01133       cutUpperWeight[i] = seenW;
01134 
01135       // compare the current cut line weights with previous upper and lower bounds.
01136       for (int ii = i - 1; ii >= 0; --ii){
01137         scalar_t pw = globalPartWeights[ii * 2];
01138         scalar_t lw = globalPartWeights[ii * 2 + 1];
01139         if(pw <= ew){
01140           if(pw == ew){
01141             cutUpperBounds[i] = cutCoordinates[ii];
01142             cutUpperWeight[i] = pw;
01143             cutLowerBounds[i] = cutCoordinates[ii];
01144             cutLowerWeight[i] = pw;
01145           }
01146           else if (pw > cutLowerWeight[i]){
01147             cutLowerBounds[i] = cutCoordinates[ii] + rightClosestDistance[ii];
01148             cutLowerWeight[i] = pw;
01149             if(lw > ew){
01150               cutUpperBounds[i] = cutCoordinates[ii] + rightClosestDistance[ii];
01151 
01152               cutUpperWeight[i] = lw;
01153             }
01154           }
01155           break;
01156         }
01157         if (pw >= ew && (pw < cutUpperWeight[i] || (pw == cutUpperWeight[i] && cutUpperBounds[i] > cutCoordinates[ii] - leftClosestDistance[ii]))){
01158           cutUpperBounds[i] = cutCoordinates[ii] - leftClosestDistance[ii] ;
01159 
01160           cutUpperWeight[i] = pw;
01161         }
01162       }
01163 
01164       scalar_t newPivot = pivotPos<scalar_t> (cutUpperBounds, cutLowerBounds,i, cutUpperWeight, cutLowerWeight, ew);
01165       //if cut line does not move significantly.
01166       if (ABS(cutCoordinates[i] - newPivot) < _EPSILON * EPS_SCALE  || cutUpperBounds[i] < cutLowerBounds[i]){
01167         isDone[i] = true;
01168 
01169 #ifdef HAVE_ZOLTAN2_OMP
01170 #pragma omp atomic
01171 #endif
01172         myNoneDoneCount -= 1;
01173         newCutCoordinates [ i] = cutCoordinates[i];
01174       } else {
01175         newCutCoordinates [ i] = newPivot;
01176       }
01177     }
01178   }
01179 
01180 
01181 
01182   //communication to determine the ratios of processors for the distribution of coordinates on the cut lines.
01183 #ifdef HAVE_ZOLTAN2_OMP
01184 //#pragma omp barrier
01185 #pragma omp single
01186 #endif
01187   {
01188     if(*rectilinearCutCount > 0){
01189       try{
01190         Teuchos::scan<int,scalar_t>(
01191             *comm, Teuchos::REDUCE_SUM,
01192             noCuts,localCutWeights, globalCutWeights
01193         );
01194       }
01195       Z2_THROW_OUTSIDE_ERROR(*env)
01196 
01197 
01198       for (partId_t i = 0; i < noCuts; ++i){
01199         //cout << "gw:" << globalCutWeights[i] << endl;
01200         if(globalCutWeights[i] > 0) {
01201           scalar_t ew = globalTotalWeight * targetPartWeightRatios[i];
01202           scalar_t expectedWeightOnLine = ew - globalPartWeights[i * 2];
01203           scalar_t myWeightOnLine = localCutWeights[i];
01204           scalar_t weightOnLineBefore = globalCutWeights[i];
01205           scalar_t incMe = expectedWeightOnLine - weightOnLineBefore;
01206           scalar_t mine = incMe + myWeightOnLine;
01207           if(mine < 0){
01208             nonRectelinearPartRatios[i] = 0;
01209           }
01210           else if(mine >= myWeightOnLine){
01211             nonRectelinearPartRatios[i] = 1;
01212           }
01213           else {
01214             nonRectelinearPartRatios[i] = mine / myWeightOnLine;
01215           }
01216         }
01217       }
01218       *rectilinearCutCount = 0;
01219     }
01220   }
01221 }
01222 
01223 
01254 template <typename scalar_t, typename lno_t>
01255 void pqJagged_1DPart_getPartWeights(
01256     size_t total_part_count,
01257     partId_t noCuts,
01258     scalar_t maxScalar,
01259     scalar_t _EPSILON,
01260     int numThreads,
01261     lno_t coordinateBegin,
01262     lno_t coordinateEnd,
01263     lno_t *partitionedPointPermutations,
01264     scalar_t *pqJagged_coordinates,
01265     bool pqJagged_uniformWeights,
01266     scalar_t *pqJagged_weights,
01267 
01268     scalar_t *cutCoordinates_tmp, //TODO change name
01269     bool *isDone,
01270     double *myPartWeights,
01271     scalar_t *myLeftClosest,
01272     scalar_t *myRightClosest,
01273     bool useBinarySearch,
01274     partId_t *partIds
01275 ){
01276 
01277   // initializations for part weights, left/right closest
01278     for (size_t i = 0; i < total_part_count; ++i){
01279       myPartWeights[i] = 0;
01280     }
01281 
01282 
01283   for(partId_t i = 0; i < noCuts; ++i){
01284     //if(isDone[i]) continue;
01285     myLeftClosest[i] = maxScalar;
01286     myRightClosest[i] = maxScalar;
01287   }
01288   //cout << "iter:" << endl;
01289   if(useBinarySearch){
01290 
01291     //lno_t comparison_count = 0;
01292     scalar_t minus_EPSILON = -_EPSILON;
01293 #ifdef HAVE_ZOLTAN2_OMP
01294 #pragma omp for
01295 #endif
01296     for (lno_t ii = coordinateBegin; ii < coordinateEnd; ++ii){
01297       int i = partitionedPointPermutations[ii];
01298       //partId_t j = (uc + lc) / 2;
01299       partId_t j = partIds[i] / 2;
01300 
01301       if(j >= noCuts){
01302         j = noCuts - 1;
01303       }
01304 
01305       partId_t lc = 0;
01306       partId_t uc = noCuts - 1;
01307       scalar_t w = pqJagged_uniformWeights? 1:pqJagged_weights[i];
01308       bool isInserted = false;
01309       bool onLeft = false;
01310       bool onRight = false;
01311       partId_t lastPart = -1;
01312       while(uc >= lc)
01313       {
01314       //comparison_count++;
01315         lastPart = -1;
01316         onLeft = false;
01317         onRight = false;
01318         scalar_t coord = pqJagged_coordinates[i];
01319         scalar_t cut = cutCoordinates_tmp[j];
01320         scalar_t distance = coord - cut;
01321         scalar_t absdistance = ABS(distance);
01322 
01323         if(absdistance < _EPSILON){
01324 
01325           myPartWeights[j * 2 + 1] += w;
01326           partIds[i] = j * 2 + 1;
01327 
01328           myLeftClosest[j] = 0;
01329           myRightClosest[j] = 0;
01330           partId_t kk = j + 1;
01331           while(kk < noCuts){  // Needed when cuts shared the same position
01332                                // kddnote Can this loop be disabled for RECTILINEAR BLOCKS?
01333                                // kddnote Mehmet says it is probably needed anyway.
01334             distance =ABS(cutCoordinates_tmp[kk] - cut);
01335             if(distance < _EPSILON){
01336               myPartWeights[2 * kk + 1] += w;
01337 
01338               myLeftClosest[kk] = 0;
01339               myRightClosest[kk] = 0;
01340               kk++;
01341             }
01342             else{
01343               if(myLeftClosest[kk] > distance){
01344                 myLeftClosest[kk] = distance;
01345               }
01346               break;
01347             }
01348           }
01349 
01350           kk = j - 1;
01351           while(kk >= 0){
01352             distance =ABS(cutCoordinates_tmp[kk] - cut);
01353             if(distance < _EPSILON){
01354               myPartWeights[2 * kk + 1] += w;
01355 
01356               myLeftClosest[kk] = 0;
01357               myRightClosest[kk] = 0;
01358               kk--;
01359             }
01360             else{
01361               if(myRightClosest[kk] > distance){
01362                 myRightClosest[kk] = distance;
01363               }
01364               break;
01365             }
01366           }
01367           isInserted = true;
01368           break;
01369         }
01370         else {
01371           if (distance < 0) {
01372 
01373             //TODO fix abs
01374             distance = absdistance;
01375             if (myLeftClosest[j] > distance){
01376               myLeftClosest[j] = distance;
01377             }
01378             bool _break = false;
01379             if(j > 0){
01380               distance = coord - cutCoordinates_tmp[j - 1];
01381               if(distance > _EPSILON){
01382                 if (myRightClosest[j - 1] > distance){
01383                   myRightClosest[j - 1] = distance;
01384                 }
01385                 _break = true;
01386               } else if(distance < minus_EPSILON){
01387                 distance = -distance;
01388                 if (myLeftClosest[j - 1] > distance){
01389                   myLeftClosest[j - 1] = distance;
01390                 }
01391               }
01392               else {
01393                 myLeftClosest[j - 1] = 0;
01394                 myRightClosest[j - 1 ] = 0;
01395               }
01396             }
01397             uc = j - 1;
01398             onLeft = true;
01399             lastPart = j;
01400             if(_break) break;
01401           }
01402           else {
01403             if (myRightClosest[j] > distance){
01404               myRightClosest[j] = distance;
01405             }
01406             bool _break = false;
01407             if(j < noCuts - 1){
01408               distance = coord - cutCoordinates_tmp[j + 1];
01409               if(distance > _EPSILON){
01410                 if (myRightClosest[j + 1] > distance){
01411                   myRightClosest[j + 1] = distance;
01412                 }
01413               } else if(distance < minus_EPSILON){
01414                 distance = -distance;
01415                 if (myLeftClosest[j + 1] > distance){
01416                   myLeftClosest[j + 1] = distance;
01417                 }
01418                 _break = true;
01419               }
01420               else {
01421                 myLeftClosest[j + 1] = 0;
01422                 myRightClosest[j + 1 ] = 0;
01423               }
01424             }
01425             lc = j + 1;
01426             onRight = true;
01427             lastPart = j;
01428             if(_break) break;
01429           }
01430         }
01431 
01432         j = (uc + lc) / 2;
01433       }
01434       if(!isInserted){
01435         if(onRight){
01436           myPartWeights[2 * lastPart + 2] += w;
01437           partIds[i] = 2 * lastPart + 2;
01438         }
01439         else if(onLeft){
01440           myPartWeights[2 * lastPart] += w;
01441           partIds[i] = 2 * lastPart;
01442         }
01443       }
01444     }
01445 /*
01446     //cout << "comp:" << comparison_count << " size:" << coordinateEnd- coordinateBegin << endl;
01447     // prefix sum computation.
01448     for (size_t i = 1; i < total_part_count; ++i){
01449       // if check for cuts sharing the same position; all cuts sharing a position
01450       // have the same weight == total weight for all cuts sharing the position.
01451       // don't want to accumulate that total weight more than once.
01452       if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
01453          ABS(cutCoordinates_tmp[i / 2] - cutCoordinates_tmp[i /2 - 1]) < _EPSILON){
01454         myPartWeights[i] = myPartWeights[i-2];
01455         continue;
01456       }
01457       myPartWeights[i] += myPartWeights[i-1];
01458     }
01459 
01460 */
01461   }
01462   else {
01463 
01464     //lno_t comp = 0;
01465 #ifdef HAVE_ZOLTAN2_OMP
01466 #pragma omp for
01467 #endif
01468     for (lno_t ii = coordinateBegin; ii < coordinateEnd; ++ii){
01469       int i = partitionedPointPermutations[ii];
01470 
01471       scalar_t w = pqJagged_uniformWeights ? 1 : pqJagged_weights[i];
01472 
01473       scalar_t coord = pqJagged_coordinates[i];
01474 
01475       partId_t j = partIds[i] / 2;
01476 
01477       if(j >= noCuts){
01478         j = noCuts - 1;
01479       }
01480       scalar_t cut = cutCoordinates_tmp[j];
01481       scalar_t distance = coord - cut;
01482       scalar_t absdistance = ABS(distance);
01483 
01484       //comp++;
01485       if(absdistance < _EPSILON){
01486         myPartWeights[j * 2 + 1] += w;
01487         //cout << "1to part:" << 2*j+1 << " coord:" << coord << endl;
01488         myLeftClosest[j] = 0;
01489         myRightClosest[j] = 0;
01490         partIds[i] = j * 2 + 1;
01491 
01492         //bas
01493         partId_t kk = j + 1;
01494         while(kk < noCuts){  // Needed when cuts shared the same position
01495           // kddnote Can this loop be disabled for RECTILINEAR BLOCKS?
01496           // kddnote Mehmet says it is probably needed anyway.
01497           distance =ABS(cutCoordinates_tmp[kk] - cut);
01498           if(distance < _EPSILON){
01499             //cout << "yo" << endl;
01500             myPartWeights[2 * kk + 1] += w;
01501 
01502             //cout << "2to part:" << 2*kk+1 << " coord:" << coord << endl;
01503             myLeftClosest[kk] = 0;
01504             myRightClosest[kk] = 0;
01505             kk++;
01506           }
01507           else{
01508             if(myLeftClosest[kk] > distance){
01509               myLeftClosest[kk] = distance;
01510             }
01511             break;
01512           }
01513         }
01514 
01515         kk = j - 1;
01516         while(kk >= 0){
01517           distance =ABS(cutCoordinates_tmp[kk] - cut);
01518           if(distance < _EPSILON){
01519             myPartWeights[2 * kk + 1] += w;
01520             //cout << "3to part:" << 2*kk+1 << " coord:" << coord << endl;
01521 
01522             myLeftClosest[kk] = 0;
01523             myRightClosest[kk] = 0;
01524             kk--;
01525           }
01526           else{
01527             if(myRightClosest[kk] > distance){
01528               myRightClosest[kk] = distance;
01529             }
01530             break;
01531           }
01532         }
01533       }
01534       else if (distance < 0) {
01535         while((absdistance > _EPSILON) && distance < 0){
01536           //comp++;
01537           if (myLeftClosest[j] > absdistance){
01538             myLeftClosest[j] = absdistance;
01539           }
01540           --j;
01541           if(j  < 0) break;
01542           distance = coord - cutCoordinates_tmp[j];
01543 
01544           absdistance = ABS(distance);
01545           }
01546         if(absdistance < _EPSILON)
01547         {
01548           myPartWeights[j * 2 + 1] += w;
01549           myLeftClosest[j] = 0;
01550           myRightClosest[j] = 0;
01551           cut = cutCoordinates_tmp[j];
01552           //cout << "4to part:" << 2*j+1 <<" j:" << j <<  " coord:" << coord << endl;
01553           //cout << "cut:" << cutCoordinates_tmp[j] << " dis:" << distance << endl;
01554           partIds[i] = j * 2 + 1;
01555 
01556           partId_t kk = j + 1;
01557           while(kk < noCuts){  // Needed when cuts shared the same position
01558             // kddnote Can this loop be disabled for RECTILINEAR BLOCKS?
01559             // kddnote Mehmet says it is probably needed anyway.
01560             distance =ABS(cutCoordinates_tmp[kk] - cut);
01561             //cout << "distance:" << distance << endl;
01562             if(distance < _EPSILON){
01563               myPartWeights[2 * kk + 1] += w;
01564               //cout << "5to part:" << 2*kk+1 << " kk:" << kk << " coord:" << coord << endl;
01565               //cout << "cut:" << cutCoordinates_tmp[kk] << " dis:" << distance << endl;
01566               myLeftClosest[kk] = 0;
01567               myRightClosest[kk] = 0;
01568               kk++;
01569             }
01570             else{
01571               if(myLeftClosest[kk] > distance){
01572                 myLeftClosest[kk] = distance;
01573               }
01574               break;
01575             }
01576           }
01577 
01578           kk = j - 1;
01579           while(kk >= 0){
01580             distance =ABS(cutCoordinates_tmp[kk] - cut);
01581             if(distance < _EPSILON){
01582               myPartWeights[2 * kk + 1] += w;
01583               //cout << "6to part:" << 2*kk+1 << " coord:" << coord << endl;
01584               //cout << "cut:" << cutCoordinates_tmp[kk] << " dis:" << distance << endl;
01585 
01586               myLeftClosest[kk] = 0;
01587               myRightClosest[kk] = 0;
01588               kk--;
01589             }
01590             else{
01591               if(myRightClosest[kk] > distance){
01592                 myRightClosest[kk] = distance;
01593               }
01594               break;
01595             }
01596           }
01597         }
01598         else {
01599           myPartWeights[j * 2 + 2] += w;
01600           if (j >= 0 && myRightClosest[j] > absdistance){
01601             myRightClosest[j] = absdistance;
01602           }
01603           partIds[i] = j * 2 + 2;
01604         }
01605 
01606       }
01607       //if it is on the left
01608       else {
01609 
01610         while((absdistance > _EPSILON) && distance > 0){
01611           //comp++;
01612             if ( myRightClosest[j] > absdistance){
01613               myRightClosest[j] = absdistance;
01614             }
01615           ++j;
01616           if(j  >= noCuts) break;
01617           distance = coord - cutCoordinates_tmp[j];
01618           absdistance = ABS(distance);
01619           }
01620 
01621 
01622 
01623         if(absdistance < _EPSILON)
01624         {
01625           myPartWeights[j * 2 + 1] += w;
01626           myLeftClosest[j] = 0;
01627           myRightClosest[j] = 0;
01628           partIds[i] = j * 2 + 1;
01629           cut = cutCoordinates_tmp[j];
01630           partId_t kk = j + 1;
01631           while(kk < noCuts){  // Needed when cuts shared the same position
01632             // kddnote Can this loop be disabled for RECTILINEAR BLOCKS?
01633             // kddnote Mehmet says it is probably needed anyway.
01634             distance =ABS(cutCoordinates_tmp[kk] - cut);
01635             if(distance < _EPSILON){
01636               myPartWeights[2 * kk + 1] += w;
01637               myLeftClosest[kk] = 0;
01638               myRightClosest[kk] = 0;
01639               kk++;
01640             }
01641             else{
01642               if(myLeftClosest[kk] > distance){
01643                 myLeftClosest[kk] = distance;
01644               }
01645               break;
01646             }
01647           }
01648 
01649           kk = j - 1;
01650           while(kk >= 0){
01651             distance =ABS(cutCoordinates_tmp[kk] - cut);
01652             if(distance < _EPSILON){
01653               myPartWeights[2 * kk + 1] += w;
01654 
01655               myLeftClosest[kk] = 0;
01656               myRightClosest[kk] = 0;
01657               kk--;
01658             }
01659             else{
01660               if(myRightClosest[kk] > distance){
01661                 myRightClosest[kk] = distance;
01662               }
01663               break;
01664             }
01665           }
01666         }
01667         else {
01668         myPartWeights[j * 2] += w;
01669         if (j < noCuts && myLeftClosest[j] > absdistance){
01670           myLeftClosest[j] = absdistance;
01671         }
01672         partIds[i] = j * 2 ;
01673         }
01674       }
01675     }
01676 
01677     /*
01678 #ifdef HAVE_ZOLTAN2_OMP
01679 #pragma omp for
01680 #endif
01681     for (lno_t ii = coordinateBegin; ii < coordinateEnd; ++ii){
01682       int i = partitionedPointPermutations[ii];
01683 
01684       scalar_t w = pqJagged_uniformWeights? 1:pqJagged_weights[i];
01685       //get a coordinate and compare it with cut lines from left to right.
01686       for(partId_t j = 0; j < noCuts; ++j){
01687 
01688         if(isDone[j]) continue;
01689         scalar_t distance = pqJagged_coordinates[i] - cutCoordinates_tmp[j];
01690         scalar_t absdistance = ABS(distance);
01691         if(absdistance < _EPSILON){
01692           myPartWeights[j * 2] -= w;
01693           //myPartWeights[j * 2] += w;
01694           myLeftClosest[j] = 0;
01695           myRightClosest[j] = 0;
01696           //break;
01697         }
01698         else
01699           if (distance < 0) {
01700             distance = -distance;
01701             if (myLeftClosest[j] > distance){
01702               myLeftClosest[j] = distance;
01703             }
01704             break;
01705           }
01706         //if it is on the left
01707           else {
01708             //if on the right, continue with the next line.
01709             myPartWeights[j * 2] -= w;
01710             myPartWeights[j * 2 + 1] -= w;
01711             //myPartWeights[j * 2] += w;
01712             //myPartWeights[j * 2 + 1] += w;
01713 
01714             if ( myRightClosest[j] > distance){
01715               myRightClosest[j] = distance;
01716             }
01717           }
01718       }
01719     }
01720     */
01721 
01722   }
01723 
01724     //cout << "comp:" << comp <<endl;
01725     // prefix sum computation.
01726     for (size_t i = 1; i < total_part_count; ++i){
01727       // if check for cuts sharing the same position; all cuts sharing a position
01728       // have the same weight == total weight for all cuts sharing the position.
01729       // don't want to accumulate that total weight more than once.
01730       if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
01731          ABS(cutCoordinates_tmp[i / 2] - cutCoordinates_tmp[i /2 - 1]) < _EPSILON){
01732         myPartWeights[i] = myPartWeights[i-2];
01733         continue;
01734       }
01735       myPartWeights[i] += myPartWeights[i-1];
01736       //cout << "p:" << "i:" << i<< " :" <<myPartWeights[i]  << endl;
01737     }
01738   /*
01739   for (size_t i = 0; i < total_part_count; ++i){
01740     cout << "p:" << i << ":" << myPartWeights[i] << endl;
01741   }
01742    */
01743 
01744 
01745 }
01746 
01747 
01762 template <typename scalar_t>
01763 void accumulateThreadResults(
01764     partId_t noCuts,
01765     size_t total_part_count,
01766     partId_t concurrentPartCount,
01767     int numThreads,
01768     bool *isDone,
01769     scalar_t **leftClosestDistance,
01770     scalar_t **rightClosestDistance,
01771     double **partWeights,
01772     scalar_t *localMinMaxTotal,
01773     scalar_t *totalPartWeights_leftClosest_rightCloset//,
01774     //bool useBinarySearch
01775     ){
01776 #ifdef HAVE_ZOLTAN2_OMP
01777 #pragma omp for
01778 #endif
01779   for(partId_t i = 0; i < noCuts * concurrentPartCount; ++i){
01780     if(isDone[i]) continue;
01781     scalar_t minl = leftClosestDistance[0][i], minr = rightClosestDistance[0][i];
01782 
01783     for (int j = 1; j < numThreads; ++j){
01784       if (rightClosestDistance[j][i] < minr ){
01785         minr = rightClosestDistance[j][i];
01786       }
01787       if (leftClosestDistance[j][i] < minl ){
01788         minl = leftClosestDistance[j][i];
01789       }
01790     }
01791     size_t ishift = i % noCuts + (i / noCuts) * (total_part_count + 2 * noCuts);
01792     totalPartWeights_leftClosest_rightCloset[total_part_count + ishift] = minl;
01793     totalPartWeights_leftClosest_rightCloset[total_part_count + noCuts + ishift] = minr;
01794   }
01795 
01796  // if(1 || useBinarySearch){
01797 #ifdef HAVE_ZOLTAN2_OMP
01798 #pragma omp for
01799 #endif
01800     for(size_t j = 0; j < total_part_count * concurrentPartCount; ++j){
01801       size_t actualCutInd = (j % total_part_count);
01802       partId_t cutInd = actualCutInd / 2 + (j / total_part_count) * noCuts;
01803 
01804       if(actualCutInd !=  total_part_count - 1 && isDone[cutInd]) continue;
01805       double pwj = 0;
01806       for (int i = 0; i < numThreads; ++i){
01807         pwj += partWeights[i][j];
01808       }
01809       size_t jshift = j % total_part_count + (j / total_part_count) * (total_part_count + 2 * noCuts);
01810       totalPartWeights_leftClosest_rightCloset[jshift] = pwj;
01811     }
01812 /*
01813   } else {
01814 #ifdef HAVE_ZOLTAN2_OMP
01815 #pragma omp for
01816 #endif
01817     for(size_t j = 0; j < total_part_count * concurrentPartCount; ++j){
01818       size_t actualCutInd = (j % total_part_count);
01819       partId_t cutInd = actualCutInd / 2 + (j / total_part_count) * noCuts;
01820 
01821       if(actualCutInd !=  total_part_count - 1 && isDone[cutInd]) continue;
01822       double pwj = 0;
01823       for (int i = 0; i < numThreads; ++i){
01824         pwj += partWeights[i][j];
01825       }
01826       scalar_t totalWeight = localMinMaxTotal[j / total_part_count + concurrentPartCount * 2];
01827       size_t jshift = j % total_part_count + (j / total_part_count) * (total_part_count + 2 * noCuts);
01828       totalPartWeights_leftClosest_rightCloset[jshift] = totalWeight + pwj;
01829 
01830     }
01831   }
01832 */
01833 }
01834 
01835 
01836 
01837 
01883 template <typename scalar_t, typename lno_t>
01884 void pqJagged_1D_Partition(
01885     const RCP<const Environment> &env,
01886     RCP<Comm<int> > &comm,
01887 
01888     lno_t *partitionedPointPermutations,
01889     scalar_t *pqJagged_coordinates,
01890     bool pqJagged_uniformWeights,
01891     scalar_t *pqJagged_weights,
01892 
01893     scalar_t *targetPartWeightRatios,   // the weight ratios at left side of the cuts. last is 1.
01894     scalar_t *globalMinMaxTotal,
01895     scalar_t *localMinMaxTotal,
01896 
01897     partId_t partNo,
01898     int numThreads,
01899     scalar_t maxScalar,
01900     scalar_t minScalar,
01901     scalar_t imbalanceTolerance,
01902     partId_t currentPartBeginIndex,
01903     partId_t concurrentPartCount,
01904     lno_t *inTotalCounts,
01905 
01906     scalar_t *cutCoordinates,
01907     scalar_t *cutCoordinatesWork,   // work array to manipulate coordinate of cutlines in different iterations.
01908     scalar_t **leftClosestDistance,
01909     scalar_t **rightClosestDistance,
01910     scalar_t *cutUpperBounds,  //to determine the next cut line with binary search
01911     scalar_t *cutLowerBounds,  //to determine the next cut line with binary search
01912     scalar_t *cutUpperWeight,   //to determine the next cut line with binary search
01913     scalar_t *cutLowerWeight,  //to determine the next cut line with binary search
01914     bool *isDone,
01915     double **partWeights,
01916     scalar_t *local_totalPartWeights_leftClosest_rightCloset,
01917     scalar_t *global_totalPartWeights_leftClosest_rightCloset,
01918     bool allowNonRectelinearPart,
01919     float *nonRectelinearPartRatios,
01920     scalar_t *localCutWeights,
01921     scalar_t *globalCutWeights,
01922 
01923     partId_t allDone,
01924     partId_t *myNonDoneCounts,
01925     bool useBinarySearch,
01926 //    string dimension,
01927     partId_t * partIds
01928 ){
01929 
01930   partId_t recteLinearCutCount = 0;
01931   scalar_t *cutCoordinates_tmp = cutCoordinates;
01932   partId_t noCuts = partNo - 1;
01933   size_t total_part_count = partNo + size_t (noCuts) ;
01934 
01935 
01936 #ifdef mpi_communication
01937   MPI_Op myop;
01938   MPI_Op_create(sumMinMin, 0, &myop);   /* step 3 */
01939 #endif
01940 
01941 
01942   scalar_t _EPSILON = numeric_limits<scalar_t>::epsilon();
01943 
01944 
01945 //  env->timerStart(MACRO_TIMERS, "PQJagged creation_" + dimension);
01946   Teuchos::PQJaggedCombinedReductionOp<int, scalar_t> reductionOp(
01947       total_part_count , noCuts  , noCuts , concurrentPartCount);
01948 
01949  // env->timerStop(MACRO_TIMERS, "PQJagged creation_" + dimension);
01950 
01951 #ifdef HAVE_ZOLTAN2_OMP
01952 #pragma omp parallel shared(allDone,  recteLinearCutCount)
01953 #endif
01954   {
01955     //env->timerStart(MACRO_TIMERS, "PQJagged thread_init_" + dimension);
01956     int me = 0;
01957 #ifdef HAVE_ZOLTAN2_OMP
01958     me = omp_get_thread_num();
01959 #endif
01960     double *myPartWeights = partWeights[me];
01961     scalar_t *myLeftClosest = leftClosestDistance[me];
01962     scalar_t *myRightClosest = rightClosestDistance[me];
01963 
01964 #ifdef HAVE_ZOLTAN2_OMP
01965 #pragma omp for
01966 #endif
01967     for(partId_t i = 0; i < noCuts * concurrentPartCount; ++i){
01968       isDone[i] = false;
01969       partId_t ind = i / noCuts;
01970       cutLowerBounds[i] = globalMinMaxTotal[ind];
01971       cutUpperBounds[i] = globalMinMaxTotal[ind + concurrentPartCount];
01972       cutUpperWeight[i] = globalMinMaxTotal[ind + 2 * concurrentPartCount];
01973       cutLowerWeight[i] = 0;
01974       if(allowNonRectelinearPart){
01975         nonRectelinearPartRatios[i] = 0;
01976       }
01977     }
01978 
01979     //env->timerStop(MACRO_TIMERS, "PQJagged thread_init_" + dimension);
01980 
01981 
01982 #ifdef HAVE_ZOLTAN2_OMP
01983 #pragma omp barrier
01984 #endif
01985     int iteration = 0;
01986     while (allDone != 0){
01987       iteration += 1;
01988 /*
01989       if(comm->getRank() == 0)
01990         {
01991 #pragma omp single
01992           { cout << endl << endl << "allDone:" << allDone << endl;
01993 
01994             for (size_t i = 0; i < noCuts * concurrentPartCount; ++i){
01995 
01996               if(isDone[i] == false)
01997                 cout << "i:" << i <<  " c:" << cutCoordinates_tmp[i] << " u:" << cutUpperBounds[i] << " l:" << cutLowerBounds[i] <<
01998                 " uw:" << cutUpperWeight[i] << " lw:" << cutLowerWeight[i] << " not done" << endl;
01999               else cout << "i:" << i <<  " c:" << cutCoordinates_tmp[i] <<  " done" << endl; }
02000 
02001           }
02002         }
02003 */
02004 
02005 /*
02006 #pragma omp single
02007       {
02008       env->timerStart(MACRO_TIMERS, "PQJagged 1D_" + dimension);
02009       }
02010       */
02011       for (partId_t kk = 0; kk < concurrentPartCount; ++kk){
02012         if (/*globalMinMax[kk] > globalMinMax[kk + k] ||*/ myNonDoneCounts[kk] == 0) continue;
02013         partId_t cutShift = noCuts * kk;
02014         size_t totalPartShift = total_part_count * kk;
02015 
02016         bool *currentDone = isDone + cutShift;
02017         double *myCurrentPartWeights = myPartWeights + totalPartShift;
02018         scalar_t *myCurrentLeftClosest = myLeftClosest + cutShift;
02019         scalar_t *myCurrentRightClosest = myRightClosest + cutShift;
02020 
02021         partId_t current = currentPartBeginIndex + kk;
02022         lno_t coordinateBegin = current ==0 ? 0: inTotalCounts[current -1];
02023         lno_t coordinateEnd = inTotalCounts[current];
02024         scalar_t *cutCoordinates_ = cutCoordinates_tmp + kk * noCuts;
02025 
02026         // compute part weights using existing cuts
02027         pqJagged_1DPart_getPartWeights<scalar_t, lno_t>(
02028             total_part_count,
02029             noCuts,
02030             maxScalar,
02031             _EPSILON,
02032             numThreads,
02033             coordinateBegin,
02034             coordinateEnd,
02035             partitionedPointPermutations,
02036             pqJagged_coordinates,
02037             pqJagged_uniformWeights,
02038             pqJagged_weights,
02039             cutCoordinates_,
02040             currentDone,
02041             myCurrentPartWeights,
02042             myCurrentLeftClosest,
02043             myCurrentRightClosest,
02044             useBinarySearch,
02045             partIds);
02046       }
02047       /*
02048 #pragma omp single
02049       {
02050       env->timerStop(MACRO_TIMERS, "PQJagged 1D_" + dimension);
02051       env->timerStart(MACRO_TIMERS, "PQJagged 1D_accumulation_" + dimension);
02052       }
02053       */
02054       accumulateThreadResults<scalar_t>(
02055           noCuts, total_part_count, concurrentPartCount,
02056           numThreads, isDone,
02057           leftClosestDistance, rightClosestDistance, partWeights,
02058           localMinMaxTotal,
02059           local_totalPartWeights_leftClosest_rightCloset//,
02060 //          useBinarySearch
02061       );
02062       /*
02063 #pragma omp single
02064       {
02065       env->timerStop(MACRO_TIMERS, "PQJagged 1D_accumulation_" + dimension);
02066 
02067       env->timerStart(MACRO_TIMERS, "PQJagged Communication_" + dimension);
02068       }
02069       */
02070 #ifdef HAVE_ZOLTAN2_OMP
02071 #pragma omp single
02072 #endif
02073       {
02074         if(comm->getSize() > 1){
02075         try{
02076 
02077 
02078 
02079 #ifdef mpi_communication
02080 
02081             MPI_Allreduce(local_totalPartWeights_leftClosest_rightCloset, global_totalPartWeights_leftClosest_rightCloset,
02082 
02083                           (total_part_count + 2 * noCuts) * concurrentPartCount, MPI_FLOAT, myop,MPI_COMM_WORLD);
02084 #endif
02085 #ifndef mpi_communication
02086 
02087           reduceAll<int, scalar_t>(*comm, reductionOp,
02088               (total_part_count + 2 * noCuts) * concurrentPartCount,
02089               local_totalPartWeights_leftClosest_rightCloset,
02090               global_totalPartWeights_leftClosest_rightCloset
02091           );
02092 #endif
02093 
02094           //cout << "reducing" << endl;
02095           /*
02096           MPI_Allreduce (
02097               local_totalPartWeights_leftClosest_rightCloset,
02098               global_totalPartWeights_leftClosest_rightCloset,
02099               total_part_count ,
02100               MPI_DOUBLE,
02101               MPI_SUM,
02102               MPI_COMM_WORLD);
02103 
02104           MPI_Allreduce (
02105               local_totalPartWeights_leftClosest_rightCloset + total_part_count ,
02106               global_totalPartWeights_leftClosest_rightCloset + total_part_count ,
02107               noCuts,
02108               MPI_DOUBLE,
02109               MPI_MIN,
02110               MPI_COMM_WORLD);
02111 
02112           MPI_Allreduce (
02113               local_totalPartWeights_leftClosest_rightCloset + total_part_count + noCuts,
02114               global_totalPartWeights_leftClosest_rightCloset + total_part_count + noCuts,
02115               noCuts,
02116               MPI_DOUBLE,
02117               MPI_MIN,
02118               MPI_COMM_WORLD);
02119         */
02120         }
02121 
02122         Z2_THROW_OUTSIDE_ERROR(*env)
02123         }
02124         else {
02125           size_t s = (total_part_count + 2 * noCuts) * concurrentPartCount;
02126           for(size_t i = 0; i < s; ++i){
02127             global_totalPartWeights_leftClosest_rightCloset[i] = local_totalPartWeights_leftClosest_rightCloset[i];
02128           }
02129         }
02130       }
02131       /*
02132 #pragma omp single
02133       {
02134       env->timerStop(MACRO_TIMERS, "PQJagged Communication_" + dimension);
02135 
02136       env->timerStart(MACRO_TIMERS, "PQJagged new_cut_calculation_" + dimension);
02137       }
02138       */
02139       for (partId_t kk = 0; kk < concurrentPartCount; ++kk){
02140 
02141 
02142         if (/*globalMinMax[kk] > globalMinMax[kk + k] || */myNonDoneCounts[kk] == 0) continue;
02143         partId_t cutShift = noCuts * kk;
02144         size_t tlrShift = (total_part_count + 2 * noCuts) * kk;
02145 
02146         scalar_t *localPartWeights = local_totalPartWeights_leftClosest_rightCloset  + tlrShift ;
02147         scalar_t *gtlr = global_totalPartWeights_leftClosest_rightCloset + tlrShift;
02148         scalar_t *glc = gtlr + total_part_count; //left closest points
02149         scalar_t *grc = gtlr + total_part_count + noCuts; //right closest points
02150         scalar_t *globalPartWeights = gtlr;
02151         bool *currentDone = isDone + cutShift;
02152         scalar_t *currentTargetPartWeightRatios = targetPartWeightRatios + (noCuts + 1) * kk;
02153         float *currentnonRectelinearPartRatios = nonRectelinearPartRatios + cutShift;
02154 
02155         scalar_t minCoordinate = globalMinMaxTotal[kk];
02156         scalar_t maxCoordinate = globalMinMaxTotal[kk + concurrentPartCount];
02157         scalar_t globalTotalWeight = globalMinMaxTotal[kk + concurrentPartCount * 2];
02158         scalar_t *currentcutLowerWeight = cutLowerWeight + cutShift;
02159         scalar_t *currentcutUpperWeight = cutUpperWeight + cutShift;
02160         scalar_t *currentcutUpperBounds = cutUpperBounds + cutShift;
02161         scalar_t *currentcutLowerBounds = cutLowerBounds + cutShift;
02162 
02163         partId_t prevDoneCount = myNonDoneCounts[kk];
02164         // Now compute the new cut coordinates.
02165         getNewCoordinates<scalar_t>(
02166             env, comm,
02167             total_part_count, noCuts, maxCoordinate, minCoordinate, globalTotalWeight, imbalanceTolerance, maxScalar,
02168             globalPartWeights, localPartWeights, currentTargetPartWeightRatios, currentDone,
02169 
02170             cutCoordinates_tmp + cutShift, currentcutUpperBounds, currentcutLowerBounds,
02171             glc, grc, currentcutLowerWeight, currentcutUpperWeight,
02172             cutCoordinatesWork +cutShift, //new cut coordinates
02173 
02174             allowNonRectelinearPart,
02175             currentnonRectelinearPartRatios,
02176             &recteLinearCutCount,
02177             localCutWeights,
02178             globalCutWeights,
02179             myNonDoneCounts[kk]
02180         );
02181 
02182         partId_t reduction = prevDoneCount - myNonDoneCounts[kk];
02183 #ifdef HAVE_ZOLTAN2_OMP
02184 #pragma omp single
02185 #endif
02186         {
02187           allDone -= reduction;
02188         }
02189       }
02190       /*
02191 #pragma omp single
02192       {
02193       env->timerStop(MACRO_TIMERS, "PQJagged new_cut_calculation_" + dimension);
02194       }
02195       */
02196 #ifdef HAVE_ZOLTAN2_OMP
02197 #pragma omp single
02198 #endif
02199       {
02200       scalar_t *t = cutCoordinates_tmp;
02201       cutCoordinates_tmp = cutCoordinatesWork;
02202       cutCoordinatesWork = t;
02203       }
02204     }
02205 /*
02206     if(comm->getRank() == 0)
02207       if(me == 0) cout << "it:" << iteration << endl;
02208 */
02209     // Needed only if keep_cuts; otherwise can simply swap array pointers
02210     // cutCoordinates and cutCoordinatesWork.
02211     // (at first iteration, cutCoordinates == cutCoorindates_tmp).
02212     // computed cuts must be in cutCoordinates.
02213     if (cutCoordinates != cutCoordinates_tmp){
02214 #ifdef HAVE_ZOLTAN2_OMP
02215 #pragma omp for
02216 #endif
02217       for(partId_t i = 0; i < noCuts *concurrentPartCount; ++i){
02218         cutCoordinates[i] = cutCoordinates_tmp[i];
02219       }
02220 
02221 #ifdef HAVE_ZOLTAN2_OMP
02222 #pragma omp single
02223 #endif
02224       {
02225         cutCoordinatesWork = cutCoordinates_tmp;
02226       }
02227     }
02228 
02229   }
02230 }
02231 
02232 
02233 
02234 
02235 
02261 template <typename lno_t, typename scalar_t>
02262 void getChunksFromCoordinates(
02263     partId_t partNo,
02264     int noThreads,
02265 
02266     lno_t *partitionedPointPermutations,
02267     scalar_t *pqJagged_coordinates,
02268     bool pqJagged_uniformWeights,
02269     scalar_t *coordWeights,
02270     scalar_t *cutCoordinates,
02271     lno_t coordinateBegin,
02272     lno_t coordinateEnd,
02273 
02274     bool allowNonRectelinearPart,
02275     float *actual_ratios,
02276     scalar_t *localPartWeights,
02277     double **partWeights,
02278     float **nonRectelinearRatios,
02279 
02280     //lno_t *coordinate_linked_list,
02281     //lno_t **coordinate_starts,
02282     //lno_t **coordinate_ends,
02283     lno_t ** partPointCounts,
02284 
02285     lno_t *newpartitionedPointPermutations,
02286     lno_t *totalCounts,
02287     partId_t *partIds //,
02288     //bool useBinarySearch
02289     ){
02290 
02291   //lno_t numCoordsInPart =  coordinateEnd - coordinateBegin;
02292   partId_t noCuts = partNo - 1;
02293   //size_t total_part_count = noCuts + partNo;
02294   scalar_t _EPSILON = numeric_limits<scalar_t>::epsilon();
02295 
02296 #ifdef HAVE_ZOLTAN2_OMP
02297 #pragma omp parallel
02298 #endif
02299   {
02300     int me = 0;
02301 #ifdef HAVE_ZOLTAN2_OMP
02302     me = omp_get_thread_num();
02303 #endif
02304 
02305     //lno_t *myStarts = coordinate_starts[me];
02306     //lno_t *myEnds = coordinate_ends[me];
02307     lno_t *myPartPointCounts = partPointCounts[me];
02308     float *myRatios = NULL;
02309     if (allowNonRectelinearPart){
02310 
02311 
02312       myRatios = nonRectelinearRatios[me];
02313 #ifdef HAVE_ZOLTAN2_OMP
02314 #pragma omp for
02315 #endif
02316       for (partId_t i = 0; i < noCuts; ++i){
02317         float r = actual_ratios[i];
02318 
02319         //cout << "real i:" << i << " :" << r << " " << endl;
02320         scalar_t leftWeight = r * (localPartWeights[i * 2 + 1] - localPartWeights[i * 2]);
02321         for(int ii = 0; ii < noThreads; ++ii){
02322           if(leftWeight > _EPSILON){
02323 
02324             scalar_t ithWeight = partWeights[ii][i * 2 + 1] - partWeights[ii][i * 2 ];
02325             if(ithWeight < leftWeight){
02326               nonRectelinearRatios[ii][i] = ithWeight;
02327             }
02328             else {
02329               nonRectelinearRatios[ii][i] = leftWeight ;
02330             }
02331             leftWeight -= ithWeight;
02332           }
02333           else {
02334             nonRectelinearRatios[ii][i] = 0;
02335           }
02336         }
02337       }
02338 
02339 
02340       if(noCuts > 0){
02341         for (partId_t i = noCuts - 1; i > 0 ; --i){          if(ABS(cutCoordinates[i] - cutCoordinates[i -1]) < _EPSILON){
02342             myRatios[i] -= myRatios[i - 1] ;
02343           }
02344           //cout << "i:" << i << " :" << myRatios[i] << " ";
02345           myRatios[i] = int ((myRatios[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL) / scalar_t(SIGNIFICANCE_MUL);
02346         }
02347       }
02348       /*
02349 
02350       for (partId_t i = 0; i < noCuts; ++i){
02351       cout << "r i:" << i << " :" <<  myRatios[i] << " " << endl;
02352       }
02353        */
02354 
02355     }
02356 
02357     for(partId_t ii = 0; ii < partNo; ++ii){
02358       myPartPointCounts[ii] = 0;
02359     }
02360 
02361 
02362 #ifdef HAVE_ZOLTAN2_OMP
02363 #pragma omp barrier
02364 #pragma omp for
02365 #endif
02366       for (lno_t ii = coordinateBegin; ii < coordinateEnd; ++ii){
02367 
02368         lno_t i = partitionedPointPermutations[ii];
02369         partId_t pp = partIds[i];
02370         partId_t p = pp / 2;
02371         if(pp % 2 == 1){
02372           //cout << "on: " << pp << ":" << p << " cut:" << p << endl;
02373 
02374           if(allowNonRectelinearPart && myRatios[p] > _EPSILON * EPS_SCALE){
02375             //cout << "p:" << p << endl;
02376             scalar_t w = pqJagged_uniformWeights? 1:coordWeights[i];
02377             myRatios[p] -= w;
02378             if(myRatios[p] < 0 && p < noCuts - 1 && ABS(cutCoordinates[p+1] - cutCoordinates[p]) < _EPSILON){
02379               myRatios[p + 1] += myRatios[p];
02380             }
02381             ++myPartPointCounts[p];
02382             partIds[i] = p;
02383           }
02384           else{
02385             //scalar_t currentCut = cutCoordinates[p];
02386             //TODO:currently cannot divide 1 line more than 2 parts.
02387             //bug cannot be divided, therefore this part should change.
02388             //cout << "p:" << p+1 << endl;
02389             ++myPartPointCounts[p + 1];
02390             partIds[i] = p + 1;
02391           }
02392       }
02393       else {
02394         ++myPartPointCounts[p];
02395 
02396         partIds[i] = p;
02397       }
02398     }
02399 
02400 
02401 
02402 #ifdef HAVE_ZOLTAN2_OMP
02403 #pragma omp for
02404 #endif
02405     for(partId_t j = 0; j < partNo; ++j){
02406       lno_t pwj = 0;//, prev = 0;
02407       //if(j == 0) prev = 0;
02408       //else prev = totalCounts[j - 1];
02409       //lno_t prev2 = prev;
02410       for (int i = 0; i < noThreads; ++i){
02411         lno_t threadPartPointCount = partPointCounts[i][j];
02412         partPointCounts[i][j] = pwj;
02413         pwj += threadPartPointCount;
02414 
02415         }
02416       totalCounts[j] = pwj;// + prev2; //+ coordinateBegin;
02417     }
02418 
02419 #ifdef HAVE_ZOLTAN2_OMP
02420 #pragma omp single
02421 #endif
02422     {
02423       for(partId_t j = 1; j < partNo; ++j){
02424         totalCounts[j] += totalCounts[j - 1];
02425       }
02426     }
02427 
02428     for(partId_t j = 1; j < partNo; ++j){
02429       myPartPointCounts[j] += totalCounts[j - 1] ;
02430     }
02431 
02432 
02433 #ifdef HAVE_ZOLTAN2_OMP
02434 #pragma omp for
02435 #endif
02436     for (lno_t ii = coordinateBegin; ii < coordinateEnd; ++ii){
02437       lno_t i = partitionedPointPermutations[ii];
02438       partId_t p =  partIds[i];
02439       //cout << "p:" << p << " my:" << myPartPointCounts[p] << " begin:" << coordinateBegin << endl;
02440       newpartitionedPointPermutations[coordinateBegin + myPartPointCounts[p]++] = i;
02441     }
02442 
02443     /*
02444     pqJagged_PartVertices <lno_t, partId_t> pqPV;
02445     pqPV.set(coordinate_linked_list, myStarts, myEnds);
02446     memset(myPartPointCounts, 0, sizeof(lno_t) * partNo);
02447 
02448 
02449 #ifdef HAVE_ZOLTAN2_OMP
02450 #pragma omp for
02451 #endif
02452     for (lno_t i = coordinateBegin; i < coordinateEnd; ++i){
02453       int ii = partitionedPointPermutations[i];
02454       coordinate_linked_list[ii] = -1;
02455     }
02456 
02457     for (partId_t i = 0; i < partNo; ++i){
02458       myEnds[i] = -1;
02459       myStarts[i] = -1;
02460     }
02461 
02462 
02463     //determine part of each point
02464     if(1 ||useBinarySearch){
02465 
02466 #ifdef HAVE_ZOLTAN2_OMP
02467 #pragma omp for
02468 #endif
02469       for (lno_t ii = coordinateBegin; ii < coordinateEnd; ++ii){
02470 
02471         int i = partitionedPointPermutations[ii];
02472         partId_t lc = 0;
02473         partId_t uc = noCuts - 1;
02474 
02475         //bool isInserted = false;
02476         bool onLeft = false;
02477         bool onRight = false;
02478         partId_t lastPart = -1;
02479         while(uc >= lc)
02480         {
02481           lastPart = -1;
02482           onLeft = false;
02483           onRight = false;
02484           partId_t j = (uc + lc) / 2;
02485           scalar_t distance = pqJagged_coordinates[i] - cutCoordinates[j];
02486           scalar_t absdistance = ABS(distance);
02487 
02488           if(allowNonRectelinearPart  && absdistance < _EPSILON ){
02489             scalar_t w = pqJagged_uniformWeights? 1:coordWeights[i];
02490             partId_t jj = j -1;
02491             while(jj >= 0 && ABS(cutCoordinates[jj] - cutCoordinates[j]) < _EPSILON){
02492               --jj;
02493             }
02494             ++jj;
02495             for (;jj < noCuts && ABS(cutCoordinates[jj] - cutCoordinates[j]) < _EPSILON; ++jj){
02496               if(myRatios[jj] > _EPSILON * EPS_SCALE ){
02497                 myRatios[jj] -= w;
02498                 if(myRatios[jj] < 0 && jj < noCuts - 1 && ABS(cutCoordinates[jj+1] - cutCoordinates[jj]) < _EPSILON){
02499                   myRatios[jj + 1] += myRatios[jj];
02500                 }
02501                 onLeft = true;
02502                 lastPart = jj;
02503                 break;
02504               }
02505             }
02506             if(!onLeft){
02507               onRight= true;
02508               lastPart = jj -1;
02509             }
02510 
02511             break;
02512           }
02513           else {
02514             if (distance < 0) {
02515               uc = j - 1;
02516               onLeft = true;
02517               lastPart = j;
02518             }
02519             else {
02520               lc = j + 1;
02521               onRight = true;
02522               lastPart = j;
02523             }
02524           }
02525         }
02526         if(onRight){
02527           pqPV.inserToPart(lastPart + 1, i);
02528           myPartPointCounts[lastPart + 1] +=  1;
02529         }
02530         else if(onLeft){
02531           pqPV.inserToPart(lastPart , i);
02532           myPartPointCounts[lastPart] +=  1;
02533         }
02534       }
02535       for(partId_t i = 1; i < partNo; ++i){
02536         myPartPointCounts[i] += myPartPointCounts[i-1];
02537       }
02538     } else {
02539 #ifdef HAVE_ZOLTAN2_OMP
02540 #pragma omp for
02541 #endif
02542       for (lno_t ii = coordinateBegin; ii < coordinateEnd; ++ii){
02543 
02544         lno_t i = partitionedPointPermutations[ii];
02545 
02546         bool inserted = false;
02547         for(partId_t j = 0; j < noCuts; ++j){
02548           scalar_t distance = pqJagged_coordinates[i] - cutCoordinates[j];
02549           scalar_t absdistance = ABS(distance);
02550 
02551           if (allowNonRectelinearPart && myRatios[j] > _EPSILON * EPS_SCALE && absdistance < _EPSILON ){
02552             scalar_t w = pqJagged_uniformWeights? 1:coordWeights[i];
02553             scalar_t decrease = w;
02554             myRatios[j] -= decrease;
02555 
02556             if(myRatios[j] < 0 && j < noCuts - 1 && ABS(cutCoordinates[j+1] - cutCoordinates[j]) < _EPSILON){
02557               myRatios[j + 1] += myRatios[j];
02558             }
02559             inserted = true;
02560             pqPV.inserToPart(j, i);
02561             break;
02562           }
02563           else if (distance < 0 && absdistance > _EPSILON){
02564             pqPV.inserToPart(j, i);
02565             inserted = true;
02566             break;
02567           }
02568           else {
02569             myPartPointCounts[j] -=  1;
02570           }
02571 
02572         }
02573         if(!inserted){
02574           pqPV.inserToPart(noCuts, i);
02575         }
02576       }
02577     }
02578     //accumulate the starts of each thread.
02579 
02580 #ifdef HAVE_ZOLTAN2_OMP
02581 #pragma omp for
02582 #endif
02583     for(partId_t i = 0; i < partNo; ++i){
02584       int head = 0, tail = 0;
02585       while(head < noThreads && coordinate_ends[head][i] == -1){
02586         ++head;
02587       }
02588       int firstHead = head;
02589       for(int j = head; j < noThreads; ){
02590         tail = j+1;
02591         bool foundTail = false;
02592         while(tail < noThreads){
02593           if(coordinate_starts[tail][i] == -1){
02594             ++tail;
02595           }
02596           else {
02597             foundTail = true;
02598             break;
02599           }
02600         }
02601         if(foundTail){
02602           coordinate_linked_list[coordinate_ends[head][i]] = coordinate_starts[tail][i];
02603           head = tail;
02604         }
02605         j = tail;
02606 
02607       }
02608       coordinate_starts[0][i] = firstHead >= noThreads ? -1:coordinate_starts[firstHead][i];
02609     }
02610 
02611 
02612     if(1 || useBinarySearch){
02613       //accumulate the count.
02614 #ifdef HAVE_ZOLTAN2_OMP
02615 #pragma omp for
02616 #endif
02617       for(partId_t j = 0; j < partNo; ++j){
02618         lno_t pwj = 0;
02619         for (int i = 0; i < noThreads; ++i){
02620           pwj += partPointCounts[i][j];
02621         }
02622         totalCounts[j] = pwj;
02623       }
02624     } else {
02625 #ifdef HAVE_ZOLTAN2_OMP
02626 #pragma omp for
02627 #endif
02628       for(partId_t j = 0; j < partNo; ++j){
02629         lno_t pwj = 0;
02630         for (int i = 0; i < noThreads; ++i){
02631           pwj += partPointCounts[i][j];
02632         }
02633         totalCounts[j] = pwj + numCoordsInPart;
02634       }
02635     }
02636 */
02637 
02638 /*
02639 #ifdef HAVE_ZOLTAN2_OMP
02640 #pragma omp for
02641 #endif
02642     for(partId_t i = 0; i < partNo; ++i){
02643       lno_t nextPoint = coordinate_starts[0][i];
02644       lno_t pcnt = 0;
02645 
02646       lno_t prevCount = coordinateBegin;
02647       if (i > 0) prevCount = totalCounts[i -1] + coordinateBegin;
02648 
02649       while(nextPoint != -1){
02650         newpartitionedPointPermutations[prevCount + pcnt++] = nextPoint;
02651         nextPoint = coordinate_linked_list[nextPoint];
02652       }
02653     }
02654     */
02655   }
02656 }
02657 
02658 
02659 template <typename T>
02660 T *allocMemory(size_t size){
02661   if (size > 0){
02662     T * a = new T[size];
02663     if (a == NULL) {
02664       throw  "cannot allocate memory";
02665     }
02666     return a;
02667   }
02668   else {
02669     return NULL;
02670   }
02671 }
02672 
02673 template <typename T>
02674 void freeArray(T *&array){
02675   if(array != NULL){
02676     delete [] array;
02677     array = NULL;
02678   }
02679 }
02680 
02681 template <typename tt>
02682 std::string toString(tt obj){
02683   std::stringstream ss (std::stringstream::in |std::stringstream::out);
02684   ss << obj;
02685   std::string tmp = "";
02686   ss >> tmp;
02687   return tmp;
02688 }
02689 
02690 
02691 
02692 
02703 template <typename Adapter>
02704 void AlgPQJagged(
02705     const RCP<const Environment> &env,
02706     RCP<Comm<int> > &comm,
02707     const RCP<const CoordinateModel<
02708     typename Adapter::base_adapter_t> > &coords,
02709     RCP<PartitioningSolution<Adapter> > &solution
02710 )
02711 {
02712 #ifndef INCLUDE_ZOLTAN2_EXPERIMENTAL
02713 
02714   Z2_THROW_EXPERIMENTAL("Zoltan2 PQJagged is strictly experimental software "
02715                         "while it is being developed and tested.")
02716 
02717 #else
02718 /*
02719  *   typedef typename Adapter::scalar_t scalar_t;
02720  *     typedef typename Adapter::gno_t gno_t;
02721  *       typedef typename Adapter::lno_t lno_t;
02722   if(comm->getRank() == 0){
02723     cout << "size of gno:" << sizeof(gno_t) << endl;
02724     cout << "size of lno:" << sizeof(lno_t) << endl;
02725     cout << "size of scalar_t:" << sizeof(scalar_t) << endl;
02726   }
02727  */
02728   env->timerStart(MACRO_TIMERS, "PQJagged Total");
02729 
02730 
02731   env->timerStart(MACRO_TIMERS, "PQJagged Problem_Init");
02732   typedef typename Adapter::scalar_t scalar_t;
02733   typedef typename Adapter::gno_t gno_t;
02734 
02735   typedef typename Adapter::lno_t lno_t;
02736   const Teuchos::ParameterList &pl = env->getParameters();
02737 
02738   std::bitset<NUM_RCB_PARAMS> params;
02739   int numTestCuts = 5;
02740   scalar_t imbalanceTolerance;
02741 
02742   multiCriteriaNorm mcnorm;
02743   bool ignoreWeights=false;
02744 
02745   bool allowNonRectelinearPart = false;
02746   int concurrentPartCount = 0;
02747   bool force_binary = false, force_linear = false;
02748   pqJagged_getParameters<scalar_t>(pl, imbalanceTolerance, mcnorm, params, numTestCuts, ignoreWeights,allowNonRectelinearPart,  concurrentPartCount,
02749       force_binary, force_linear);
02750 
02751   int coordDim, weightDim; size_t nlc; global_size_t gnc; int criteriaDim;
02752   pqJagged_getCoordinateValues<Adapter>( coords, coordDim, weightDim, nlc, gnc, criteriaDim, ignoreWeights);
02753   size_t numLocalCoords = nlc;
02754 
02755   //allocate only two dimensional pointer.
02756   //raw pointer addresess will be obtained from multivector.
02757   scalar_t **pqJagged_coordinates = allocMemory<scalar_t *>(coordDim);
02758   scalar_t **pqJagged_weights = allocMemory<scalar_t *>(criteriaDim);
02759   bool *pqJagged_uniformParts = allocMemory< bool >(criteriaDim); //if the partitioning results wanted to be uniform.
02760   scalar_t **pqJagged_partSizes =  allocMemory<scalar_t *>(criteriaDim); //if in a criteria dimension, uniform part is false this shows ratios of the target part weights.
02761   bool *pqJagged_uniformWeights = allocMemory< bool >(criteriaDim); //if the weights of coordinates are uniform in a criteria dimension.
02762 
02763   ArrayView<const gno_t> pqJagged_gnos;
02764   size_t numGlobalParts;
02765   int pqJagged_multiVectorDim;
02766 
02767   pqJagged_getInputValues<Adapter, scalar_t, gno_t>(
02768       env, coords, solution,params,coordDim,weightDim,numLocalCoords,
02769       numGlobalParts, pqJagged_multiVectorDim,
02770       pqJagged_coordinates,criteriaDim, pqJagged_weights,pqJagged_gnos, ignoreWeights,
02771       pqJagged_uniformWeights, pqJagged_uniformParts, pqJagged_partSizes
02772   );
02773 
02774   int numThreads = pqJagged_getNumThreads();
02775 
02776   partId_t totalDimensionCut = 0;
02777   partId_t totalPartCount = 1;
02778   partId_t maxPartNo = 0;
02779 
02780   const partId_t *partNo = pl.getPtr<Array <partId_t> >("pqParts")->getRawPtr();
02781   int partArraySize = pl.getPtr<Array <partId_t> >("pqParts")->size() - 1;
02782 
02783   for (int i = 0; i < partArraySize; ++i){
02784     totalPartCount *= partNo[i];
02785     if(partNo[i] > maxPartNo) maxPartNo = partNo[i];
02786   }
02787   totalDimensionCut = totalPartCount - 1;
02788   partId_t maxCutNo = maxPartNo - 1;
02789   partId_t maxTotalCumulativePartCount = totalPartCount / partNo[partArraySize - 1];
02790   size_t maxTotalPartCount = maxPartNo + size_t(maxCutNo);
02791   //maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
02792 
02793   if(concurrentPartCount > maxTotalCumulativePartCount){
02794     if(comm->getRank() == 0){
02795       cerr << "Warning: Concurrent part calculation count ("<< concurrentPartCount << ") has been set bigger than maximum amount that can be used." << " Setting to:" << maxTotalCumulativePartCount << "." << endl;
02796     }
02797     concurrentPartCount = maxTotalCumulativePartCount;
02798   }
02799 
02800   // coordinates of the cut lines. First one is the min, last one is max coordinate.
02801   // kddnote if (keep_cuts)
02802   // coordinates of the cut lines.
02803   //only store this much if cuts are needed to be stored.
02804   scalar_t *allCutCoordinates = allocMemory< scalar_t>(totalDimensionCut);
02805   // kddnote else
02806   //scalar_t *allCutCoordinates = allocMemory< scalar_t>(maxCutNo * concurrentPartCount);
02807 
02808   //as input indices.
02809   lno_t *partitionedPointCoordinates =  allocMemory< lno_t>(numLocalCoords);
02810   //as output indices
02811   lno_t *newpartitionedPointCoordinates = allocMemory< lno_t>(numLocalCoords);
02812   scalar_t *max_min_array =  allocMemory< scalar_t>(numThreads * 2);
02813 
02814   //initial configuration
02815   //set each pointer-i to i.
02816 #ifdef HAVE_ZOLTAN2_OMP
02817 #pragma omp parallel for
02818 #endif
02819   for(size_t i = 0; i < numLocalCoords; ++i){
02820     partitionedPointCoordinates[i] = i;
02821   }
02822 
02823 #ifdef HAVE_ZOLTAN2_OMP
02824 #ifdef FIRST_TOUCH
02825   firstTouch<lno_t>(newpartitionedPointCoordinates, numLocalCoords);
02826 #endif
02827 #endif
02828 
02829   //initially there is a single partition
02830   lno_t currentPartitionCount = 1;
02831   //single partition starts at index-0, and ends at numLocalCoords
02832 
02833   //inTotalCounts array holds the end points in partitionedPointCoordinates array
02834   //for each partition. Initially sized 1, and single element is set to numLocalCoords.
02835   lno_t *inTotalCounts = allocMemory<lno_t>(1);
02836   inTotalCounts[0] = static_cast<lno_t>(numLocalCoords);//the end of the initial partition is the end of coordinates.
02837 
02838   //the ends points of the output.
02839   lno_t *outTotalCounts = NULL;
02840 
02841   //the array holding the points in each part as linked list
02842   //lno_t *coordinate_linked_list = allocMemory<lno_t>(numLocalCoords);
02843   //the start and end coordinate of  each part.
02844   //lno_t **coordinate_starts =  allocMemory<lno_t *>(numThreads);
02845   //lno_t **coordinate_ends = allocMemory<lno_t *>(numThreads);
02846 
02847   //assign the max size to starts, as it will be reused.
02848   /*
02849   for(int i = 0; i < numThreads; ++i){
02850     coordinate_starts[i] = allocMemory<lno_t>(maxPartNo);
02851     coordinate_ends[i] = allocMemory<lno_t>(maxPartNo);
02852   }
02853    */
02854 
02855 
02856 #ifdef HAVE_ZOLTAN2_OMP
02857 #ifdef FIRST_TOUCH
02858 #pragma omp parallel
02859   {
02860     int me = omp_get_thread_num();
02861     for(partId_t ii = 0; ii < maxPartNo; ++ii){
02862       coordinate_starts[me][ii] = 0;
02863       coordinate_ends[me][ii] = 0;
02864     }
02865   }
02866 #endif
02867 #endif
02868 
02869 
02870 
02871 
02872   float *nonRectelinearPart = NULL; //how much weight percentage should a MPI put left side of the each cutline
02873   float **nonRectRatios = NULL; //how much weight percentage should each thread in MPI put left side of the each cutline
02874 
02875   if(allowNonRectelinearPart){
02876     //cout << "allowing" << endl;
02877     nonRectelinearPart = allocMemory<float>(maxCutNo * concurrentPartCount);
02878 #ifdef HAVE_ZOLTAN2_OMP
02879 #ifdef FIRST_TOUCH
02880     firstTouch<float>(nonRectelinearPart, maxCutNo);
02881 #endif
02882 #endif
02883 
02884     nonRectRatios = allocMemory<float *>(numThreads);
02885 
02886     for(int i = 0; i < numThreads; ++i){
02887       nonRectRatios[i] = allocMemory<float>(maxCutNo);
02888     }
02889 #ifdef HAVE_ZOLTAN2_OMP
02890 #ifdef FIRST_TOUCH
02891 #pragma omp parallel
02892     {
02893       int me = omp_get_thread_num();
02894       for(partId_t ii = 0; ii < maxCutNo; ++ii){
02895         nonRectRatios[me][ii] = 0;
02896       }
02897     }
02898 #endif
02899 #endif
02900 
02901   }
02902 
02903   // work array to manipulate coordinate of cutlines in different iterations.
02904   //necessary because previous cut line information is used for determining the next cutline information.
02905   //therefore, cannot update the cut work array until all cutlines are determined.
02906   scalar_t *cutCoordinatesWork = allocMemory<scalar_t>(maxCutNo * concurrentPartCount);
02907 #ifdef HAVE_ZOLTAN2_OMP
02908 #ifdef FIRST_TOUCH
02909   firstTouch<scalar_t>(cutCoordinatesWork, maxCutNo);
02910 #endif
02911 #endif
02912 
02913   //cumulative part weight ratio array.
02914   scalar_t *targetPartWeightRatios = allocMemory<scalar_t>(maxPartNo * concurrentPartCount); // the weight ratios at left side of the cuts. First is 0, last is 1.
02915 #ifdef HAVE_ZOLTAN2_OMP
02916 #ifdef FIRST_TOUCH
02917   firstTouch<scalar_t>(cutPartRatios, maxCutNo);
02918 #endif
02919 #endif
02920 
02921 
02922   scalar_t *cutUpperBounds = allocMemory<scalar_t>(maxCutNo * concurrentPartCount);  //upper bound coordinate of a cut line
02923   scalar_t *cutLowerBounds = allocMemory<scalar_t>(maxCutNo* concurrentPartCount);  //lower bound coordinate of a cut line
02924   scalar_t *cutLowerWeight = allocMemory<scalar_t>(maxCutNo* concurrentPartCount);  //lower bound weight of a cut line
02925   scalar_t *cutUpperWeight = allocMemory<scalar_t>(maxCutNo* concurrentPartCount);  //upper bound weight of a cut line
02926 
02927   scalar_t *localMinMaxTotal = allocMemory<scalar_t>(3 * concurrentPartCount); //combined array to exchange the min and max coordinate, and total weight of part.
02928   scalar_t *globalMinMaxTotal = allocMemory<scalar_t>(3 * concurrentPartCount);//global combined array with the results for min, max and total weight.
02929 
02930   //isDone is used to determine if a cutline is determined already.
02931   //If a cut line is already determined, the next iterations will skip this cut line.
02932   bool *isDone = allocMemory<bool>(maxCutNo * concurrentPartCount);
02933   //myNonDoneCount count holds the number of cutlines that have not been finalized for each part
02934   //when concurrentPartCount>1, using this information, if myNonDoneCount[x]==0, then no work is done for this part.
02935   partId_t *myNonDoneCount =  allocMemory<partId_t>(concurrentPartCount);
02936   //local part weights of each thread.
02937   double **partWeights = allocMemory<double *>(numThreads);
02938   //the work manupulation array for partweights.
02939   double **pws = allocMemory<double *>(numThreads);
02940 
02941   //leftClosesDistance to hold the min distance of a coordinate to a cutline from left (for each thread).
02942   scalar_t **leftClosestDistance = allocMemory<scalar_t *>(numThreads);
02943   //leftClosesDistance to hold the min distance of a coordinate to a cutline from right (for each thread)
02944   scalar_t **rightClosestDistance = allocMemory<scalar_t *>(numThreads);
02945 
02946   //to store how many points in each part a thread has.
02947   lno_t **partPointCounts = allocMemory<lno_t *>(numThreads);
02948 
02949   for(int i = 0; i < numThreads; ++i){
02950     //partWeights[i] = allocMemory<scalar_t>(maxTotalPartCount);
02951     partWeights[i] = allocMemory < double >(maxTotalPartCount * concurrentPartCount);
02952     rightClosestDistance[i] = allocMemory<scalar_t>(maxCutNo * concurrentPartCount);
02953     leftClosestDistance[i] = allocMemory<scalar_t>(maxCutNo * concurrentPartCount);
02954     partPointCounts[i] =  allocMemory<lno_t>(maxPartNo);
02955   }
02956 #ifdef HAVE_ZOLTAN2_OMP
02957 #ifdef FIRST_TOUCH
02958 #pragma omp parallel
02959   {
02960     int me = omp_get_thread_num();
02961     for(partId_t ii = 0; ii < maxPartNo; ++ii){
02962       partPointCounts[me][ii] = 0;
02963     }
02964     for(size_t ii = 0; ii < maxTotalPartCount; ++ii){
02965       partWeights[me][ii] = 0;
02966     }
02967     for(partId_t ii = 0; ii < maxCutNo; ++ii){
02968       rightClosestDistance[me][ii] = 0;
02969       leftClosestDistance[me][ii] = 0;
02970 
02971     }
02972   }
02973 #endif
02974 #endif
02975   // kddnote  Needed only when non-rectilinear parts.
02976   scalar_t *cutWeights = allocMemory<scalar_t>(maxCutNo);
02977   scalar_t *globalCutWeights = allocMemory<scalar_t>(maxCutNo);
02978 
02979   //for faster communication, concatanation of
02980   //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
02981   //leftClosest distances sized P-1, since P-1 cut lines
02982   //rightClosest distances size P-1, since P-1 cut lines.
02983   scalar_t *totalPartWeights_leftClosests_rightClosests = allocMemory<scalar_t>((maxTotalPartCount + maxCutNo * 2) * concurrentPartCount);
02984   scalar_t *global_totalPartWeights_leftClosests_rightClosests = allocMemory<scalar_t>((maxTotalPartCount + maxCutNo * 2) * concurrentPartCount);
02985 
02986   partId_t *partIds = NULL;
02987   ArrayRCP<partId_t> partId;
02988   if(numLocalCoords > 0){
02989     partIds = allocMemory<partId_t>(numLocalCoords);
02990     partId = arcp(partIds, 0, numLocalCoords, true);
02991   }
02992 
02993   scalar_t *cutCoordinates =  allCutCoordinates;
02994 
02995   //partId_t leftPartitions = totalPartCount;
02996   scalar_t maxScalar_t = numeric_limits<scalar_t>::max();
02997   scalar_t minScalar_t = -numeric_limits<scalar_t>::max();
02998 
02999   env->timerStop(MACRO_TIMERS, "PQJagged Problem_Init");
03000 
03001   env->timerStart(MACRO_TIMERS, "PQJagged Problem_Partitioning");
03002 
03003 
03004   //int myRank = comm->getRank();
03005   //int worldSize = comm->getSize();
03006 
03007   scalar_t _EPSILON = numeric_limits<scalar_t>::epsilon();
03008   for (int i = 0; i < partArraySize; ++i){
03009     if(partNo[i] == 1) continue;
03010     int coordInd = i % coordDim;
03011     string istring = toString<int>(i);
03012 
03013     env->timerStart(MACRO_TIMERS, "PQJagged Problem_Partitioning_" + istring);
03014     outTotalCounts = allocMemory<lno_t>(currentPartitionCount * partNo[i]);
03015 
03016     //the index where in the outtotalCounts will be written.
03017     partId_t currentOut = 0;
03018     //whatever is written to outTotalCounts will be added with previousEnd so that the points will be shifted.
03019     partId_t previousEnd = 0;
03020     scalar_t * pqCoord = pqJagged_coordinates[coordInd];
03021 
03022     partId_t currentWorkPart = 0;
03023     partId_t concurrentPart = min(currentPartitionCount - currentWorkPart, concurrentPartCount);
03024     bool useBinarySearch = false;
03025     if(partNo[i] > BINARYCUTOFF){
03026       useBinarySearch = true;
03027     }
03028     if(force_binary){
03029       useBinarySearch = true;
03030     }
03031     if (force_linear){
03032       useBinarySearch = false;
03033     }
03034 
03035 //    size_t total_part_count = partNo[i] * 2 + 1;
03036 
03037     //run for all available parts.
03038     for (; currentWorkPart < currentPartitionCount; currentWorkPart += concurrentPart){
03039 
03040       concurrentPart = min(currentPartitionCount - currentWorkPart, concurrentPartCount);
03041 #ifdef mpi_communication
03042       concurrent = concurrentPart;
03043 #endif
03044       /*
03045       if(myRank == 0)
03046         cout << "i: " << i << " j:" << currentWorkPart << " ";
03047        */
03048       //scalar_t used_imbalance = imbalanceTolerance * (LEAF_IMBALANCE_FACTOR + (1 - LEAF_IMBALANCE_FACTOR)   / leftPartitions) * 0.7;
03049       scalar_t used_imbalance = 0;
03050       //env->timerStart(MACRO_TIMERS, "PQJagged Problem_Partitioning_" + istring +"_min_max");
03051 
03052       for(int kk = 0; kk < concurrentPart; ++kk){
03053 
03054         partId_t currentPart = currentWorkPart + kk;
03055         lno_t coordinateEnd= inTotalCounts[currentPart];
03056         lno_t coordinateBegin = currentPart==0 ? 0: inTotalCounts[currentPart -1];
03057 
03058         pqJagged_getLocalMinMaxTotalCoord<scalar_t, lno_t>(
03059             partitionedPointCoordinates,
03060             pqCoord,
03061             pqJagged_uniformWeights,
03062             pqJagged_weights[0],
03063             numThreads,
03064             coordinateBegin,
03065             coordinateEnd,
03066             max_min_array,
03067             maxScalar_t,
03068             minScalar_t,
03069             localMinMaxTotal[kk], //min coordinate
03070             localMinMaxTotal[kk + concurrentPart], //max coordinate
03071             localMinMaxTotal[kk + 2*concurrentPart] //total weight);
03072                           );
03073       }
03074       pqJagged_getGlobalMinMaxTotalCoord<scalar_t>(comm,env, concurrentPart, localMinMaxTotal, globalMinMaxTotal);
03075 
03076       //env->timerStop(MACRO_TIMERS, "PQJagged Problem_Partitioning_" + istring +"_min_max");
03077 
03078 
03079 
03080 
03081       partId_t allDone = 0;
03082       // Compute weight ratios for parts & cuts:
03083       //  e.g., 0.25  0.25  0.5    0.5  0.75 0.75  1
03084       //       part0  cut0  part1 cut1 part2 cut2 part3
03085       for(int kk = 0; kk < concurrentPart; ++kk){
03086         scalar_t minCoordinate = globalMinMaxTotal[kk];
03087         scalar_t maxCoordinate = globalMinMaxTotal[kk + concurrentPart];
03088         scalar_t *usedCutCoordinate = cutCoordinates + (partNo[i] - 1) * kk;
03089         scalar_t *usedCutPartRatios = targetPartWeightRatios + (partNo[i]) * kk;
03090 
03091         if(minCoordinate <= maxCoordinate){
03092           allDone += partNo[i] - 1;
03093           myNonDoneCount[kk] = partNo[i] - 1;
03094           //env->timerStart(MACRO_TIMERS, "PQJagged Problem_Partitioning_" + istring + "_cut_coord");
03095           pqJagged_getCutCoord_Weights<scalar_t>(
03096               minCoordinate, maxCoordinate,
03097               pqJagged_uniformParts[0], pqJagged_partSizes[0], partNo[i] - 1,
03098               usedCutCoordinate, usedCutPartRatios, numThreads
03099           );
03100           //env->timerStop(MACRO_TIMERS, "PQJagged Problem_Partitioning_" + istring + "_cut_coord");
03101           scalar_t coordinate_range = maxCoordinate - minCoordinate;
03102 
03103           partId_t currentPart = currentWorkPart + kk;
03104           lno_t coordinateEnd= inTotalCounts[currentPart];
03105           lno_t coordinateBegin = currentPart==0 ? 0: inTotalCounts[currentPart -1];
03106 
03107           if(ABS(coordinate_range) < _EPSILON ){
03108             for(lno_t ii = coordinateBegin; ii < coordinateEnd; ++ii){
03109               partIds[partitionedPointCoordinates[ii]] = 0;
03110             }
03111           }
03112           else{
03113 
03114             scalar_t slice = coordinate_range / partNo[i];
03115 
03116 #ifdef HAVE_ZOLTAN2_OMP
03117 #pragma omp parallel for
03118 #endif
03119             for(lno_t ii = coordinateBegin; ii < coordinateEnd; ++ii){
03120 
03121               lno_t iii = partitionedPointCoordinates[ii];
03122               partId_t pp = partId_t((pqCoord[iii] - minCoordinate) / slice);
03123               //if( pp >= partNo[iii])
03124               //{
03125               //  partIds[iii] = 0;
03126               //} else {
03127               //  partIds[iii] = total_part_count - 2 * pp;
03128               //}
03129               partIds[iii] = 2 * pp;
03130             }
03131           }
03132         }
03133         else {
03134           // e.g., if have fewer coordinates than parts, don't need to do next dim.
03135           myNonDoneCount[kk] = 0;
03136         }
03137       }
03138 
03139       //cout << "partId:" << partIds[0] << endl;
03140       //env->timerStart(MACRO_TIMERS, "PQJagged Problem_Partitioning_" + istring + "_1d");
03141       // Determine cut lines for k parts here.
03142       pqJagged_1D_Partition<scalar_t, lno_t>(
03143           env, comm,
03144           partitionedPointCoordinates,
03145           pqCoord, pqJagged_uniformWeights[0], pqJagged_weights[0],
03146           targetPartWeightRatios,
03147           globalMinMaxTotal,localMinMaxTotal,
03148           partNo[i], numThreads,
03149           maxScalar_t,
03150           minScalar_t,
03151           used_imbalance,
03152           currentWorkPart,
03153           concurrentPart,
03154           inTotalCounts,
03155           cutCoordinates,
03156           cutCoordinatesWork,
03157           leftClosestDistance,
03158           rightClosestDistance,
03159           cutUpperBounds,
03160           cutLowerBounds,
03161           cutUpperWeight,
03162           cutLowerWeight,
03163           isDone,
03164           partWeights,
03165           totalPartWeights_leftClosests_rightClosests,
03166           global_totalPartWeights_leftClosests_rightClosests,
03167           allowNonRectelinearPart,
03168           nonRectelinearPart,
03169           cutWeights,
03170           globalCutWeights,
03171           allDone,
03172           myNonDoneCount,
03173           useBinarySearch, // istring,
03174           partIds
03175           );
03176 
03177 
03178       //env->timerStop(MACRO_TIMERS, "PQJagged Problem_Partitioning_" + istring + "_1d");
03179 
03180       //partId_t pEnd = currentOut + partNo[i] * concurrentPart;
03181       //for (partId_t ii = currentOut; ii < pEnd; ++ii){
03182       //  outTotalCounts[ii] = 0;
03183       //}
03184 
03185       //env->timerStart(MACRO_TIMERS, "PQJagged Problem_Partitioning_" + istring + "_chunks");
03186       for(int kk = 0; kk < concurrentPart; ++kk){
03187 
03188         if(globalMinMaxTotal[kk] > globalMinMaxTotal[kk + concurrentPart]) {
03189           for(partId_t jj = 0; jj < partNo[i]; ++jj){
03190             outTotalCounts[currentOut + kk * (partNo[i]) + jj] = 0;
03191           }
03192 
03193           continue;
03194         }
03195         partId_t curr = currentWorkPart + kk;
03196         lno_t coordinateEnd= inTotalCounts[curr];
03197         lno_t coordinateBegin = curr==0 ? 0: inTotalCounts[curr -1];
03198         partId_t cutShift = (partNo[i] - 1) * kk;
03199         scalar_t *usedCutCoordinate = cutCoordinates + cutShift;
03200         float *usednonRectelinearPart = nonRectelinearPart + cutShift;
03201 
03202         scalar_t *tlr =  totalPartWeights_leftClosests_rightClosests + (4 *(partNo[i] - 1) + 1) * kk;
03203 
03204         for(int ii = 0; ii < numThreads; ++ii){
03205           pws[ii] = partWeights[ii] +  (2 * (partNo[i] - 1) + 1) * kk;
03206         }
03207 
03208         // Rewrite the indices based on the computed cuts.
03209         getChunksFromCoordinates<lno_t,scalar_t>(
03210             partNo[i],
03211             numThreads,
03212 
03213             partitionedPointCoordinates,
03214             pqCoord,
03215             pqJagged_uniformWeights[0],
03216             pqJagged_weights[0],
03217 
03218             usedCutCoordinate,
03219             coordinateBegin,
03220             coordinateEnd,
03221 
03222             allowNonRectelinearPart,
03223             usednonRectelinearPart,
03224             tlr,
03225             pws,
03226             nonRectRatios,
03227 
03228             //coordinate_linked_list,
03229             //coordinate_starts,
03230             //coordinate_ends,
03231             partPointCounts,
03232 
03233             newpartitionedPointCoordinates,
03234             outTotalCounts + currentOut + kk * (partNo[i]),partIds//,
03235             //useBinarySearch
03236         );
03237       }
03238 
03239       //env->timerStop(MACRO_TIMERS, "PQJagged Problem_Partitioning_" + istring + "_chunks");
03240       cutCoordinates += (partNo[i] - 1) * concurrentPart;
03241 
03242       for(partId_t kk = 0; kk < concurrentPart; ++kk){
03243 
03244 
03245         for (partId_t ii = 0;ii < partNo[i] ; ++ii){
03246           outTotalCounts[currentOut+ii] += previousEnd;
03247           //cout << "out:" << outTotalCounts[currentOut+ii] << " prev:" << previousEnd<< endl;
03248         }
03249         previousEnd = outTotalCounts[currentOut + partNo[i] - 1];
03250 
03251         currentOut += partNo[i] ;
03252       }
03253 /*
03254       if(myRank == 0)
03255         cout << endl;
03256 */
03257     } // end of this partitioning dimension
03258 
03259     // swap the indices' memory
03260     lno_t * tmp = partitionedPointCoordinates;
03261     partitionedPointCoordinates = newpartitionedPointCoordinates;
03262     newpartitionedPointCoordinates = tmp;
03263 
03264     currentPartitionCount *= partNo[i];
03265     freeArray<lno_t>(inTotalCounts);
03266     inTotalCounts = outTotalCounts;
03267     env->timerStop(MACRO_TIMERS, "PQJagged Problem_Partitioning_" + istring);
03268   } // Partitioning is done
03269 
03270   env->timerStop(MACRO_TIMERS, "PQJagged Problem_Partitioning");
03271 
03272   //cout << "biter" << endl;
03273   env->timerStart(MACRO_TIMERS, "PQJagged Part_Assignment");
03274 /*
03275   partId_t *partIds = NULL;
03276   ArrayRCP<partId_t> partId;
03277   if(numLocalCoords > 0){
03278     partIds = allocMemory<partId_t>(numLocalCoords);
03279     partId = arcp(partIds, 0, numLocalCoords, true);
03280   }
03281 */
03282 
03283 #ifdef HAVE_ZOLTAN2_OMP
03284 #pragma omp parallel for
03285 #endif
03286   for(partId_t i = 0; i < totalPartCount;++i){
03287     lno_t begin = 0;
03288     lno_t end = inTotalCounts[i];
03289     if(i > 0) begin = inTotalCounts[i -1];
03290     //cout << "part:" << i << " begin:" << begin << " end:" << end << " count:" << end - begin << endl;
03291     /*
03292 #ifdef HAVE_ZOLTAN2_OMP
03293 #pragma omp for
03294 #endif
03295      */
03296     for (lno_t ii = begin; ii < end; ++ii){
03297 
03298       lno_t k = partitionedPointCoordinates[ii];
03299       partIds[k] = i;
03300 
03301       /*
03302       cout << "part of coordinate:";
03303       for(int iii = 0; iii < coordDim; ++iii){
03304         cout <<  pqJagged_coordinates[iii][k] << " ";
03305       }
03306       cout << i;
03307       cout << endl;
03308       */
03309 
03310     }
03311   }
03312   env->timerStop(MACRO_TIMERS, "PQJagged Part_Assignment");
03313   ArrayRCP<const gno_t> gnoList;
03314   if(numLocalCoords > 0){
03315     gnoList = arcpFromArrayView(pqJagged_gnos);
03316   }
03317 
03318   env->timerStart(MACRO_TIMERS, "PQJagged Solution_Part_Assignment");
03319   solution->setParts(gnoList, partId, true);
03320   env->timerStop(MACRO_TIMERS, "PQJagged Solution_Part_Assignment");
03321 
03322 
03323   env->timerStart(MACRO_TIMERS, "PQJagged Problem_Free");
03324 
03325 /*
03326   if(comm->getRank() == 0){
03327     for(partId_t i = 0; i < totalPartCount - 1;++i){
03328       cout << "i:"<< i<<" cut coordinate:" << allCutCoordinates[i] << endl;
03329     }
03330   }
03331 */
03332 
03333 
03334   for(int i = 0; i < numThreads; ++i){
03335 //    freeArray<lno_t>(coordinate_starts[i]);
03336 //    freeArray<lno_t>(coordinate_ends[i]);
03337     freeArray<lno_t>(partPointCounts[i]);
03338   }
03339 
03340   freeArray<lno_t *>(partPointCounts);
03341   //freeArray<lno_t>(coordinate_linked_list);
03342   //freeArray<lno_t *>(coordinate_starts);
03343   //freeArray<lno_t *>(coordinate_ends);
03344   freeArray<double *> (pws);
03345 
03346   if(allowNonRectelinearPart){
03347     freeArray<float>(nonRectelinearPart);
03348     for(int i = 0; i < numThreads; ++i){
03349       freeArray<float>(nonRectRatios[i]);
03350     }
03351     freeArray<float *>(nonRectRatios);
03352   }
03353 
03354   freeArray<partId_t>(myNonDoneCount);
03355   freeArray<scalar_t>(cutWeights);
03356   freeArray<scalar_t>(globalCutWeights);
03357   freeArray<scalar_t>(max_min_array);
03358   freeArray<lno_t>(outTotalCounts);
03359   freeArray<lno_t>(partitionedPointCoordinates);
03360   freeArray<lno_t>(newpartitionedPointCoordinates);
03361   freeArray<scalar_t>(allCutCoordinates);
03362   freeArray<scalar_t *>(pqJagged_coordinates);
03363   freeArray<scalar_t *>(pqJagged_weights);
03364   freeArray<bool>(pqJagged_uniformParts);
03365   freeArray<scalar_t> (localMinMaxTotal);
03366   freeArray<scalar_t> (globalMinMaxTotal);
03367   freeArray<scalar_t *>(pqJagged_partSizes);
03368   freeArray<bool>(pqJagged_uniformWeights);
03369   freeArray<scalar_t>(cutCoordinatesWork);
03370   freeArray<scalar_t>(targetPartWeightRatios);
03371   freeArray<scalar_t>(cutUpperBounds);
03372   freeArray<scalar_t>(cutLowerBounds);
03373   freeArray<scalar_t>(cutLowerWeight);
03374   freeArray<scalar_t>(cutUpperWeight);
03375   freeArray<bool>(isDone);
03376   freeArray<scalar_t>(totalPartWeights_leftClosests_rightClosests);
03377   freeArray<scalar_t>(global_totalPartWeights_leftClosests_rightClosests);
03378 
03379   for(int i = 0; i < numThreads; ++i){
03380     freeArray<double>(partWeights[i]);//freeArray<scalar_t>(partWeights[i]);
03381     freeArray<scalar_t>(rightClosestDistance[i]);
03382     freeArray<scalar_t>(leftClosestDistance[i]);
03383   }
03384 
03385   //freeArray<scalar_t *>(partWeights);
03386   freeArray<double *>(partWeights);
03387   freeArray<scalar_t *>(leftClosestDistance);
03388   freeArray<scalar_t *>(rightClosestDistance);
03389 
03390 
03391   env->timerStop(MACRO_TIMERS, "PQJagged Problem_Free");
03392   env->timerStop(MACRO_TIMERS, "PQJagged Total");
03393 
03394 #endif // INCLUDE_ZOLTAN2_EXPERIMENTAL
03395 }
03396 } // namespace Zoltan2
03397 
03398 
03399 
03400 
03401 
03402 #endif