Zoltan2
Zoltan2_AlgPQJagged.hpp
Go to the documentation of this file.
00001 // @HEADER
00002 //
00003 // ***********************************************************************
00004 //
00005 //   Zoltan2: A package of combinatorial algorithms for scientific computing
00006 //                  Copyright 2012 Sandia Corporation
00007 //
00008 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00009 // the U.S. Government retains certain rights in this software.
00010 //
00011 // Redistribution and use in source and binary forms, with or without
00012 // modification, are permitted provided that the following conditions are
00013 // met:
00014 //
00015 // 1. Redistributions of source code must retain the above copyright
00016 // notice, this list of conditions and the following disclaimer.
00017 //
00018 // 2. Redistributions in binary form must reproduce the above copyright
00019 // notice, this list of conditions and the following disclaimer in the
00020 // documentation and/or other materials provided with the distribution.
00021 //
00022 // 3. Neither the name of the Corporation nor the names of the
00023 // contributors may be used to endorse or promote products derived from
00024 // this software without specific prior written permission.
00025 //
00026 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00027 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00029 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00030 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00031 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00032 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00033 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00034 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00035 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00036 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00037 //
00038 // Questions? Contact Karen Devine      (kddevin@sandia.gov)
00039 //                    Erik Boman        (egboman@sandia.gov)
00040 //                    Siva Rajamanickam (srajama@sandia.gov)
00041 //
00042 // ***********************************************************************
00043 //
00044 // @HEADER
00045 
00050 #ifndef _ZOLTAN2_ALGPQJagged_HPP_
00051 #define _ZOLTAN2_ALGPQJagged_HPP_
00052 
00053 #include <Zoltan2_PQJagged_ReductionOps.hpp>
00054 #include <Zoltan2_CoordinateModel.hpp>
00055 #include <Zoltan2_Parameters.hpp>
00056 #include <Tpetra_Distributor.hpp>
00057 #include <Teuchos_ParameterList.hpp>
00058 #include <Zoltan2_CoordinatePartitioningGraph.hpp>
00059 #include <new>          // ::operator new[]
00060 #include <algorithm>    // std::sort
00061 #include <Zoltan2_Util.hpp>
00062 #include <vector>
00063 
00064 #ifdef HAVE_ZOLTAN2_ZOLTAN
00065 #ifdef HAVE_ZOLTAN2_MPI
00066 #define ENABLE_ZOLTAN_MIGRATION
00067 #include "zoltan_comm_cpp.h"
00068 #include "zoltan_types.h" // for error codes
00069 #endif
00070 #endif
00071 
00072 
00073 #ifdef HAVE_ZOLTAN2_OMP
00074 #include <omp.h>
00075 #endif
00076 
00077 #define LEAST_SIGNIFICANCE 0.0001
00078 #define SIGNIFICANCE_MUL 1000
00079 
00080 //if the (last dimension reduce all count) x the mpi world size
00081 //estimated to be bigger than this number then migration will be forced
00082 //in earlier iterations.
00083 #define FUTURE_REDUCEALL_CUTOFF 1500000
00084 //if parts right before last dimension are estimated to have less than
00085 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
00086 #define MIN_WORK_LAST_DIM 1000
00087 
00088 
00089 
00090 
00091 #define ABS(x) ((x) >= 0 ? (x) : -(x))
00092 //imbalance calculation. Wreal / Wexpected - 1
00093 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
00094         (Wachieved) / ((totalW) * (expectedRatio)) - 1
00095 #define imbalanceOf2(Wachieved, wExpected) \
00096         (Wachieved) / (wExpected) - 1
00097 
00098 
00099 
00100 using std::vector;
00101 
00102 namespace Zoltan2{
00103 
00107 template <typename T>
00108 T *allocMemory(size_t size){
00109     if (size > 0){
00110         T * a = new T[size];
00111         if (a == NULL) {
00112             throw  "cannot allocate memory";
00113         }
00114         return a;
00115     }
00116     else {
00117         return NULL;
00118     }
00119 }
00120 
00124 template <typename T>
00125 void freeArray(T *&array){
00126     if(array != NULL){
00127         delete [] array;
00128         array = NULL;
00129     }
00130 }
00131 
00135 template <typename tt>
00136 std::string toString(tt obj){
00137     std::stringstream ss (std::stringstream::in |std::stringstream::out);
00138     ss << obj;
00139     std::string tmp = "";
00140     ss >> tmp;
00141     return tmp;
00142 }
00143 
00151 template <typename IT, typename CT, typename WT>
00152 class uMultiSortItem
00153 {
00154 public:
00155     IT index;
00156     CT count;
00157     //unsigned int val;
00158     WT *val;
00159     WT _EPSILON;
00160 
00161     uMultiSortItem(){
00162         this->index = 0;
00163         this->count = 0;
00164         this->val = NULL;
00165         this->_EPSILON = numeric_limits<WT>::epsilon() * 100;
00166     }
00167 
00168 
00169     uMultiSortItem(IT index_ ,CT count_, WT *vals_){
00170         this->index = index_;
00171         this->count = count_;
00172         this->val = vals_;
00173         this->_EPSILON = numeric_limits<WT>::epsilon() * 100;
00174     }
00175 
00176     uMultiSortItem( const uMultiSortItem<IT,CT,WT>& other ){
00177         this->index = other.index;
00178         this->count = other.count;
00179         this->val = other.val;
00180         this->_EPSILON = other._EPSILON;
00181     }
00182 
00183     ~uMultiSortItem(){
00184         //freeArray<WT>(this->val);
00185     }
00186 
00187     void set(IT index_ ,CT count_, WT *vals_){
00188         this->index = index_;
00189         this->count = count_;
00190         this->val = vals_;
00191     }
00192 
00193 
00194     uMultiSortItem<IT,CT,WT> operator=(const uMultiSortItem<IT,CT,WT>& other){
00195         this->index = other.index;
00196         this->count = other.count;
00197         this->val = other.val;
00198         return *(this);
00199     }
00200 
00201     bool operator<(const uMultiSortItem<IT,CT,WT>& other) const{
00202         assert (this->count == other.count);
00203         for(CT i = 0; i < this->count; ++i){
00204             //if the values are equal go to next one.
00205             if (ABS(this->val[i] - other.val[i]) < this->_EPSILON){
00206                 continue;
00207             }
00208             //if next value is smaller return true;
00209             if(this->val[i] < other.val[i]){
00210                 return true;
00211             }
00212             //if next value is bigger return false;
00213             else {
00214                 return false;
00215             }
00216         }
00217         //if they are totally equal.
00218         return this->index < other.index;
00219     }
00220     bool operator>(const uMultiSortItem<IT,CT,WT>& other) const{
00221         assert (this->count == other.count);
00222         for(CT i = 0; i < this->count; ++i){
00223             //if the values are equal go to next one.
00224             if (ABS(this->val[i] - other.val[i]) < this->_EPSILON){
00225                 continue;
00226             }
00227             //if next value is bigger return true;
00228             if(this->val[i] > other.val[i]){
00229                 return true;
00230             }
00231             //if next value is smaller return false;
00232             else //(this->val[i] > other.val[i])
00233             {
00234                 return false;
00235             }
00236         }
00237         //if they are totally equal.
00238         return this->index > other.index;
00239     }
00240 };// uSortItem;
00241 
00245 template <class IT, class WT>
00246 struct uSortItem
00247 {
00248     IT id;
00249     //unsigned int val;
00250     WT val;
00251 };// uSortItem;
00252 
00256 template <class IT, class WT>
00257 void uqsort(IT n, uSortItem<IT, WT> * arr)
00258 {
00259 #define SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
00260     int NSTACK = 50;
00261     int M = 7;
00262     IT         i, ir=n, j, k, l=1;
00263     IT         jstack=0, istack[50];
00264     WT aval;
00265     uSortItem<IT,WT>    a, temp;
00266 
00267     --arr;
00268     for (;;)
00269     {
00270         if (ir-l < M)
00271         {
00272             for (j=l+1;j<=ir;j++)
00273             {
00274                 a=arr[j];
00275                 aval = a.val;
00276                 for (i=j-1;i>=1;i--)
00277                 {
00278                     if (arr[i].val <= aval)
00279                         break;
00280                     arr[i+1] = arr[i];
00281                 }
00282                 arr[i+1]=a;
00283             }
00284             if (jstack == 0)
00285                 break;
00286             ir=istack[jstack--];
00287             l=istack[jstack--];
00288         }
00289         else
00290         {
00291             k=(l+ir) >> 1;
00292             SWAP(arr[k],arr[l+1], temp)
00293             if (arr[l+1].val > arr[ir].val)
00294             {
00295                 SWAP(arr[l+1],arr[ir],temp)
00296             }
00297             if (arr[l].val > arr[ir].val)
00298             {
00299                 SWAP(arr[l],arr[ir],temp)
00300             }
00301             if (arr[l+1].val > arr[l].val)
00302             {
00303                 SWAP(arr[l+1],arr[l],temp)
00304             }
00305             i=l+1;
00306             j=ir;
00307             a=arr[l];
00308             aval = a.val;
00309             for (;;)
00310             {
00311                 do i++; while (arr[i].val < aval);
00312                 do j--; while (arr[j].val > aval);
00313                 if (j < i) break;
00314                 SWAP(arr[i],arr[j],temp);
00315             }
00316             arr[l]=arr[j];
00317             arr[j]=a;
00318             jstack += 2;
00319             if (jstack > NSTACK){
00320                 cout << "uqsort: NSTACK too small in sort." << endl;
00321                 exit(1);
00322             }
00323             if (ir-i+1 >= j-l)
00324             {
00325                 istack[jstack]=ir;
00326                 istack[jstack-1]=i;
00327                 ir=j-1;
00328             }
00329             else
00330             {
00331                 istack[jstack]=j-1;
00332                 istack[jstack-1]=l;
00333                 l=i;
00334             }
00335         }
00336     }
00337 }
00338 
00339 
00340 
00344 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
00345 class AlgMJ{
00346 private:
00347 
00348     RCP<const Environment> mj_env; //the environment object
00349     RCP<Comm<int> > mj_problemComm; //initial comm object
00350 
00351     double imbalance_tolerance; //input imbalance tolerance.
00352     partId_t *part_no_array; //input part array specifying num part to divide along each dim.
00353     int recursion_depth; //the number of steps that partitioning will be solved in.
00354     int coord_dim, num_weights_per_coord; //coordinate dim and # of weights per coord
00355 
00356     size_t initial_num_loc_coords; //initial num local coords.
00357     global_size_t initial_num_glob_coords; //initial num global coords.
00358 
00359     pq_lno_t num_local_coords; //number of local coords.
00360     pq_gno_t num_global_coords; //number of global coords.
00361 
00362     pq_scalar_t **mj_coordinates; //two dimension coordinate array
00363     pq_scalar_t **mj_weights; //two dimension weight array
00364     bool *mj_uniform_parts; //if the target parts are uniform
00365     pq_scalar_t **mj_part_sizes; //target part weight sizes.
00366     bool *mj_uniform_weights; //if the coordinates have uniform weights.
00367 
00368     ArrayView<const pq_gno_t> mj_gnos; //global ids of the coordinates, comes from the input
00369     size_t num_global_parts; //the targeted number of parts
00370 
00371     pq_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
00372     pq_gno_t *current_mj_gnos; //current global ids of the coordinates, might change during migration.
00373     int *owner_of_coordinate; //the actual processor owner of the coordinate, to track after migrations.
00374 
00375     pq_lno_t *coordinate_permutations; //permutation of coordinates, for partitioning.
00376     pq_lno_t *new_coordinate_permutations; //permutation work array.
00377     partId_t *assigned_part_ids; //the part ids assigned to coordinates.
00378 
00379     pq_lno_t *part_xadj; //beginning and end of each part.
00380     pq_lno_t *new_part_xadj; // work array for beginning and end of each part.
00381 
00382     //get mj specific parameters.
00383     bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
00384     int max_concurrent_part_calculation; // how many parts we can calculate concurrently.
00385 
00386     int mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
00387     int mj_user_recursion_depth; //the recursion depth value provided by user.
00388     int mj_keep_part_boxes; //if the boxes need to be kept.
00389 
00390     int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
00391     pq_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
00392     int num_threads; //num threads
00393 
00394     partId_t total_num_cut ; //how many cuts will be totally
00395     partId_t total_num_part;    //how many parts will be totally
00396 
00397     partId_t max_num_part_along_dim ;         //maximum part count along a dimension.
00398     partId_t max_num_cut_along_dim; //maximum cut count along a dimension.
00399     size_t max_num_total_part_along_dim; //maximum part+cut count along a dimension.
00400 
00401     partId_t total_dim_num_reduce_all;    //estimate on #reduceAlls can be done.
00402     partId_t last_dim_num_part; //max no of parts that might occur
00403                                 //during the partition before the
00404                                 //last partitioning dimension.
00405 
00406     RCP<Comm<int> > comm; //comm object than can be altered during execution
00407     float fEpsilon; //epsilon for float
00408     pq_scalar_t sEpsilon; //epsilon for pq_scalar_t
00409 
00410     pq_scalar_t maxScalar_t; //max possible scalar
00411     pq_scalar_t minScalar_t; //min scalar
00412 
00413     pq_scalar_t *all_cut_coordinates;
00414     pq_scalar_t *max_min_coords;
00415     pq_scalar_t *process_cut_line_weight_to_put_left; //how much weight should a MPI put left side of the each cutline
00416     pq_scalar_t **thread_cut_line_weight_to_put_left; //how much weight percentage should each thread in MPI put left side of the each outline
00417 
00418     // work array to manipulate coordinate of cutlines in different iterations.
00419     //necessary because previous cut line information is used for determining
00420     //the next cutline information. therefore, cannot update the cut work array
00421     //until all cutlines are determined.
00422     pq_scalar_t *cut_coordinates_work_array;
00423 
00424     //cumulative part weight array.
00425     pq_scalar_t *target_part_weights;
00426 
00427     pq_scalar_t *cut_upper_bound_coordinates ;  //upper bound coordinate of a cut line
00428     pq_scalar_t *cut_lower_bound_coordinates ;  //lower bound coordinate of a cut line
00429     pq_scalar_t *cut_lower_bound_weights ;  //lower bound weight of a cut line
00430     pq_scalar_t *cut_upper_bound_weights ;  //upper bound weight of a cut line
00431 
00432     pq_scalar_t *process_local_min_max_coord_total_weight ; //combined array to exchange the min and max coordinate, and total weight of part.
00433     pq_scalar_t *global_min_max_coord_total_weight ;//global combined array with the results for min, max and total weight.
00434 
00435     //isDone is used to determine if a cutline is determined already.
00436     //If a cut line is already determined, the next iterations will skip this cut line.
00437     bool *is_cut_line_determined;
00438     //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
00439     //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
00440     partId_t *my_incomplete_cut_count;
00441     //local part weights of each thread.
00442     double **thread_part_weights;
00443     //the work manupulation array for partweights.
00444     double **thread_part_weight_work;
00445 
00446     //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
00447     pq_scalar_t **thread_cut_left_closest_point;
00448     //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
00449     pq_scalar_t **thread_cut_right_closest_point;
00450 
00451     //to store how many points in each part a thread has.
00452     pq_lno_t **thread_point_counts;
00453 
00454     pq_scalar_t *process_rectelinear_cut_weight;
00455     pq_scalar_t *global_rectelinear_cut_weight;
00456 
00457     //for faster communication, concatanation of
00458     //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
00459     //leftClosest distances sized P-1, since P-1 cut lines
00460     //rightClosest distances size P-1, since P-1 cut lines.
00461     pq_scalar_t *total_part_weight_left_right_closests ;
00462     pq_scalar_t *global_total_part_weight_left_right_closests;
00463 
00464     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > kept_boxes;
00465     int myRank, myActualRank; //processor rank, and initial rank
00466 
00467     /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
00468      * the input. part_no_array takes
00469      * precedence if both are provided.
00470      * Depending on these parameters, total cut/part number,
00471      * maximum part/cut number along a dimension, estimated number of reduceAlls,
00472      * and the number of parts before the last dimension is calculated.
00473      * */
00474     void set_part_specifications();
00475 
00476     /* \brief Tries to determine the part number for current dimension,
00477      * by trying to make the partitioning as square as possible.
00478      * \param num_total_future how many more partitionings are required.
00479      * \param root how many more recursion depth is left.
00480      */
00481     inline partId_t get_part_count(
00482         partId_t num_total_future,
00483         double root);
00484 
00485     /* \brief Allocates the all required memory for the mj partitioning algorithm.
00486      *
00487      */
00488     void allocate_set_work_memory();
00489 
00490     /* \brief for part communication we keep track of the box boundaries.
00491      * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
00492      * This function initializes a single box with all global min and max coordinates.
00493      * \param initial_partitioning_boxes the input and output vector for boxes.
00494      */
00495     void init_part_boxes(
00496         RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > & outPartBoxes);
00497 
00498     /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
00499      * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
00500      * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
00501      * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
00502      *
00503      *  \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
00504      *  \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
00505      *  \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
00506      *  \param future_num_parts: output, max number of future parts that will be obtained from a single
00507      *  \param current_num_parts: input, how many parts are there currently.
00508      *  \param current_iteration: input, current dimension iteration number.
00509      *  \param input_part_boxes: input, if boxes are kept, current boxes.
00510      *  \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
00511      */
00512     partId_t update_part_num_arrays(
00513         vector <partId_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
00514         vector<partId_t> *future_num_part_in_parts,
00515         vector<partId_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
00516         partId_t &future_num_parts,
00517         partId_t current_num_parts,
00518         int current_iteration,
00519         RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > input_part_boxes,
00520         RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > output_part_boxes);
00521 
00533     void mj_get_local_min_max_coord_totW(
00534         pq_lno_t coordinate_begin_index,
00535         pq_lno_t coordinate_end_index,
00536         pq_lno_t *mj_current_coordinate_permutations,
00537         pq_scalar_t *mj_current_dim_coords,
00538         pq_scalar_t &min_coordinate,
00539         pq_scalar_t &max_coordinate,
00540         pq_scalar_t &total_weight);
00541 
00549     void mj_get_global_min_max_coord_totW(
00550         partId_t current_concurrent_num_parts,
00551         pq_scalar_t *local_min_max_total,
00552         pq_scalar_t *global_min_max_total);
00553 
00572     void mj_get_initial_cut_coords_target_weights(
00573         pq_scalar_t min_coord,
00574         pq_scalar_t max_coord,
00575         partId_t num_cuts/*p-1*/ ,
00576         pq_scalar_t global_weight,
00577         pq_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
00578         pq_scalar_t *target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
00579 
00580         vector <partId_t> *future_num_part_in_parts, //the vecto
00581         vector <partId_t> *next_future_num_parts_in_parts,
00582         partId_t concurrent_current_part,
00583         partId_t obtained_part_index);
00584 
00597     void set_initial_coordinate_parts(
00598         pq_scalar_t &max_coordinate,
00599         pq_scalar_t &min_coordinate,
00600         partId_t &concurrent_current_part_index,
00601         pq_lno_t coordinate_begin_index,
00602         pq_lno_t coordinate_end_index,
00603         pq_lno_t *mj_current_coordinate_permutations,
00604         pq_scalar_t *mj_current_dim_coords,
00605         partId_t *mj_part_ids,
00606         partId_t &partition_count);
00607 
00618     void mj_1D_part(
00619         pq_scalar_t *mj_current_dim_coords,
00620         pq_scalar_t imbalanceTolerance,
00621         partId_t current_work_part,
00622         partId_t current_concurrent_num_parts,
00623         pq_scalar_t *current_cut_coordinates,
00624         partId_t total_incomplete_cut_count,
00625         vector <partId_t> &num_partitioning_in_current_dim);
00626 
00646     void mj_1D_part_get_thread_part_weights(
00647         size_t total_part_count,
00648         partId_t num_cuts,
00649         pq_scalar_t max_coord,
00650         pq_scalar_t min_coord,
00651         pq_lno_t coordinate_begin_index,
00652         pq_lno_t coordinate_end_index,
00653         pq_scalar_t *mj_current_dim_coords,
00654         pq_scalar_t *temp_current_cut_coords,
00655         bool *current_cut_status,
00656         double *my_current_part_weights,
00657         pq_scalar_t *my_current_left_closest,
00658         pq_scalar_t *my_current_right_closest);
00659 
00667     void mj_accumulate_thread_results(
00668         const vector <partId_t> &num_partitioning_in_current_dim,
00669         partId_t current_work_part,
00670         partId_t current_concurrent_num_parts);
00671 
00702     void mj_get_new_cut_coordinates(
00703         const size_t &num_total_part,
00704         const partId_t &num_cuts,
00705         const pq_scalar_t &max_coordinate,
00706         const pq_scalar_t &min_coordinate,
00707         const pq_scalar_t &global_total_weight,
00708         const pq_scalar_t &used_imbalance_tolerance,
00709         pq_scalar_t * current_global_part_weights,
00710         const pq_scalar_t * current_local_part_weights,
00711         const pq_scalar_t *current_part_target_weights,
00712         bool *current_cut_line_determined,
00713         pq_scalar_t *current_cut_coordinates,
00714         pq_scalar_t *current_cut_upper_bounds,
00715         pq_scalar_t *current_cut_lower_bounds,
00716         pq_scalar_t *current_global_left_closest_points,
00717         pq_scalar_t *current_global_right_closest_points,
00718         pq_scalar_t * current_cut_lower_bound_weights,
00719         pq_scalar_t * current_cut_upper_weights,
00720         pq_scalar_t *new_current_cut_coordinates,
00721         pq_scalar_t *current_part_cut_line_weight_to_put_left,
00722         partId_t *rectelinear_cut_count,
00723         partId_t &my_num_incomplete_cut);
00724 
00734     void mj_calculate_new_cut_position (
00735       pq_scalar_t cut_upper_bound,
00736         pq_scalar_t cut_lower_bound,
00737         pq_scalar_t cut_upper_weight,
00738         pq_scalar_t cut_lower_weight,
00739         pq_scalar_t expected_weight,
00740         pq_scalar_t &new_cut_position);
00741 
00752     void mj_create_new_partitions(
00753         partId_t num_parts,
00754         pq_scalar_t *mj_current_dim_coords,
00755         pq_scalar_t *current_concurrent_cut_coordinate,
00756         pq_lno_t coordinate_begin,
00757         pq_lno_t coordinate_end,
00758         pq_scalar_t *used_local_cut_line_weight_to_left,
00759         double **used_thread_part_weight_work,
00760         pq_lno_t *out_part_xadj);
00761 
00784     bool mj_perform_migration(
00785         partId_t in_num_parts, //current umb parts
00786         partId_t &out_num_parts, //output umb parts.
00787         vector<partId_t> *next_future_num_parts_in_parts,
00788         partId_t &output_part_begin_index,
00789         size_t migration_reduce_all_population,
00790         pq_lno_t num_coords_for_last_dim_part,
00791         string iteration,
00792         RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > &input_part_boxes,
00793         RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > &output_part_boxes);
00794 
00804     void get_processor_num_points_in_parts(
00805         partId_t num_procs,
00806         partId_t num_parts,
00807         pq_gno_t *&num_points_in_all_processor_parts);
00808 
00821     bool mj_check_to_migrate(
00822         size_t migration_reduce_all_population,
00823         pq_lno_t num_coords_for_last_dim_part,
00824         partId_t num_procs,
00825         partId_t num_parts,
00826         pq_gno_t *num_points_in_all_processor_parts);
00827 
00828 
00846     void mj_migration_part_proc_assignment(
00847         pq_gno_t * num_points_in_all_processor_parts,
00848         partId_t num_parts,
00849         partId_t num_procs,
00850         pq_lno_t *send_count_to_each_proc,
00851         vector<partId_t> &processor_ranks_for_subcomm,
00852         vector<partId_t> *next_future_num_parts_in_parts,
00853         partId_t &out_num_part,
00854         vector<partId_t> &out_part_indices,
00855         partId_t &output_part_numbering_begin_index,
00856         int *coordinate_destionations);
00857 
00874     void mj_assign_proc_to_parts(
00875         pq_gno_t * num_points_in_all_processor_parts,
00876         partId_t num_parts,
00877         partId_t num_procs,
00878         pq_lno_t *send_count_to_each_proc,
00879         vector<partId_t> &processor_ranks_for_subcomm,
00880         vector<partId_t> *next_future_num_parts_in_parts,
00881         partId_t &out_part_index,
00882         partId_t &output_part_numbering_begin_index,
00883         int *coordinate_destionations);
00884 
00895     void assign_send_destinations(
00896         partId_t num_parts,
00897         partId_t *part_assignment_proc_begin_indices,
00898         partId_t *processor_chains_in_parts,
00899         pq_lno_t *send_count_to_each_proc,
00900         int *coordinate_destionations);
00901 
00914     void assign_send_destionations2(
00915         partId_t num_parts,
00916         uSortItem<partId_t, partId_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
00917         int *coordinate_destionations,
00918         partId_t &output_part_numbering_begin_index,
00919         vector<partId_t> *next_future_num_parts_in_parts);
00920 
00937     void mj_assign_parts_to_procs(
00938         pq_gno_t * num_points_in_all_processor_parts,
00939         partId_t num_parts,
00940         partId_t num_procs,
00941         pq_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
00942         vector<partId_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
00943         partId_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
00944         vector<partId_t> &out_part_indices, //output: the part indices which the processor is assigned to.
00945         partId_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
00946         int *coordinate_destionations);
00947 
00960     void mj_migrate_coords(
00961         partId_t num_procs,
00962         pq_gno_t &num_new_local_points,
00963         string iteration,
00964         int *coordinate_destionations,
00965         partId_t num_parts);
00966 
00973     void create_sub_communicator(vector<partId_t> &processor_ranks_for_subcomm);
00974 
00975 
00981     void fill_permutation_array(
00982         partId_t output_num_parts,
00983         partId_t num_parts);
00984 
00993     void set_final_parts(
00994         partId_t current_num_parts,
00995         partId_t output_part_begin_index,
00996         RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > output_part_boxes,
00997         bool is_data_ever_migrated);
01000     void free_work_memory();
01014     void create_consistent_chunks(
01015         partId_t num_parts,
01016         pq_scalar_t *mj_current_dim_coords,
01017         pq_scalar_t *current_concurrent_cut_coordinate,
01018         pq_lno_t coordinate_begin,
01019         pq_lno_t coordinate_end,
01020         pq_scalar_t *used_local_cut_line_weight_to_left,
01021         pq_lno_t *out_part_xadj,
01022         int coordInd);
01023 public:
01024     AlgMJ();
01025 
01054     void multi_jagged_part(
01055         const RCP<const Environment> &env,
01056           RCP<Comm<int> > &problemComm,
01057 
01058           double imbalance_tolerance,
01059           size_t num_global_parts,
01060           partId_t *part_no_array,
01061           int recursion_depth,
01062 
01063           int coord_dim,
01064           pq_lno_t num_local_coords,
01065           pq_gno_t num_global_coords,
01066           const pq_gno_t *initial_mj_gnos,
01067           pq_scalar_t **mj_coordinates,
01068 
01069           int num_weights_per_coord,
01070           bool *mj_uniform_weights,
01071           pq_scalar_t **mj_weights,
01072           bool *mj_uniform_parts,
01073           pq_scalar_t **mj_part_sizes,
01074 
01075           partId_t *&result_assigned_part_ids,
01076           pq_gno_t *&result_mj_gnos
01077 
01078         );
01086     void set_partitioning_parameters(
01087         bool distribute_points_on_cut_lines_,
01088         int max_concurrent_part_calculation_,
01089         int check_migrate_avoid_migration_option_,
01090         pq_scalar_t minimum_migration_imbalance_);
01094     void set_to_keep_part_boxes();
01099     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > get_part_boxes();
01100 
01124     void sequential_task_partitioning(
01125         const RCP<const Environment> &env,
01126         pq_lno_t num_total_coords,
01127         pq_lno_t num_selected_coords,
01128         size_t num_target_part,
01129         int coord_dim,
01130         pq_scalar_t **mj_coordinates,
01131         pq_lno_t *initial_selected_coords_output_permutation,
01132         pq_lno_t *output_xadj,
01133         int recursion_depth,
01134         const partId_t *part_no_array);
01135 
01136 };
01137 
01161 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
01162 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::sequential_task_partitioning(
01163     const RCP<const Environment> &env,
01164     pq_lno_t num_total_coords,
01165     pq_lno_t num_selected_coords,
01166     size_t num_target_part,
01167     int coord_dim_,
01168     pq_scalar_t **mj_coordinates_,
01169     pq_lno_t *inital_adjList_output_adjlist,
01170     pq_lno_t *output_xadj,
01171     int rd,
01172     const partId_t *part_no_array_
01173 ){
01174 
01175   this->mj_env = env;
01176   const RCP<Comm<int> > commN;
01177   this->comm = this->mj_problemComm =  Teuchos::rcp_const_cast<Comm<int> >
01178   (Teuchos::DefaultComm<int>::getDefaultSerialComm(commN));
01179   this->myActualRank = this->myRank = 1;
01180 
01181 #ifdef HAVE_ZOLTAN2_OMP
01182   int actual_num_threads = omp_get_num_threads();
01183   omp_set_num_threads(1);
01184 #endif
01185 
01186     //weights are uniform for task mapping
01187  
01188     //parts are uniform for task mapping
01189     //as input indices.
01190 
01191     this->imbalance_tolerance = 0;
01192     this->num_global_parts = num_target_part;
01193     this->part_no_array = (partId_t *)part_no_array_;
01194     this->recursion_depth = rd;
01195 
01196     this->coord_dim = coord_dim_;
01197     this->num_local_coords = num_total_coords;
01198     this->num_global_coords = num_total_coords;
01199     this->mj_coordinates = mj_coordinates_;  //will copy the memory to this->mj_coordinates.
01200 
01203     this->initial_mj_gnos = allocMemory<pq_gno_t>(this->num_local_coords);
01204 
01205     this->num_weights_per_coord = 0;
01206     bool *tmp_mj_uniform_weights = new bool[1];
01207     this->mj_uniform_weights = tmp_mj_uniform_weights ;
01208     this->mj_uniform_weights[0] = true;
01209 
01210     pq_scalar_t **tmp_mj_weights = new pq_scalar_t *[1];
01211     this->mj_weights = tmp_mj_weights; //will copy the memory to this->mj_weights
01212 
01213     bool *tmp_mj_uniform_parts = new bool[1];
01214     this->mj_uniform_parts = tmp_mj_uniform_parts;
01215     this->mj_uniform_parts[0] = true;
01216 
01217     pq_scalar_t **tmp_mj_part_sizes = new pq_scalar_t * [1];
01218     this->mj_part_sizes = tmp_mj_part_sizes;
01219     this->mj_part_sizes[0] = NULL;
01220 
01221     this->num_threads = 1;
01222     this->set_part_specifications();
01223 
01224     this->allocate_set_work_memory();
01225     //the end of the initial partition is the end of coordinates.
01226     this->part_xadj[0] = static_cast<pq_lno_t>(num_selected_coords);
01227     for(size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
01228         this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
01229     }
01230 
01231     partId_t current_num_parts = 1;
01232 
01233     pq_scalar_t *current_cut_coordinates =  this->all_cut_coordinates;
01234 
01235     partId_t future_num_parts = this->total_num_part;
01236 
01237     vector<partId_t> *future_num_part_in_parts = new vector<partId_t> ();
01238     vector<partId_t> *next_future_num_parts_in_parts = new vector<partId_t> ();
01239     next_future_num_parts_in_parts->push_back(this->num_global_parts);
01240     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > t1;
01241     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > t2;
01242 
01243     for (int i = 0; i < this->recursion_depth; ++i){
01244 
01245         //partitioning array. size will be as the number of current partitions and this
01246         //holds how many parts that each part will be in the current dimension partitioning.
01247         vector <partId_t> num_partitioning_in_current_dim;
01248 
01249         //number of parts that will be obtained at the end of this partitioning.
01250         //future_num_part_in_parts is as the size of current number of parts.
01251         //holds how many more parts each should be divided in the further
01252         //iterations. this will be used to calculate num_partitioning_in_current_dim,
01253         //as the number of parts that the part will be partitioned
01254         //in the current dimension partitioning.
01255 
01256         //next_future_num_parts_in_parts will be as the size of outnumParts,
01257         //and this will hold how many more parts that each output part
01258         //should be divided. this array will also be used to determine the weight ratios
01259         //of the parts.
01260         //swap the arrays to use iteratively..
01261         vector<partId_t> *tmpPartVect= future_num_part_in_parts;
01262         future_num_part_in_parts = next_future_num_parts_in_parts;
01263         next_future_num_parts_in_parts = tmpPartVect;
01264 
01265         //clear next_future_num_parts_in_parts array as
01266         //getPartitionArrays expects it to be empty.
01267         //it also expects num_partitioning_in_current_dim to be empty as well.
01268         next_future_num_parts_in_parts->clear();
01269 
01270 
01271         //returns the total number of output parts for this dimension partitioning.
01272         partId_t output_part_count_in_dimension =
01273             this->update_part_num_arrays(
01274                 num_partitioning_in_current_dim,
01275                 future_num_part_in_parts,
01276                 next_future_num_parts_in_parts,
01277                 future_num_parts,
01278                 current_num_parts,
01279                 i,
01280                 t1,
01281                 t2);
01282 
01283         //if the number of obtained parts equal to current number of parts,
01284         //skip this dimension. For example, this happens when 1 is given in the input
01285         //part array is given. P=4,5,1,2
01286         if(output_part_count_in_dimension == current_num_parts) {
01287             tmpPartVect= future_num_part_in_parts;
01288             future_num_part_in_parts = next_future_num_parts_in_parts;
01289             next_future_num_parts_in_parts = tmpPartVect;
01290             continue;
01291         }
01292 
01293         //get the coordinate axis along which the partitioning will be done.
01294         int coordInd = i % this->coord_dim;
01295         pq_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
01296         //convert i to string to be used for debugging purposes.
01297         string istring = toString<int>(i);
01298 
01299         //alloc Memory to point the indices
01300         //of the parts in the permutation array.
01301         this->new_part_xadj = allocMemory<pq_lno_t>(output_part_count_in_dimension);
01302 
01303         //the index where in the outtotalCounts will be written.
01304         partId_t output_part_index = 0;
01305         //whatever is written to outTotalCounts will be added with previousEnd
01306         //so that the points will be shifted.
01307         partId_t output_coordinate_end_index = 0;
01308 
01309         partId_t current_work_part = 0;
01310         partId_t current_concurrent_num_parts = min(current_num_parts - current_work_part,
01311                                          this->max_concurrent_part_calculation);
01312 
01313         partId_t obtained_part_index = 0;
01314 
01315         //run for all available parts.
01316         for (; current_work_part < current_num_parts;
01317                      current_work_part += current_concurrent_num_parts){
01318 
01319             current_concurrent_num_parts = min(current_num_parts - current_work_part,
01320             this->max_concurrent_part_calculation);
01321 
01322             partId_t actual_work_part_count = 0;
01323             //initialization for 1D partitioning.
01324             //get the min and max coordinates of each part
01325             //together with the part weights of each part.
01326             for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
01327                 partId_t current_work_part_in_concurrent_parts = current_work_part + kk;
01328 
01329                 //if this part wont be partitioned any further
01330                 //dont do any work for this part.
01331                 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
01332                     continue;
01333                 }
01334                 ++actual_work_part_count;
01335                 pq_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
01336                 pq_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
01337 
01338                 /*
01339                 cout << "i:" << i << " j:" << current_work_part + kk
01340                     << " coordinate_begin_index:" << coordinate_begin_index
01341                     << " coordinate_end_index:" << coordinate_end_index
01342                     << " total:" << coordinate_end_index - coordinate_begin_index<< endl;
01343                     */
01344                 this->mj_get_local_min_max_coord_totW(
01345                     coordinate_begin_index,
01346                     coordinate_end_index,
01347                     this->coordinate_permutations,
01348                     mj_current_dim_coords,
01349                     this->process_local_min_max_coord_total_weight[kk], //min coordinate
01350                         this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max coordinate
01351                         this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] //total weight);
01352                 );
01353             }
01354 
01355             //1D partitioning
01356             if (actual_work_part_count > 0){
01357                 //obtain global Min max of the part.
01358               this->mj_get_global_min_max_coord_totW(
01359                     current_concurrent_num_parts,
01360                     this->process_local_min_max_coord_total_weight,
01361                     this->global_min_max_coord_total_weight);
01362 
01363                 //represents the total number of cutlines
01364                 //whose coordinate should be determined.
01365                 partId_t total_incomplete_cut_count = 0;
01366 
01367                 //Compute weight ratios for parts & cuts:
01368                 //e.g., 0.25  0.25  0.5    0.5  0.75 0.75  1
01369                 //part0  cut0  part1 cut1 part2 cut2 part3
01370                 partId_t concurrent_part_cut_shift = 0;
01371                 partId_t concurrent_part_part_shift = 0;
01372                 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
01373                     pq_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
01374                     pq_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
01375                                                      current_concurrent_num_parts];
01376                     pq_scalar_t global_total_weight =
01377                               this->global_min_max_coord_total_weight[kk +
01378                                                      2 * current_concurrent_num_parts];
01379 
01380                     partId_t concurrent_current_part_index = current_work_part + kk;
01381 
01382                     partId_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
01383 
01384                     pq_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
01385                     pq_scalar_t *current_target_part_weights = this->target_part_weights +
01386                                                                      concurrent_part_part_shift;
01387                     //shift the usedCutCoordinate array as noCuts.
01388                     concurrent_part_cut_shift += partition_count - 1;
01389                     //shift the partRatio array as noParts.
01390                     concurrent_part_part_shift += partition_count;
01391 
01392                     //calculate only if part is not empty,
01393                     //and part will be further partitioend.
01394                     if(partition_count > 1 && min_coordinate <= max_coordinate){
01395 
01396                         //increase allDone by the number of cuts of the current
01397                         //part's cut line number.
01398                         total_incomplete_cut_count += partition_count - 1;
01399                         //set the number of cut lines that should be determined
01400                         //for this part.
01401                         this->my_incomplete_cut_count[kk] = partition_count - 1;
01402 
01403                         //get the target weights of the parts.
01404                         this->mj_get_initial_cut_coords_target_weights(
01405                             min_coordinate,
01406                             max_coordinate,
01407                             partition_count - 1,
01408                             global_total_weight,
01409                             usedCutCoordinate,
01410                             current_target_part_weights,
01411                             future_num_part_in_parts,
01412                             next_future_num_parts_in_parts,
01413                             concurrent_current_part_index,
01414                             obtained_part_index);
01415 
01416                         pq_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
01417                         pq_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
01418 
01419                         //get the initial estimated part assignments of the coordinates.
01420                         this->set_initial_coordinate_parts(
01421                             max_coordinate,
01422                             min_coordinate,
01423                             concurrent_current_part_index,
01424                             coordinate_begin_index, coordinate_end_index,
01425                             this->coordinate_permutations,
01426                             mj_current_dim_coords,
01427                             this->assigned_part_ids,
01428                             partition_count);
01429                     }
01430                     else {
01431                         // e.g., if have fewer coordinates than parts, don't need to do next dim.
01432                         this->my_incomplete_cut_count[kk] = 0;
01433                     }
01434                     obtained_part_index += partition_count;
01435                 }
01436 
01437                 //used imbalance, it is always 0, as it is difficult to estimate a range.
01438                 pq_scalar_t used_imbalance = 0;
01439 
01440                 // Determine cut lines for k parts here.
01441                 this->mj_1D_part(
01442                     mj_current_dim_coords,
01443                     used_imbalance,
01444                     current_work_part,
01445                     current_concurrent_num_parts,
01446                     current_cut_coordinates,
01447                     total_incomplete_cut_count,
01448                     num_partitioning_in_current_dim);
01449             }
01450 
01451             //create part chunks
01452             {
01453 
01454                 partId_t output_array_shift = 0;
01455                 partId_t cut_shift = 0;
01456                 size_t tlr_shift = 0;
01457                 size_t partweight_array_shift = 0;
01458 
01459                 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
01460                     partId_t current_concurrent_work_part = current_work_part + kk;
01461                     partId_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
01462 
01463                     //if the part is empty, skip the part.
01464                     if((num_parts != 1  ) && this->global_min_max_coord_total_weight[kk] >
01465                              this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
01466 
01467                         for(partId_t jj = 0; jj < num_parts; ++jj){
01468                             this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
01469                         }
01470                         cut_shift += num_parts - 1;
01471                         tlr_shift += (4 *(num_parts - 1) + 1);
01472                         output_array_shift += num_parts;
01473                         partweight_array_shift += (2 * (num_parts - 1) + 1);
01474                         continue;
01475                     }
01476 
01477                     pq_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
01478                     pq_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
01479                                                              -1];
01480                     pq_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
01481                     pq_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
01482                                                          cut_shift;
01483 
01484                     for(int ii = 0; ii < this->num_threads; ++ii){
01485                         this->thread_part_weight_work[ii] = this->thread_part_weights[ii] +  partweight_array_shift;
01486                     }
01487 
01488                     if(num_parts > 1){
01489                         // Rewrite the indices based on the computed cuts.
01490                       this->create_consistent_chunks(
01491                             num_parts,
01492                             mj_current_dim_coords,
01493                             current_concurrent_cut_coordinate,
01494                             coordinate_begin,
01495                             coordinate_end,
01496                             used_local_cut_line_weight_to_left,
01497                             this->new_part_xadj + output_part_index + output_array_shift,
01498                             coordInd );
01499                     }
01500                     else {
01501                         //if this part is partitioned into 1 then just copy
01502                         //the old values.
01503                         pq_lno_t part_size = coordinate_end - coordinate_begin;
01504                         *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
01505                         memcpy(this->new_coordinate_permutations + coordinate_begin,
01506                         this->coordinate_permutations + coordinate_begin,
01507                         part_size * sizeof(pq_lno_t));
01508                     }
01509                     cut_shift += num_parts - 1;
01510                     tlr_shift += (4 *(num_parts - 1) + 1);
01511                     output_array_shift += num_parts;
01512                     partweight_array_shift += (2 * (num_parts - 1) + 1);
01513                 }
01514 
01515                 //shift cut coordinates so that all cut coordinates are stored.
01516                 //current_cut_coordinates += cutShift;
01517 
01518                 //getChunks from coordinates partitioned the parts and
01519                 //wrote the indices as if there were a single part.
01520                 //now we need to shift the beginning indices.
01521                 for(partId_t kk = 0; kk < current_concurrent_num_parts; ++kk){
01522                     partId_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
01523                     for (partId_t ii = 0;ii < num_parts ; ++ii){
01524                         //shift it by previousCount
01525                         this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
01526                     }
01527                     //increase the previous count by current end.
01528                     output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
01529                     //increase the current out.
01530                     output_part_index += num_parts ;
01531                 }
01532             }
01533         }
01534         // end of this partitioning dimension
01535 
01536         //set the current num parts for next dim partitioning
01537         current_num_parts = output_part_count_in_dimension;
01538 
01539         //swap the coordinate permutations for the next dimension.
01540         pq_lno_t * tmp = this->coordinate_permutations;
01541         this->coordinate_permutations = this->new_coordinate_permutations;
01542         this->new_coordinate_permutations = tmp;
01543 
01544         freeArray<pq_lno_t>(this->part_xadj);
01545         this->part_xadj = this->new_part_xadj;
01546     }
01547 
01548     for(pq_lno_t i = 0; i < num_total_coords; ++i){
01549       inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
01550     }
01551 
01552     for(size_t i = 0; i < this->num_global_parts ; ++i){
01553         output_xadj[i] = this->part_xadj[i];
01554     }
01555 
01556     delete future_num_part_in_parts;
01557     delete next_future_num_parts_in_parts;
01558 
01559     //free the extra memory that we allocated.
01560     freeArray<partId_t>(this->assigned_part_ids);
01561     freeArray<pq_gno_t>(this->initial_mj_gnos);
01562     freeArray<pq_gno_t>(this->current_mj_gnos);
01563     freeArray<bool>(tmp_mj_uniform_weights);
01564     freeArray<bool>(tmp_mj_uniform_parts);
01565     freeArray<pq_scalar_t *>(tmp_mj_weights);
01566     freeArray<pq_scalar_t *>(tmp_mj_part_sizes);
01567 
01568     this->free_work_memory();
01569 
01570 #ifdef HAVE_ZOLTAN2_OMP
01571     omp_set_num_threads(actual_num_threads);
01572 #endif
01573 }
01574 
01578 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
01579 AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::AlgMJ():
01580   mj_env(), mj_problemComm(), imbalance_tolerance(0),
01581   part_no_array(NULL), recursion_depth(0), coord_dim(0),
01582   num_weights_per_coord(0), initial_num_loc_coords(0), initial_num_glob_coords(0),
01583   num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
01584   mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
01585   mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
01586   initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
01587   coordinate_permutations(NULL), new_coordinate_permutations(NULL),
01588   assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
01589   distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
01590   mj_run_as_rcb(0), mj_user_recursion_depth(0), mj_keep_part_boxes(0),
01591   check_migrate_avoid_migration_option(0), minimum_migration_imbalance(0.30),
01592   num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
01593   max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
01594   last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
01595   all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
01596   thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
01597   target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
01598   cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
01599   process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
01600   is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
01601   thread_part_weights(NULL), thread_part_weight_work(NULL),
01602   thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
01603   thread_point_counts(NULL), process_rectelinear_cut_weight(NULL),
01604   global_rectelinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
01605   global_total_part_weight_left_right_closests(NULL),kept_boxes(),
01606   myRank(0), myActualRank(0)
01607   {
01608   this->fEpsilon = numeric_limits<float>::epsilon();
01609   this->sEpsilon = numeric_limits<pq_scalar_t>::epsilon() * 100;
01610 
01611     this->maxScalar_t = numeric_limits<pq_scalar_t>::max();
01612     this->minScalar_t = -numeric_limits<pq_scalar_t>::max();
01613 
01614 }
01615 
01619 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
01620 RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::get_part_boxes(){
01621 
01622   if (this->mj_keep_part_boxes){
01623     return this->kept_boxes;
01624   } else {
01625     cerr << "Warning: part boxes are not stored - "
01626         << "too store part boxes call set_to_keep_part_boxes() before partitioning" << endl;
01627     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > a;
01628     return a;
01629   }
01630 }
01634 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
01635 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::set_to_keep_part_boxes(){
01636   this->mj_keep_part_boxes = 1;
01637 }
01638 
01639 
01640 
01641 
01642 
01643 /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
01644  * the input. part_no_array takes
01645  * precedence if both are provided.
01646  * Depending on these parameters, total cut/part number,
01647  * maximum part/cut number along a dimension, estimated number of reduceAlls,
01648  * and the number of parts before the last dimension is calculated.
01649  * */
01650 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
01651 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::set_part_specifications(){
01652 
01653   this->total_num_cut = 0; //how many cuts will be totally
01654   this->total_num_part = 1;    //how many parts will be totally
01655   this->max_num_part_along_dim = 0;         //maximum part count along a dimension.
01656   this->total_dim_num_reduce_all = 0;    //estimate on #reduceAlls can be done.
01657   this->last_dim_num_part = 1; //max no of parts that might occur
01658   //during the partition before the
01659   //last partitioning dimension.
01660   this->max_num_cut_along_dim = 0;
01661   this->max_num_total_part_along_dim = 0;
01662 
01663   if (this->part_no_array){
01664     //if user provided part array, traverse the array and set variables.
01665     for (int i = 0; i < this->recursion_depth; ++i){
01666       this->total_dim_num_reduce_all += this->total_num_part;
01667       this->total_num_part *= this->part_no_array[i];
01668       if(this->part_no_array[i] > this->max_num_part_along_dim) {
01669         this->max_num_part_along_dim = this->part_no_array[i];
01670       }
01671     }
01672     this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
01673     this->num_global_parts = this->total_num_part;
01674   } else {
01675     partId_t future_num_parts = this->num_global_parts;
01676 
01677     //we need to calculate the part numbers now, to determine the maximum along the dimensions.
01678     for (int i = 0; i < this->recursion_depth; ++i){
01679 
01680       partId_t maxNoPartAlongI = this->get_part_count(
01681           future_num_parts, 1.0f / (this->recursion_depth - i));
01682 
01683       if (maxNoPartAlongI > this->max_num_part_along_dim){
01684         this->max_num_part_along_dim = maxNoPartAlongI;
01685       }
01686 
01687       partId_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
01688       if (future_num_parts % maxNoPartAlongI){
01689         ++nfutureNumParts;
01690       }
01691       future_num_parts = nfutureNumParts;
01692     }
01693     this->total_num_part = this->num_global_parts;
01694     //estimate reduceAll Count here.
01695     //we find the upperbound instead.
01696     partId_t p = 1;
01697     for (int i = 0; i < this->recursion_depth; ++i){
01698       this->total_dim_num_reduce_all += p;
01699       p *= this->max_num_part_along_dim;
01700     }
01701 
01702     this->last_dim_num_part  = p / this->max_num_part_along_dim;
01703   }
01704 
01705   this->total_num_cut = this->total_num_part - 1;
01706   this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
01707   this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
01708   //maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
01709 
01710   //refine the concurrent part count, if it is given bigger than the maximum possible part count.
01711     if(this->max_concurrent_part_calculation > this->last_dim_num_part){
01712         if(this->mj_problemComm->getRank() == 0){
01713             cerr << "Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
01714             ") has been set bigger than maximum amount that can be used." <<
01715             " Setting to:" << this->last_dim_num_part << "." << endl;
01716         }
01717         this->max_concurrent_part_calculation = this->last_dim_num_part;
01718     }
01719 
01720 }
01721 /* \brief Tries to determine the part number for current dimension,
01722  * by trying to make the partitioning as square as possible.
01723  * \param num_total_future how many more partitionings are required.
01724  * \param root how many more recursion depth is left.
01725  */
01726 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
01727 inline partId_t AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::get_part_count(
01728     partId_t num_total_future,
01729     double root){
01730   double fp = pow(num_total_future, root);
01731   partId_t ip = partId_t (fp);
01732   if (fp - ip < this->fEpsilon * 100){
01733     return ip;
01734   }
01735   else {
01736     return ip  + 1;
01737   }
01738 }
01739 
01740 /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
01741  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
01742  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
01743  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
01744  *
01745  *  \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
01746  *  \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
01747  *  \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
01748  *  \param future_num_parts: output, max number of future parts that will be obtained from a single
01749  *  \param current_num_parts: input, how many parts are there currently.
01750  *  \param current_iteration: input, current dimension iteration number.
01751  *  \param input_part_boxes: input, if boxes are kept, current boxes.
01752  *  \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
01753  */
01754 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
01755 partId_t AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::update_part_num_arrays(
01756   vector <partId_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
01757     vector<partId_t> *future_num_part_in_parts,
01758     vector<partId_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
01759     partId_t &future_num_parts,
01760     partId_t current_num_parts,
01761     int current_iteration,
01762     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > input_part_boxes,
01763     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > output_part_boxes
01764 ){
01765   //how many parts that will be obtained after this dimension.
01766     partId_t output_num_parts = 0;
01767     if(this->part_no_array){
01768         //when the partNo array is provided as input,
01769         //each current partition will be partition to the same number of parts.
01770         //we dont need to use the future_num_part_in_parts vector in this case.
01771 
01772         partId_t p = this->part_no_array[current_iteration];
01773         if (p < 1){
01774             cout << "i:" << current_iteration << " p is given as:" << p << endl;
01775             exit(1);
01776         }
01777         if (p == 1){
01778             return current_num_parts;
01779         }
01780 
01781         for (partId_t ii = 0; ii < current_num_parts; ++ii){
01782             num_partitioning_in_current_dim.push_back(p);
01783 
01784         }
01785         //cout << "me:" << this->myRank << " current_iteration" << current_iteration <<
01786         //" current_num_parts:" << current_num_parts << endl;
01787         //cout << "num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0] << endl;
01788         //set the new value of future_num_parts.
01789 
01790         /*
01791         cout << "\tfuture_num_parts:" << future_num_parts
01792             << " num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0]
01793             << future_num_parts/ num_partitioning_in_current_dim[0] << endl;
01794         */
01795 
01796         future_num_parts /= num_partitioning_in_current_dim[0];
01797         output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
01798 
01799         if (this->mj_keep_part_boxes){
01800             for (partId_t k = 0; k < current_num_parts; ++k){
01801               //initialized the output boxes as its ancestor.
01802               for (partId_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
01803                     output_part_boxes->push_back((*input_part_boxes)[k]);
01804                 }
01805             }
01806         }
01807 
01808         //set the how many more parts each part will be divided.
01809         //this is obvious when partNo array is provided as input.
01810         //however, fill this so that weights will be calculated according to this array.
01811         for (partId_t ii = 0; ii < output_num_parts; ++ii){
01812             next_future_num_parts_in_parts->push_back(future_num_parts);
01813         }
01814     }
01815     else {
01816         //if partNo array is not provided as input,
01817         //future_num_part_in_parts  holds how many parts each part should be divided.
01818         //initially it holds a single number equal to the total number of global parts.
01819 
01820         //calculate the future_num_parts from beginning,
01821         //since each part might be divided into different number of parts.
01822         future_num_parts = 1;
01823 
01824         //cout << "i:" << i << endl;
01825 
01826         for (partId_t ii = 0; ii < current_num_parts; ++ii){
01827             //get how many parts a part should be divided.
01828             partId_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
01829 
01830             //get the ideal number of parts that is close to the
01831             //(recursion_depth - i) root of the future_num_parts_of_part_ii.
01832             partId_t num_partitions_in_current_dim =
01833                       this->get_part_count(
01834                           future_num_parts_of_part_ii,
01835                           1.0 / (this->recursion_depth - current_iteration)
01836                                   );
01837 
01838             if (num_partitions_in_current_dim > this->max_num_part_along_dim){
01839                 cerr << "ERROR: maxPartNo calculation is wrong." << endl;
01840                 exit(1);
01841             }
01842             //add this number to num_partitioning_in_current_dim vector.
01843             num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
01844 
01845 
01846             //increase the output number of parts.
01847             output_num_parts += num_partitions_in_current_dim;
01848 
01849             //ideal number of future partitions for each part.
01850             partId_t ideal_num_future_parts_in_part = future_num_parts_of_part_ii / num_partitions_in_current_dim;
01851             for (partId_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
01852                 partId_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
01853 
01854                 //if there is a remainder in the part increase the part weight.
01855                 if (iii < future_num_parts_of_part_ii % num_partitions_in_current_dim){
01856                     //if not uniform, add 1 for the extra parts.
01857                     ++num_future_parts_for_part_iii;
01858                 }
01859                 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii);
01860 
01861                 //if part boxes are stored, initialize the box of the parts as the ancestor.
01862                 if (this->mj_keep_part_boxes){
01863                     output_part_boxes->push_back((*input_part_boxes)[ii]);
01864                 }
01865 
01866                 //set num future_num_parts to maximum in this part.
01867                 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
01868             }
01869         }
01870     }
01871     return output_num_parts;
01872 }
01873 
01874 
01875 /* \brief Allocates and initializes the work memory that will be used by MJ.
01876  *
01877  * */
01878 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
01879 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::allocate_set_work_memory(){
01880 
01881   //points to process that initially owns the coordinate.
01882   this->owner_of_coordinate  = NULL;
01883 
01884   //Throughout the partitioning execution,
01885   //instead of the moving the coordinates, hold a permutation array for parts.
01886   //coordinate_permutations holds the current permutation.
01887   this->coordinate_permutations =  allocMemory< pq_lno_t>(this->num_local_coords);
01888   //initial configuration, set each pointer-i to i.
01889 #ifdef HAVE_ZOLTAN2_OMP
01890 #pragma omp parallel for
01891 #endif
01892   for(pq_lno_t i = 0; i < this->num_local_coords; ++i){
01893     this->coordinate_permutations[i] = i;
01894   }
01895 
01896   //new_coordinate_permutations holds the current permutation.
01897   this->new_coordinate_permutations = allocMemory< pq_lno_t>(this->num_local_coords);
01898 
01899   this->assigned_part_ids = NULL;
01900   if(this->num_local_coords > 0){
01901     this->assigned_part_ids = allocMemory<partId_t>(this->num_local_coords);
01902   }
01903 
01904   //single partition starts at index-0, and ends at numLocalCoords
01905   //inTotalCounts array holds the end points in coordinate_permutations array
01906   //for each partition. Initially sized 1, and single element is set to numLocalCoords.
01907   this->part_xadj = allocMemory<pq_lno_t>(1);
01908   this->part_xadj[0] = static_cast<pq_lno_t>(this->num_local_coords);//the end of the initial partition is the end of coordinates.
01909   //the ends points of the output, this is allocated later.
01910   this->new_part_xadj = NULL;
01911 
01912   // only store this much if cuts are needed to be stored.
01913   //this->all_cut_coordinates = allocMemory< pq_scalar_t>(this->total_num_cut);
01914 
01915 
01916   this->all_cut_coordinates  = allocMemory< pq_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
01917 
01918   this->max_min_coords =  allocMemory< pq_scalar_t>(this->num_threads * 2);
01919 
01920   this->process_cut_line_weight_to_put_left = NULL; //how much weight percentage should a MPI put left side of the each cutline
01921   this->thread_cut_line_weight_to_put_left = NULL; //how much weight percentage should each thread in MPI put left side of the each outline
01922   //distribute_points_on_cut_lines = false;
01923   if(this->distribute_points_on_cut_lines){
01924     this->process_cut_line_weight_to_put_left = allocMemory<pq_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
01925     this->thread_cut_line_weight_to_put_left = allocMemory<pq_scalar_t *>(this->num_threads);
01926     for(int i = 0; i < this->num_threads; ++i){
01927       this->thread_cut_line_weight_to_put_left[i] = allocMemory<pq_scalar_t>(this->max_num_cut_along_dim);
01928     }
01929       this->process_rectelinear_cut_weight = allocMemory<pq_scalar_t>(this->max_num_cut_along_dim);
01930       this->global_rectelinear_cut_weight = allocMemory<pq_scalar_t>(this->max_num_cut_along_dim);
01931   }
01932 
01933 
01934   // work array to manipulate coordinate of cutlines in different iterations.
01935   //necessary because previous cut line information is used for determining
01936   //the next cutline information. therefore, cannot update the cut work array
01937   //until all cutlines are determined.
01938   this->cut_coordinates_work_array = allocMemory<pq_scalar_t>(this->max_num_cut_along_dim *
01939       this->max_concurrent_part_calculation);
01940 
01941 
01942   //cumulative part weight array.
01943   this->target_part_weights = allocMemory<pq_scalar_t>(
01944           this->max_num_part_along_dim * this->max_concurrent_part_calculation);
01945   // the weight from left to write.
01946 
01947     this->cut_upper_bound_coordinates = allocMemory<pq_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);  //upper bound coordinate of a cut line
01948     this->cut_lower_bound_coordinates = allocMemory<pq_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);  //lower bound coordinate of a cut line
01949     this->cut_lower_bound_weights = allocMemory<pq_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);  //lower bound weight of a cut line
01950     this->cut_upper_bound_weights = allocMemory<pq_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);  //upper bound weight of a cut line
01951 
01952     this->process_local_min_max_coord_total_weight = allocMemory<pq_scalar_t>(3 * this->max_concurrent_part_calculation); //combined array to exchange the min and max coordinate, and total weight of part.
01953     this->global_min_max_coord_total_weight = allocMemory<pq_scalar_t>(3 * this->max_concurrent_part_calculation);//global combined array with the results for min, max and total weight.
01954 
01955     //is_cut_line_determined is used to determine if a cutline is determined already.
01956     //If a cut line is already determined, the next iterations will skip this cut line.
01957     this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
01958     //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
01959     //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
01960     this->my_incomplete_cut_count =  allocMemory<partId_t>(this->max_concurrent_part_calculation);
01961     //local part weights of each thread.
01962     this->thread_part_weights = allocMemory<double *>(this->num_threads);
01963     //the work manupulation array for partweights.
01964     this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
01965 
01966     //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
01967     this->thread_cut_left_closest_point = allocMemory<pq_scalar_t *>(this->num_threads);
01968     //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
01969     this->thread_cut_right_closest_point = allocMemory<pq_scalar_t *>(this->num_threads);
01970 
01971     //to store how many points in each part a thread has.
01972     this->thread_point_counts = allocMemory<pq_lno_t *>(this->num_threads);
01973 
01974     for(int i = 0; i < this->num_threads; ++i){
01975         //partWeights[i] = allocMemory<pq_scalar_t>(maxTotalPartCount);
01976         this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
01977         this->thread_cut_right_closest_point[i] = allocMemory<pq_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
01978         this->thread_cut_left_closest_point[i] = allocMemory<pq_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
01979         this->thread_point_counts[i] =  allocMemory<pq_lno_t>(this->max_num_part_along_dim);
01980     }
01981     //for faster communication, concatanation of
01982     //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
01983     //leftClosest distances sized P-1, since P-1 cut lines
01984     //rightClosest distances size P-1, since P-1 cut lines.
01985     this->total_part_weight_left_right_closests = allocMemory<pq_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
01986     this->global_total_part_weight_left_right_closests = allocMemory<pq_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
01987 
01988 
01989     pq_scalar_t **coord = allocMemory<pq_scalar_t *>(this->coord_dim);
01990     for (int i=0; i < this->coord_dim; i++){
01991       coord[i] = allocMemory<pq_scalar_t>(this->num_local_coords);
01992 #ifdef HAVE_ZOLTAN2_OMP
01993 #pragma omp parallel for
01994 #endif
01995       for (pq_lno_t j=0; j < this->num_local_coords; j++)
01996         coord[i][j] = this->mj_coordinates[i][j];
01997     }
01998     this->mj_coordinates = coord;
01999 
02000 
02001     int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
02002     pq_scalar_t **weights = allocMemory<pq_scalar_t *>(criteria_dim);
02003 
02004     for (int i=0; i < criteria_dim; i++){
02005       weights[i] = NULL;
02006     }
02007     for (int i=0; i < this->num_weights_per_coord; i++){
02008       weights[i] = allocMemory<pq_scalar_t>(this->num_local_coords);
02009 #ifdef HAVE_ZOLTAN2_OMP
02010 #pragma omp parallel for
02011 #endif
02012       for (pq_lno_t j=0; j < this->num_local_coords; j++)
02013         weights[i][j] = this->mj_weights[i][j];
02014 
02015     }
02016   this->mj_weights = weights;
02017     this->current_mj_gnos = allocMemory<pq_gno_t>(this->num_local_coords);
02018 #ifdef HAVE_ZOLTAN2_OMP
02019 #pragma omp parallel for
02020 #endif
02021     for (pq_lno_t j=0; j < this->num_local_coords; j++)
02022       this->current_mj_gnos[j] = this->initial_mj_gnos[j];
02023 
02024     this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
02025 
02026 #ifdef HAVE_ZOLTAN2_OMP
02027 #pragma omp parallel for
02028 #endif
02029     for (pq_lno_t j=0; j < this->num_local_coords; j++)
02030       this->owner_of_coordinate[j] = this->myActualRank;
02031 }
02032 
02033 /* \brief for part communication we keep track of the box boundaries.
02034  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
02035  * This function initializes a single box with all global min and max coordinates.
02036  * \param initial_partitioning_boxes the input and output vector for boxes.
02037  */
02038 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
02039 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::init_part_boxes(
02040     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > & initial_partitioning_boxes
02041     ){
02042 
02043   //local min coords
02044     pq_scalar_t *mins = allocMemory<pq_scalar_t>(this->coord_dim);
02045     //global min coords
02046     pq_scalar_t *gmins = allocMemory<pq_scalar_t>(this->coord_dim);
02047     //local max coords
02048     pq_scalar_t *maxs = allocMemory<pq_scalar_t>(this->coord_dim);
02049     //global max coords
02050     pq_scalar_t *gmaxs = allocMemory<pq_scalar_t>(this->coord_dim);
02051 
02052     for (int i = 0; i < this->coord_dim; ++i){
02053         pq_scalar_t localMin = this->mj_coordinates[i][0];
02054         pq_scalar_t localMax = this->mj_coordinates[i][0];
02055         for (pq_lno_t j = 1; j < this->num_local_coords; ++j){
02056             //cout << " pqJagged_coordinates[i][i]:" <<
02057             //pqJagged_coordinates[i][j] << endl;
02058             if (this->mj_coordinates[i][j] < localMin){
02059                 localMin = this->mj_coordinates[i][j];
02060             }
02061             if (this->mj_coordinates[i][j] > localMax){
02062                 localMax = this->mj_coordinates[i][j];
02063             }
02064         }
02065         //cout << " localMin:" << localMin << endl;
02066         //cout << " localMax:" << localMax << endl;
02067         mins[i] = localMin;
02068         maxs[i] = localMax;
02069     }
02070     reduceAll<int, pq_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
02071             this->coord_dim, mins, gmins
02072     );
02073 
02074     reduceAll<int, pq_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
02075             this->coord_dim, maxs, gmaxs
02076     );
02077 
02078     //create single box with all areas.
02079     coordinateModelPartBox <pq_scalar_t, partId_t> tmpBox (0, this->coord_dim,
02080                                                         gmins, gmaxs);
02081     //coordinateModelPartBox <pq_scalar_t, partId_t> tmpBox (0, coordDim);
02082     freeArray<pq_scalar_t>(mins);
02083     freeArray<pq_scalar_t>(gmins);
02084     freeArray<pq_scalar_t>(maxs);
02085     freeArray<pq_scalar_t>(gmaxs);
02086     initial_partitioning_boxes->push_back(tmpBox);
02087 }
02088 
02099 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
02100 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_get_local_min_max_coord_totW(
02101     pq_lno_t coordinate_begin_index,
02102     pq_lno_t coordinate_end_index,
02103     pq_lno_t *mj_current_coordinate_permutations,
02104     pq_scalar_t *mj_current_dim_coords,
02105     pq_scalar_t &min_coordinate,
02106     pq_scalar_t &max_coordinate,
02107     pq_scalar_t &total_weight){
02108 
02109     //if the part is empty.
02110     //set the min and max coordinates as reverse.
02111     if(coordinate_begin_index >= coordinate_end_index)
02112     {
02113         min_coordinate = this->maxScalar_t;
02114         max_coordinate = this->minScalar_t;
02115         total_weight = 0;
02116     }
02117     else {
02118         pq_scalar_t my_total_weight = 0;
02119 #ifdef HAVE_ZOLTAN2_OMP
02120 #pragma omp parallel
02121 #endif
02122         {
02123             //if uniform weights are used, then weight is equal to count.
02124             if (this->mj_uniform_weights[0]) {
02125 #ifdef HAVE_ZOLTAN2_OMP
02126 #pragma omp single
02127 #endif
02128                 {
02129                     my_total_weight = coordinate_end_index - coordinate_begin_index;
02130                 }
02131 
02132             }
02133             else {
02134                 //if not uniform, then weights are reducted from threads.
02135 #ifdef HAVE_ZOLTAN2_OMP
02136 #pragma omp for reduction(+:my_total_weight)
02137 #endif
02138                 for (pq_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
02139                     int i = mj_current_coordinate_permutations[ii];
02140                     my_total_weight += this->mj_weights[0][i];
02141                 }
02142             }
02143 
02144             int my_thread_id = 0;
02145 #ifdef HAVE_ZOLTAN2_OMP
02146             my_thread_id = omp_get_thread_num();
02147 #endif
02148             pq_scalar_t my_thread_min_coord, my_thread_max_coord;
02149             my_thread_min_coord=my_thread_max_coord
02150                 =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
02151 
02152 
02153 #ifdef HAVE_ZOLTAN2_OMP
02154 #pragma omp for
02155 #endif
02156             for(pq_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
02157                 int i = mj_current_coordinate_permutations[j];
02158                 if(mj_current_dim_coords[i] > my_thread_max_coord)
02159                     my_thread_max_coord = mj_current_dim_coords[i];
02160                 if(mj_current_dim_coords[i] < my_thread_min_coord)
02161                     my_thread_min_coord = mj_current_dim_coords[i];
02162             }
02163             this->max_min_coords[my_thread_id] = my_thread_min_coord;
02164             this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
02165 
02166 #ifdef HAVE_ZOLTAN2_OMP
02167 //we need a barrier here, because max_min_array might not be filled by some of the threads.
02168 #pragma omp barrier
02169 #pragma omp single nowait
02170 #endif
02171             {
02172                 min_coordinate = this->max_min_coords[0];
02173                 for(int i = 1; i < this->num_threads; ++i){
02174                     if(this->max_min_coords[i] < min_coordinate)
02175                         min_coordinate = this->max_min_coords[i];
02176                 }
02177             }
02178 
02179 #ifdef HAVE_ZOLTAN2_OMP
02180 #pragma omp single nowait
02181 #endif
02182             {
02183                 max_coordinate = this->max_min_coords[this->num_threads];
02184                 for(int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
02185                     if(this->max_min_coords[i] > max_coordinate)
02186                         max_coordinate = this->max_min_coords[i];
02187                 }
02188             }
02189         }
02190         total_weight = my_total_weight;
02191     }
02192 }
02193 
02194 
02202 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
02203 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_get_global_min_max_coord_totW(
02204     partId_t current_concurrent_num_parts,
02205     pq_scalar_t *local_min_max_total,
02206     pq_scalar_t *global_min_max_total){
02207 
02208   //reduce min for first current_concurrent_num_parts elements, reduce max for next
02209   //concurrentPartCount elements,
02210   //reduce sum for the last concurrentPartCount elements.
02211   if(this->comm->getSize()  > 1){
02212     Teuchos::PQJaggedCombinedMinMaxTotalReductionOp<int, pq_scalar_t>
02213       reductionOp(
02214           current_concurrent_num_parts,
02215           current_concurrent_num_parts,
02216           current_concurrent_num_parts);
02217     try{
02218       reduceAll<int, pq_scalar_t>(
02219           *(this->comm),
02220           reductionOp,
02221           3 * current_concurrent_num_parts,
02222           local_min_max_total,
02223           global_min_max_total);
02224     }
02225     Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
02226   }
02227   else {
02228     partId_t s = 3 * current_concurrent_num_parts;
02229     for (partId_t i = 0; i < s; ++i){
02230       global_min_max_total[i] = local_min_max_total[i];
02231     }
02232   }
02233 }
02234 
02235 
02236 
02255 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
02256 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_get_initial_cut_coords_target_weights(
02257     pq_scalar_t min_coord,
02258     pq_scalar_t max_coord,
02259     partId_t num_cuts/*p-1*/ ,
02260     pq_scalar_t global_weight,
02261     pq_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
02262     pq_scalar_t *current_target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
02263 
02264     vector <partId_t> *future_num_part_in_parts, //the vecto
02265     vector <partId_t> *next_future_num_parts_in_parts,
02266     partId_t concurrent_current_part,
02267     partId_t obtained_part_index
02268 ){
02269 
02270     pq_scalar_t coord_range = max_coord - min_coord;
02271     if(this->mj_uniform_parts[0]){
02272         {
02273             partId_t cumulative = 0;
02274             //how many total future parts the part will be partitioned into.
02275             pq_scalar_t total_future_part_count_in_part = pq_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
02276 
02277 
02278             //how much each part should weigh in ideal case.
02279             pq_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
02280             /*
02281             cout << "total_future_part_count_in_part:" << total_future_part_count_in_part << endl;
02282             cout << "global_weight:" << global_weight << endl;
02283             cout << "unit_part_weight" << unit_part_weight <<endl;
02284             */
02285             for(partId_t i = 0; i < num_cuts; ++i){
02286                 cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
02287 
02288                 /*
02289                 cout << "obtained_part_index:" << obtained_part_index <<
02290                     " (*next_future_num_parts_in_parts)[i + obtained_part_index]:" << (*next_future_num_parts_in_parts)[i + obtained_part_index] <<
02291                     " cumulative:" << cumulative << endl;
02292                 */
02293                 //set target part weight.
02294                 current_target_part_weights[i] = cumulative * unit_part_weight;
02295                 //cout <<"i:" << i << " current_target_part_weights:" << current_target_part_weights[i] << endl;
02296                 //set initial cut coordinate.
02297                 initial_cut_coords[i] = min_coord + (coord_range *
02298                                          cumulative) / total_future_part_count_in_part;
02299             }
02300             current_target_part_weights[num_cuts] = 1;
02301         }
02302 
02303         //round the target part weights.
02304         if (this->mj_uniform_weights[0]){
02305           for(partId_t i = 0; i < num_cuts + 1; ++i){
02306                 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
02307             }
02308         }
02309     }
02310     else {
02311       cerr << "MJ does not support non uniform part weights" << endl;
02312       exit(1);
02313     }
02314 }
02315 
02316 
02329 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
02330 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::set_initial_coordinate_parts(
02331     pq_scalar_t &max_coordinate,
02332     pq_scalar_t &min_coordinate,
02333     partId_t &concurrent_current_part_index,
02334     pq_lno_t coordinate_begin_index,
02335     pq_lno_t coordinate_end_index,
02336     pq_lno_t *mj_current_coordinate_permutations,
02337     pq_scalar_t *mj_current_dim_coords,
02338     partId_t *mj_part_ids,
02339     partId_t &partition_count
02340 ){
02341     pq_scalar_t coordinate_range = max_coordinate - min_coordinate;
02342 
02343     //if there is single point, or if all points are along a line.
02344     //set initial part to 0 for all.
02345     if(ABS(coordinate_range) < this->sEpsilon ){
02346 #ifdef HAVE_ZOLTAN2_OMP
02347 #pragma omp parallel for
02348 #endif
02349         for(pq_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
02350           mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
02351         }
02352     }
02353     else{
02354 
02355         //otherwise estimate an initial part for each coordinate.
02356         //assuming uniform distribution of points.
02357         pq_scalar_t slice = coordinate_range / partition_count;
02358 
02359 #ifdef HAVE_ZOLTAN2_OMP
02360 #pragma omp parallel for
02361 #endif
02362         for(pq_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
02363 
02364             pq_lno_t iii = mj_current_coordinate_permutations[ii];
02365             partId_t pp = partId_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
02366             mj_part_ids[iii] = 2 * pp;
02367         }
02368     }
02369 }
02370 
02371 
02382 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
02383 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_1D_part(
02384     pq_scalar_t *mj_current_dim_coords,
02385     pq_scalar_t used_imbalance_tolerance,
02386     partId_t current_work_part,
02387     partId_t current_concurrent_num_parts,
02388     pq_scalar_t *current_cut_coordinates,
02389     partId_t total_incomplete_cut_count,
02390     vector <partId_t> &num_partitioning_in_current_dim
02391 ){
02392 
02393 
02394     partId_t rectelinear_cut_count = 0;
02395     pq_scalar_t *temp_cut_coords = current_cut_coordinates;
02396 
02397     Teuchos::PQJaggedCombinedReductionOp<partId_t, pq_scalar_t>
02398                  *reductionOp = NULL;
02399     reductionOp = new Teuchos::PQJaggedCombinedReductionOp
02400                      <partId_t, pq_scalar_t>(
02401                          &num_partitioning_in_current_dim ,
02402                          current_work_part ,
02403                          current_concurrent_num_parts);
02404 
02405     size_t total_reduction_size = 0;
02406 #ifdef HAVE_ZOLTAN2_OMP
02407 #pragma omp parallel shared(total_incomplete_cut_count,  rectelinear_cut_count)
02408 #endif
02409     {
02410         int me = 0;
02411 #ifdef HAVE_ZOLTAN2_OMP
02412         me = omp_get_thread_num();
02413 #endif
02414         double *my_thread_part_weights = this->thread_part_weights[me];
02415         pq_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
02416         pq_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
02417 
02418 #ifdef HAVE_ZOLTAN2_OMP
02419 #pragma omp single
02420 #endif
02421             {
02422                 //initialize the lower and upper bounds of the cuts.
02423                 partId_t next = 0;
02424                 for(partId_t i = 0; i < current_concurrent_num_parts; ++i){
02425 
02426                     partId_t num_part_in_dim =  num_partitioning_in_current_dim[current_work_part + i];
02427                     partId_t num_cut_in_dim = num_part_in_dim - 1;
02428                     total_reduction_size += (4 * num_cut_in_dim + 1);
02429 
02430                     for(partId_t ii = 0; ii < num_cut_in_dim; ++ii){
02431                         this->is_cut_line_determined[next] = false;
02432                         this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i]; //min coordinate
02433                         this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts]; //max coordinate
02434 
02435                         this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts]; //total weight
02436                         this->cut_lower_bound_weights[next] = 0;
02437 
02438                         if(this->distribute_points_on_cut_lines){
02439                             this->process_cut_line_weight_to_put_left[next] = 0;
02440                         }
02441                         ++next;
02442                     }
02443                 }
02444             }
02445 
02446         //no need to have barrier here.
02447         //pragma omp single have implicit barrier.
02448 
02449         int iteration = 0;
02450         while (total_incomplete_cut_count != 0){
02451             iteration += 1;
02452             //cout << "\niteration:" << iteration  << " ";
02453             partId_t concurrent_cut_shifts = 0;
02454             size_t total_part_shift = 0;
02455 
02456             for (partId_t kk = 0; kk < current_concurrent_num_parts; ++kk){
02457                 partId_t num_parts =  -1;
02458                 num_parts =  num_partitioning_in_current_dim[current_work_part + kk];
02459 
02460                 partId_t num_cuts = num_parts - 1;
02461                 size_t total_part_count = num_parts + size_t (num_cuts) ;
02462                 if (this->my_incomplete_cut_count[kk] > 0){
02463 
02464                     //although isDone shared, currentDone is private and same for all.
02465                     bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
02466                     double *my_current_part_weights = my_thread_part_weights + total_part_shift;
02467                     pq_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
02468                     pq_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
02469 
02470                     partId_t conccurent_current_part = current_work_part + kk;
02471                     pq_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
02472                     pq_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
02473                     pq_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
02474 
02475                     pq_scalar_t min_coord = global_min_max_coord_total_weight[kk];
02476                     pq_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
02477 
02478                     // compute part weights using existing cuts
02479                     this->mj_1D_part_get_thread_part_weights(
02480                         total_part_count,
02481                         num_cuts,
02482                         max_coord,//globalMinMaxTotal[kk + concurrentPartCount],//maxScalar,
02483                         min_coord,//globalMinMaxTotal[kk]//minScalar,
02484                         coordinate_begin_index,
02485                         coordinate_end_index,
02486                         mj_current_dim_coords,
02487                         temp_current_cut_coords,
02488                         current_cut_status,
02489                         my_current_part_weights,
02490                         my_current_left_closest,
02491                         my_current_right_closest);
02492 
02493                 }
02494 
02495                 concurrent_cut_shifts += num_cuts;
02496                 total_part_shift += total_part_count;
02497             }
02498 
02499             //sum up the results of threads
02500             this->mj_accumulate_thread_results(
02501                 num_partitioning_in_current_dim,
02502                 current_work_part,
02503                 current_concurrent_num_parts);
02504 
02505             //now sum up the results of mpi processors.
02506 #ifdef HAVE_ZOLTAN2_OMP
02507 #pragma omp single
02508 #endif
02509             {
02510                 if(this->comm->getSize() > 1){
02511                     try{
02512 
02513 
02514                         reduceAll<int, pq_scalar_t>( *(this->comm), *reductionOp,
02515                             total_reduction_size,
02516                             this->total_part_weight_left_right_closests,
02517                             this->global_total_part_weight_left_right_closests);
02518 
02519                     }
02520                     Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
02521                 }
02522                 else {
02523                         memcpy(
02524                           this->global_total_part_weight_left_right_closests,
02525                             this->total_part_weight_left_right_closests,
02526                             total_reduction_size * sizeof(pq_scalar_t));
02527                 }
02528             }
02529 
02530             //how much cut will be shifted for the next part in the concurrent part calculation.
02531             partId_t cut_shift = 0;
02532 
02533             //how much the concantaneted array will be shifted for the next part in concurrent part calculation.
02534             size_t tlr_shift = 0;
02535             for (partId_t kk = 0; kk < current_concurrent_num_parts; ++kk){
02536               partId_t num_parts =  num_partitioning_in_current_dim[current_work_part + kk];
02537               partId_t num_cuts = num_parts - 1;
02538               size_t num_total_part = num_parts + size_t (num_cuts) ;
02539 
02540               //if the cuts of this cut has already been completed.
02541               //nothing to do for this part.
02542               //just update the shift amount and proceed.
02543               if (this->my_incomplete_cut_count[kk] == 0) {
02544                 cut_shift += num_cuts;
02545                 tlr_shift += (num_total_part + 2 * num_cuts);
02546                 continue;
02547               }
02548 
02549               pq_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests  + tlr_shift ;
02550               pq_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
02551               pq_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part; //left closest points
02552               pq_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts; //right closest points
02553               pq_scalar_t *current_global_part_weights = current_global_tlr;
02554               bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
02555 
02556               pq_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
02557               pq_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
02558 
02559               pq_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
02560               pq_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
02561               pq_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
02562               pq_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
02563               pq_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
02564               pq_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
02565               pq_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
02566 
02567               partId_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
02568 
02569               // Now compute the new cut coordinates.
02570               this->mj_get_new_cut_coordinates(
02571                   num_total_part,
02572                   num_cuts,
02573                   max_coordinate,
02574                   min_coordinate,
02575                   global_total_weight,
02576                   used_imbalance_tolerance,
02577                   current_global_part_weights,
02578                   current_local_part_weights,
02579                   current_part_target_weights,
02580                   current_cut_line_determined,
02581                   temp_cut_coords + cut_shift,
02582                   current_cut_upper_bounds,
02583                   current_cut_lower_bounds,
02584                   current_global_left_closest_points,
02585                   current_global_right_closest_points,
02586                   current_cut_lower_bound_weights,
02587                   current_cut_upper_weights,
02588                   this->cut_coordinates_work_array +cut_shift, //new cut coordinates
02589                   current_part_cut_line_weight_to_put_left,
02590                   &rectelinear_cut_count,
02591                   this->my_incomplete_cut_count[kk]);
02592 
02593               cut_shift += num_cuts;
02594               tlr_shift += (num_total_part + 2 * num_cuts);
02595               partId_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
02596 #ifdef HAVE_ZOLTAN2_OMP
02597 #pragma omp single
02598 #endif
02599               {
02600                 total_incomplete_cut_count -= iteration_complete_cut_count;
02601               }
02602 
02603             }
02604 #ifdef HAVE_ZOLTAN2_OMP
02605 #pragma omp barrier
02606 #pragma omp single
02607 #endif
02608             {
02609               //swap the cut coordinates for next iteration.
02610                 pq_scalar_t *t = temp_cut_coords;
02611                 temp_cut_coords = this->cut_coordinates_work_array;
02612                 this->cut_coordinates_work_array = t;
02613             }
02614         }
02615 
02616         // Needed only if keep_cuts; otherwise can simply swap array pointers
02617         // cutCoordinates and cutCoordinatesWork.
02618         // (at first iteration, cutCoordinates == cutCoorindates_tmp).
02619         // computed cuts must be in cutCoordinates.
02620         if (current_cut_coordinates != temp_cut_coords){
02621 #ifdef HAVE_ZOLTAN2_OMP
02622 #pragma omp single
02623 #endif
02624           {
02625             partId_t next = 0;
02626             for(partId_t i = 0; i < current_concurrent_num_parts; ++i){
02627               partId_t num_parts = -1;
02628               num_parts = num_partitioning_in_current_dim[current_work_part + i];
02629               partId_t num_cuts = num_parts - 1;
02630 
02631               for(partId_t ii = 0; ii < num_cuts; ++ii){
02632                 current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
02633               }
02634               next += num_cuts;
02635             }
02636           }
02637 
02638 #ifdef HAVE_ZOLTAN2_OMP
02639 #pragma omp single
02640 #endif
02641             {
02642                 this->cut_coordinates_work_array = temp_cut_coords;
02643             }
02644         }
02645     }
02646     delete reductionOp;
02647 }
02648 
02649 
02669 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
02670 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_1D_part_get_thread_part_weights(
02671     size_t total_part_count,
02672     partId_t num_cuts,
02673     pq_scalar_t max_coord,
02674     pq_scalar_t min_coord,
02675     pq_lno_t coordinate_begin_index,
02676     pq_lno_t coordinate_end_index,
02677     pq_scalar_t *mj_current_dim_coords,
02678     pq_scalar_t *temp_current_cut_coords,
02679     bool *current_cut_status,
02680     double *my_current_part_weights,
02681     pq_scalar_t *my_current_left_closest,
02682     pq_scalar_t *my_current_right_closest){
02683 
02684   // initializations for part weights, left/right closest
02685   for (size_t i = 0; i < total_part_count; ++i){
02686     my_current_part_weights[i] = 0;
02687   }
02688 
02689   //initialize the left and right closest coordinates
02690   //to their max value.
02691   for(partId_t i = 0; i < num_cuts; ++i){
02692     my_current_left_closest[i] = min_coord - 1;
02693     my_current_right_closest[i] = max_coord + 1;
02694   }
02695   //pq_lno_t comparison_count = 0;
02696   pq_scalar_t minus_EPSILON = -this->sEpsilon;
02697 #ifdef HAVE_ZOLTAN2_OMP
02698   //no need for the barrier as all threads uses their local memories.
02699   //dont change the static scheduling here, as it is assumed when the new
02700   //partitions are created later.
02701 #pragma omp for
02702 #endif
02703   for (pq_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
02704     int i = this->coordinate_permutations[ii];
02705 
02706     //the accesses to assigned_part_ids are thread safe
02707     //since each coordinate is assigned to only a single thread.
02708     partId_t j = this->assigned_part_ids[i] / 2;
02709 
02710     if(j >= num_cuts){
02711       j = num_cuts - 1;
02712     }
02713 
02714     partId_t lower_cut_index = 0;
02715     partId_t upper_cut_index = num_cuts - 1;
02716 
02717     pq_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
02718     bool is_inserted = false;
02719     bool is_on_left_of_cut = false;
02720     bool is_on_right_of_cut = false;
02721     partId_t last_compared_part = -1;
02722 
02723     pq_scalar_t coord = mj_current_dim_coords[i];
02724 
02725     while(upper_cut_index >= lower_cut_index)
02726     {
02727       //comparison_count++;
02728       last_compared_part = -1;
02729       is_on_left_of_cut = false;
02730       is_on_right_of_cut = false;
02731       pq_scalar_t cut = temp_current_cut_coords[j];
02732       pq_scalar_t distance_to_cut = coord - cut;
02733       pq_scalar_t abs_distance_to_cut = ABS(distance_to_cut);
02734 
02735       //if it is on the line.
02736       if(abs_distance_to_cut < this->sEpsilon){
02737 
02738         my_current_part_weights[j * 2 + 1] += w;
02739         this->assigned_part_ids[i] = j * 2 + 1;
02740 
02741         //assign left and right closest point to cut as the point is on the cut.
02742         my_current_left_closest[j] = coord;
02743         my_current_right_closest[j] = coord;
02744         //now we need to check if there are other cuts on the same cut coordinate.
02745         //if there are, then we add the weight of the cut to all cuts in the same coordinate.
02746         partId_t kk = j + 1;
02747         while(kk < num_cuts){
02748           // Needed when cuts shared the same position
02749           distance_to_cut =ABS(temp_current_cut_coords[kk] - cut);
02750           if(distance_to_cut < this->sEpsilon){
02751             my_current_part_weights[2 * kk + 1] += w;
02752             my_current_left_closest[kk] = coord;
02753             my_current_right_closest[kk] = coord;
02754             kk++;
02755           }
02756           else{
02757             //cut is far away.
02758             //just check the left closest point for the next cut.
02759             if(coord - my_current_left_closest[kk] > this->sEpsilon){
02760               my_current_left_closest[kk] = coord;
02761             }
02762             break;
02763           }
02764         }
02765 
02766 
02767         kk = j - 1;
02768         //continue checking for the cuts on the left if they share the same coordinate.
02769         while(kk >= 0){
02770           distance_to_cut =ABS(temp_current_cut_coords[kk] - cut);
02771           if(distance_to_cut < this->sEpsilon){
02772             my_current_part_weights[2 * kk + 1] += w;
02773             //try to write the partId as the leftmost cut.
02774             this->assigned_part_ids[i] = kk * 2 + 1;
02775             my_current_left_closest[kk] = coord;
02776             my_current_right_closest[kk] = coord;
02777             kk--;
02778           }
02779           else{
02780             //if cut is far away on the left of the point.
02781             //then just compare for right closest point.
02782             if(my_current_right_closest[kk] - coord > this->sEpsilon){
02783               my_current_right_closest[kk] = coord;
02784             }
02785             break;
02786           }
02787         }
02788 
02789         is_inserted = true;
02790         break;
02791       }
02792       else {
02793         //if point is on the left of the cut.
02794         if (distance_to_cut < 0) {
02795           bool _break = false;
02796           if(j > 0){
02797             //check distance to the cut on the left the current cut compared.
02798             //if point is on the right, then we find the part of the point.
02799             pq_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
02800             if(distance_to_next_cut > this->sEpsilon){
02801               _break = true;
02802             }
02803           }
02804           //if point is not on the right of the next cut, then
02805           //set the upper bound to this cut.
02806           upper_cut_index = j - 1;
02807           //set the last part, and mark it as on the left of the last part.
02808           is_on_left_of_cut = true;
02809           last_compared_part = j;
02810           if(_break) break;
02811         }
02812         else {
02813           //if point is on the right of the cut.
02814           bool _break = false;
02815           if(j < num_cuts - 1){
02816             //check distance to the cut on the left the current cut compared.
02817             //if point is on the right, then we find the part of the point.
02818             pq_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
02819             if(distance_to_next_cut < minus_EPSILON){
02820                        _break = true;
02821                      }
02822           }
02823 
02824           //if point is not on the left of the next cut, then
02825           //set the upper bound to this cut.
02826           lower_cut_index = j + 1;
02827           //set the last part, and mark it as on the right of the last part.
02828           is_on_right_of_cut = true;
02829           last_compared_part = j;
02830           if(_break) break;
02831         }
02832       }
02833 
02834       j = (upper_cut_index + lower_cut_index) / 2;
02835     }
02836     if(!is_inserted){
02837       if(is_on_right_of_cut){
02838 
02839         //add it to the right of the last compared part.
02840         my_current_part_weights[2 * last_compared_part + 2] += w;
02841         this->assigned_part_ids[i] = 2 * last_compared_part + 2;
02842 
02843         //update the right closest point of last compared cut.
02844         if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
02845           my_current_right_closest[last_compared_part] = coord;
02846         }
02847         //update the left closest point of the cut on the right of the last compared cut.
02848         if(last_compared_part+1 < num_cuts){
02849 
02850           if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
02851             my_current_left_closest[last_compared_part + 1] = coord;
02852           }
02853         }
02854 
02855       }
02856       else if(is_on_left_of_cut){
02857 
02858         //add it to the left of the last compared part.
02859         my_current_part_weights[2 * last_compared_part] += w;
02860         this->assigned_part_ids[i] = 2 * last_compared_part;
02861 
02862 
02863         //update the left closest point of last compared cut.
02864         if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
02865           my_current_left_closest[last_compared_part] = coord;
02866         }
02867 
02868         //update the right closest point of the cut on the left of the last compared cut.
02869         if(last_compared_part-1 >= 0){
02870           if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
02871             my_current_right_closest[last_compared_part -1] = coord;
02872           }
02873         }
02874       }
02875     }
02876   }
02877 
02878   // prefix sum computation.
02879   //we need prefix sum for each part to determine cut positions.
02880   for (size_t i = 1; i < total_part_count; ++i){
02881     // check for cuts sharing the same position; all cuts sharing a position
02882     // have the same weight == total weight for all cuts sharing the position.
02883     // don't want to accumulate that total weight more than once.
02884     if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
02885         ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
02886     < this->sEpsilon){
02887       //i % 2 = 0 when part i represents the cut coordinate.
02888       //if it is a cut, and if the next cut also have the same coordinate, then
02889       //dont addup.
02890       my_current_part_weights[i] = my_current_part_weights[i-2];
02891       continue;
02892     }
02893     //otherwise do the prefix sum.
02894     my_current_part_weights[i] += my_current_part_weights[i-1];
02895   }
02896 }
02897 
02898 
02906 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
02907 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_accumulate_thread_results(
02908     const vector <partId_t> &num_partitioning_in_current_dim,
02909     partId_t current_work_part,
02910     partId_t current_concurrent_num_parts){
02911 
02912 #ifdef HAVE_ZOLTAN2_OMP
02913   //needs barrier here, as it requires all threads to finish mj_1D_part_get_thread_part_weights
02914   //using parallel region here reduces the performance because of the cache invalidates.
02915 #pragma omp barrier
02916 #pragma omp single
02917 #endif
02918   {
02919     size_t tlr_array_shift = 0;
02920     partId_t cut_shift = 0;
02921 
02922     //iterate for all concurrent parts to find the left and right closest points in the process.
02923     for(partId_t i = 0; i < current_concurrent_num_parts; ++i){
02924 
02925       partId_t num_parts_in_part =  num_partitioning_in_current_dim[current_work_part + i];
02926       partId_t num_cuts_in_part = num_parts_in_part - 1;
02927       size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
02928 
02929       //iterate for cuts in a single part.
02930       for(partId_t ii = 0; ii < num_cuts_in_part ; ++ii){
02931         partId_t next = tlr_array_shift + ii;
02932         partId_t cut_index = cut_shift + ii;
02933         if(this->is_cut_line_determined[cut_index]) continue;
02934         pq_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
02935             right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
02936 
02937         //find the closest points from left and right for the cut in the process.
02938         for (int j = 1; j < this->num_threads; ++j){
02939           if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
02940             right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
02941           }
02942           if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
02943             left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
02944           }
02945         }
02946         //store the left and right closes points.
02947         this->total_part_weight_left_right_closests[num_total_part_in_part +
02948                                                     next] = left_closest_in_process;
02949         this->total_part_weight_left_right_closests[num_total_part_in_part +
02950                                                     num_cuts_in_part + next] = right_closest_in_process;
02951       }
02952       //set the shift position in the arrays
02953       tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
02954       cut_shift += num_cuts_in_part;
02955     }
02956 
02957     tlr_array_shift = 0;
02958     cut_shift = 0;
02959     size_t total_part_array_shift = 0;
02960 
02961     //iterate for all concurrent parts to find the total weight in the process.
02962     for(partId_t i = 0; i < current_concurrent_num_parts; ++i){
02963 
02964       partId_t num_parts_in_part =  num_partitioning_in_current_dim[current_work_part + i];
02965       partId_t num_cuts_in_part = num_parts_in_part - 1;
02966       size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
02967 
02968       for(size_t j = 0; j < num_total_part_in_part; ++j){
02969 
02970         partId_t cut_ind = j / 2 + cut_shift;
02971 
02972         //need to check j !=  num_total_part_in_part - 1
02973             // which is same as j/2 != num_cuts_in_part.
02974         //we cannot check it using cut_ind, because of the concurrent part concantanetion.
02975         if(j !=  num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind]) continue;
02976         double pwj = 0;
02977         for (int k = 0; k < this->num_threads; ++k){
02978           pwj += this->thread_part_weights[k][total_part_array_shift + j];
02979         }
02980         //size_t jshift = j % total_part_count + i * (total_part_count + 2 * noCuts);
02981         this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
02982       }
02983       cut_shift += num_cuts_in_part;
02984       tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
02985       total_part_array_shift += num_total_part_in_part;
02986     }
02987   }
02988   //the other threads needs to wait here.
02989   //but we don't need a pragma omp barrier.
02990   //as omp single has already have implicit barrier.
02991 }
02992 
02993 
03003 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
03004 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_calculate_new_cut_position (
03005   pq_scalar_t cut_upper_bound,
03006     pq_scalar_t cut_lower_bound,
03007     pq_scalar_t cut_upper_weight,
03008     pq_scalar_t cut_lower_weight,
03009     pq_scalar_t expected_weight,
03010     pq_scalar_t &new_cut_position){
03011 
03012     if(ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
03013       new_cut_position = cut_upper_bound; //or lower bound does not matter.
03014     }
03015 
03016 
03017     if(ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
03018       new_cut_position = cut_lower_bound;
03019     }
03020 
03021     pq_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
03022     pq_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
03023     pq_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
03024 
03025     pq_scalar_t required_shift = (my_weight_diff / weight_range);
03026     int scale_constant = 20;
03027     int shiftint= int (required_shift * scale_constant);
03028     if (shiftint == 0) shiftint = 1;
03029     required_shift = pq_scalar_t (shiftint) / scale_constant;
03030     new_cut_position = coordinate_range * required_shift + cut_lower_bound;
03031 }
03032 
03033 
03044 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
03045 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_create_new_partitions(
03046     partId_t num_parts,
03047     pq_scalar_t *mj_current_dim_coords,
03048     pq_scalar_t *current_concurrent_cut_coordinate,
03049     pq_lno_t coordinate_begin,
03050     pq_lno_t coordinate_end,
03051     pq_scalar_t *used_local_cut_line_weight_to_left,
03052     double **used_thread_part_weight_work,
03053     pq_lno_t *out_part_xadj){
03054 
03055   partId_t num_cuts = num_parts - 1;
03056 
03057 #ifdef HAVE_ZOLTAN2_OMP
03058 #pragma omp parallel
03059 #endif
03060   {
03061     int me = 0;
03062 #ifdef HAVE_ZOLTAN2_OMP
03063     me = omp_get_thread_num();
03064 #endif
03065 
03066     pq_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
03067     pq_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
03068 
03069     //now if the rectelinear partitioning is allowed we decide how
03070     //much weight each thread should put to left and right.
03071     if (this->distribute_points_on_cut_lines){
03072       my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
03073       // this for assumes the static scheduling in mj_1D_part calculation.
03074 #ifdef HAVE_ZOLTAN2_OMP
03075 #pragma omp for
03076 #endif
03077       for (partId_t i = 0; i < num_cuts; ++i){
03078         //the left to be put on the left of the cut.
03079         pq_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
03080         for(int ii = 0; ii < this->num_threads; ++ii){
03081           if(left_weight > this->sEpsilon){
03082             //the weight of thread ii on cut.
03083             pq_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
03084             if(thread_ii_weight_on_cut < left_weight){
03085               //if left weight is bigger than threads weight on cut.
03086               this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
03087             }
03088             else {
03089               //if thread's weight is bigger than space, then put only a portion.
03090               this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
03091             }
03092             left_weight -= thread_ii_weight_on_cut;
03093           }
03094           else {
03095             this->thread_cut_line_weight_to_put_left[ii][i] = 0;
03096           }
03097         }
03098       }
03099 
03100       if(num_cuts > 0){
03101         //this is a special case. If cutlines share the same coordinate, their weights are equal.
03102         //we need to adjust the ratio for that.
03103         for (partId_t i = num_cuts - 1; i > 0 ; --i){
03104           if(ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
03105             my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
03106           }
03107           my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
03108                                 / pq_scalar_t(SIGNIFICANCE_MUL);
03109         }
03110       }
03111     }
03112 
03113     for(partId_t ii = 0; ii < num_parts; ++ii){
03114       thread_num_points_in_parts[ii] = 0;
03115     }
03116 
03117 
03118 #ifdef HAVE_ZOLTAN2_OMP
03119     //dont change static scheduler. the static partitioner used later as well.
03120 #pragma omp for
03121 #endif
03122     for (pq_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
03123 
03124       pq_lno_t coordinate_index = this->coordinate_permutations[ii];
03125       pq_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
03126       partId_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
03127       partId_t coordinate_assigned_part = coordinate_assigned_place / 2;
03128       if(coordinate_assigned_place % 2 == 1){
03129         //if it is on the cut.
03130         if(this->distribute_points_on_cut_lines
03131             && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
03132           //if the rectelinear partitioning is allowed,
03133           //and the thread has still space to put on the left of the cut
03134           //then thread puts the vertex to left.
03135           my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
03136           //if putting the vertex to left increased the weight more than expected.
03137           //and if the next cut is on the same coordinate,
03138           //then we need to adjust how much weight next cut puts to its left as well,
03139           //in order to take care of the imbalance.
03140           if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
03141               && coordinate_assigned_part < num_cuts - 1
03142               && ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
03143                   current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
03144             my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
03145           }
03146           ++thread_num_points_in_parts[coordinate_assigned_part];
03147           this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
03148         }
03149         else{
03150           //if there is no more space on the left, put the coordinate to the right of the cut.
03151           ++coordinate_assigned_part;
03152           //this while loop is necessary when a line is partitioned into more than 2 parts.
03153           while(this->distribute_points_on_cut_lines &&
03154               coordinate_assigned_part < num_cuts){
03155             //traverse all the cut lines having the same partitiong
03156             if(ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
03157                 current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
03158                 < this->sEpsilon){
03159               //if line has enough space on left, put it there.
03160               if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
03161               this->sEpsilon &&
03162               my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
03163               ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
03164                 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
03165                 //Again if it put too much on left of the cut,
03166                 //update how much the next cut sharing the same coordinate will put to its left.
03167                 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
03168                     coordinate_assigned_part < num_cuts - 1 &&
03169                     ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
03170                         current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
03171                   my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
03172                 }
03173                 break;
03174               }
03175             }
03176             else {
03177               break;
03178             }
03179             ++coordinate_assigned_part;
03180           }
03181           ++thread_num_points_in_parts[coordinate_assigned_part];
03182           this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
03183         }
03184       }
03185       else {
03186         //if it is already assigned to a part, then just put it to the corresponding part.
03187         ++thread_num_points_in_parts[coordinate_assigned_part];
03188         this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
03189       }
03190     }
03191 
03192 
03193 
03194     //now we calculate where each thread will write in new_coordinate_permutations array.
03195     //first we find the out_part_xadj, by marking the begin and end points of each part found.
03196     //the below loop find the number of points in each part, and writes it to out_part_xadj
03197 #ifdef HAVE_ZOLTAN2_OMP
03198 #pragma omp for
03199 #endif
03200     for(partId_t j = 0; j < num_parts; ++j){
03201       pq_lno_t num_points_in_part_j_upto_thread_i = 0;
03202       for (int i = 0; i < this->num_threads; ++i){
03203         pq_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
03204         //prefix sum to thread point counts, so that each will have private space to write.
03205         this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
03206         num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
03207 
03208       }
03209       out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
03210     }
03211 
03212     //now we need to do a prefix sum to out_part_xadj[j], to point begin and end of each part.
03213 #ifdef HAVE_ZOLTAN2_OMP
03214 #pragma omp single
03215 #endif
03216     {
03217       //perform prefix sum for num_points in parts.
03218       for(partId_t j = 1; j < num_parts; ++j){
03219         out_part_xadj[j] += out_part_xadj[j - 1];
03220       }
03221     }
03222 
03223     //shift the num points in threads thread to obtain the
03224     //beginning index of each thread's private space.
03225     for(partId_t j = 1; j < num_parts; ++j){
03226       thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
03227     }
03228 
03229 
03230     //now thread gets the coordinate and writes the index of coordinate to the permutation array
03231     //using the part index we calculated.
03232 #ifdef HAVE_ZOLTAN2_OMP
03233 #pragma omp for
03234 #endif
03235     for (pq_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
03236       pq_lno_t i = this->coordinate_permutations[ii];
03237       partId_t p =  this->assigned_part_ids[i];
03238       this->new_coordinate_permutations[coordinate_begin +
03239                                         thread_num_points_in_parts[p]++] = i;
03240     }
03241   }
03242 }
03243 
03244 
03245 
03274 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
03275 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_get_new_cut_coordinates(
03276     const size_t &num_total_part,
03277     const partId_t &num_cuts,
03278     const pq_scalar_t &max_coordinate,
03279     const pq_scalar_t &min_coordinate,
03280     const pq_scalar_t &global_total_weight,
03281     const pq_scalar_t &used_imbalance_tolerance,
03282     pq_scalar_t * current_global_part_weights,
03283     const pq_scalar_t * current_local_part_weights,
03284     const pq_scalar_t *current_part_target_weights,
03285     bool *current_cut_line_determined,
03286     pq_scalar_t *current_cut_coordinates,
03287     pq_scalar_t *current_cut_upper_bounds,
03288     pq_scalar_t *current_cut_lower_bounds,
03289     pq_scalar_t *current_global_left_closest_points,
03290     pq_scalar_t *current_global_right_closest_points,
03291     pq_scalar_t * current_cut_lower_bound_weights,
03292     pq_scalar_t * current_cut_upper_weights,
03293     pq_scalar_t *new_current_cut_coordinates,
03294     pq_scalar_t *current_part_cut_line_weight_to_put_left,
03295     partId_t *rectelinear_cut_count,
03296     partId_t &my_num_incomplete_cut){
03297 
03298   //seen weight in the part
03299   pq_scalar_t seen_weight_in_part = 0;
03300   //expected weight for part.
03301   pq_scalar_t expected_weight_in_part = 0;
03302   //imbalance for the left and right side of the cut.
03303   pq_scalar_t imbalance_on_left = 0, imbalance_on_right = 0;
03304 
03305 
03306 #ifdef HAVE_ZOLTAN2_OMP
03307 #pragma omp for
03308 #endif
03309   for (partId_t i = 0; i < num_cuts; i++){
03310     //if left and right closest points are not set yet,
03311     //set it to the cut itself.
03312     if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
03313       current_global_left_closest_points[i] = current_cut_coordinates[i];
03314     if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
03315       current_global_right_closest_points[i] = current_cut_coordinates[i];
03316 
03317   }
03318 #ifdef HAVE_ZOLTAN2_OMP
03319 #pragma omp for
03320 #endif
03321   for (partId_t i = 0; i < num_cuts; i++){
03322 
03323     if(this->distribute_points_on_cut_lines){
03324       //init the weight on the cut.
03325       this->global_rectelinear_cut_weight[i] = 0;
03326       this->process_rectelinear_cut_weight[i] = 0;
03327     }
03328     //if already determined at previous iterations,
03329     //then just write the coordinate to new array, and proceed.
03330     if(current_cut_line_determined[i]) {
03331       new_current_cut_coordinates[i] = current_cut_coordinates[i];
03332       continue;
03333     }
03334 
03335     //current weight of the part at the left of the cut line.
03336     seen_weight_in_part = current_global_part_weights[i * 2];
03337 
03338     /*
03339     cout << "seen_weight_in_part:" << i << " is "<< seen_weight_in_part << endl;
03340     cout << "\tcut:" << current_cut_coordinates[i]
03341            << " current_cut_lower_bounds:" << current_cut_lower_bounds[i]
03342                << " current_cut_upper_bounds:" << current_cut_upper_bounds[i] << endl;
03343                */
03344     //expected ratio
03345     expected_weight_in_part = current_part_target_weights[i];
03346     //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
03347     imbalance_on_left = imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
03348     //rightImbalance = imbalanceOf(globalTotalWeight - seenW, globalTotalWeight, 1 - expected);
03349     imbalance_on_right = imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
03350 
03351     bool is_left_imbalance_valid = ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
03352     bool is_right_imbalance_valid = ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
03353 
03354     //if the cut line reaches to desired imbalance.
03355     if(is_left_imbalance_valid && is_right_imbalance_valid){
03356       current_cut_line_determined[i] = true;
03357 #ifdef HAVE_ZOLTAN2_OMP
03358 #pragma omp atomic
03359 #endif
03360       my_num_incomplete_cut -= 1;
03361       new_current_cut_coordinates [i] = current_cut_coordinates[i];
03362       continue;
03363     }
03364     else if(imbalance_on_left < 0){
03365       //if left imbalance < 0 then we need to move the cut to right.
03366 
03367       if(this->distribute_points_on_cut_lines){
03368         //if it is okay to distribute the coordinate on
03369         //the same coordinate to left and right.
03370         //then check if we can reach to the target weight by including the
03371         //coordinates in the part.
03372         if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
03373           //if it is we are done.
03374           current_cut_line_determined[i] = true;
03375 #ifdef HAVE_ZOLTAN2_OMP
03376 #pragma omp atomic
03377 #endif
03378           my_num_incomplete_cut -= 1;
03379 
03380           //then assign everything on the cut to the left of the cut.
03381           new_current_cut_coordinates [i] = current_cut_coordinates[i];
03382 
03383           //for this cut all the weight on cut will be put to left.
03384 
03385           current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
03386           continue;
03387         }
03388         else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
03389 
03390           //if the weight is larger than the expected weight,
03391           //then we need to distribute some points to left, some to right.
03392           current_cut_line_determined[i] = true;
03393 #ifdef HAVE_ZOLTAN2_OMP
03394 #pragma omp atomic
03395 #endif
03396           *rectelinear_cut_count += 1;
03397           //increase the num cuts to be determined with rectelinear partitioning.
03398 
03399 #ifdef HAVE_ZOLTAN2_OMP
03400 #pragma omp atomic
03401 #endif
03402           my_num_incomplete_cut -= 1;
03403           new_current_cut_coordinates [i] = current_cut_coordinates[i];
03404           this->process_rectelinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
03405               current_local_part_weights[i * 2];
03406           continue;
03407         }
03408       }
03409       //we need to move further right,so set lower bound to current line, and shift it to the closes point from right.
03410       current_cut_lower_bounds[i] = current_global_right_closest_points[i];
03411       //set the lower bound weight to the weight we have seen.
03412       current_cut_lower_bound_weights[i] = seen_weight_in_part;
03413 
03414       //compare the upper bound with what has been found in the last iteration.
03415       //we try to make more strict bounds for the cut here.
03416       for (partId_t ii = i + 1; ii < num_cuts ; ++ii){
03417         pq_scalar_t p_weight = current_global_part_weights[ii * 2];
03418         pq_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
03419 
03420         if(p_weight >= expected_weight_in_part){
03421           //if a cut on the right has the expected weight, then we found
03422           //our cut position. Set up and low coordiantes to this new cut coordinate.
03423           //but we need one more iteration to finalize the cut position,
03424           //as wee need to update the part ids.
03425           if(p_weight == expected_weight_in_part){
03426             current_cut_upper_bounds[i] = current_cut_coordinates[ii];
03427             current_cut_upper_weights[i] = p_weight;
03428             current_cut_lower_bounds[i] = current_cut_coordinates[ii];
03429             current_cut_lower_bound_weights[i] = p_weight;
03430           } else if (p_weight < current_cut_upper_weights[i]){
03431             //if a part weight is larger then my expected weight,
03432             //but lower than my upper bound weight, update upper bound.
03433             current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
03434             current_cut_upper_weights[i] = p_weight;
03435           }
03436           break;
03437         }
03438         //if comes here then pw < ew
03439         //then compare the weight against line weight.
03440         if(line_weight >= expected_weight_in_part){
03441           //if the line is larger than the expected weight,
03442           //then we need to reach to the balance by distributing coordinates on this line.
03443           current_cut_upper_bounds[i] = current_cut_coordinates[ii];
03444           current_cut_upper_weights[i] = line_weight;
03445           current_cut_lower_bounds[i] = current_cut_coordinates[ii];
03446           current_cut_lower_bound_weights[i] = p_weight;
03447           break;
03448         }
03449         //if a stricter lower bound is found,
03450         //update the lower bound.
03451         if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
03452           current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
03453           current_cut_lower_bound_weights[i] = p_weight;
03454         }
03455       }
03456 
03457 
03458       pq_scalar_t new_cut_position = 0;
03459       this->mj_calculate_new_cut_position(
03460           current_cut_upper_bounds[i],
03461           current_cut_lower_bounds[i],
03462           current_cut_upper_weights[i],
03463           current_cut_lower_bound_weights[i],
03464           expected_weight_in_part, new_cut_position);
03465 
03466       //if cut line does not move significantly.
03467       //then finalize the search.
03468       if (ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
03469         /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/
03470         ){
03471         current_cut_line_determined[i] = true;
03472 #ifdef HAVE_ZOLTAN2_OMP
03473 #pragma omp atomic
03474 #endif
03475         my_num_incomplete_cut -= 1;
03476 
03477         //set the cut coordinate and proceed.
03478         new_current_cut_coordinates [i] = current_cut_coordinates[i];
03479       } else {
03480         new_current_cut_coordinates [i] = new_cut_position;
03481       }
03482     } else {
03483 
03484       //need to move the cut line to left.
03485       //set upper bound to current line.
03486       current_cut_upper_bounds[i] = current_global_left_closest_points[i];
03487       current_cut_upper_weights[i] = seen_weight_in_part;
03488 
03489       // compare the current cut line weights with previous upper and lower bounds.
03490       for (int ii = i - 1; ii >= 0; --ii){
03491         pq_scalar_t p_weight = current_global_part_weights[ii * 2];
03492         pq_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
03493         if(p_weight <= expected_weight_in_part){
03494           if(p_weight == expected_weight_in_part){
03495             //if the weight of the part is my expected weight
03496             //then we find the solution.
03497             current_cut_upper_bounds[i] = current_cut_coordinates[ii];
03498             current_cut_upper_weights[i] = p_weight;
03499             current_cut_lower_bounds[i] = current_cut_coordinates[ii];
03500             current_cut_lower_bound_weights[i] = p_weight;
03501           }
03502           else if (p_weight > current_cut_lower_bound_weights[i]){
03503             //if found weight is bigger than the lower bound
03504             //then update the lower bound.
03505             current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
03506             current_cut_lower_bound_weights[i] = p_weight;
03507 
03508             //at the same time, if weight of line is bigger than the
03509             //expected weight, then update the upper bound as well.
03510             //in this case the balance will be obtained by distributing weightss
03511             //on this cut position.
03512             if(line_weight > expected_weight_in_part){
03513               current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
03514               current_cut_upper_weights[i] = line_weight;
03515             }
03516           }
03517           break;
03518         }
03519         //if the weight of the cut on the left is still bigger than my weight,
03520         //and also if the weight is smaller than the current upper weight,
03521         //or if the weight is equal to current upper weight, but on the left of
03522         // the upper weight, then update upper bound.
03523         if (p_weight >= expected_weight_in_part &&
03524             (p_weight < current_cut_upper_weights[i] ||
03525                 (p_weight == current_cut_upper_weights[i] &&
03526                     current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
03527                 )
03528             )
03529           ){
03530           current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
03531           current_cut_upper_weights[i] = p_weight;
03532         }
03533       }
03534       pq_scalar_t new_cut_position = 0;
03535       this->mj_calculate_new_cut_position(
03536           current_cut_upper_bounds[i],
03537           current_cut_lower_bounds[i],
03538           current_cut_upper_weights[i],
03539           current_cut_lower_bound_weights[i],
03540           expected_weight_in_part,
03541           new_cut_position);
03542 
03543       //if cut line does not move significantly.
03544       if (ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
03545           /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/ ){
03546         current_cut_line_determined[i] = true;
03547 #ifdef HAVE_ZOLTAN2_OMP
03548 #pragma omp atomic
03549 #endif
03550         my_num_incomplete_cut -= 1;
03551         //set the cut coordinate and proceed.
03552         new_current_cut_coordinates [ i] = current_cut_coordinates[i];
03553       } else {
03554         new_current_cut_coordinates [ i] = new_cut_position;
03555       }
03556     }
03557   }
03558 
03559   //communication to determine the ratios of processors for the distribution
03560   //of coordinates on the cut lines.
03561 #ifdef HAVE_ZOLTAN2_OMP
03562   //no need barrier here as it is implicit.
03563 #pragma omp single
03564 #endif
03565   {
03566     if(*rectelinear_cut_count > 0){
03567 
03568       try{
03569         Teuchos::scan<int,pq_scalar_t>(
03570             *comm, Teuchos::REDUCE_SUM,
03571             num_cuts,
03572             this->process_rectelinear_cut_weight,
03573             this->global_rectelinear_cut_weight
03574         );
03575       }
03576       Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
03577 
03578       for (partId_t i = 0; i < num_cuts; ++i){
03579         //if cut line weight to be distributed.
03580         if(this->global_rectelinear_cut_weight[i] > 0) {
03581           //expected weight to go to left of the cut.
03582           pq_scalar_t expected_part_weight = current_part_target_weights[i];
03583           //the weight that should be put to left of the cut.
03584           pq_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
03585           //the weight of the cut in the process
03586           pq_scalar_t my_weight_on_line = this->process_rectelinear_cut_weight[i];
03587           //the sum of the cut weights upto this process, including the weight of this process.
03588           pq_scalar_t weight_on_line_upto_process_inclusive = this->global_rectelinear_cut_weight[i];
03589           //the space on the left side of the cut after all processes before this process (including this process)
03590           //puts their weights on cut to left.
03591           pq_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
03592           //add my weight to this space to find out how much space is left to me.
03593           pq_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
03594 
03595           /*
03596           cout << "expected_part_weight:" << expected_part_weight
03597               << " necessary_weight_on_line_for_left:" << necessary_weight_on_line_for_left
03598               << " my_weight_on_line" << my_weight_on_line
03599               << " weight_on_line_upto_process_inclusive:" << weight_on_line_upto_process_inclusive
03600               << " space_to_put_left:" << space_to_put_left
03601               << " space_left_to_me" << space_left_to_me << endl;
03602            */
03603           if(space_left_to_me < 0){
03604             //space_left_to_me is negative and i dont need to put anything to left.
03605             current_part_cut_line_weight_to_put_left[i] = 0;
03606           }
03607           else if(space_left_to_me >= my_weight_on_line){
03608             //space left to me is bigger than the weight of the processor on cut.
03609             //so put everything to left.
03610             current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
03611             //cout << "setting current_part_cut_line_weight_to_put_left to my_weight_on_line:" << my_weight_on_line << endl;
03612           }
03613           else {
03614             //put only the weight as much as the space.
03615             current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
03616 
03617             //cout << "setting current_part_cut_line_weight_to_put_left to space_left_to_me:" << space_left_to_me << endl;
03618           }
03619 
03620         }
03621       }
03622       *rectelinear_cut_count = 0;
03623     }
03624   }
03625 }
03626 
03636 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
03637 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::get_processor_num_points_in_parts(
03638     partId_t num_procs,
03639     partId_t num_parts,
03640     pq_gno_t *&num_points_in_all_processor_parts){
03641 
03642   //initially allocation_size is num_parts
03643   size_t allocation_size = num_parts * (num_procs + 1);
03644 
03645   //this will be output
03646   //holds how many each processor has in each part.
03647   //last portion is the sum of all processor points in each part.
03648 
03649   //allocate memory for the local num coordinates in each part.
03650   pq_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<pq_gno_t>(allocation_size);
03651 
03652 
03653   //this is the portion of the memory which will be used
03654   //at the summation to obtain total number of processors' points in each part.
03655   pq_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
03656   //this is the portion of the memory where each stores its local number.
03657   //this information is needed by other processors.
03658   pq_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
03659 
03660   //initialize the array with 0's.
03661   memset(num_local_points_in_each_part_to_reduce_sum, 0, sizeof(pq_gno_t)*allocation_size);
03662 
03663   //write the number of coordinates in each part.
03664   for (partId_t i = 0; i < num_parts; ++i){
03665     pq_lno_t part_begin_index = 0;
03666     if (i > 0){
03667       part_begin_index = this->new_part_xadj[i - 1];
03668     }
03669     pq_lno_t part_end_index = this->new_part_xadj[i];
03670     my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
03671   }
03672 
03673   //copy the local num parts to the last portion of array,
03674   //so that this portion will represent the global num points in each part after the reduction.
03675   memcpy (my_local_point_counts_in_each_art,
03676       my_local_points_to_reduce_sum,
03677       sizeof(pq_gno_t) * (num_parts) );
03678 
03679 
03680   //reduceAll operation.
03681   //the portion that belongs to a processor with index p
03682   //will start from myRank * num_parts.
03683   //the global number of points will be held at the index
03684   try{
03685     reduceAll<int, pq_gno_t>(
03686         *(this->comm),
03687         Teuchos::REDUCE_SUM,
03688         allocation_size,
03689         num_local_points_in_each_part_to_reduce_sum,
03690         num_points_in_all_processor_parts);
03691   }
03692   Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
03693   freeArray<pq_gno_t>(num_local_points_in_each_part_to_reduce_sum);
03694 }
03695 
03696 
03697 
03710 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
03711 bool AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_check_to_migrate(
03712     size_t migration_reduce_all_population,
03713     pq_lno_t num_coords_for_last_dim_part,
03714     partId_t num_procs,
03715     partId_t num_parts,
03716     pq_gno_t *num_points_in_all_processor_parts){
03717 
03718   //if reduce all count and population in the last dim is too high
03719     if (migration_reduce_all_population > FUTURE_REDUCEALL_CUTOFF) return true;
03720     //if the work in a part per processor in the last dim is too low.
03721     if (num_coords_for_last_dim_part < MIN_WORK_LAST_DIM) return true;
03722 
03723   //if migration is to be checked and the imbalance is too high
03724     if (this->check_migrate_avoid_migration_option == 0){
03725       double global_imbalance = 0;
03726       //global shift to reach the sum of coordiante count in each part.
03727       size_t global_shift = num_procs * num_parts;
03728 
03729       for (partId_t ii = 0; ii < num_procs; ++ii){
03730         for (partId_t i = 0; i < num_parts; ++i){
03731           double ideal_num = num_points_in_all_processor_parts[global_shift + i]
03732                     / double(num_procs);
03733 
03734           global_imbalance += ABS(ideal_num -
03735               num_points_in_all_processor_parts[ii * num_parts + i]) /  (ideal_num);
03736         }
03737       }
03738       global_imbalance /= num_parts;
03739       global_imbalance /= num_procs;
03740 
03741     /*
03742       if (this->myRank == 0) {
03743         cout << "imbalance for next iteration:" << global_imbalance << endl;
03744       }
03745       */
03746 
03747       if(global_imbalance <= this->minimum_migration_imbalance){
03748         return false;
03749       }
03750       else {
03751         return true;
03752       }
03753     }
03754     else {
03755       //if migration is forced
03756         return true;
03757     }
03758 }
03759 
03760 
03770 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
03771 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::assign_send_destinations(
03772     partId_t num_parts,
03773     partId_t *part_assignment_proc_begin_indices,
03774     partId_t *processor_chains_in_parts,
03775     pq_lno_t *send_count_to_each_proc,
03776     int *coordinate_destionations){
03777 
03778     for (partId_t p = 0; p < num_parts; ++p){
03779         pq_lno_t part_begin = 0;
03780         if (p > 0) part_begin = this->new_part_xadj[p - 1];
03781         pq_lno_t part_end = this->new_part_xadj[p];
03782 
03783         //get the first part that current processor will send its part-p.
03784         partId_t proc_to_sent = part_assignment_proc_begin_indices[p];
03785         //initialize how many point I sent to this processor.
03786         pq_lno_t num_total_send = 0;
03787         for (pq_lno_t j=part_begin; j < part_end; j++){
03788             pq_lno_t local_ind = this->new_coordinate_permutations[j];
03789             while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
03790               //then get the next processor to send the points in part p.
03791                 num_total_send = 0;
03792                 //assign new processor to part_assign_begin[p]
03793                 part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
03794                 //remove the previous processor
03795                 processor_chains_in_parts[proc_to_sent] = -1;
03796                 //choose the next processor as the next one to send.
03797                 proc_to_sent = part_assignment_proc_begin_indices[p];
03798             }
03799             //write the gno index to corresponding position in sendBuf.
03800             coordinate_destionations[local_ind] = proc_to_sent;
03801             ++num_total_send;
03802         }
03803     }
03804 }
03805 
03820 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
03821 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_assign_proc_to_parts(
03822     pq_gno_t * num_points_in_all_processor_parts,
03823     partId_t num_parts,
03824     partId_t num_procs,
03825     pq_lno_t *send_count_to_each_proc,
03826     vector<partId_t> &processor_ranks_for_subcomm,
03827     vector<partId_t> *next_future_num_parts_in_parts,
03828     partId_t &out_part_index,
03829     partId_t &output_part_numbering_begin_index,
03830     int *coordinate_destionations){
03831 
03832 
03833     pq_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
03834     partId_t *num_procs_assigned_to_each_part = allocMemory<partId_t>(num_parts);
03835 
03836     //boolean variable if the process finds its part to be assigned.
03837     bool did_i_find_my_group = false;
03838 
03839     partId_t num_free_procs = num_procs;
03840     partId_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
03841 
03842     double max_imbalance_difference = 0;
03843     partId_t max_differing_part = 0;
03844 
03845     //find how many processor each part requires.
03846     for (partId_t i=0; i < num_parts; i++){
03847 
03848         //scalar portion of the required processors
03849         double scalar_required_proc = num_procs *
03850                 (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
03851 
03852         //round it to closest integer.
03853         partId_t required_proc = static_cast<partId_t> (0.5 + scalar_required_proc);
03854 
03855         //if assigning the required num procs, creates problems for the rest of the parts.
03856         //then only assign {num_free_procs - (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
03857         if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
03858             required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
03859         }
03860 
03861         //reduce the free processor count
03862         num_free_procs -= required_proc;
03863         //reduce the free minimum processor count required for the rest of the part by 1.
03864         --minimum_num_procs_required_for_rest_of_parts;
03865 
03866         //part (i) is assigned to (required_proc) processors.
03867         num_procs_assigned_to_each_part[i] = required_proc;
03868 
03869         //because of the roundings some processors might be left as unassigned.
03870         //we want to assign those processors to the part with most imbalance.
03871         //find the part with the maximum imbalance here.
03872         double imbalance_wrt_ideal = (scalar_required_proc - required_proc) /  required_proc;
03873         if (imbalance_wrt_ideal > max_imbalance_difference){
03874             max_imbalance_difference = imbalance_wrt_ideal;
03875             max_differing_part = i;
03876         }
03877     }
03878 
03879     //assign extra processors to the part with maximum imbalance than the ideal.
03880     if (num_free_procs > 0){
03881         num_procs_assigned_to_each_part[max_differing_part] +=  num_free_procs;
03882     }
03883 
03884     //now find what are the best processors with least migration for each part.
03885 
03886     //part_assignment_proc_begin_indices ([i]) is the array that holds the beginning
03887     //index of a processor that processor sends its data for part - i
03888     partId_t *part_assignment_proc_begin_indices = allocMemory<partId_t>(num_parts);
03889     //the next processor send is found in processor_chains_in_parts, in linked list manner.
03890     partId_t *processor_chains_in_parts = allocMemory<partId_t>(num_procs);
03891     partId_t *processor_part_assignments = allocMemory<partId_t>(num_procs);
03892 
03893     //initialize the assignment of each processor.
03894     //this has a linked list implementation.
03895     //the beginning of processors assigned
03896     //to each part is hold at  part_assignment_proc_begin_indices[part].
03897     //then the next processor assigned to that part is located at
03898     //proc_part_assignments[part_assign_begins[part]], this is a chain
03899     //until the value of -1 is reached.
03900     for (int i = 0; i < num_procs; ++i ){
03901         processor_part_assignments[i] = -1;
03902         processor_chains_in_parts[i] = -1;
03903     }
03904     for (int i = 0; i < num_parts; ++i ){
03905         part_assignment_proc_begin_indices[i] = -1;
03906     }
03907 
03908 
03909     //Allocate memory for sorting data structure.
03910     uSortItem<partId_t, pq_lno_t> * sort_item_num_part_points_in_procs = allocMemory <uSortItem<partId_t, partId_t> > (num_procs);
03911     for(partId_t i = 0; i < num_parts; ++i){
03912         //the algorithm tries to minimize the cost of migration,
03913       //by assigning the processors with highest number of coordinates on that part.
03914       //here we might want to implement a maximum weighted bipartite matching algorithm.
03915       for(partId_t ii = 0; ii < num_procs; ++ii){
03916         sort_item_num_part_points_in_procs[ii].id = ii;
03917         //if processor is not assigned yet.
03918         //add its num points to the sort data structure.
03919         if (processor_part_assignments[ii] == -1){
03920           sort_item_num_part_points_in_procs[ii].val =
03921               num_points_in_all_processor_parts[ii * num_parts + i];
03922         }
03923         else {
03924           //if processor is already assigned, insert -nLocal - 1 so that it won't be selected again.
03925           //would be same if we simply set it to -1,
03926           //but more information with no extra cost (which is used later) is provided.
03927           sort_item_num_part_points_in_procs[ii].val = -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
03928         }
03929       }
03930         //sort the processors in the part.
03931         uqsort<partId_t, partId_t>(num_procs, sort_item_num_part_points_in_procs);
03932 
03933         partId_t required_proc_count =  num_procs_assigned_to_each_part[i];
03934         pq_gno_t total_num_points_in_part = global_num_points_in_parts[i];
03935         pq_gno_t ideal_num_points_in_a_proc =
03936                 ceil (total_num_points_in_part / double (required_proc_count));
03937 
03938         //starts sending to least heaviest part.
03939         partId_t next_proc_to_send_index = num_procs - required_proc_count;
03940         partId_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
03941         pq_lno_t space_left_in_sent_proc =  ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
03942 
03943         //find the processors that will be assigned to this part, which are the heaviest
03944         //non assigned processors.
03945         for(partId_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
03946             partId_t proc_id = sort_item_num_part_points_in_procs[ii].id;
03947             //assign processor to part - i.
03948             processor_part_assignments[proc_id] = i;
03949         }
03950 
03951         bool did_change_sign = false;
03952         //if processor has a minus count, reverse it.
03953         for(partId_t ii = 0; ii < num_procs; ++ii){
03954             if (sort_item_num_part_points_in_procs[ii].val < 0){
03955                 did_change_sign = true;
03956                 sort_item_num_part_points_in_procs[ii].val = -sort_item_num_part_points_in_procs[ii].val - 1;
03957             }
03958             else {
03959                 break;
03960             }
03961         }
03962         if(did_change_sign){
03963             //resort the processors in the part for the rest of the processors that is not assigned.
03964             uqsort<partId_t, partId_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
03965         }
03966 
03967         //check if this processors is one of the procs assigned to this part.
03968         //if it is, then get the group.
03969         if (!did_i_find_my_group){
03970             for(partId_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
03971 
03972               partId_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
03973               //add the proc to the group.
03974                 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
03975 
03976                 if(proc_id_to_assign == this->myRank){
03977                   //if the assigned process is me, then I find my group.
03978                     did_i_find_my_group = true;
03979                     //set the beginning of part i to my rank.
03980                     part_assignment_proc_begin_indices[i] = this->myRank;
03981                     processor_chains_in_parts[this->myRank] = -1;
03982 
03983                     //set send count to myself to the number of points that I have in part i.
03984                     send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
03985 
03986                     //calculate the shift required for the output_part_numbering_begin_index
03987                     for (partId_t in = 0; in < i; ++in){
03988                         output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
03989                     }
03990                     out_part_index = i;
03991                 }
03992             }
03993             //if these was not my group,
03994             //clear the subcomminicator processor array.
03995             if (!did_i_find_my_group){
03996                 processor_ranks_for_subcomm.clear();
03997             }
03998         }
03999 
04000         //send points of the nonassigned coordinates to the assigned coordinates.
04001         //starts from the heaviest nonassigned processor.
04002         //TODO we might want to play with this part, that allows more computational imbalance
04003         //but having better communication balance.
04004         for(partId_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
04005             partId_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
04006             pq_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
04007 
04008             //we set number of points to -to_sent - 1 for the assigned processors.
04009             //we reverse it here. This should not happen, as we have already reversed them above.
04010 #ifdef MJ_DEBUG
04011             if (num_points_to_sent < 0) {
04012               cout << "Migration - processor assignments - for part:" << i << "from proc:" << nonassigned_proc_id << " num_points_to_sent:" << num_points_to_sent << endl;
04013               exit(1);
04014             }
04015 #endif
04016 
04017             //now sends the points to the assigned processors.
04018             while (num_points_to_sent > 0){
04019                 //if the processor has enough space.
04020                 if (num_points_to_sent <= space_left_in_sent_proc){
04021                   //reduce the space left in the processor.
04022                   space_left_in_sent_proc -= num_points_to_sent;
04023                   //if my rank is the one that is sending the coordinates.
04024                     if (this->myRank == nonassigned_proc_id){
04025                         //set my sent count to the sent processor.
04026                         send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
04027                         //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
04028                         //that the processor will send its point in part-i.
04029                         partId_t prev_begin = part_assignment_proc_begin_indices[i];
04030                         part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
04031                         processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
04032                     }
04033                     num_points_to_sent = 0;
04034                 }
04035                 else {
04036                     //there might be no space left in the processor.
04037                     if(space_left_in_sent_proc > 0){
04038                         num_points_to_sent -= space_left_in_sent_proc;
04039 
04040                         //send as the space left in the processor.
04041                         if (this->myRank == nonassigned_proc_id){
04042                           //send as much as the space in this case.
04043                             send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
04044                             partId_t prev_begin = part_assignment_proc_begin_indices[i];
04045                             part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
04046                             processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
04047 
04048                         }
04049                     }
04050                     //change the sent part
04051                     ++next_proc_to_send_index;
04052 
04053 #ifdef MJ_DEBUG
04054                     if(next_part_to_send_index <  nprocs - required_proc_count ){
04055                       cout << "Migration - processor assignments - for part:"
04056                           << i
04057                           <<  " next_part_to_send :" << next_part_to_send_index
04058                           << " nprocs:" << nprocs
04059                           << " required_proc_count:" << required_proc_count
04060                           << " Error: next_part_to_send_index <  nprocs - required_proc_count" << endl;
04061                       exit(1)l
04062 
04063                     }
04064 #endif
04065                     //send the new id.
04066                     next_proc_to_send_id =  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
04067                     //set the new space in the processor.
04068                     space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
04069                 }
04070             }
04071         }
04072     }
04073 
04074 
04075 
04076     this->assign_send_destinations(
04077         num_parts,
04078             part_assignment_proc_begin_indices,
04079             processor_chains_in_parts,
04080             send_count_to_each_proc,
04081             coordinate_destionations);
04082 
04083     freeArray<partId_t>(part_assignment_proc_begin_indices);
04084     freeArray<partId_t>(processor_chains_in_parts);
04085     freeArray<partId_t>(processor_part_assignments);
04086     freeArray<uSortItem<partId_t, partId_t> > (sort_item_num_part_points_in_procs);
04087     freeArray<partId_t > (num_procs_assigned_to_each_part);
04088 
04089 }
04090 
04091 
04104 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
04105 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::assign_send_destionations2(
04106     partId_t num_parts,
04107     uSortItem<partId_t, partId_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
04108     int *coordinate_destionations,
04109     partId_t &output_part_numbering_begin_index,
04110     vector<partId_t> *next_future_num_parts_in_parts){
04111 
04112     partId_t part_shift_amount = output_part_numbering_begin_index;
04113     partId_t previous_processor = -1;
04114     for(partId_t i = 0; i < num_parts; ++i){
04115         partId_t p = sort_item_part_to_proc_assignment[i].id;
04116         //assigned processors are sorted.
04117         pq_lno_t part_begin_index = 0;
04118         if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
04119         pq_lno_t part_end_index = this->new_part_xadj[p];
04120 
04121         partId_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
04122         if (this->myRank == assigned_proc && previous_processor != assigned_proc){
04123             output_part_numbering_begin_index =  part_shift_amount;
04124         }
04125         previous_processor = assigned_proc;
04126         part_shift_amount += (*next_future_num_parts_in_parts)[p];
04127 
04128         for (pq_lno_t j=part_begin_index; j < part_end_index; j++){
04129             pq_lno_t localInd = this->new_coordinate_permutations[j];
04130             coordinate_destionations[localInd] = assigned_proc;
04131         }
04132     }
04133 }
04134 
04135 
04152 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
04153 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_assign_parts_to_procs(
04154     pq_gno_t * num_points_in_all_processor_parts,
04155     partId_t num_parts,
04156     partId_t num_procs,
04157     pq_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
04158     vector<partId_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
04159     partId_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
04160     vector<partId_t> &out_part_indices, //output: the part indices which the processor is assigned to.
04161     partId_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
04162     int *coordinate_destionations){
04163     out_num_part = 0;
04164 
04165     pq_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
04166     out_part_indices.clear();
04167 
04168     //to sort the parts that is assigned to the processors.
04169     //id is the part number, sort value is the assigned processor id.
04170     uSortItem<partId_t, partId_t> * sort_item_part_to_proc_assignment  = allocMemory <uSortItem<partId_t, partId_t> >(num_parts);
04171     uSortItem<partId_t, pq_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<partId_t, pq_gno_t> >(num_procs);
04172 
04173 
04174     //calculate the optimal number of coordinates that should be assigned to each processor.
04175     pq_lno_t work_each = pq_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
04176     //to hold the left space as the number of coordinates to the optimal number in each proc.
04177     pq_lno_t *space_in_each_processor = allocMemory <pq_lno_t>(num_procs);
04178     //initialize left space in each.
04179     for (partId_t i = 0; i < num_procs; ++i){
04180         space_in_each_processor[i] = work_each;
04181     }
04182 
04183     //we keep track of how many parts each processor is assigned to.
04184     //because in some weird inputs, it might be possible that some
04185     //processors is not assigned to any part. Using these variables,
04186     //we force each processor to have at least one part.
04187     partId_t *num_parts_proc_assigned = allocMemory <partId_t>(num_procs);
04188     memset(num_parts_proc_assigned, 0, sizeof(partId_t) * num_procs);
04189     int empty_proc_count = num_procs;
04190 
04191     //to sort the parts with decreasing order of their coordiantes.
04192     //id are the part numbers, sort value is the number of points in each.
04193     uSortItem<partId_t, pq_gno_t> * sort_item_point_counts_in_parts  = allocMemory <uSortItem<partId_t, pq_gno_t> >(num_parts);
04194 
04195     //initially we will sort the parts according to the number of coordinates they have.
04196     //so that we will start assigning with the part that has the most number of coordinates.
04197     for (partId_t i = 0; i < num_parts; ++i){
04198         sort_item_point_counts_in_parts[i].id = i;
04199         sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
04200     }
04201     //sort parts with increasing order of loads.
04202     uqsort<partId_t, pq_gno_t>(num_parts, sort_item_point_counts_in_parts);
04203 
04204 
04205     //assigning parts to the processors
04206     //traverse the part win decreasing order of load.
04207     //first assign the heaviest part.
04208     for (partId_t j = 0; j < num_parts; ++j){
04209         //sorted with increasing order, traverse inverse.
04210         partId_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
04211         //load of the part
04212         pq_gno_t load = global_num_points_in_parts[i];
04213 
04214         //assigned processors
04215         partId_t assigned_proc = -1;
04216         //if not fit best processor.
04217         partId_t best_proc_to_assign = 0;
04218 
04219 
04220         //sort processors with increasing number of points in this part.
04221         for (partId_t ii = 0; ii < num_procs; ++ii){
04222             sort_item_num_points_of_proc_in_part_i[ii].id = ii;
04223 
04224             //if there are still enough parts to fill empty processors, than proceed normally.
04225             //but if empty processor count is equal to the number of part, then
04226             //we force to part assignments only to empty processors.
04227             if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
04228               //how many points processor ii has in part i?
04229               sort_item_num_points_of_proc_in_part_i[ii].val =  num_points_in_all_processor_parts[ii * num_parts + i];
04230             }
04231             else {
04232               sort_item_num_points_of_proc_in_part_i[ii].val  = -1;
04233             }
04234         }
04235         uqsort<partId_t, pq_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
04236 
04237         //traverse all processors with decreasing load.
04238         for (partId_t iii = num_procs - 1; iii >= 0; --iii){
04239             partId_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
04240             pq_lno_t left_space = space_in_each_processor[ii] - load;
04241             //if enought space, assign to this part.
04242             if(left_space >= 0 ){
04243                 assigned_proc = ii;
04244                 break;
04245             }
04246             //if space is not enough, store the best candidate part.
04247             if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
04248                 best_proc_to_assign = ii;
04249             }
04250         }
04251 
04252         //if none had enough space, then assign it to best part.
04253         if (assigned_proc == -1){
04254             assigned_proc = best_proc_to_assign;
04255         }
04256 
04257         if (num_parts_proc_assigned[assigned_proc]++ == 0){
04258           --empty_proc_count;
04259         }
04260         space_in_each_processor[assigned_proc] -= load;
04261         //to sort later, part-i is assigned to the proccessor - assignment.
04262         sort_item_part_to_proc_assignment[j].id = i; //part i
04263         sort_item_part_to_proc_assignment[j].val = assigned_proc; //assigned to processor - assignment.
04264 
04265 
04266         //if assigned processor is me, increase the number.
04267         if (assigned_proc == this->myRank){
04268             out_num_part++;//assigned_part_count;
04269             out_part_indices.push_back(i);
04270         }
04271         //increase the send to that processor by the number of points in that part.
04272         //as everyone send their coordiantes in this part to the processor assigned to this part.
04273         send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
04274     }
04275     freeArray<partId_t>(num_parts_proc_assigned);
04276     freeArray< uSortItem<partId_t, pq_gno_t> > (sort_item_num_points_of_proc_in_part_i);
04277     freeArray<uSortItem<partId_t, pq_gno_t> >(sort_item_point_counts_in_parts);
04278     freeArray<pq_lno_t >(space_in_each_processor);
04279 
04280 
04281     //sort assignments with respect to the assigned processors.
04282     uqsort<partId_t, partId_t>(num_parts, sort_item_part_to_proc_assignment);
04283     //fill sendBuf.
04284 
04285 
04286     this->assign_send_destionations2(
04287             num_parts,
04288             sort_item_part_to_proc_assignment,
04289             coordinate_destionations,
04290             output_part_numbering_begin_index,
04291             next_future_num_parts_in_parts);
04292 
04293     freeArray<uSortItem<partId_t, partId_t> >(sort_item_part_to_proc_assignment);
04294 }
04295 
04296 
04314 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
04315 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_migration_part_proc_assignment(
04316     pq_gno_t * num_points_in_all_processor_parts,
04317     partId_t num_parts,
04318     partId_t num_procs,
04319     pq_lno_t *send_count_to_each_proc,
04320     vector<partId_t> &processor_ranks_for_subcomm,
04321     vector<partId_t> *next_future_num_parts_in_parts,
04322     partId_t &out_num_part,
04323     vector<partId_t> &out_part_indices,
04324     partId_t &output_part_numbering_begin_index,
04325     int *coordinate_destionations){
04326 
04327 
04328 
04329   processor_ranks_for_subcomm.clear();
04330   // if (this->num_local_coords > 0)
04331   if (num_procs > num_parts){
04332     //if there are more processors than the number of current part
04333     //then processors share the existing parts.
04334     //at the end each processor will have a single part,
04335     //but a part will be shared by a group of processors.
04336     partId_t out_part_index = 0;
04337     this->mj_assign_proc_to_parts(
04338         num_points_in_all_processor_parts,
04339         num_parts,
04340         num_procs,
04341         send_count_to_each_proc,
04342         processor_ranks_for_subcomm,
04343         next_future_num_parts_in_parts,
04344         out_part_index,
04345         output_part_numbering_begin_index,
04346         coordinate_destionations
04347     );
04348 
04349     out_num_part = 1;
04350     out_part_indices.clear();
04351     out_part_indices.push_back(out_part_index);
04352   }
04353   else {
04354 
04355     //there are more parts than the processors.
04356     //therefore a processor will be assigned multiple parts,
04357     //the subcommunicators will only have a single processor.
04358     processor_ranks_for_subcomm.push_back(this->myRank);
04359 
04360     //since there are more parts then procs,
04361     //assign multiple parts to processors.
04362     this->mj_assign_parts_to_procs(
04363         num_points_in_all_processor_parts,
04364         num_parts,
04365         num_procs,
04366         send_count_to_each_proc,
04367         next_future_num_parts_in_parts,
04368         out_num_part,
04369         out_part_indices,
04370         output_part_numbering_begin_index,
04371         coordinate_destionations);
04372   }
04373 }
04374 
04387 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
04388 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_migrate_coords(
04389     partId_t num_procs,
04390     pq_gno_t &num_new_local_points,
04391     string iteration,
04392     int *coordinate_destionations,
04393     partId_t num_parts){
04394 #ifdef ENABLE_ZOLTAN_MIGRATION
04395   ZOLTAN_COMM_OBJ *plan = NULL;
04396   MPI_Comm mpi_comm = Teuchos2MPI (this->comm);
04397   pq_lno_t num_incoming_gnos = 0;
04398   int message_tag = 7859;
04399 
04400   this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Migration Z1PlanCreating-" + iteration);
04401   int ierr = Zoltan_Comm_Create(
04402       &plan,
04403       this->num_local_coords,
04404       coordinate_destionations,
04405       mpi_comm,
04406       message_tag,
04407       &num_incoming_gnos);
04408   Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04409   this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Migration Z1PlanCreating-" + iteration);
04410 
04411   this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Migration Z1Migration-" + iteration);
04412   pq_gno_t *incoming_gnos = allocMemory< pq_gno_t>(num_incoming_gnos);
04413 
04414   //migrate gnos.
04415   message_tag++;
04416   ierr = Zoltan_Comm_Do(
04417       plan,
04418       message_tag,
04419       (char *) this->current_mj_gnos,
04420       sizeof(pq_gno_t),
04421       (char *) incoming_gnos);
04422   Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04423 
04424   freeArray<pq_gno_t>(this->current_mj_gnos);
04425   this->current_mj_gnos = incoming_gnos;
04426 
04427 
04428   //migrate coordinates
04429   for (int i = 0; i < this->coord_dim; ++i){
04430     message_tag++;
04431     pq_scalar_t *coord = this->mj_coordinates[i];
04432 
04433     this->mj_coordinates[i] = allocMemory<pq_scalar_t>(num_incoming_gnos);
04434     ierr = Zoltan_Comm_Do(
04435         plan,
04436         message_tag,
04437         (char *) coord,
04438         sizeof(pq_scalar_t),
04439         (char *) this->mj_coordinates[i]);
04440     Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04441     freeArray<pq_scalar_t>(coord);
04442   }
04443 
04444   //migrate weights.
04445   for (int i = 0; i < this->num_weights_per_coord; ++i){
04446     message_tag++;
04447     pq_scalar_t *weight = this->mj_weights[i];
04448 
04449     this->mj_weights[i] = allocMemory<pq_scalar_t>(num_incoming_gnos);
04450     ierr = Zoltan_Comm_Do(
04451         plan,
04452         message_tag,
04453         (char *) weight,
04454         sizeof(pq_scalar_t),
04455         (char *) this->mj_weights[i]);
04456     Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04457     freeArray<pq_scalar_t>(weight);
04458   }
04459 
04460 
04461   //migrate owners.
04462   int *coord_own = allocMemory<int>(num_incoming_gnos);
04463   message_tag++;
04464   ierr = Zoltan_Comm_Do(
04465       plan,
04466       message_tag,
04467       (char *) this->owner_of_coordinate,
04468       sizeof(int), (char *) coord_own);
04469   Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04470   freeArray<int>(this->owner_of_coordinate);
04471   this->owner_of_coordinate = coord_own;
04472 
04473 
04474   //if num procs is less than num parts,
04475   //we need the part assigment arrays as well, since
04476   //there will be multiple parts in processor.
04477   partId_t *new_parts = allocMemory<int>(num_incoming_gnos);
04478   if(num_procs < num_parts){
04479     message_tag++;
04480     ierr = Zoltan_Comm_Do(
04481         plan,
04482         message_tag,
04483         (char *) this->assigned_part_ids,
04484         sizeof(partId_t),
04485         (char *) new_parts);
04486     Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04487   }
04488   freeArray<partId_t>(this->assigned_part_ids);
04489   this->assigned_part_ids = new_parts;
04490 
04491   ierr = Zoltan_Comm_Destroy(&plan);
04492   Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04493   num_new_local_points = num_incoming_gnos;
04494   this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Migration Z1Migration-" + iteration);
04495 #else
04496 
04497   this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Migration DistributorPlanCreating-" + iteration);
04498   Tpetra::Distributor distributor(this->comm);
04499   ArrayView<const partId_t> destinations( coordinate_destionations, this->num_local_coords);
04500   pq_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
04501   this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Migration DistributorPlanCreating-" + iteration);
04502 
04503   this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Migration DistributorMigration-" + iteration);
04504   {
04505     //migrate gnos.
04506     ArrayRCP<pq_gno_t> recieved_gnos(num_incoming_gnos);
04507     ArrayView<pq_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
04508     distributor.doPostsAndWaits<pq_gno_t>(sent_gnos, 1, recieved_gnos());
04509     freeArray<pq_gno_t>(this->current_mj_gnos);
04510     this->current_mj_gnos = allocMemory<pq_gno_t>(num_incoming_gnos);
04511     memcpy(
04512         this->current_mj_gnos,
04513         recieved_gnos.getRawPtr(),
04514         num_incoming_gnos * sizeof(pq_gno_t));
04515   }
04516   //migrate coordinates
04517   for (int i = 0; i < this->coord_dim; ++i){
04518 
04519     ArrayView<pq_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
04520     ArrayRCP<pq_scalar_t> recieved_coord(num_incoming_gnos);
04521     distributor.doPostsAndWaits<pq_scalar_t>(sent_coord, 1, recieved_coord());
04522     freeArray<pq_scalar_t>(this->mj_coordinates[i]);
04523     this->mj_coordinates[i] = allocMemory<pq_scalar_t>(num_incoming_gnos);
04524     memcpy(
04525         this->mj_coordinates[i],
04526         recieved_coord.getRawPtr(),
04527         num_incoming_gnos * sizeof(pq_scalar_t));
04528   }
04529 
04530   //migrate weights.
04531   for (int i = 0; i < this->num_weights_per_coord; ++i){
04532 
04533     ArrayView<pq_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
04534     ArrayRCP<pq_scalar_t> recieved_weight(num_incoming_gnos);
04535     distributor.doPostsAndWaits<pq_scalar_t>(sent_weight, 1, recieved_weight());
04536     freeArray<pq_scalar_t>(this->mj_weights[i]);
04537     this->mj_weights[i] = allocMemory<pq_scalar_t>(num_incoming_gnos);
04538     memcpy(
04539         this->mj_weights[i],
04540         recieved_weight.getRawPtr(),
04541         num_incoming_gnos * sizeof(pq_scalar_t));
04542   }
04543 
04544   {
04545     //migrate the owners of the coordinates
04546     ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
04547     ArrayRCP<int> recieved_owners(num_incoming_gnos);
04548     distributor.doPostsAndWaits<int>(sent_owners, 1, recieved_owners());
04549     freeArray<int>(this->owner_of_coordinate);
04550     this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
04551     memcpy(
04552             this->owner_of_coordinate,
04553             recieved_owners.getRawPtr(),
04554             num_incoming_gnos * sizeof(int));
04555   }
04556 
04557   //if num procs is less than num parts,
04558   //we need the part assigment arrays as well, since
04559   //there will be multiple parts in processor.
04560   if(num_procs < num_parts){
04561     ArrayView<partId_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
04562     ArrayRCP<partId_t> recieved_partids(num_incoming_gnos);
04563     distributor.doPostsAndWaits<partId_t>(sent_partids, 1, recieved_partids());
04564     freeArray<partId_t>(this->assigned_part_ids);
04565     this->assigned_part_ids = allocMemory<partId_t>(num_incoming_gnos);
04566     memcpy(
04567         this->assigned_part_ids,
04568         recieved_partids.getRawPtr(),
04569         num_incoming_gnos * sizeof(partId_t));
04570   }
04571   else {
04572     partId_t *new_parts = allocMemory<int>(num_incoming_gnos);
04573     freeArray<partId_t>(this->assigned_part_ids);
04574     this->assigned_part_ids = new_parts;
04575   }
04576   this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Migration DistributorMigration-" + iteration);
04577   num_new_local_points = num_incoming_gnos;
04578 
04579 #endif
04580 }
04581 
04588 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
04589 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::create_sub_communicator(vector<partId_t> &processor_ranks_for_subcomm){
04590     partId_t group_size = processor_ranks_for_subcomm.size();
04591     partId_t *ids = allocMemory<partId_t>(group_size);
04592     for(partId_t i = 0; i < group_size; ++i) {
04593         ids[i] = processor_ranks_for_subcomm[i];
04594     }
04595     ArrayView<const partId_t> idView(ids, group_size);
04596     this->comm = this->comm->createSubcommunicator(idView);
04597     freeArray<partId_t>(ids);
04598 }
04599 
04600 
04606 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
04607 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::fill_permutation_array(
04608     partId_t output_num_parts,
04609     partId_t num_parts){
04610   //if there is single output part, then simply fill the permutation array.
04611     if (output_num_parts == 1){
04612         for(pq_lno_t i = 0; i < this->num_local_coords; ++i){
04613             this->new_coordinate_permutations[i] = i;
04614         }
04615         this->new_part_xadj[0] = this->num_local_coords;
04616     }
04617     else {
04618 
04619       //otherwise we need to count how many points are there in each part.
04620       //we allocate here as num_parts, because the sent partids are up to num_parts,
04621       //although there are outout_num_parts different part.
04622         pq_lno_t *num_points_in_parts = allocMemory<pq_lno_t>(num_parts);
04623         //part shift holds the which part number an old part number corresponds to.
04624         partId_t *part_shifts = allocMemory<partId_t>(num_parts);
04625 
04626         memset(num_points_in_parts, 0, sizeof(pq_lno_t) * num_parts);
04627 
04628         for(pq_lno_t i = 0; i < this->num_local_coords; ++i){
04629             partId_t ii = this->assigned_part_ids[i];
04630             ++num_points_in_parts[ii];
04631         }
04632 
04633         //write the end points of the parts.
04634         partId_t p = 0;
04635         pq_lno_t prev_index = 0;
04636         for(partId_t i = 0; i < num_parts; ++i){
04637             if(num_points_in_parts[i] > 0)  {
04638                 this->new_part_xadj[p] =  prev_index + num_points_in_parts[i];
04639                 prev_index += num_points_in_parts[i];
04640                 part_shifts[i] = p++;
04641             }
04642         }
04643 
04644         //for the rest of the parts write the end index as end point.
04645         partId_t assigned_num_parts = p - 1;
04646         for (;p < num_parts; ++p){
04647             this->new_part_xadj[p] =  this->new_part_xadj[assigned_num_parts];
04648         }
04649         for(partId_t i = 0; i < output_num_parts; ++i){
04650             num_points_in_parts[i] = this->new_part_xadj[i];
04651         }
04652 
04653         //write the permutation array here.
04654         //get the part of the coordinate i, shift it to obtain the new part number.
04655         //assign it to the end of the new part numbers pointer.
04656         for(pq_lno_t i = this->num_local_coords - 1; i >= 0; --i){
04657             partId_t part = part_shifts[partId_t(this->assigned_part_ids[i])];
04658             this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
04659         }
04660 
04661         freeArray<pq_lno_t>(num_points_in_parts);
04662         freeArray<partId_t>(part_shifts);
04663     }
04664 }
04665 
04666 
04689 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
04690 bool AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::mj_perform_migration(
04691     partId_t input_num_parts, //current umb parts
04692     partId_t &output_num_parts, //output umb parts.
04693     vector<partId_t> *next_future_num_parts_in_parts,
04694     partId_t &output_part_begin_index,
04695     size_t migration_reduce_all_population,
04696     pq_lno_t num_coords_for_last_dim_part,
04697     string iteration,
04698     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > &input_part_boxes,
04699     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > &output_part_boxes){
04700 
04701   partId_t num_procs = this->comm->getSize();
04702   this->myRank = this->comm->getRank();
04703 
04704 
04705   //this array holds how many points each processor has in each part.
04706   //to access how many points processor i has on part j,
04707   //num_points_in_all_processor_parts[i * num_parts + j]
04708   pq_gno_t *num_points_in_all_processor_parts = allocMemory<pq_gno_t>(input_num_parts * (num_procs + 1));
04709 
04710   //get the number of coordinates in each part in each processor.
04711   this->get_processor_num_points_in_parts(
04712       num_procs,
04713       input_num_parts,
04714       num_points_in_all_processor_parts);
04715 
04716 
04717   //check if migration will be performed or not.
04718   if (!this->mj_check_to_migrate(
04719       migration_reduce_all_population,
04720       num_coords_for_last_dim_part,
04721       num_procs,
04722       input_num_parts,
04723       num_points_in_all_processor_parts)){
04724     freeArray<pq_gno_t>(num_points_in_all_processor_parts);
04725     return false;
04726   }
04727 
04728 
04729   pq_lno_t *send_count_to_each_proc = NULL;
04730   int *coordinate_destionations = allocMemory<int>(this->num_local_coords);
04731   send_count_to_each_proc = allocMemory<pq_lno_t>(num_procs);
04732   for (int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
04733 
04734   vector<partId_t> processor_ranks_for_subcomm;
04735   vector<partId_t> out_part_indices;
04736 
04737   //determine which processors are assigned to which parts
04738   this->mj_migration_part_proc_assignment(
04739       num_points_in_all_processor_parts,
04740       input_num_parts,
04741       num_procs,
04742       send_count_to_each_proc,
04743       processor_ranks_for_subcomm,
04744       next_future_num_parts_in_parts,
04745       output_num_parts,
04746       out_part_indices,
04747       output_part_begin_index,
04748       coordinate_destionations);
04749 
04750 
04751 
04752 
04753   freeArray<pq_lno_t>(send_count_to_each_proc);
04754   vector <partId_t> tmpv;
04755 
04756   std::sort (out_part_indices.begin(), out_part_indices.end());
04757   partId_t outP = out_part_indices.size();
04758 
04759   pq_gno_t new_global_num_points = 0;
04760   pq_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
04761 
04762   if (this->mj_keep_part_boxes){
04763     input_part_boxes->clear();
04764   }
04765 
04766   //now we calculate the new values for next_future_num_parts_in_parts.
04767   //same for the part boxes.
04768   for (partId_t i = 0; i < outP; ++i){
04769     partId_t ind = out_part_indices[i];
04770     new_global_num_points += global_num_points_in_parts[ind];
04771     tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
04772     if (this->mj_keep_part_boxes){
04773       input_part_boxes->push_back((*output_part_boxes)[ind]);
04774     }
04775   }
04776   //swap the input and output part boxes.
04777   if (this->mj_keep_part_boxes){
04778     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > tmpPartBoxes = input_part_boxes;
04779     input_part_boxes = output_part_boxes;
04780     output_part_boxes = tmpPartBoxes;
04781   }
04782   next_future_num_parts_in_parts->clear();
04783   for (partId_t i = 0; i < outP; ++i){
04784     partId_t p = tmpv[i];
04785     next_future_num_parts_in_parts->push_back(p);
04786   }
04787 
04788   freeArray<pq_gno_t>(num_points_in_all_processor_parts);
04789 
04790   pq_gno_t num_new_local_points = 0;
04791 
04792 
04793   //perform the actual migration operation here.
04794   this->mj_migrate_coords(
04795       num_procs,
04796       num_new_local_points,
04797       iteration,
04798       coordinate_destionations,
04799       input_num_parts);
04800 
04801 
04802   freeArray<int>(coordinate_destionations);
04803 
04804   if(this->num_local_coords != num_new_local_points){
04805     freeArray<pq_lno_t>(this->new_coordinate_permutations);
04806     freeArray<pq_lno_t>(this->coordinate_permutations);
04807 
04808     this->new_coordinate_permutations = allocMemory<pq_lno_t>(num_new_local_points);
04809     this->coordinate_permutations = allocMemory<pq_lno_t>(num_new_local_points);
04810   }
04811   this->num_local_coords = num_new_local_points;
04812   this->num_global_coords = new_global_num_points;
04813 
04814 
04815 
04816   //create subcommunicator.
04817   this->create_sub_communicator(processor_ranks_for_subcomm);
04818   processor_ranks_for_subcomm.clear();
04819 
04820   //fill the new permutation arrays.
04821   this->fill_permutation_array(
04822       output_num_parts,
04823       input_num_parts);
04824   return true;
04825 }
04826 
04827 
04841 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
04842 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::create_consistent_chunks(
04843     partId_t num_parts,
04844     pq_scalar_t *mj_current_dim_coords,
04845     pq_scalar_t *current_concurrent_cut_coordinate,
04846     pq_lno_t coordinate_begin,
04847     pq_lno_t coordinate_end,
04848     pq_scalar_t *used_local_cut_line_weight_to_left,
04849     pq_lno_t *out_part_xadj,
04850     int coordInd){
04851 
04852   //pq_lno_t numCoordsInPart =  coordinateEnd - coordinateBegin;
04853   partId_t no_cuts = num_parts - 1;
04854 
04855 
04856 
04857   int me = 0;
04858   pq_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
04859   pq_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
04860 
04861 
04862   //now if the rectelinear partitioning is allowed we decide how
04863   //much weight each thread should put to left and right.
04864   if (this->distribute_points_on_cut_lines){
04865 
04866     my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
04867     for (partId_t i = 0; i < no_cuts; ++i){
04868       //the left to be put on the left of the cut.
04869       pq_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
04870       //cout << "i:" << i << " left_weight:" << left_weight << endl;
04871       for(int ii = 0; ii < this->num_threads; ++ii){
04872         if(left_weight > this->sEpsilon){
04873           //the weight of thread ii on cut.
04874           pq_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
04875           if(thread_ii_weight_on_cut < left_weight){
04876             this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
04877           }
04878           else {
04879             this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
04880           }
04881           left_weight -= thread_ii_weight_on_cut;
04882         }
04883         else {
04884           this->thread_cut_line_weight_to_put_left[ii][i] = 0;
04885         }
04886       }
04887     }
04888 
04889     if(no_cuts > 0){
04890       //this is a special case. If cutlines share the same coordinate, their weights are equal.
04891       //we need to adjust the ratio for that.
04892       for (partId_t i = no_cuts - 1; i > 0 ; --i){
04893         if(ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
04894           my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
04895         }
04896         my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
04897                                         / pq_scalar_t(SIGNIFICANCE_MUL);
04898       }
04899     }
04900   }
04901 
04902   for(partId_t ii = 0; ii < num_parts; ++ii){
04903     thread_num_points_in_parts[ii] = 0;
04904   }
04905 
04906   //for this specific case we dont want to distribute the points along the cut position
04907   //randomly, as we need a specific ordering of them. Instead,
04908   //we put the coordinates into a sort item, where we sort those
04909   //using the coordinates of points on other dimensions and the index.
04910 
04911 
04912   //some of the cuts might share the same position.
04913   //in this case, if cut i and cut j share the same position
04914   //cut_map[i] = cut_map[j] = sort item index.
04915   partId_t *cut_map = allocMemory<partId_t> (no_cuts);
04916 
04917 
04918   typedef uMultiSortItem<pq_lno_t, int, pq_scalar_t> multiSItem;
04919   typedef std::vector< multiSItem > multiSVector;
04920   typedef std::vector<multiSVector> multiS2Vector;
04921 
04922   //to keep track of the memory allocated.
04923   std::vector<pq_scalar_t *>allocated_memory;
04924 
04925   //vector for which the coordinates will be sorted.
04926   multiS2Vector sort_vector_points_on_cut;
04927 
04928   //the number of cuts that have different coordinates.
04929   partId_t different_cut_count = 1;
04930   cut_map[0] = 0;
04931 
04932   //now we insert 1 sort vector for all cuts on the different
04933   //positins.if multiple cuts are on the same position, they share sort vectors.
04934   multiSVector tmpMultiSVector;
04935   sort_vector_points_on_cut.push_back(tmpMultiSVector);
04936 
04937   for (partId_t i = 1; i < no_cuts ; ++i){
04938     //if cuts share the same cut coordinates
04939     //set the cutmap accordingly.
04940     if(ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
04941       cut_map[i] = cut_map[i-1];
04942     }
04943     else {
04944       cut_map[i] = different_cut_count++;
04945       multiSVector tmp2MultiSVector;
04946       sort_vector_points_on_cut.push_back(tmp2MultiSVector);
04947     }
04948   }
04949 
04950 
04951   //now the actual part assigment.
04952   for (pq_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
04953 
04954     pq_lno_t i = this->coordinate_permutations[ii];
04955 
04956     partId_t pp = this->assigned_part_ids[i];
04957     partId_t p = pp / 2;
04958     //if the coordinate is on a cut.
04959     if(pp % 2 == 1 ){
04960       pq_scalar_t *vals = allocMemory<pq_scalar_t>(this->coord_dim -1);
04961       allocated_memory.push_back(vals);
04962 
04963       //we insert the coordinates to the sort item here.
04964       int val_ind = 0;
04965       for(int dim = coordInd + 1; dim < this->coord_dim; ++dim){
04966         vals[val_ind++] = this->mj_coordinates[dim][i];
04967       }
04968       for(int dim = 0; dim < coordInd; ++dim){
04969         vals[val_ind++] = this->mj_coordinates[dim][i];
04970       }
04971       multiSItem tempSortItem(i, this->coord_dim -1, vals);
04972       //inser the point to the sort vector pointed by the cut_map[p].
04973       partId_t cmap = cut_map[p];
04974       sort_vector_points_on_cut[cmap].push_back(tempSortItem);
04975     }
04976     else {
04977       //if it is not on the cut, simple sorting.
04978       ++thread_num_points_in_parts[p];
04979       this->assigned_part_ids[i] = p;
04980     }
04981   }
04982 
04983   //sort all the sort vectors.
04984   for (partId_t i = 0; i < different_cut_count; ++i){
04985     std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
04986   }
04987 
04988   //we do the part assignment for the points on cuts here.
04989   partId_t previous_cut_map = cut_map[0];
04990 
04991   //this is how much previous part owns the weight of the current part.
04992   //when target part weight is 1.6, and the part on the left is given 2,
04993   //the left has an extra 0.4, while the right has missing 0.4 from the previous cut.
04994   //this parameter is used to balance this issues.
04995   //in the above example weight_stolen_from_previous_part will be 0.4.
04996   //if the left part target is 2.2 but it is given 2,
04997   //then weight_stolen_from_previous_part will be -0.2.
04998   pq_scalar_t weight_stolen_from_previous_part = 0;
04999   for (partId_t p = 0; p < no_cuts; ++p){
05000 
05001     partId_t mapped_cut = cut_map[p];
05002 
05003     //if previous cut map is done, and it does not have the same index,
05004     //then assign all points left on that cut to its right.
05005     if (previous_cut_map != mapped_cut){
05006       pq_lno_t sort_vector_end = (pq_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
05007       for (; sort_vector_end >= 0; --sort_vector_end){
05008         multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
05009         pq_lno_t i = t.index;
05010         ++thread_num_points_in_parts[p];
05011         this->assigned_part_ids[i] = p;
05012       }
05013       sort_vector_points_on_cut[previous_cut_map].clear();
05014     }
05015     pq_lno_t sort_vector_end = (pq_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
05016 
05017     for (; sort_vector_end >= 0; --sort_vector_end){
05018       multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
05019       pq_lno_t i = t.index;
05020       pq_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
05021 
05022 
05023       //part p has enough space for point i, then put it to point i.
05024       if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
05025         my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
05026           > this->sEpsilon){
05027 
05028         my_local_thread_cut_weights_to_put_left[p] -= w;
05029         sort_vector_points_on_cut[mapped_cut].pop_back();
05030         ++thread_num_points_in_parts[p];
05031         this->assigned_part_ids[i] = p;
05032         //if putting this weight to left overweights the left cut, then
05033         //increase the space for the next cut using weight_stolen_from_previous_part.
05034         if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
05035           if(mapped_cut == cut_map[p + 1] ){
05036             //if the cut before the cut indexed at p was also at the same position
05037             //special case, as we handle the weight differently here.
05038             if (previous_cut_map != mapped_cut){
05039               weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
05040             }
05041             else {
05042               //if the cut before the cut indexed at p was also at the same position
05043               //we assign extra weights cumulatively in this case.
05044               weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
05045             }
05046           }
05047           else{
05048             weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
05049           }
05050           //end assignment for part p
05051           break;
05052         }
05053       } else {
05054         //if part p does not have enough space for this point
05055         //and if there is another cut sharing the same positon,
05056         //again increase the space for the next
05057         if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
05058           if (previous_cut_map != mapped_cut){
05059             weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
05060           }
05061           else {
05062             weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
05063           }
05064         }
05065         else{
05066           weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
05067         }
05068         //end assignment for part p
05069         break;
05070       }
05071     }
05072     previous_cut_map = mapped_cut;
05073   }
05074   //put everything left on the last cut to the last part.
05075   pq_lno_t sort_vector_end = (pq_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
05076   for (; sort_vector_end >= 0; --sort_vector_end){
05077     multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
05078     pq_lno_t i = t.index;
05079     ++thread_num_points_in_parts[no_cuts];
05080     this->assigned_part_ids[i] = no_cuts;
05081   }
05082   sort_vector_points_on_cut[previous_cut_map].clear();
05083   freeArray<partId_t> (cut_map);
05084 
05085   //free the memory allocated for vertex sort items .
05086   pq_lno_t vSize = (pq_lno_t) allocated_memory.size();
05087   for(pq_lno_t i = 0; i < vSize; ++i){
05088     freeArray<pq_scalar_t> (allocated_memory[i]);
05089   }
05090 
05091   //creation of part_xadj as in usual case.
05092   for(partId_t j = 0; j < num_parts; ++j){
05093     pq_lno_t num_points_in_part_j_upto_thread_i = 0;
05094     for (int i = 0; i < this->num_threads; ++i){
05095       pq_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
05096       this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
05097       num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
05098 
05099     }
05100     out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
05101   }
05102 
05103   //perform prefix sum for num_points in parts.
05104   for(partId_t j = 1; j < num_parts; ++j){
05105     out_part_xadj[j] += out_part_xadj[j - 1];
05106   }
05107 
05108 
05109   //shift the num points in threads thread to obtain the
05110   //beginning index of each thread's private space.
05111   for(partId_t j = 1; j < num_parts; ++j){
05112     thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
05113   }
05114 
05115   //now thread gets the coordinate and writes the index of coordinate to the permutation array
05116   //using the part index we calculated.
05117   for (pq_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
05118     pq_lno_t i = this->coordinate_permutations[ii];
05119     partId_t p =  this->assigned_part_ids[i];
05120     this->new_coordinate_permutations[coordinate_begin +
05121                                       thread_num_points_in_parts[p]++] = i;
05122   }
05123 }
05124 
05125 
05126 
05136 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
05137 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::set_final_parts(
05138     partId_t current_num_parts,
05139     partId_t output_part_begin_index,
05140     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > output_part_boxes,
05141     bool is_data_ever_migrated){
05142     this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Part_Assignment");
05143 
05144 #ifdef HAVE_ZOLTAN2_OMP
05145 #pragma omp parallel for
05146 #endif
05147     for(partId_t i = 0; i < current_num_parts;++i){
05148 
05149       pq_lno_t begin = 0;
05150       pq_lno_t end = this->part_xadj[i];
05151 
05152       if(i > 0) begin = this->part_xadj[i -1];
05153       partId_t part_to_set_index = i + output_part_begin_index;
05154       if (this->mj_keep_part_boxes){
05155         (*output_part_boxes)[i].setpId(part_to_set_index);
05156       }
05157       for (pq_lno_t ii = begin; ii < end; ++ii){
05158         pq_lno_t k = this->coordinate_permutations[ii];
05159         this->assigned_part_ids[k] = part_to_set_index;
05160       }
05161     }
05162 
05163     //ArrayRCP<const pq_gno_t> gnoList;
05164     if(!is_data_ever_migrated){
05165       //freeArray<pq_gno_t>(this->current_mj_gnos);
05166         //if(this->num_local_coords > 0){
05167         //    gnoList = arcpFromArrayView(this->mj_gnos);
05168         //}
05169     }
05170     else {
05171 #ifdef ENABLE_ZOLTAN_MIGRATION
05172       //if data is migrated, then send part numbers to the original owners.
05173       ZOLTAN_COMM_OBJ *plan = NULL;
05174       MPI_Comm mpi_comm = Teuchos2MPI (this->mj_problemComm);
05175 
05176       pq_lno_t incoming = 0;
05177       int message_tag = 7856;
05178 
05179       this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Final Z1PlanCreating");
05180       int ierr = Zoltan_Comm_Create( &plan, this->num_local_coords,
05181           this->owner_of_coordinate, mpi_comm, message_tag,
05182           &incoming);
05183       Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
05184       this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Final Z1PlanCreating" );
05185 
05186       pq_gno_t *incoming_gnos = allocMemory< pq_gno_t>(incoming);
05187 
05188       message_tag++;
05189       this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Final Z1PlanComm");
05190       ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->current_mj_gnos,
05191           sizeof(pq_gno_t), (char *) incoming_gnos);
05192       Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
05193 
05194       freeArray<pq_gno_t>(this->current_mj_gnos);
05195       this->current_mj_gnos = incoming_gnos;
05196 
05197       partId_t *incoming_partIds = allocMemory< partId_t>(incoming);
05198 
05199       message_tag++;
05200       ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->assigned_part_ids,
05201           sizeof(partId_t), (char *) incoming_partIds);
05202       Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
05203       freeArray<partId_t>(this->assigned_part_ids);
05204       this->assigned_part_ids = incoming_partIds;
05205 
05206       this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Final Z1PlanComm");
05207       ierr = Zoltan_Comm_Destroy(&plan);
05208       Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
05209 
05210       this->num_local_coords = incoming;
05211       //gnoList = arcp(this->current_mj_gnos, 0, this->num_local_coords, true);
05212 #else
05213 
05214       //if data is migrated, then send part numbers to the original owners.
05215       this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Final DistributorPlanCreating");
05216       Tpetra::Distributor distributor(this->mj_problemComm);
05217       ArrayView<const partId_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
05218       pq_lno_t incoming = distributor.createFromSends(owners_of_coords);
05219       this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Final DistributorPlanCreating" );
05220 
05221       this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Final DistributorPlanComm");
05222     //migrate gnos to actual owners.
05223     ArrayRCP<pq_gno_t> recieved_gnos(incoming);
05224     ArrayView<pq_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
05225     distributor.doPostsAndWaits<pq_gno_t>(sent_gnos, 1, recieved_gnos());
05226     freeArray<pq_gno_t>(this->current_mj_gnos);
05227     this->current_mj_gnos = allocMemory<pq_gno_t>(incoming);
05228     memcpy(
05229         this->current_mj_gnos,
05230         recieved_gnos.getRawPtr(),
05231         incoming * sizeof(pq_gno_t));
05232 
05233     //migrate part ids to actual owners.
05234     ArrayView<partId_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
05235     ArrayRCP<partId_t> recieved_partids(incoming);
05236     distributor.doPostsAndWaits<partId_t>(sent_partids, 1, recieved_partids());
05237     freeArray<partId_t>(this->assigned_part_ids);
05238     this->assigned_part_ids = allocMemory<partId_t>(incoming);
05239     memcpy(
05240         this->assigned_part_ids,
05241         recieved_partids.getRawPtr(),
05242         incoming * sizeof(partId_t));
05243 
05244       this->num_local_coords = incoming;
05245       this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Final DistributorPlanComm");
05246 
05247 #endif
05248     }
05249 
05250     this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Part_Assignment");
05251 
05252     this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Solution_Part_Assignment");
05253 
05254     //ArrayRCP<partId_t> partId;
05255     //partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
05256 
05257     if (this->mj_keep_part_boxes){
05258       this->kept_boxes = output_part_boxes;
05259     }
05260 
05261     this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Solution_Part_Assignment");
05262 }
05263 
05266 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
05267 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::free_work_memory(){
05268   this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Problem_Free");
05269 
05270   for (int i=0; i < this->coord_dim; i++){
05271 
05272     freeArray<pq_scalar_t>(this->mj_coordinates[i]);
05273   }
05274   freeArray<pq_scalar_t *>(this->mj_coordinates);
05275 
05276   for (int i=0; i < this->num_weights_per_coord; i++){
05277     freeArray<pq_scalar_t>(this->mj_weights[i]);
05278   }
05279   freeArray<pq_scalar_t *>(this->mj_weights);
05280 
05281   freeArray<int>(this->owner_of_coordinate);
05282 
05283   for(int i = 0; i < this->num_threads; ++i){
05284     freeArray<pq_lno_t>(this->thread_point_counts[i]);
05285   }
05286 
05287   freeArray<pq_lno_t *>(this->thread_point_counts);
05288   freeArray<double *> (this->thread_part_weight_work);
05289 
05290   if(this->distribute_points_on_cut_lines){
05291     freeArray<pq_scalar_t>(this->process_cut_line_weight_to_put_left);
05292     for(int i = 0; i < this->num_threads; ++i){
05293       freeArray<pq_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
05294     }
05295     freeArray<pq_scalar_t *>(this->thread_cut_line_weight_to_put_left);
05296     freeArray<pq_scalar_t>(this->process_rectelinear_cut_weight);
05297     freeArray<pq_scalar_t>(this->global_rectelinear_cut_weight);
05298   }
05299 
05300   freeArray<partId_t>(this->my_incomplete_cut_count);
05301 
05302   freeArray<pq_scalar_t>(this->max_min_coords);
05303 
05304   freeArray<pq_lno_t>(this->new_part_xadj);
05305 
05306   freeArray<pq_lno_t>(this->coordinate_permutations);
05307 
05308   freeArray<pq_lno_t>(this->new_coordinate_permutations);
05309 
05310   freeArray<pq_scalar_t>(this->all_cut_coordinates);
05311 
05312   freeArray<pq_scalar_t> (this->process_local_min_max_coord_total_weight);
05313 
05314   freeArray<pq_scalar_t> (this->global_min_max_coord_total_weight);
05315 
05316   freeArray<pq_scalar_t>(this->cut_coordinates_work_array);
05317 
05318   freeArray<pq_scalar_t>(this->target_part_weights);
05319 
05320   freeArray<pq_scalar_t>(this->cut_upper_bound_coordinates);
05321 
05322   freeArray<pq_scalar_t>(this->cut_lower_bound_coordinates);
05323 
05324   freeArray<pq_scalar_t>(this->cut_lower_bound_weights);
05325   freeArray<pq_scalar_t>(this->cut_upper_bound_weights);
05326   freeArray<bool>(this->is_cut_line_determined);
05327   freeArray<pq_scalar_t>(this->total_part_weight_left_right_closests);
05328   freeArray<pq_scalar_t>(this->global_total_part_weight_left_right_closests);
05329 
05330   for(int i = 0; i < this->num_threads; ++i){
05331     freeArray<double>(this->thread_part_weights[i]);
05332     freeArray<pq_scalar_t>(this->thread_cut_right_closest_point[i]);
05333     freeArray<pq_scalar_t>(this->thread_cut_left_closest_point[i]);
05334   }
05335 
05336   freeArray<double *>(this->thread_part_weights);
05337   freeArray<pq_scalar_t *>(this->thread_cut_left_closest_point);
05338   freeArray<pq_scalar_t *>(this->thread_cut_right_closest_point);
05339 
05340   this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Problem_Free");
05341 }
05342 
05350 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
05351 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::set_partitioning_parameters(
05352     bool distribute_points_on_cut_lines_,
05353     int max_concurrent_part_calculation_,
05354     int check_migrate_avoid_migration_option_,
05355     pq_scalar_t minimum_migration_imbalance_){
05356   this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
05357   this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
05358   this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
05359   this->minimum_migration_imbalance = minimum_migration_imbalance_;
05360 
05361 }
05362 
05363 
05392 template <typename pq_scalar_t, typename pq_lno_t, typename pq_gno_t>
05393 void AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t>::multi_jagged_part(
05394 
05395     const RCP<const Environment> &env,
05396       RCP<Comm<int> > &problemComm,
05397 
05398       double imbalance_tolerance_,
05399       size_t num_global_parts_,
05400       partId_t *part_no_array_,
05401       int recursion_depth_,
05402 
05403       int coord_dim_,
05404       pq_lno_t num_local_coords_,
05405       pq_gno_t num_global_coords_,
05406       const pq_gno_t *initial_mj_gnos_,
05407       pq_scalar_t **mj_coordinates_,
05408 
05409       int num_weights_per_coord_,
05410       bool *mj_uniform_weights_,
05411       pq_scalar_t **mj_weights_,
05412       bool *mj_uniform_parts_,
05413       pq_scalar_t **mj_part_sizes_,
05414 
05415       partId_t *&result_assigned_part_ids_,
05416       pq_gno_t *&result_mj_gnos_
05417 
05418     ){
05419 
05420 #ifndef INCLUDE_ZOLTAN2_EXPERIMENTAL
05421 
05422     Z2_THROW_EXPERIMENTAL("Zoltan2 PQJagged is strictly experimental software "
05423             "while it is being developed and tested.")
05424 
05425 #else
05426 
05427 #ifdef print_debug
05428     if(comm->getRank() == 0){
05429       cout << "size of gno:" << sizeof(pq_gno_t) << endl;
05430       cout << "size of lno:" << sizeof(pq_lno_t) << endl;
05431       cout << "size of pq_scalar_t:" << sizeof(pq_scalar_t) << endl;
05432     }
05433 #endif
05434 
05435   this->mj_env = env;
05436     this->mj_problemComm = problemComm;
05437     this->myActualRank = this->myRank = this->mj_problemComm->getRank();
05438 
05439     this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Total");
05440     this->mj_env->debug(3, "In PQ Jagged");
05441 
05442     {
05443       this->imbalance_tolerance = imbalance_tolerance_;
05444       this->num_global_parts = num_global_parts_;
05445       this->part_no_array =  part_no_array_;
05446       this->recursion_depth = recursion_depth_;
05447 
05448       this->coord_dim = coord_dim_;
05449       this->num_local_coords = num_local_coords_;
05450       this->num_global_coords = num_global_coords_;
05451       this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
05452       this->initial_mj_gnos = (pq_gno_t *) initial_mj_gnos_; //will copy the memory to this->current_mj_gnos[j].
05453 
05454       this->num_weights_per_coord = num_weights_per_coord_;
05455       this->mj_uniform_weights = mj_uniform_weights_;
05456       this->mj_weights = mj_weights_; //will copy the memory to this->mj_weights
05457       this->mj_uniform_parts = mj_uniform_parts_;
05458       this->mj_part_sizes = mj_part_sizes_;
05459 
05460       this->num_threads = 1;
05461 #ifdef HAVE_ZOLTAN2_OMP
05462 #pragma omp parallel
05463 
05464       {
05465         this->num_threads = omp_get_num_threads();
05466       }
05467 #endif
05468     }
05469     //this->set_input_data();
05470     this->set_part_specifications();
05471 
05472     this->allocate_set_work_memory();
05473 
05474     //We duplicate the comm as we create subcommunicators during migration.
05475     //We keep the problemComm as it is, while comm changes after each migration.
05476     this->comm = this->mj_problemComm->duplicate();
05477 
05478     //initially there is a single partition
05479     partId_t current_num_parts = 1;
05480     pq_scalar_t *current_cut_coordinates =  this->all_cut_coordinates;
05481 
05482     this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Problem_Partitioning");
05483 
05484     partId_t output_part_begin_index = 0;
05485     partId_t future_num_parts = this->total_num_part;
05486     bool is_data_ever_migrated = false;
05487 
05488     vector<partId_t> *future_num_part_in_parts = new vector<partId_t> ();
05489     vector<partId_t> *next_future_num_parts_in_parts = new vector<partId_t> ();
05490     next_future_num_parts_in_parts->push_back(this->num_global_parts);
05491 
05492     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > >
05493      input_part_boxes(new vector <coordinateModelPartBox <pq_scalar_t, partId_t> >
05494       (), true) ;
05495     RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > >
05496      output_part_boxes(new vector <coordinateModelPartBox <pq_scalar_t, partId_t> >
05497       (), true);
05498 
05499     if(this->mj_keep_part_boxes){
05500       this->init_part_boxes(output_part_boxes);
05501     }
05502 
05503     for (int i = 0; i < this->recursion_depth; ++i){
05504         //partitioning array. size will be as the number of current partitions and this
05505         //holds how many parts that each part will be in the current dimension partitioning.
05506         vector <partId_t> num_partitioning_in_current_dim;
05507 
05508         //number of parts that will be obtained at the end of this partitioning.
05509         //future_num_part_in_parts is as the size of current number of parts.
05510         //holds how many more parts each should be divided in the further
05511         //iterations. this will be used to calculate num_partitioning_in_current_dim,
05512         //as the number of parts that the part will be partitioned
05513         //in the current dimension partitioning.
05514 
05515         //next_future_num_parts_in_parts will be as the size of outnumParts,
05516         //and this will hold how many more parts that each output part
05517         //should be divided. this array will also be used to determine the weight ratios
05518         //of the parts.
05519         //swap the arrays to use iteratively..
05520         vector<partId_t> *tmpPartVect= future_num_part_in_parts;
05521         future_num_part_in_parts = next_future_num_parts_in_parts;
05522         next_future_num_parts_in_parts = tmpPartVect;
05523 
05524         //clear next_future_num_parts_in_parts array as
05525         //getPartitionArrays expects it to be empty.
05526         //it also expects num_partitioning_in_current_dim to be empty as well.
05527         next_future_num_parts_in_parts->clear();
05528 
05529         if(this->mj_keep_part_boxes){
05530             RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > >
05531                                                  tmpPartBoxes = input_part_boxes;
05532             input_part_boxes = output_part_boxes;
05533             output_part_boxes = tmpPartBoxes;
05534             output_part_boxes->clear();
05535         }
05536 
05537         //returns the total no. of output parts for this dimension partitioning.
05538         partId_t output_part_count_in_dimension =
05539             this->update_part_num_arrays(
05540                 num_partitioning_in_current_dim,
05541                 future_num_part_in_parts,
05542                 next_future_num_parts_in_parts,
05543                 future_num_parts,
05544                 current_num_parts,
05545                 i,
05546                 input_part_boxes,
05547                 output_part_boxes);
05548 
05549         //if the number of obtained parts equal to current number of parts,
05550         //skip this dimension. For example, this happens when 1 is given in the input
05551         //part array is given. P=4,5,1,2
05552         if(output_part_count_in_dimension == current_num_parts) {
05553           //still need to swap the input output arrays.
05554             tmpPartVect= future_num_part_in_parts;
05555             future_num_part_in_parts = next_future_num_parts_in_parts;
05556             next_future_num_parts_in_parts = tmpPartVect;
05557 
05558             if(this->mj_keep_part_boxes){
05559                 RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > >
05560                                              tmpPartBoxes = input_part_boxes;
05561                 input_part_boxes = output_part_boxes;
05562                 output_part_boxes = tmpPartBoxes;
05563             }
05564             continue;
05565         }
05566 
05567 
05568         //get the coordinate axis along which the partitioning will be done.
05569         int coordInd = i % this->coord_dim;
05570         pq_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
05571 
05572         //convert i to string to be used for debugging purposes.
05573         string istring = toString<int>(i);
05574         this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Problem_Partitioning_" + istring);
05575 
05576         //alloc Memory to point the indices
05577         //of the parts in the permutation array.
05578         this->new_part_xadj = allocMemory<pq_lno_t>(output_part_count_in_dimension);
05579 
05580         //the index where in the new_part_xadj will be written.
05581         partId_t output_part_index = 0;
05582         //whatever is written to output_part_index will be added with putput_coordinate_end_index
05583         //so that the points will be shifted.
05584         partId_t output_coordinate_end_index = 0;
05585 
05586         partId_t current_work_part = 0;
05587         partId_t current_concurrent_num_parts =
05588             min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
05589 
05590         partId_t obtained_part_index = 0;
05591 
05592         //run for all available parts.
05593         for (; current_work_part < current_num_parts;
05594                  current_work_part += current_concurrent_num_parts){
05595 
05596             current_concurrent_num_parts = min(current_num_parts - current_work_part,
05597                                  this->max_concurrent_part_calculation);
05598 
05599             partId_t actual_work_part_count = 0;
05600             //initialization for 1D partitioning.
05601             //get the min and max coordinates of each part
05602             //together with the part weights of each part.
05603             for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
05604                 partId_t current_work_part_in_concurrent_parts = current_work_part + kk;
05605 
05606                 //if this part wont be partitioned any further
05607                 //dont do any work for this part.
05608                 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
05609                     continue;
05610                 }
05611                 ++actual_work_part_count;
05612                 pq_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
05613                 pq_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
05614 
05615 /*
05616                 cout << "i:" << i << " j:" << current_work_part + kk
05617                     << " coordinate_begin_index:" << coordinate_begin_index
05618                     << " coordinate_end_index:" << coordinate_end_index
05619                     << " total:" << coordinate_end_index - coordinate_begin_index<< endl;
05620                     */
05621                 this->mj_get_local_min_max_coord_totW(
05622                         coordinate_begin_index,
05623                         coordinate_end_index,
05624                         this->coordinate_permutations,
05625                         mj_current_dim_coords,
05626                             this->process_local_min_max_coord_total_weight[kk], //min_coordinate
05627                             this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max_coordinate
05628                             this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]); //total_weight
05629 
05630             }
05631 
05632             //1D partitioning
05633             if (actual_work_part_count > 0){
05634                 //obtain global Min max of the part.
05635                 this->mj_get_global_min_max_coord_totW(
05636                     current_concurrent_num_parts,
05637                     this->process_local_min_max_coord_total_weight,
05638                     this->global_min_max_coord_total_weight);
05639 
05640                 //represents the total number of cutlines
05641                 //whose coordinate should be determined.
05642                 partId_t total_incomplete_cut_count = 0;
05643 
05644                 //Compute weight ratios for parts & cuts:
05645                 //e.g., 0.25  0.25  0.5    0.5  0.75 0.75  1
05646                 //part0  cut0  part1 cut1 part2 cut2 part3
05647                 partId_t concurrent_part_cut_shift = 0;
05648                 partId_t concurrent_part_part_shift = 0;
05649                 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
05650                     pq_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
05651                     pq_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
05652                                                      current_concurrent_num_parts];
05653 
05654                     pq_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
05655                                                         2 * current_concurrent_num_parts];
05656 
05657                     partId_t concurrent_current_part_index = current_work_part + kk;
05658 
05659                     partId_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
05660 
05661                     pq_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
05662                     pq_scalar_t *current_target_part_weights = this->target_part_weights +
05663                                                         concurrent_part_part_shift;
05664                     //shift the usedCutCoordinate array as noCuts.
05665                     concurrent_part_cut_shift += partition_count - 1;
05666                     //shift the partRatio array as noParts.
05667                     concurrent_part_part_shift += partition_count;
05668 
05669 
05670                     //calculate only if part is not empty,
05671                     //and part will be further partitioned.
05672                     if(partition_count > 1 && min_coordinate <= max_coordinate){
05673 
05674                         //increase num_cuts_do_be_determined by the number of cuts of the current
05675                         //part's cut line number.
05676                         total_incomplete_cut_count += partition_count - 1;
05677                         //set the number of cut lines that should be determined
05678                         //for this part.
05679                         this->my_incomplete_cut_count[kk] = partition_count - 1;
05680 
05681                         //get the target weights of the parts.
05682                         this->mj_get_initial_cut_coords_target_weights(
05683                             min_coordinate,
05684                             max_coordinate,
05685                             partition_count - 1,
05686                             global_total_weight,
05687                             usedCutCoordinate,
05688                             current_target_part_weights,
05689                             future_num_part_in_parts,
05690                             next_future_num_parts_in_parts,
05691                             concurrent_current_part_index,
05692                             obtained_part_index);
05693 
05694                         pq_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
05695                         pq_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
05696 
05697                         //get the initial estimated part assignments of the
05698                         //coordinates.
05699                         this->set_initial_coordinate_parts(
05700                             max_coordinate,
05701                             min_coordinate,
05702                             concurrent_current_part_index,
05703                             coordinate_begin_index, coordinate_end_index,
05704                             this->coordinate_permutations,
05705                             mj_current_dim_coords,
05706                             this->assigned_part_ids,
05707                             partition_count);
05708                     }
05709                     else {
05710                         // e.g., if have fewer coordinates than parts, don't need to do next dim.
05711                         this->my_incomplete_cut_count[kk] = 0;
05712                     }
05713                     obtained_part_index += partition_count;
05714                 }
05715 
05716 
05717 
05718                 //used imbalance, it is always 0, as it is difficult to
05719                 //estimate a range.
05720                 pq_scalar_t used_imbalance = 0;
05721 
05722 
05723                 // Determine cut lines for all concurrent parts parts here.
05724                 this->mj_1D_part(
05725                     mj_current_dim_coords,
05726                     used_imbalance,
05727                     current_work_part,
05728                     current_concurrent_num_parts,
05729                     current_cut_coordinates,
05730                     total_incomplete_cut_count,
05731                     num_partitioning_in_current_dim);
05732             }
05733 
05734             //create new part chunks
05735             {
05736                 partId_t output_array_shift = 0;
05737                 partId_t cut_shift = 0;
05738                 size_t tlr_shift = 0;
05739                 size_t partweight_array_shift = 0;
05740 
05741                 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
05742                     partId_t current_concurrent_work_part = current_work_part + kk;
05743                     partId_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
05744 
05745                     //if the part is empty, skip the part.
05746                     if((num_parts != 1  )
05747                         &&
05748                         this->global_min_max_coord_total_weight[kk] >
05749                              this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
05750 
05751                       //we still need to write the begin and end point of the
05752                       //empty part. simply set it zero, the array indices will be shifted later.
05753                       for(partId_t jj = 0; jj < num_parts; ++jj){
05754                         this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
05755                       }
05756                         cut_shift += num_parts - 1;
05757                         tlr_shift += (4 *(num_parts - 1) + 1);
05758                         output_array_shift += num_parts;
05759                         partweight_array_shift += (2 * (num_parts - 1) + 1);
05760                         continue;
05761                     }
05762 
05763                     pq_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
05764                     pq_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
05765                                                                 current_concurrent_work_part -1];
05766                     pq_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
05767                     pq_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
05768                                                             cut_shift;
05769 
05770                     //pq_scalar_t *used_tlr_array =  this->total_part_weight_left_right_closests + tlr_shift;
05771 
05772                     for(int ii = 0; ii < this->num_threads; ++ii){
05773                         this->thread_part_weight_work[ii] = this->thread_part_weights[ii] +  partweight_array_shift;
05774                     }
05775 
05776                     if(num_parts > 1){
05777                         if(this->mj_keep_part_boxes){
05778                           //if part boxes are to be stored update the boundaries.
05779                             for (partId_t j = 0; j < num_parts - 1; ++j){
05780                                 (*output_part_boxes)[output_array_shift + output_part_index +
05781                                  j].updateMinMax(current_concurrent_cut_coordinate[j], 1
05782                                   /*update max*/, coordInd);
05783 
05784                                 (*output_part_boxes)[output_array_shift + output_part_index + j +
05785                                  1].updateMinMax(current_concurrent_cut_coordinate[j], 0
05786                                   /*update min*/, coordInd);
05787                             }
05788                         }
05789 
05790                         // Rewrite the indices based on the computed cuts.
05791                         this->mj_create_new_partitions(
05792                             num_parts,
05793                             mj_current_dim_coords,
05794                             current_concurrent_cut_coordinate,
05795                             coordinate_begin,
05796                             coordinate_end,
05797                             used_local_cut_line_weight_to_left,
05798                             this->thread_part_weight_work,
05799                             this->new_part_xadj + output_part_index + output_array_shift
05800                             );
05801 
05802                     }
05803                     else {
05804                         //if this part is partitioned into 1 then just copy
05805                         //the old values.
05806                         pq_lno_t part_size = coordinate_end - coordinate_begin;
05807                         *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
05808                         memcpy(
05809                           this->new_coordinate_permutations + coordinate_begin,
05810                             this->coordinate_permutations + coordinate_begin,
05811                             part_size * sizeof(pq_lno_t));
05812                     }
05813                     cut_shift += num_parts - 1;
05814                     tlr_shift += (4 *(num_parts - 1) + 1);
05815                     output_array_shift += num_parts;
05816                     partweight_array_shift += (2 * (num_parts - 1) + 1);
05817                 }
05818 
05819                 //shift cut coordinates so that all cut coordinates are stored.
05820                 //no shift now because we dont keep the cuts.
05821                 //current_cut_coordinates += cut_shift;
05822 
05823                 //mj_create_new_partitions from coordinates partitioned the parts and
05824                 //write the indices as if there were a single part.
05825                 //now we need to shift the beginning indices.
05826                 for(partId_t kk = 0; kk < current_concurrent_num_parts; ++kk){
05827                     partId_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
05828                     for (partId_t ii = 0;ii < num_parts ; ++ii){
05829                         //shift it by previousCount
05830                         this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
05831                     }
05832                     //increase the previous count by current end.
05833                     output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
05834                     //increase the current out.
05835                     output_part_index += num_parts ;
05836                 }
05837             }
05838         }
05839         // end of this partitioning dimension
05840 
05841 
05842         int current_world_size = this->comm->getSize();
05843         long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
05844 
05845 
05846         bool is_migrated_in_current_dimension = false;
05847 
05848         //we migrate if there are more partitionings to be done after this step
05849         //and if the migration is not forced to be avoided.
05850         //and the operation is not sequential.
05851         if (future_num_parts > 1 &&
05852             this->check_migrate_avoid_migration_option >= 0 &&
05853             current_world_size > 1){
05854 
05855           this->mj_env->timerStart(MACRO_TIMERS, "PQJagged - Problem_Migration-" + istring);
05856           partId_t num_parts = output_part_count_in_dimension;
05857           if ( this->mj_perform_migration(
05858                   num_parts,
05859                   current_num_parts, //output
05860                   next_future_num_parts_in_parts, //output
05861                   output_part_begin_index,
05862                   migration_reduce_all_population,
05863                   this->num_local_coords / (future_num_parts * current_num_parts),
05864                   istring,
05865                   input_part_boxes, output_part_boxes) ) {
05866             is_migrated_in_current_dimension = true;
05867             is_data_ever_migrated = true;
05868             this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Problem_Migration-" +
05869                 istring);
05870             //since data is migrated, we reduce the number of reduceAll operations for the last part.
05871             this->total_dim_num_reduce_all /= num_parts;
05872           }
05873           else {
05874             is_migrated_in_current_dimension = false;
05875             this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Problem_Migration-" + istring);
05876           }
05877         }
05878 
05879         //swap the coordinate permutations for the next dimension.
05880         pq_lno_t * tmp = this->coordinate_permutations;
05881         this->coordinate_permutations = this->new_coordinate_permutations;
05882         this->new_coordinate_permutations = tmp;
05883 
05884         if(!is_migrated_in_current_dimension){
05885             this->total_dim_num_reduce_all -= current_num_parts;
05886             current_num_parts = output_part_count_in_dimension;
05887         }
05888         freeArray<pq_lno_t>(this->part_xadj);
05889         this->part_xadj = this->new_part_xadj;
05890 
05891         this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Problem_Partitioning_" + istring);
05892     }
05893 
05894     // Partitioning is done
05895     delete future_num_part_in_parts;
05896     delete next_future_num_parts_in_parts;
05897 
05898     this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Problem_Partitioning");
05900 
05901 
05902     //get the final parts of each initial coordinate
05903     //the results will be written to
05904     //this->assigned_part_ids for gnos given in this->current_mj_gnos
05905     this->set_final_parts(
05906         current_num_parts,
05907         output_part_begin_index,
05908         output_part_boxes,
05909         is_data_ever_migrated);
05910 
05911   result_assigned_part_ids_ = this->assigned_part_ids;
05912   result_mj_gnos_ = this->current_mj_gnos;
05913 
05914     this->free_work_memory();
05915     this->mj_env->timerStop(MACRO_TIMERS, "PQJagged - Total");
05916     this->mj_env->debug(3, "Out of PQ Jagged");
05917 #endif // INCLUDE_ZOLTAN2_EXPERIMENTAL
05918 
05919 }
05920 
05921 
05925 template <typename Adapter>
05926 class Zoltan2_AlgMJ{
05927 private:
05928 
05929 #ifndef DOXYGEN_SHOULD_SKIP_THIS
05930   typedef typename Adapter::scalar_t pq_scalar_t;
05931   typedef typename Adapter::gno_t pq_gno_t;
05932   typedef typename Adapter::lno_t pq_lno_t;
05933   typedef typename Adapter::node_t pq_node_t;
05934 #endif
05935   AlgMJ<pq_scalar_t, pq_lno_t, pq_gno_t> mj_partitioner;
05936 
05937   RCP<const Environment> mj_env; //the environment object
05938   RCP<const CoordinateModel<typename Adapter::base_adapter_t> > mj_coords; //coordinate adapter
05939   RCP<PartitioningSolution<Adapter> > mj_solution; //solution object
05940 
05941   //PARAMETERS
05942   double imbalance_tolerance; //input imbalance tolerance.
05943   size_t num_global_parts; //the targeted number of parts
05944   partId_t *part_no_array; //input part array specifying num part to divide along each dim.
05945   int recursion_depth; //the number of steps that partitioning will be solved in.
05946 
05947   int coord_dim; // coordinate dimension.
05948     pq_lno_t num_local_coords; //number of local coords.
05949     pq_gno_t num_global_coords; //number of global coords.
05950     const pq_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
05951     pq_scalar_t **mj_coordinates; //two dimension coordinate array
05952 
05953     int num_weights_per_coord; // number of weights per coordinate
05954     bool *mj_uniform_weights; //if the coordinates have uniform weights.
05955     pq_scalar_t **mj_weights; //two dimensional weight array
05956     bool *mj_uniform_parts; //if the target parts are uniform
05957     pq_scalar_t **mj_part_sizes; //target part weight sizes.
05958 
05959     bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
05960     int max_concurrent_part_calculation; // how many parts we can calculate concurrently.
05961     int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
05962     pq_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
05963     int mj_keep_part_boxes; //if the boxes need to be kept.
05964 
05965     int num_threads;
05966 
05967     int mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
05968 
05969   void set_up_partitioning_data();
05970 
05971   void set_input_parameters(const Teuchos::ParameterList &p);
05972 
05973   void free_work_memory();
05974 public:
05975   Zoltan2_AlgMJ():mj_partitioner(),mj_env(),
05976       mj_coords(), mj_solution(), imbalance_tolerance(0),
05977       num_global_parts(1), part_no_array(NULL), recursion_depth(0),
05978       coord_dim(0),num_local_coords(0), num_global_coords(0),
05979       initial_mj_gnos(NULL), mj_coordinates(NULL), num_weights_per_coord(0),
05980       mj_uniform_weights(NULL), mj_weights(NULL), mj_uniform_parts(NULL),
05981       mj_part_sizes(NULL), distribute_points_on_cut_lines(true),
05982       max_concurrent_part_calculation(1), check_migrate_avoid_migration_option(0),
05983       minimum_migration_imbalance(0.30), mj_keep_part_boxes(0), num_threads(1), mj_run_as_rcb(0)
05984   {}
05985   ~Zoltan2_AlgMJ(){}
05986 
05996   void multi_jagged_part(
05997       const RCP<const Environment> &env,
05998       RCP<Comm<int> > &problemComm,
05999       const RCP<const CoordinateModel<typename Adapter::base_adapter_t> > &mj_coords,
06000       RCP<PartitioningSolution<Adapter> > &solution);
06001 };
06002 
06003 
06013 template <typename Adapter>
06014 void Zoltan2_AlgMJ<Adapter>::multi_jagged_part(
06015     const RCP<const Environment> &env,
06016     RCP<Comm<int> > &problemComm,
06017     const RCP<const CoordinateModel<typename Adapter::base_adapter_t> > &coords,
06018     RCP<PartitioningSolution<Adapter> > &solution){
06019     this->mj_env = env;
06020     this->mj_coords = coords;
06021     this->mj_solution = solution;
06022   this->set_up_partitioning_data();
06023     this->set_input_parameters(this->mj_env->getParameters());
06024     if (this->mj_keep_part_boxes){
06025       this->mj_partitioner.set_to_keep_part_boxes();
06026     }
06027     this->mj_partitioner.set_partitioning_parameters(
06028         this->distribute_points_on_cut_lines,
06029         this->max_concurrent_part_calculation,
06030         this->check_migrate_avoid_migration_option,
06031         this->minimum_migration_imbalance);
06032 
06033   partId_t *result_assigned_part_ids = NULL;
06034   pq_gno_t *result_mj_gnos = NULL;
06035     this->mj_partitioner.multi_jagged_part(
06036         env,
06037         problemComm,
06038 
06039         this->imbalance_tolerance,
06040         this->num_global_parts,
06041         this->part_no_array,
06042         this->recursion_depth,
06043 
06044         this->coord_dim,
06045         this->num_local_coords,
06046         this->num_global_coords,
06047         this->initial_mj_gnos,
06048         this->mj_coordinates,
06049 
06050         this->num_weights_per_coord,
06051         this->mj_uniform_weights,
06052         this->mj_weights,
06053         this->mj_uniform_parts,
06054         this->mj_part_sizes,
06055 
06056         result_assigned_part_ids,
06057           result_mj_gnos
06058         );
06059 
06060     ArrayRCP<const pq_gno_t> gnoList = arcp(result_mj_gnos, 0, this->num_local_coords, true);
06061     ArrayRCP<partId_t> partId = arcp(result_assigned_part_ids, 0, this->num_local_coords, true);
06062     this->mj_solution->setParts(gnoList, partId, true);
06063     if (this->mj_keep_part_boxes){
06064       RCP < vector <coordinateModelPartBox <pq_scalar_t, partId_t> > > output_part_boxes = this->mj_partitioner.get_part_boxes();
06065         this->mj_solution->setPartBoxes(output_part_boxes);
06066     }
06067     this->free_work_memory();
06068 }
06069 /* \brief Freeing the memory allocated.
06070  * */
06071 template <typename Adapter>
06072 void Zoltan2_AlgMJ<Adapter>::free_work_memory(){
06073   freeArray<pq_scalar_t *>(this->mj_coordinates);
06074   freeArray<pq_scalar_t *>(this->mj_weights);
06075   freeArray<bool>(this->mj_uniform_parts);
06076   freeArray<pq_scalar_t *>(this->mj_part_sizes);
06077   freeArray<bool>(this->mj_uniform_weights);
06078 
06079 }
06080 
06081 /* \brief Sets the partitioning data for multijagged algorithm.
06082  * */
06083 template <typename Adapter>
06084 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(){
06085 
06086   this->coord_dim = this->mj_coords->getCoordinateDim();
06087   this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
06088   this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
06089   this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
06090   int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
06091 
06092   // From the Solution we get part information.
06093   // If the part sizes for a given criteria are not uniform,
06094   // then they are values that sum to 1.0.
06095   this->num_global_parts = this->mj_solution->getTargetGlobalNumberOfParts();
06096   //allocate only two dimensional pointer.
06097   //raw pointer addresess will be obtained from multivector.
06098   this->mj_coordinates = allocMemory<pq_scalar_t *>(this->coord_dim);
06099   this->mj_weights = allocMemory<pq_scalar_t *>(criteria_dim);
06100 
06101   //if the partitioning results are to be uniform.
06102   this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
06103   //if in a criteria dimension, uniform part is false this shows ratios of
06104   //the target part weights.
06105   this->mj_part_sizes =  allocMemory<pq_scalar_t *>(criteria_dim);
06106   //if the weights of coordinates are uniform in a criteria dimension.
06107   this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
06108 
06109   typedef StridedData<pq_lno_t, pq_scalar_t> input_t;
06110   ArrayView<const pq_gno_t> gnos;
06111   ArrayView<input_t> xyz;
06112   ArrayView<input_t> wgts;
06113 
06114   this->mj_coords->getCoordinates(gnos, xyz, wgts);
06115   //obtain global ids.
06116   ArrayView<const pq_gno_t> mj_gnos = gnos;
06117   this->initial_mj_gnos = mj_gnos.getRawPtr();
06118 
06119   //extract coordinates from multivector.
06120   for (int dim=0; dim < this->coord_dim; dim++){
06121     ArrayRCP<const pq_scalar_t> ar;
06122     xyz[dim].getInputArray(ar);
06123     //pqJagged coordinate values assignment
06124     this->mj_coordinates[dim] =  (pq_scalar_t *)ar.getRawPtr();
06125   }
06126 
06127   //if no weights are provided set uniform weight.
06128   if (this->num_weights_per_coord == 0){
06129     this->mj_uniform_weights[0] = true;
06130     this->mj_weights[0] = NULL;
06131   }
06132   else{
06133     //if weights are provided get weights for all weight indices
06134     for (int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
06135       ArrayRCP<const pq_scalar_t> ar;
06136       wgts[wdim].getInputArray(ar);
06137       this->mj_uniform_weights[wdim] = false;
06138       this->mj_weights[wdim] = (pq_scalar_t *) ar.getRawPtr();
06139     }
06140   }
06141 
06142   for (int wdim = 0; wdim < criteria_dim; wdim++){
06143     if (this->mj_solution->criteriaHasUniformPartSizes(wdim)){
06144       this->mj_uniform_parts[wdim] = true;
06145       this->mj_part_sizes[wdim] = NULL;
06146     }
06147     else{
06148       cerr << "MJ does not support non uniform target part weights" << endl;
06149       exit(1);
06150     }
06151   }
06152 }
06153 
06154 /* \brief Sets the partitioning parameters for multijagged algorithm.
06155  * \param pl: is the parameter list provided to zoltan2 call
06156  * */
06157 template <typename Adapter>
06158 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(const Teuchos::ParameterList &pl){
06159 
06160   const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
06161   if (pe){
06162     double tol;
06163     tol = pe->getValue(&tol);
06164     this->imbalance_tolerance = tol - 1.0;
06165   }
06166 
06167     // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
06168   if (this->imbalance_tolerance <= 0)
06169     this->imbalance_tolerance= 10e-4;
06170 
06171   //if an input partitioning array is provided.
06172   this->part_no_array = NULL;
06173   //the length of the input partitioning array.
06174   this->recursion_depth = 0;
06175 
06176   if (pl.getPtr<Array <partId_t> >("mj_parts")){
06177     this->part_no_array = (partId_t *) pl.getPtr<Array <partId_t> >("mj_parts")->getRawPtr();
06178     this->recursion_depth = pl.getPtr<Array <partId_t> >("mj_parts")->size() - 1;
06179     this->mj_env->debug(2, "mj_parts provided by user");
06180   }
06181 
06182   //get mj specific parameters.
06183   this->distribute_points_on_cut_lines = true;
06184   this->max_concurrent_part_calculation = 1;
06185 
06186   this->mj_run_as_rcb = 0;
06187   int mj_user_recursion_depth = -1;
06188   this->mj_keep_part_boxes = 0;
06189   this->check_migrate_avoid_migration_option = 0;
06190   this->minimum_migration_imbalance = 0.35;
06191 
06192   pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
06193   if (pe){
06194     double imb;
06195     imb = pe->getValue(&imb);
06196     this->minimum_migration_imbalance = imb - 1.0;
06197   }
06198 
06199   pe = pl.getEntryPtr("mj_migration_option");
06200   if (pe){
06201     this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
06202   }else {
06203     this->check_migrate_avoid_migration_option = 0;
06204   }
06205   if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
06206 
06207 
06208   pe = pl.getEntryPtr("mj_concurrent_part_count");
06209   if (pe){
06210     this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
06211   }else {
06212     this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
06213   }
06214 
06215   pe = pl.getEntryPtr("mj_keep_part_boxes");
06216   if (pe){
06217     this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
06218   }else {
06219     this->mj_keep_part_boxes = 0; // Set to invalid value
06220   }
06221 
06222   //need to keep part boxes if mapping type is geometric.
06223   if (this->mj_keep_part_boxes == 0){
06224     pe = pl.getEntryPtr("mapping_type");
06225     if (pe){
06226       int mapping_type = -1;
06227       mapping_type = pe->getValue(&mapping_type);
06228       if (mapping_type == 0){
06229         mj_keep_part_boxes  = 1;
06230       }
06231     }
06232   }
06233 
06234   //need to keep part boxes if mapping type is geometric.
06235   pe = pl.getEntryPtr("mj_enable_rcb");
06236   if (pe){
06237     this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
06238   }else {
06239     this->mj_run_as_rcb = 0; // Set to invalid value
06240   }
06241 
06242   pe = pl.getEntryPtr("mj_recursion_depth");
06243   if (pe){
06244     mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
06245   }else {
06246     mj_user_recursion_depth = -1; // Set to invalid value
06247   }
06248 
06249   int val = 0;
06250   pe = pl.getEntryPtr("rectilinear_blocks");
06251   if (pe)
06252     val = pe->getValue(&val);
06253 
06254     // TODO: RCB default is false ? Should we change ?
06255   if (val == 1){
06256     this->distribute_points_on_cut_lines = false;
06257   } else {
06258     this->distribute_points_on_cut_lines = true;
06259   }
06260 
06261   if (this->mj_run_as_rcb){
06262     mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
06263   }
06264   if (this->recursion_depth < 1){
06265     if (mj_user_recursion_depth > 0){
06266       this->recursion_depth = mj_user_recursion_depth;
06267     }
06268     else {
06269       this->recursion_depth = this->coord_dim;
06270     }
06271   }
06272 
06273   this->num_threads = 1;
06274 #ifdef HAVE_ZOLTAN2_OMP
06275 #pragma omp parallel
06276   {
06277     this->num_threads = omp_get_num_threads();
06278   }
06279 #endif
06280 
06281 }
06282 
06283 } // namespace Zoltan2
06284 
06285 #endif