RTOpToMPI.c

Go to the documentation of this file.
00001 /*
00002 // @HEADER
00003 // ***********************************************************************
00004 // 
00005 // Moocho: Multi-functional Object-Oriented arCHitecture for Optimization
00006 //                  Copyright (2003) Sandia Corporation
00007 // 
00008 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
00009 // license for use of this work by or on behalf of the U.S. Government.
00010 // 
00011 // This library is free software; you can redistribute it and/or modify
00012 // it under the terms of the GNU Lesser General Public License as
00013 // published by the Free Software Foundation; either version 2.1 of the
00014 // License, or (at your option) any later version.
00015 //  
00016 // This library is distributed in the hope that it will be useful, but
00017 // WITHOUT ANY WARRANTY; without even the implied warranty of
00018 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00019 // Lesser General Public License for more details.
00020 //  
00021 // You should have received a copy of the GNU Lesser General Public
00022 // License along with this library; if not, write to the Free Software
00023 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00024 // USA
00025 // Questions? Contact Roscoe A. Bartlett (rabartl@sandia.gov) 
00026 // 
00027 // ***********************************************************************
00028 // @HEADER
00029 */
00030 
00031 #include "RTOpToMPI.h"
00032 
00033 #ifdef RTOP_TO_MPI_SHOW_TIMES
00034 #include <time.h>
00035 #include <stdio.h>
00036 #endif
00037 
00038 void RTOp_MPI_type_signature(
00039   const int num_values
00040   ,const int num_indexes
00041   ,const int num_chars
00042   ,int* num_entries
00043   ,int block_lengths[]
00044   ,MPI_Aint displacements[]
00045   ,MPI_Datatype datatypes[]
00046   )
00047 {
00048   int k = 0, off = 0;
00049   /* values */
00050   block_lengths[k] = 3 + num_values; /* must carry size information */
00051   displacements[k] = 0;
00052   datatypes[k]     = RTOpMPI_VALUE_TYPE;
00053   ++k;
00054   off += (3 + num_values) * sizeof(RTOp_value_type);
00055   /* indexes */
00056   if( num_indexes ) {
00057     block_lengths[k] = num_indexes;
00058     displacements[k] = off;
00059     datatypes[k]     = RTOpMPI_INDEX_TYPE;
00060     ++k;
00061     off += num_indexes * sizeof(RTOp_index_type);
00062   }
00063   /* chars */
00064   if( num_chars ) {
00065     block_lengths[k] = num_chars;
00066     displacements[k] = off;
00067     datatypes[k]     = RTOpMPI_CHAR_TYPE;
00068     ++k;
00069   }
00070   *num_entries = k;
00071 }
00072 
00073 int RTOp_extract_reduct_obj_ext_state(
00074   const struct RTOp_RTOp*   op
00075   ,RTOp_ReductTarget        reduct_obj
00076   ,int                      num_values
00077   ,int                      num_indexes
00078   ,int                      num_chars
00079   ,void*                    reduct_obj_ext
00080   )
00081 {
00082   int err = 0;
00083   int
00084     num_values_off  = 0,
00085     num_indexes_off = num_values_off  + sizeof(RTOp_value_type),
00086     num_chars_off   = num_indexes_off + sizeof(RTOp_value_type),
00087     values_off      = num_chars_off   + sizeof(RTOp_value_type),
00088     indexes_off     = values_off      + num_values  * sizeof(RTOp_value_type),
00089     chars_off       = indexes_off     + num_indexes * sizeof(RTOp_index_type);
00090   *(RTOp_value_type*)((char*)reduct_obj_ext + num_values_off)  = num_values;
00091   *(RTOp_value_type*)((char*)reduct_obj_ext + num_indexes_off) = num_indexes;
00092   *(RTOp_value_type*)((char*)reduct_obj_ext + num_chars_off)   = num_chars;
00093   err = RTOp_extract_reduct_obj_state(
00094     op,reduct_obj
00095     ,num_values,  (RTOp_value_type*)((char*)reduct_obj_ext + values_off)
00096     ,num_indexes, (RTOp_index_type*)((char*)reduct_obj_ext + indexes_off)
00097     ,num_chars,   (RTOp_char_type*)((char*)reduct_obj_ext  + chars_off)
00098     );
00099   return err;
00100 }
00101 
00102 int RTOp_load_reduct_obj_ext_state(
00103   const struct RTOp_RTOp*   op
00104   ,const void*              reduct_obj_ext
00105   ,RTOp_ReductTarget        reduct_obj
00106   )
00107 {
00108   int err = 0;
00109   int
00110     num_values_off  = 0,
00111     num_values      =                   *(RTOp_value_type*)((char*)reduct_obj_ext + num_values_off),
00112     num_indexes_off = num_values_off  + sizeof(RTOp_value_type),
00113     num_indexes     =                   *(RTOp_value_type*)((char*)reduct_obj_ext + num_indexes_off),
00114     num_chars_off   = num_indexes_off + sizeof(RTOp_value_type),
00115     num_chars       =                   *(RTOp_value_type*)((char*)reduct_obj_ext + num_chars_off),
00116     values_off      = num_chars_off + sizeof(RTOp_value_type),
00117     indexes_off     = values_off  + num_values  * sizeof(RTOp_value_type),
00118     chars_off       = indexes_off + num_indexes * sizeof(RTOp_index_type);
00119   err = RTOp_load_reduct_obj_state(
00120     op
00121     ,num_values,   (RTOp_value_type*)((char*)reduct_obj_ext + values_off)
00122     ,num_indexes, (RTOp_index_type*)((char*)reduct_obj_ext + indexes_off)
00123     ,num_chars,   (RTOp_char_type*)((char*) reduct_obj_ext + chars_off)
00124     ,reduct_obj
00125     );
00126   return err;
00127 }
00128 
00129 int RTOp_MPI_apply_op(
00130   MPI_Comm comm, const struct RTOp_RTOp* op, int root_rank
00131   ,const int num_cols
00132   ,const int num_vecs, const struct RTOp_SubVector sub_vecs[]
00133   ,const int num_targ_vecs, const struct RTOp_MutableSubVector sub_targ_vecs[]
00134   ,RTOp_ReductTarget reduct_objs[]
00135     )
00136 {
00137   int err = 0;
00138   int num_reduct_type_values = 0, num_reduct_type_indexes = 0,
00139     num_reduct_type_chars = 0,  num_reduct_type_entries = 0;
00140   RTOp_ReductTarget           *i_reduct_objs = NULL;
00141   int                         target_type_block_lengths[3];
00142   MPI_Aint                    target_type_displacements[3];
00143   RTOp_Datatype               target_type_datatypes[3];
00144   MPI_Datatype                mpi_reduct_ext_type = MPI_DATATYPE_NULL;
00145   int                         reduct_obj_ext_size = 0;
00146   char                        *i_reduct_objs_ext = NULL;
00147   char                        *i_reduct_objs_tmp = NULL;
00148   RTOp_reduct_op_func_ptr_t   reduct_op_func_ptr  = NULL;
00149   MPI_Op                      mpi_op = MPI_OP_NULL;
00150   int                         rank = -1;
00151   int                         k;
00152   int                         kc;
00153 #ifdef RTOP_TO_MPI_SHOW_TIMES
00154     const double secs_per_tick = ((double)1.0) / CLOCKS_PER_SEC;
00155   clock_t ticks_start_start, ticks_start=0, ticks_end = 0;
00156 #endif
00157 
00158   if( comm == MPI_COMM_NULL ) {
00159     /* Just do the reduction on this processor and be done with it! */
00160     if( sub_vecs || sub_targ_vecs ) {
00161       for( kc = 0; kc < num_cols; ++kc ) {
00162         err = RTOp_apply_op(
00163           op, num_vecs, sub_vecs+kc*num_vecs, num_targ_vecs, sub_targ_vecs+kc*num_targ_vecs
00164           ,reduct_objs ? reduct_objs[kc] : RTOp_REDUCT_OBJ_NULL
00165           );
00166         if (err) goto ERR_LABEL;
00167       }
00168     }
00169     return 0; /* Success! */
00170   }
00171 
00172   /* 
00173    * Get the description of the datatype of the target object. 
00174    * We need this in order to map it to an MPI user defined 
00175    * data type so that the reduction operations can be performed 
00176    * over all of the of processors. 
00177    * 
00178    * Get the number of the entries in the array that describes 
00179    * target type's datatypes 
00180    */
00181   err = RTOp_get_reduct_type_num_entries(
00182     op, &num_reduct_type_values, &num_reduct_type_indexes, &num_reduct_type_chars );
00183   if(err) goto ERR_LABEL;
00184   num_reduct_type_entries
00185     = (num_reduct_type_values ? 1 : 0)
00186     + (num_reduct_type_indexes ? 1 : 0)
00187     + (num_reduct_type_chars ? 1 : 0);
00188 
00189   if( num_reduct_type_entries ) {
00190 #ifdef RTOP_TO_MPI_SHOW_TIMES
00191     if(RTOp_MPI_apply_op_print_timings) {
00192       printf("RTOp_MPI_apply_op(...) : timing various MPI calls and other activities\n");
00193       ticks_start_start = clock();
00194     }
00195 #endif
00196     /*
00197      * There is a non-null reduction target object so we need to reduce 
00198      * it across processors 
00199      * 
00200      * Allocate the intermediate target object and perform the reduction for the 
00201      * vector elements on this processor. 
00202      */
00203         i_reduct_objs = malloc( sizeof(RTOp_ReductTarget) * num_cols );
00204     for( kc = 0; kc < num_cols; ++kc ) {
00205       i_reduct_objs[kc] = RTOp_REDUCT_OBJ_NULL;
00206       RTOp_reduct_obj_create( op, i_reduct_objs + kc );
00207     }
00208 #ifdef RTOP_TO_MPI_SHOW_TIMES
00209     if(RTOp_MPI_apply_op_print_timings) {
00210       printf("calling RTOp_apply_op(...)");
00211       ticks_start = clock();
00212     }
00213 #endif
00214     if( sub_vecs || sub_targ_vecs ) {
00215       for( kc = 0; kc < num_cols; ++kc ) {
00216         err = RTOp_apply_op(
00217           op, num_vecs, sub_vecs+kc*num_vecs, num_targ_vecs, sub_targ_vecs+kc*num_targ_vecs
00218           ,i_reduct_objs[kc]
00219           );
00220         if (err) goto ERR_LABEL;
00221       }
00222     }
00223 #ifdef RTOP_TO_MPI_SHOW_TIMES
00224     if(RTOp_MPI_apply_op_print_timings) {
00225       ticks_end = clock();
00226       printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick );
00227     }
00228 #endif
00229     /* Get the arrays that describe the reduction object datatype */
00230     if( num_reduct_type_values == 0 ) ++num_reduct_type_entries; /* Add the block for the sizes */
00231     RTOp_MPI_type_signature(
00232       num_reduct_type_values, num_reduct_type_indexes, num_reduct_type_chars
00233       ,&num_reduct_type_entries
00234       ,target_type_block_lengths, target_type_displacements, target_type_datatypes
00235       );
00236     /* Translate this target datatype description to an MPI datatype */
00237 #ifdef RTOP_TO_MPI_SHOW_TIMES
00238     if(RTOp_MPI_apply_op_print_timings) {
00239       printf("calling MPI_Type_struct(...)");
00240       ticks_start = clock();
00241     }
00242 #endif
00243     err = MPI_Type_struct( num_reduct_type_entries
00244                  , target_type_block_lengths, target_type_displacements
00245                  , target_type_datatypes, &mpi_reduct_ext_type );
00246 #ifdef RTOP_TO_MPI_SHOW_TIMES
00247     if(RTOp_MPI_apply_op_print_timings) {
00248       ticks_end = clock();
00249       printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick );
00250     }
00251 #endif
00252     if(err) goto ERR_LABEL;
00253 #ifdef RTOP_TO_MPI_SHOW_TIMES
00254     if(RTOp_MPI_apply_op_print_timings) {
00255       printf("calling MPI_Type_commit(...)");
00256       ticks_start = clock();
00257     }
00258 #endif
00259     err = MPI_Type_commit( &mpi_reduct_ext_type );
00260 #ifdef RTOP_TO_MPI_SHOW_TIMES
00261     if(RTOp_MPI_apply_op_print_timings) {
00262       ticks_end = clock();
00263       printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick );
00264     }
00265 #endif
00266     if(err) goto ERR_LABEL;
00267     /* */
00268     /* Create an external contiguous representation for the intermediate reduction object */
00269     /* that MPI can use.  This is very low level but at least the user */
00270     /* does not have to do it. */
00271     /* */
00272     reduct_obj_ext_size =
00273       (3 + num_reduct_type_values) * sizeof(RTOp_value_type) +
00274       num_reduct_type_indexes      * sizeof(RTOp_index_type) +
00275       num_reduct_type_chars        * sizeof(RTOp_char_type);
00276     i_reduct_objs_ext = malloc( reduct_obj_ext_size * num_cols );
00277     for( kc = 0; kc < num_cols; ++kc ) {
00278       RTOp_extract_reduct_obj_ext_state(
00279         op,i_reduct_objs[kc],num_reduct_type_values,num_reduct_type_indexes,num_reduct_type_chars
00280         ,i_reduct_objs_ext+kc*reduct_obj_ext_size
00281         );
00282     }
00283     /* */
00284     /* Now perform a global reduction over all of the processors. */
00285     /* */
00286     /* Get the user defined reduction operation for MPI to use */
00287     RTOp_get_reduct_op( op, &reduct_op_func_ptr );
00288     if( reduct_op_func_ptr != NULL ) {
00289       /* */
00290       /* The operator object has returned an MPI compatible reduction function so */
00291       /* MPI will be of great help in preforming the global reduction :-). */
00292       /* */
00293       /* Create the MPI reduction operator object */
00294       /* */
00295 #ifdef RTOP_TO_MPI_SHOW_TIMES
00296       if(RTOp_MPI_apply_op_print_timings) {
00297         printf("calling MPI_Op_create(...)");
00298         ticks_start = clock();
00299       }
00300 #endif
00301       err = MPI_Op_create(
00302         reduct_op_func_ptr
00303         ,1                   /* The op is communitive? yes == 1! */
00304         ,&mpi_op
00305         );
00306 #ifdef RTOP_TO_MPI_SHOW_TIMES
00307       if(RTOp_MPI_apply_op_print_timings) {
00308         ticks_end = clock();
00309         printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick );
00310       }
00311 #endif
00312       if(err) goto ERR_LABEL;
00313       if( root_rank >= 0 ) {
00314         MPI_Comm_rank( comm, &rank );
00315         /* Apply the reduction operation over all of the processors */
00316         /* and reduce to one target object only on the root processor */
00317         i_reduct_objs_tmp = malloc( reduct_obj_ext_size * num_cols );
00318         err = MPI_Reduce(
00319           i_reduct_objs_ext
00320           ,rank == root_rank ? i_reduct_objs_tmp : NULL  /* I don't know if this is MPI legal? */
00321           ,num_cols, mpi_reduct_ext_type
00322           ,mpi_op, root_rank, comm
00323           );
00324         if(err) goto ERR_LABEL;
00325         if( rank == root_rank ) { /* bit-by-bit copy */
00326           for( k = 0; k < reduct_obj_ext_size * num_cols; ++k ) i_reduct_objs_ext[k] = i_reduct_objs_tmp[k];
00327         }
00328       }
00329       else {
00330         /* Apply the reduction operation over all of the processors */
00331         /* and reduce to one target object on each processor */
00332         i_reduct_objs_tmp = malloc( reduct_obj_ext_size * num_cols );
00333 #ifdef RTOP_TO_MPI_SHOW_TIMES
00334         if(RTOp_MPI_apply_op_print_timings) {
00335           printf("calling MPI_Allreduce(...)");
00336           ticks_start = clock();
00337         }
00338 #endif
00339         err = MPI_Allreduce(
00340           i_reduct_objs_ext, i_reduct_objs_tmp, num_cols
00341           ,mpi_reduct_ext_type, mpi_op, comm
00342           );
00343 #ifdef RTOP_TO_MPI_SHOW_TIMES
00344         if(RTOp_MPI_apply_op_print_timings) {
00345           ticks_end = clock();
00346           printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick );
00347         }
00348 #endif
00349         if(err) goto ERR_LABEL;
00350         for( k = 0; k < reduct_obj_ext_size * num_cols; ++k ) i_reduct_objs_ext[k] = i_reduct_objs_tmp[k];
00351       }
00352       /* Load the updated state of the reduction target object and reduce. */
00353       for( kc = 0; kc < num_cols; ++kc ) {
00354         RTOp_load_reduct_obj_ext_state( op, i_reduct_objs_ext+kc*reduct_obj_ext_size, i_reduct_objs[kc] );
00355         RTOp_reduce_reduct_objs( op, i_reduct_objs[kc], reduct_objs[kc] );
00356       }
00357     }
00358     else {
00359       /* */
00360       /* We must do without the MPI compatible reduction function :-(  We must */
00361       /* manually perform the reduction operation ourselves.  Note, this will */
00362       /* not be as efficient as when MPI would do it but it helps take some */
00363       /* of the burden off of the developers of operator classes. */
00364       /* */
00365       /* What we need to do is to gather all of the intermediate reduction */
00366       /* objects to the root process and then do the reduction pair-wise. */
00367       /* */
00368       assert( reduct_op_func_ptr ); /* ToDo: Implement! */
00369     }
00370   }
00371   else {
00372     /* */
00373     /* There is a null reduction target object so we don't need to reduce */
00374     /* it across processors */
00375     /* */
00376     if( sub_vecs || sub_targ_vecs ) {
00377       for( kc = 0; kc < num_cols; ++kc ) {
00378         err = RTOp_apply_op(
00379           op, num_vecs, sub_vecs+kc*num_vecs, num_targ_vecs, sub_targ_vecs+kc*num_targ_vecs
00380           ,RTOp_REDUCT_OBJ_NULL
00381           );
00382         if (err) goto ERR_LABEL;
00383       }
00384     }
00385   }
00386 
00387 ERR_LABEL:
00388 
00389   /* Clean up dynamically allocated memory! */
00390 
00391   if( i_reduct_objs_tmp != NULL )
00392     free( i_reduct_objs_tmp );
00393   if( mpi_op != MPI_OP_NULL )
00394     MPI_Op_free( &mpi_op );
00395   if( i_reduct_objs_ext != NULL )
00396     free( i_reduct_objs_ext );
00397   if( mpi_reduct_ext_type != MPI_DATATYPE_NULL )
00398     MPI_Type_free( &mpi_reduct_ext_type );
00399   if( i_reduct_objs != NULL ) {
00400     for( kc = 0; kc < num_cols; ++kc )
00401       RTOp_reduct_obj_free( op, &i_reduct_objs[0] );
00402     free( i_reduct_objs );
00403   }
00404 
00405   return err;
00406 }
00407 
00408 #ifdef RTOP_TO_MPI_SHOW_TIMES
00409 int RTOp_MPI_apply_op_print_timings = 0;
00410 #endif

Generated on Thu Sep 18 12:35:20 2008 for MOOCHO (Single Doxygen Collection) by doxygen 1.3.9.1