00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00028 #ifndef util_ParallelReduce_hpp
00029 #define util_ParallelReduce_hpp
00030
00031 #include <cstddef>
00032 #include <iosfwd>
00033 #include <string>
00034 #include <util/Parallel.hpp>
00035 #include <util/SimpleArrayOps.hpp>
00036
00037
00038
00039 namespace phdmesh {
00040
00044 void all_write_string( ParallelMachine ,
00045 std::ostream & ,
00046 const std::string & );
00047
00048 void all_reduce_sum( ParallelMachine ,
00049 const double * local , double * global , unsigned count );
00050
00051 void all_reduce_sum( ParallelMachine ,
00052 const float * local , float * global , unsigned count );
00053
00054 void all_reduce_sum( ParallelMachine ,
00055 const int * local , int * global , unsigned count );
00056
00057 void all_reduce_bor( ParallelMachine ,
00058 const unsigned * local ,
00059 unsigned * global , unsigned count );
00060
00072 }
00073
00074
00075
00076
00077 namespace phdmesh {
00078
00079 extern "C" {
00080 typedef void (*ParallelReduceOp)
00081 ( void * inv , void * outv , int * , ParallelDatatype * );
00082 }
00083
00084 void all_reduce_internal( ParallelMachine arg_comm ,
00085 ParallelReduceOp arg_op ,
00086 void * arg_in ,
00087 void * arg_out ,
00088 unsigned arg_len );
00089
00090 namespace {
00091
00092
00093
00094
00095 struct ReduceEnd {
00096 struct BufferType {};
00097 void copyin( BufferType & ) const {}
00098 void copyout( BufferType & ) const {}
00099 static void op( BufferType & , BufferType & ) {}
00100 };
00101
00102
00103
00104 template < class Oper , class Next = ReduceEnd >
00105 struct Reduce {
00106 typedef typename Oper::type Type ;
00107 enum { N = Oper::N };
00108
00109 struct BufferType {
00110 Type m_value[N];
00111 typename Next::BufferType m_next ;
00112 };
00113
00114 Next m_next ;
00115 Type * m_ptr ;
00116
00117 Next & set( const Oper & arg ) { m_ptr = arg.ptr ; return m_next ; }
00118
00119 void reduce( ParallelMachine comm ) const ;
00120
00121 void copyin( BufferType & b ) const
00122 { Copy<N>( b.m_value , m_ptr ); m_next.copyin( b.m_next ); }
00123
00124 void copyout( BufferType & b ) const
00125 { Copy<N>( m_ptr , b.m_value ); m_next.copyout( b.m_next ); }
00126
00127 static void op( BufferType & dst , BufferType & src )
00128 { Oper::op(dst.m_value,src.m_value); Next::op(dst.m_next,src.m_next); }
00129
00130 static void void_op( void*inv, void*inoutv, int*, ParallelDatatype*);
00131 };
00132
00133 template <class Oper, class Next>
00134 void Reduce<Oper,Next>::void_op( void*inv, void*inoutv,int*,ParallelDatatype*)
00135 {
00136 op( * reinterpret_cast<BufferType*>( inoutv ) ,
00137 * reinterpret_cast<BufferType*>( inv ) );
00138 }
00139
00140 template <class Oper, class Next>
00141 void Reduce<Oper,Next>::reduce( ParallelMachine comm ) const
00142 {
00143 ParallelReduceOp f = reinterpret_cast<ParallelReduceOp>( & void_op );
00144 BufferType inbuf , outbuf ;
00145 copyin( inbuf );
00146 all_reduce_internal( comm , f , & inbuf , & outbuf , sizeof(BufferType) );
00147 copyout( outbuf );
00148 }
00149
00150 }
00151 }
00152
00153
00154
00155
00156 namespace phdmesh {
00157
00158 template < class Op1 >
00159 inline
00160 void all_reduce( ParallelMachine comm , const Op1 & op1 )
00161 {
00162 Reduce< Op1 > work ;
00163 work.set( op1 );
00164 work.reduce( comm );
00165 }
00166
00167 template < class Op1 , class Op2 >
00168 inline
00169 void all_reduce( ParallelMachine comm , const Op1 & op1 ,
00170 const Op2 & op2 )
00171 {
00172 Reduce< Op1 ,
00173 Reduce< Op2 > > work ;
00174 work.set( op1 ).set( op2 );
00175 work.reduce( comm );
00176 }
00177
00178 template < class Op1 , class Op2 , class Op3 >
00179 inline
00180 void all_reduce( ParallelMachine comm , const Op1 & op1 ,
00181 const Op2 & op2 ,
00182 const Op3 & op3 )
00183 {
00184 Reduce< Op1 ,
00185 Reduce< Op2 ,
00186 Reduce< Op3 > > > work ;
00187 work.set( op1 ).set( op2 ).set( op3 );
00188 work.reduce( comm );
00189 }
00190
00191 template < class Op1 , class Op2 , class Op3 , class Op4 >
00192 inline
00193 void all_reduce( ParallelMachine comm , const Op1 & op1 ,
00194 const Op2 & op2 ,
00195 const Op3 & op3 ,
00196 const Op4 & op4 )
00197 {
00198 Reduce< Op1 ,
00199 Reduce< Op2 ,
00200 Reduce< Op3 ,
00201 Reduce< Op4 > > > > work ;
00202 work.set( op1 ).set( op2 ).set( op3 ).set( op4 );
00203 work.reduce( comm );
00204 }
00205
00206 template < class Op1 , class Op2 , class Op3 , class Op4 ,
00207 class Op5 >
00208 inline
00209 void all_reduce( ParallelMachine comm , const Op1 & op1 ,
00210 const Op2 & op2 ,
00211 const Op3 & op3 ,
00212 const Op4 & op4 ,
00213 const Op5 & op5 )
00214 {
00215 Reduce< Op1 ,
00216 Reduce< Op2 ,
00217 Reduce< Op3 ,
00218 Reduce< Op4 ,
00219 Reduce< Op5 > > > > > work ;
00220 work.set( op1 ).set( op2 ).set( op3 ).set( op4 ).set( op5 );
00221 work.reduce( comm );
00222 }
00223
00224 template < class Op1 , class Op2 , class Op3 , class Op4 ,
00225 class Op5 , class Op6 >
00226 inline
00227 void all_reduce( ParallelMachine comm , const Op1 & op1 ,
00228 const Op2 & op2 ,
00229 const Op3 & op3 ,
00230 const Op4 & op4 ,
00231 const Op5 & op5 ,
00232 const Op6 & op6 )
00233 {
00234 Reduce< Op1 ,
00235 Reduce< Op2 ,
00236 Reduce< Op3 ,
00237 Reduce< Op4 ,
00238 Reduce< Op5 ,
00239 Reduce< Op6 > > > > > > work ;
00240 work.set( op1 ).set( op2 ).set( op3 ).set( op4 ).set( op5 ).set( op6 );
00241 work.reduce( comm );
00242 }
00243
00244 }
00245
00246
00247
00248 #endif
00249