|
RTOpPack: Extra C/C++ Code for Vector Reduction/Transformation Operators Version of the Day
|
00001 /* 00002 // @HEADER 00003 // *********************************************************************** 00004 // 00005 // Moocho: Multi-functional Object-Oriented arCHitecture for Optimization 00006 // Copyright (2003) Sandia Corporation 00007 // 00008 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00009 // license for use of this work by or on behalf of the U.S. Government. 00010 // 00011 // This library is free software; you can redistribute it and/or modify 00012 // it under the terms of the GNU Lesser General Public License as 00013 // published by the Free Software Foundation; either version 2.1 of the 00014 // License, or (at your option) any later version. 00015 // 00016 // This library is distributed in the hope that it will be useful, but 00017 // WITHOUT ANY WARRANTY; without even the implied warranty of 00018 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00019 // Lesser General Public License for more details. 00020 // 00021 // You should have received a copy of the GNU Lesser General Public 00022 // License along with this library; if not, write to the Free Software 00023 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00024 // USA 00025 // Questions? Contact Roscoe A. Bartlett (rabartl@sandia.gov) 00026 // 00027 // *********************************************************************** 00028 // @HEADER 00029 */ 00030 00031 #include "RTOpToMPI.h" 00032 00033 #include <stdlib.h> 00034 00035 #ifdef RTOP_TO_MPI_SHOW_TIMES 00036 #include <time.h> 00037 #include <stdio.h> 00038 #endif 00039 00040 void RTOp_MPI_type_signature( 00041 const int num_values 00042 ,const int num_indexes 00043 ,const int num_chars 00044 ,int* num_entries 00045 ,int block_lengths[] 00046 ,MPI_Aint displacements[] 00047 ,MPI_Datatype datatypes[] 00048 ) 00049 { 00050 int k = 0, off = 0; 00051 /* values */ 00052 block_lengths[k] = 3 + num_values; /* must carry size information */ 00053 displacements[k] = 0; 00054 datatypes[k] = RTOpMPI_VALUE_TYPE; 00055 ++k; 00056 off += (3 + num_values) * sizeof(RTOp_value_type); 00057 /* indexes */ 00058 if( num_indexes ) { 00059 block_lengths[k] = num_indexes; 00060 displacements[k] = off; 00061 datatypes[k] = RTOpMPI_INDEX_TYPE; 00062 ++k; 00063 off += num_indexes * sizeof(RTOp_index_type); 00064 } 00065 /* chars */ 00066 if( num_chars ) { 00067 block_lengths[k] = num_chars; 00068 displacements[k] = off; 00069 datatypes[k] = RTOpMPI_CHAR_TYPE; 00070 ++k; 00071 } 00072 *num_entries = k; 00073 } 00074 00075 int RTOp_extract_reduct_obj_ext_state( 00076 const struct RTOp_RTOp* op 00077 ,RTOp_ReductTarget reduct_obj 00078 ,int num_values 00079 ,int num_indexes 00080 ,int num_chars 00081 ,void* reduct_obj_ext 00082 ) 00083 { 00084 int err = 0; 00085 int 00086 num_values_off = 0, 00087 num_indexes_off = num_values_off + sizeof(RTOp_value_type), 00088 num_chars_off = num_indexes_off + sizeof(RTOp_value_type), 00089 values_off = num_chars_off + sizeof(RTOp_value_type), 00090 indexes_off = values_off + num_values * sizeof(RTOp_value_type), 00091 chars_off = indexes_off + num_indexes * sizeof(RTOp_index_type); 00092 *(RTOp_value_type*)((char*)reduct_obj_ext + num_values_off) = num_values; 00093 *(RTOp_value_type*)((char*)reduct_obj_ext + num_indexes_off) = num_indexes; 00094 *(RTOp_value_type*)((char*)reduct_obj_ext + num_chars_off) = num_chars; 00095 err = RTOp_extract_reduct_obj_state( 00096 op,reduct_obj 00097 ,num_values, (RTOp_value_type*)((char*)reduct_obj_ext + values_off) 00098 ,num_indexes, (RTOp_index_type*)((char*)reduct_obj_ext + indexes_off) 00099 ,num_chars, (RTOp_char_type*)((char*)reduct_obj_ext + chars_off) 00100 ); 00101 return err; 00102 } 00103 00104 int RTOp_load_reduct_obj_ext_state( 00105 const struct RTOp_RTOp* op 00106 ,const void* reduct_obj_ext 00107 ,RTOp_ReductTarget reduct_obj 00108 ) 00109 { 00110 int err = 0; 00111 int 00112 num_values_off = 0, 00113 num_values = *(RTOp_value_type*)((char*)reduct_obj_ext + num_values_off), 00114 num_indexes_off = num_values_off + sizeof(RTOp_value_type), 00115 num_indexes = *(RTOp_value_type*)((char*)reduct_obj_ext + num_indexes_off), 00116 num_chars_off = num_indexes_off + sizeof(RTOp_value_type), 00117 num_chars = *(RTOp_value_type*)((char*)reduct_obj_ext + num_chars_off), 00118 values_off = num_chars_off + sizeof(RTOp_value_type), 00119 indexes_off = values_off + num_values * sizeof(RTOp_value_type), 00120 chars_off = indexes_off + num_indexes * sizeof(RTOp_index_type); 00121 err = RTOp_load_reduct_obj_state( 00122 op 00123 ,num_values, (RTOp_value_type*)((char*)reduct_obj_ext + values_off) 00124 ,num_indexes, (RTOp_index_type*)((char*)reduct_obj_ext + indexes_off) 00125 ,num_chars, (RTOp_char_type*)((char*) reduct_obj_ext + chars_off) 00126 ,reduct_obj 00127 ); 00128 return err; 00129 } 00130 00131 int RTOp_MPI_apply_op( 00132 MPI_Comm comm, const struct RTOp_RTOp* op, int root_rank 00133 ,const int num_cols 00134 ,const int num_vecs, const struct RTOp_SubVector sub_vecs[] 00135 ,const int num_targ_vecs, const struct RTOp_MutableSubVector sub_targ_vecs[] 00136 ,RTOp_ReductTarget reduct_objs[] 00137 ) 00138 { 00139 int err = 0; 00140 int num_reduct_type_values = 0, num_reduct_type_indexes = 0, 00141 num_reduct_type_chars = 0, num_reduct_type_entries = 0; 00142 RTOp_ReductTarget *i_reduct_objs = NULL; 00143 int target_type_block_lengths[3]; 00144 MPI_Aint target_type_displacements[3]; 00145 RTOp_Datatype target_type_datatypes[3]; 00146 MPI_Datatype mpi_reduct_ext_type = MPI_DATATYPE_NULL; 00147 int reduct_obj_ext_size = 0; 00148 char *i_reduct_objs_ext = NULL; 00149 char *i_reduct_objs_tmp = NULL; 00150 RTOp_reduct_op_func_ptr_t reduct_op_func_ptr = NULL; 00151 MPI_Op mpi_op = MPI_OP_NULL; 00152 int rank = -1; 00153 int k; 00154 int kc; 00155 #ifdef RTOP_TO_MPI_SHOW_TIMES 00156 const double secs_per_tick = ((double)1.0) / CLOCKS_PER_SEC; 00157 clock_t ticks_start_start, ticks_start=0, ticks_end = 0; 00158 #endif 00159 00160 if( comm == MPI_COMM_NULL ) { 00161 /* Just do the reduction on this processor and be done with it! */ 00162 if( sub_vecs || sub_targ_vecs ) { 00163 for( kc = 0; kc < num_cols; ++kc ) { 00164 err = RTOp_apply_op( 00165 op, num_vecs, sub_vecs+kc*num_vecs, num_targ_vecs, sub_targ_vecs+kc*num_targ_vecs 00166 ,reduct_objs ? reduct_objs[kc] : RTOp_REDUCT_OBJ_NULL 00167 ); 00168 if (err) goto ERR_LABEL; 00169 } 00170 } 00171 return 0; /* Success! */ 00172 } 00173 00174 /* 00175 * Get the description of the datatype of the target object. 00176 * We need this in order to map it to an MPI user defined 00177 * data type so that the reduction operations can be performed 00178 * over all of the of processors. 00179 * 00180 * Get the number of the entries in the array that describes 00181 * target type's datatypes 00182 */ 00183 err = RTOp_get_reduct_type_num_entries( 00184 op, &num_reduct_type_values, &num_reduct_type_indexes, &num_reduct_type_chars ); 00185 if(err) goto ERR_LABEL; 00186 num_reduct_type_entries 00187 = (num_reduct_type_values ? 1 : 0) 00188 + (num_reduct_type_indexes ? 1 : 0) 00189 + (num_reduct_type_chars ? 1 : 0); 00190 00191 if( num_reduct_type_entries ) { 00192 #ifdef RTOP_TO_MPI_SHOW_TIMES 00193 if(RTOp_MPI_apply_op_print_timings) { 00194 printf("RTOp_MPI_apply_op(...) : timing various MPI calls and other activities\n"); 00195 ticks_start_start = clock(); 00196 } 00197 #endif 00198 /* 00199 * There is a non-null reduction target object so we need to reduce 00200 * it across processors 00201 * 00202 * Allocate the intermediate target object and perform the reduction for the 00203 * vector elements on this processor. 00204 */ 00205 i_reduct_objs = malloc( sizeof(RTOp_ReductTarget) * num_cols ); 00206 for( kc = 0; kc < num_cols; ++kc ) { 00207 i_reduct_objs[kc] = RTOp_REDUCT_OBJ_NULL; 00208 RTOp_reduct_obj_create( op, i_reduct_objs + kc ); 00209 } 00210 #ifdef RTOP_TO_MPI_SHOW_TIMES 00211 if(RTOp_MPI_apply_op_print_timings) { 00212 printf("calling RTOp_apply_op(...)"); 00213 ticks_start = clock(); 00214 } 00215 #endif 00216 if( sub_vecs || sub_targ_vecs ) { 00217 for( kc = 0; kc < num_cols; ++kc ) { 00218 err = RTOp_apply_op( 00219 op, num_vecs, sub_vecs+kc*num_vecs, num_targ_vecs, sub_targ_vecs+kc*num_targ_vecs 00220 ,i_reduct_objs[kc] 00221 ); 00222 if (err) goto ERR_LABEL; 00223 } 00224 } 00225 #ifdef RTOP_TO_MPI_SHOW_TIMES 00226 if(RTOp_MPI_apply_op_print_timings) { 00227 ticks_end = clock(); 00228 printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick ); 00229 } 00230 #endif 00231 /* Get the arrays that describe the reduction object datatype */ 00232 if( num_reduct_type_values == 0 ) ++num_reduct_type_entries; /* Add the block for the sizes */ 00233 RTOp_MPI_type_signature( 00234 num_reduct_type_values, num_reduct_type_indexes, num_reduct_type_chars 00235 ,&num_reduct_type_entries 00236 ,target_type_block_lengths, target_type_displacements, target_type_datatypes 00237 ); 00238 /* Translate this target datatype description to an MPI datatype */ 00239 #ifdef RTOP_TO_MPI_SHOW_TIMES 00240 if(RTOp_MPI_apply_op_print_timings) { 00241 printf("calling MPI_Type_struct(...)"); 00242 ticks_start = clock(); 00243 } 00244 #endif 00245 err = MPI_Type_struct( num_reduct_type_entries 00246 , target_type_block_lengths, target_type_displacements 00247 , target_type_datatypes, &mpi_reduct_ext_type ); 00248 #ifdef RTOP_TO_MPI_SHOW_TIMES 00249 if(RTOp_MPI_apply_op_print_timings) { 00250 ticks_end = clock(); 00251 printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick ); 00252 } 00253 #endif 00254 if(err) goto ERR_LABEL; 00255 #ifdef RTOP_TO_MPI_SHOW_TIMES 00256 if(RTOp_MPI_apply_op_print_timings) { 00257 printf("calling MPI_Type_commit(...)"); 00258 ticks_start = clock(); 00259 } 00260 #endif 00261 err = MPI_Type_commit( &mpi_reduct_ext_type ); 00262 #ifdef RTOP_TO_MPI_SHOW_TIMES 00263 if(RTOp_MPI_apply_op_print_timings) { 00264 ticks_end = clock(); 00265 printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick ); 00266 } 00267 #endif 00268 if(err) goto ERR_LABEL; 00269 /* */ 00270 /* Create an external contiguous representation for the intermediate reduction object */ 00271 /* that MPI can use. This is very low level but at least the user */ 00272 /* does not have to do it. */ 00273 /* */ 00274 reduct_obj_ext_size = 00275 (3 + num_reduct_type_values) * sizeof(RTOp_value_type) + 00276 num_reduct_type_indexes * sizeof(RTOp_index_type) + 00277 num_reduct_type_chars * sizeof(RTOp_char_type); 00278 i_reduct_objs_ext = malloc( reduct_obj_ext_size * num_cols ); 00279 for( kc = 0; kc < num_cols; ++kc ) { 00280 RTOp_extract_reduct_obj_ext_state( 00281 op,i_reduct_objs[kc],num_reduct_type_values,num_reduct_type_indexes,num_reduct_type_chars 00282 ,i_reduct_objs_ext+kc*reduct_obj_ext_size 00283 ); 00284 } 00285 /* */ 00286 /* Now perform a global reduction over all of the processors. */ 00287 /* */ 00288 /* Get the user defined reduction operation for MPI to use */ 00289 RTOp_get_reduct_op( op, &reduct_op_func_ptr ); 00290 if( reduct_op_func_ptr != NULL ) { 00291 /* */ 00292 /* The operator object has returned an MPI compatible reduction function so */ 00293 /* MPI will be of great help in preforming the global reduction :-). */ 00294 /* */ 00295 /* Create the MPI reduction operator object */ 00296 /* */ 00297 #ifdef RTOP_TO_MPI_SHOW_TIMES 00298 if(RTOp_MPI_apply_op_print_timings) { 00299 printf("calling MPI_Op_create(...)"); 00300 ticks_start = clock(); 00301 } 00302 #endif 00303 err = MPI_Op_create( 00304 reduct_op_func_ptr 00305 ,1 /* The op is communitive? yes == 1! */ 00306 ,&mpi_op 00307 ); 00308 #ifdef RTOP_TO_MPI_SHOW_TIMES 00309 if(RTOp_MPI_apply_op_print_timings) { 00310 ticks_end = clock(); 00311 printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick ); 00312 } 00313 #endif 00314 if(err) goto ERR_LABEL; 00315 if( root_rank >= 0 ) { 00316 MPI_Comm_rank( comm, &rank ); 00317 /* Apply the reduction operation over all of the processors */ 00318 /* and reduce to one target object only on the root processor */ 00319 i_reduct_objs_tmp = malloc( reduct_obj_ext_size * num_cols ); 00320 err = MPI_Reduce( 00321 i_reduct_objs_ext 00322 ,rank == root_rank ? i_reduct_objs_tmp : NULL /* I don't know if this is MPI legal? */ 00323 ,num_cols, mpi_reduct_ext_type 00324 ,mpi_op, root_rank, comm 00325 ); 00326 if(err) goto ERR_LABEL; 00327 if( rank == root_rank ) { /* bit-by-bit copy */ 00328 for( k = 0; k < reduct_obj_ext_size * num_cols; ++k ) i_reduct_objs_ext[k] = i_reduct_objs_tmp[k]; 00329 } 00330 } 00331 else { 00332 /* Apply the reduction operation over all of the processors */ 00333 /* and reduce to one target object on each processor */ 00334 i_reduct_objs_tmp = malloc( reduct_obj_ext_size * num_cols ); 00335 #ifdef RTOP_TO_MPI_SHOW_TIMES 00336 if(RTOp_MPI_apply_op_print_timings) { 00337 printf("calling MPI_Allreduce(...)"); 00338 ticks_start = clock(); 00339 } 00340 #endif 00341 err = MPI_Allreduce( 00342 i_reduct_objs_ext, i_reduct_objs_tmp, num_cols 00343 ,mpi_reduct_ext_type, mpi_op, comm 00344 ); 00345 #ifdef RTOP_TO_MPI_SHOW_TIMES 00346 if(RTOp_MPI_apply_op_print_timings) { 00347 ticks_end = clock(); 00348 printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick ); 00349 } 00350 #endif 00351 if(err) goto ERR_LABEL; 00352 for( k = 0; k < reduct_obj_ext_size * num_cols; ++k ) i_reduct_objs_ext[k] = i_reduct_objs_tmp[k]; 00353 } 00354 /* Load the updated state of the reduction target object and reduce. */ 00355 for( kc = 0; kc < num_cols; ++kc ) { 00356 RTOp_load_reduct_obj_ext_state( op, i_reduct_objs_ext+kc*reduct_obj_ext_size, i_reduct_objs[kc] ); 00357 RTOp_reduce_reduct_objs( op, i_reduct_objs[kc], reduct_objs[kc] ); 00358 } 00359 } 00360 else { 00361 /* */ 00362 /* We must do without the MPI compatible reduction function :-( We must */ 00363 /* manually perform the reduction operation ourselves. Note, this will */ 00364 /* not be as efficient as when MPI would do it but it helps take some */ 00365 /* of the burden off of the developers of operator classes. */ 00366 /* */ 00367 /* What we need to do is to gather all of the intermediate reduction */ 00368 /* objects to the root process and then do the reduction pair-wise. */ 00369 /* */ 00370 assert( reduct_op_func_ptr ); /* ToDo: Implement! */ 00371 } 00372 } 00373 else { 00374 /* */ 00375 /* There is a null reduction target object so we don't need to reduce */ 00376 /* it across processors */ 00377 /* */ 00378 if( sub_vecs || sub_targ_vecs ) { 00379 for( kc = 0; kc < num_cols; ++kc ) { 00380 err = RTOp_apply_op( 00381 op, num_vecs, sub_vecs+kc*num_vecs, num_targ_vecs, sub_targ_vecs+kc*num_targ_vecs 00382 ,RTOp_REDUCT_OBJ_NULL 00383 ); 00384 if (err) goto ERR_LABEL; 00385 } 00386 } 00387 } 00388 00389 ERR_LABEL: 00390 00391 /* Clean up dynamically allocated memory! */ 00392 00393 if( i_reduct_objs_tmp != NULL ) 00394 free( i_reduct_objs_tmp ); 00395 if( mpi_op != MPI_OP_NULL ) 00396 MPI_Op_free( &mpi_op ); 00397 if( i_reduct_objs_ext != NULL ) 00398 free( i_reduct_objs_ext ); 00399 if( mpi_reduct_ext_type != MPI_DATATYPE_NULL ) 00400 MPI_Type_free( &mpi_reduct_ext_type ); 00401 if( i_reduct_objs != NULL ) { 00402 for( kc = 0; kc < num_cols; ++kc ) 00403 RTOp_reduct_obj_free( op, &i_reduct_objs[0] ); 00404 free( i_reduct_objs ); 00405 } 00406 00407 return err; 00408 } 00409 00410 #ifdef RTOP_TO_MPI_SHOW_TIMES 00411 int RTOp_MPI_apply_op_print_timings = 0; 00412 #endif
1.7.4