#include "colls_private.h"
//#include <star-reduction.c>
-/* change number of core per smp-node
- we assume that number of core per process will be the same for all implementations */
-#ifndef NUM_CORE
-#define NUM_CORE 8
-#endif
/*
This fucntion performs all-reduce operation as follow.
int tag = COLL_TAG_ALLREDUCE;
int mask, src, dst;
MPI_Status status;
- int num_core = simcall_host_get_core(SIMIX_host_self());
- // do we use the default one or the number of cores in the platform ?
- // if the number of cores is one, the platform may be simulated with 1 node = 1 core
- if (num_core == 1) num_core = NUM_CORE;
+ if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+ smpi_comm_init_smp(comm);
+ }
+ int num_core=1;
+ if (smpi_comm_is_uniform(comm)){
+ num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
+ }
comm_size = smpi_comm_size(comm);
rank = smpi_comm_rank(comm);
MPI_Aint extent;
extent = smpi_datatype_get_extent(dtype);
- tmp_buf = (void *) xbt_malloc(count * extent);
+ tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);
int intra_rank, inter_rank;
intra_rank = rank % num_core;
recv_chunk = extent * count / (comm_size / num_core);
mask = 1;
- i = 0;
curr_count = count / 2;
int phase = 0;
base_offset = 0;
- send_base_offset = 0;
- recv_base_offset = 0;
while (mask < (comm_size / num_core)) {
dst = inter_rank ^ mask;
// compute offsets
- send_base_offset = base_offset;
-
// right-handside
if (inter_rank & mask) {
recv_base_offset = base_offset + curr_count;
}
- free(tmp_buf);
+ smpi_free_tmp_buffer(tmp_buf);
return MPI_SUCCESS;
}