MPI_Datatype datatype,
int root, MPI_Comm comm)
{
+ int mpi_errno = MPI_SUCCESS;
+ int comm_size/*, rank*/;
+ int two_level_bcast = 1;
+ size_t nbytes = 0;
+ int range = 0;
+ int range_threshold = 0;
+ int range_threshold_intra = 0;
+ int is_homogeneous, is_contig;
+ MPI_Aint type_size;
+ //, position;
+ void *tmp_buf = NULL;
+ MPI_Comm shmem_comm;
+ //MPID_Datatype *dtp;
+
+ if (count == 0)
+ return MPI_SUCCESS;
+ if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+ smpi_comm_init_smp(comm);
+ }
+ if(!mv2_bcast_thresholds_table)
+ init_mv2_bcast_tables_stampede();
+ comm_size = smpi_comm_size(comm);
+ //rank = smpi_comm_rank(comm);
+
+ is_contig=1;
+/* if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
+/* is_contig = 1;*/
+/* else {*/
+/* MPID_Datatype_get_ptr(datatype, dtp);*/
+/* is_contig = dtp->is_contig;*/
+/* }*/
+
+ is_homogeneous = 1;
+
+ /* MPI_Type_size() might not give the accurate size of the packed
+ * datatype for heterogeneous systems (because of padding, encoding,
+ * etc). On the other hand, MPI_Pack_size() can become very
+ * expensive, depending on the implementation, especially for
+ * heterogeneous systems. We want to use MPI_Type_size() wherever
+ * possible, and MPI_Pack_size() in other places.
+ */
+ //if (is_homogeneous) {
+ type_size=smpi_datatype_size(datatype);
+
+ /* } else {
+ MPIR_Pack_size_impl(1, datatype, &type_size);
+ }*/
+ nbytes = (size_t) (count) * (type_size);
+
+ /* Search for the corresponding system size inside the tuning table */
+ while ((range < (mv2_size_bcast_tuning_table - 1)) &&
+ (comm_size > mv2_bcast_thresholds_table[range].numproc)) {
+ range++;
+ }
+ /* Search for corresponding inter-leader function */
+ while ((range_threshold < (mv2_bcast_thresholds_table[range].size_inter_table - 1))
+ && (nbytes >
+ mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max)
+ && (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
+ range_threshold++;
+ }
+
+ /* Search for corresponding intra-node function */
+ while ((range_threshold_intra <
+ (mv2_bcast_thresholds_table[range].size_intra_table - 1))
+ && (nbytes >
+ mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max)
+ && (mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max !=
+ -1)) {
+ range_threshold_intra++;
+ }
- //TODO : Bcast really needs intra/inter phases in mvapich. Default to mpich if not available
- return smpi_coll_tuned_bcast_mpich(buffer, count, datatype, root, comm);
+ MV2_Bcast_function =
+ mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
+ MV2_pt_Bcast_function;
+
+ MV2_Bcast_intra_node_function =
+ mv2_bcast_thresholds_table[range].
+ intra_node[range_threshold_intra].MV2_pt_Bcast_function;
+
+/* if (mv2_user_bcast_intra == NULL && */
+/* MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {*/
+/* MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;*/
+/* }*/
+
+ if (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
+ zcpy_pipelined_knomial_factor != -1) {
+ zcpy_knomial_factor =
+ mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
+ zcpy_pipelined_knomial_factor;
+ }
+
+ if (mv2_pipelined_zcpy_knomial_factor != -1) {
+ zcpy_knomial_factor = mv2_pipelined_zcpy_knomial_factor;
+ }
+
+ if(MV2_Bcast_intra_node_function == NULL) {
+ /* if tuning table do not have any intra selection, set func pointer to
+ ** default one for mcast intra node */
+ MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;
+ }
+
+ /* Set value of pipeline segment size */
+ bcast_segment_size = mv2_bcast_thresholds_table[range].bcast_segment_size;
+
+ /* Set value of inter node knomial factor */
+ mv2_inter_node_knomial_factor = mv2_bcast_thresholds_table[range].inter_node_knomial_factor;
+
+ /* Set value of intra node knomial factor */
+ mv2_intra_node_knomial_factor = mv2_bcast_thresholds_table[range].intra_node_knomial_factor;
+
+ /* Check if we will use a two level algorithm or not */
+ two_level_bcast =
+#if defined(_MCST_SUPPORT_)
+ mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold]
+ || comm->ch.is_mcast_ok;
+#else
+ mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold];
+#endif
+ if (two_level_bcast == 1) {
+ if (!is_contig || !is_homogeneous) {
+ tmp_buf=(void *)xbt_malloc(nbytes);
+
+/* position = 0;*/
+/* if (rank == root) {*/
+/* mpi_errno =*/
+/* MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
+/* if (mpi_errno)*/
+/* MPIU_ERR_POP(mpi_errno);*/
+/* }*/
+ }
+#ifdef CHANNEL_MRAIL_GEN2
+ if ((mv2_enable_zcpy_bcast == 1) &&
+ (&MPIR_Pipelined_Bcast_Zcpy_MV2 == MV2_Bcast_function)) {
+ if (!is_contig || !is_homogeneous) {
+ mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(tmp_buf, nbytes, MPI_BYTE,
+ root, comm);
+ } else {
+ mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(buffer, count, datatype,
+ root, comm);
+ }
+ } else
+#endif /* defined(CHANNEL_MRAIL_GEN2) */
+ {
+ shmem_comm = smpi_comm_get_intra_comm(comm);
+ if (!is_contig || !is_homogeneous) {
+ mpi_errno =
+ MPIR_Bcast_tune_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE,
+ root, comm);
+ } else {
+ mpi_errno =
+ MPIR_Bcast_tune_inter_node_helper_MV2(buffer, count, datatype, root,
+ comm);
+ }
+
+ /* We are now done with the inter-node phase */
+
+
+ root = INTRA_NODE_ROOT;
+
+
+ if (!is_contig || !is_homogeneous) {
+ mpi_errno = MV2_Bcast_intra_node_function(tmp_buf, nbytes,
+ MPI_BYTE, root, shmem_comm);
+ } else {
+ mpi_errno = MV2_Bcast_intra_node_function(buffer, count,
+ datatype, root, shmem_comm);
+
+ }
+ }
+/* if (!is_contig || !is_homogeneous) {*/
+/* if (rank != root) {*/
+/* position = 0;*/
+/* mpi_errno = MPIR_Unpack_impl(tmp_buf, nbytes, &position, buffer,*/
+/* count, datatype);*/
+/* }*/
+/* }*/
+ } else {
+ /* We use Knomial for intra node */
+ MV2_Bcast_intra_node_function = &MPIR_Knomial_Bcast_intra_node_MV2;
+/* if (mv2_enable_shmem_bcast == 0) {*/
+ /* Fall back to non-tuned version */
+/* MPIR_Bcast_intra_MV2(buffer, count, datatype, root, comm);*/
+/* } else {*/
+ mpi_errno = MV2_Bcast_function(buffer, count, datatype, root,
+ comm);
+
+/* }*/
+ }
+
+
+ return mpi_errno;
}
void *recvbuf,
int recvcnt,
MPI_Datatype recvtype,
- int root, MPI_Comm comm_ptr)
+ int root, MPI_Comm comm)
{
int range = 0, range_threshold = 0, range_threshold_intra = 0;
int mpi_errno = MPI_SUCCESS;
int recvtype_size, sendtype_size;
int partial_sub_ok = 0;
int conf_index = 0;
- // int local_size = -1;
- // int i;
- // MPI_Comm shmem_comm;
+ int local_size = -1;
+ int i;
+ MPI_Comm shmem_comm;
// MPID_Comm *shmem_commptr=NULL;
if(mv2_scatter_thresholds_table==NULL)
init_mv2_scatter_tables_stampede();
- comm_size = smpi_comm_size(comm_ptr);
+ if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+ smpi_comm_init_smp(comm);
+ }
+
+ comm_size = smpi_comm_size(comm);
- rank = smpi_comm_rank(comm_ptr);
+ rank = smpi_comm_rank(comm);
if (rank == root) {
sendtype_size=smpi_datatype_size(sendtype);
recvtype_size=smpi_datatype_size(recvtype);
nbytes = recvcnt * recvtype_size;
}
- /*
+
// check if safe to use partial subscription mode
- if (comm_ptr->ch.shmem_coll_ok == 1 && comm_ptr->ch.is_uniform) {
+ if (smpi_comm_is_uniform(comm)) {
- shmem_comm = comm_ptr->ch.shmem_comm;
- MPID_Comm_get_ptr(shmem_comm, shmem_commptr);
- local_size = shmem_commptr->local_size;
+ shmem_comm = smpi_comm_get_intra_comm(comm);
+ local_size = smpi_comm_size(shmem_comm);
i = 0;
if (mv2_scatter_table_ppn_conf[0] == -1) {
// Indicating user defined tuning
conf_index = 0;
- goto conf_check_end;
+ }else{
+ do {
+ if (local_size == mv2_scatter_table_ppn_conf[i]) {
+ conf_index = i;
+ partial_sub_ok = 1;
+ break;
+ }
+ i++;
+ } while(i < mv2_scatter_num_ppn_conf);
}
- do {
- if (local_size == mv2_scatter_table_ppn_conf[i]) {
- conf_index = i;
- partial_sub_ok = 1;
- break;
- }
- i++;
- } while(i < mv2_scatter_num_ppn_conf);
}
- */
+
if (partial_sub_ok != 1) {
conf_index = 0;
}
if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) {
#if defined(_MCST_SUPPORT_)
- if(comm_ptr->ch.is_mcast_ok == 1
+ if(comm->ch.is_mcast_ok == 1
&& mv2_use_mcast_scatter == 1
- && comm_ptr->ch.shmem_coll_ok == 1) {
+ && comm->ch.shmem_coll_ok == 1) {
MV2_Scatter_function = &MPIR_Scatter_mcst_MV2;
} else
#endif /*#if defined(_MCST_SUPPORT_) */
if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) ||
(MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) {
- /* if( comm_ptr->ch.shmem_coll_ok == 1 &&
- comm_ptr->ch.is_global_block == 1 ) {
+ if( smpi_comm_is_blocked(comm)) {
MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra]
.MV2_pt_Scatter_function;
mpi_errno =
MV2_Scatter_function(sendbuf, sendcnt, sendtype,
recvbuf, recvcnt, recvtype, root,
- comm_ptr);
- } else {*/
+ comm);
+ } else {
mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype,
recvbuf, recvcnt, recvtype, root,
- comm_ptr);
+ comm);
- //}
+ }
} else {
mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype,
recvbuf, recvcnt, recvtype, root,
- comm_ptr);
+ comm);
}
return (mpi_errno);
}