+ int mpi_errno = MPI_SUCCESS;
+ int comm_size/*, rank*/;
+ int two_level_bcast = 1;
+ size_t nbytes = 0;
+ int range = 0;
+ int range_threshold = 0;
+ int range_threshold_intra = 0;
+ int is_homogeneous, is_contig;
+ MPI_Aint type_size;
+ //, position;
+ void *tmp_buf = NULL;
+ MPI_Comm shmem_comm;
+ //MPID_Datatype *dtp;
+
+ if (count == 0)
+ return MPI_SUCCESS;
+ if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
+ smpi_comm_init_smp(comm);
+ }
+ if(!mv2_bcast_thresholds_table)
+ init_mv2_bcast_tables_stampede();
+ comm_size = smpi_comm_size(comm);
+ //rank = smpi_comm_rank(comm);
+
+ is_contig=1;
+/* if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
+/* is_contig = 1;*/
+/* else {*/
+/* MPID_Datatype_get_ptr(datatype, dtp);*/
+/* is_contig = dtp->is_contig;*/
+/* }*/
+
+ is_homogeneous = 1;
+
+ /* MPI_Type_size() might not give the accurate size of the packed
+ * datatype for heterogeneous systems (because of padding, encoding,
+ * etc). On the other hand, MPI_Pack_size() can become very
+ * expensive, depending on the implementation, especially for
+ * heterogeneous systems. We want to use MPI_Type_size() wherever
+ * possible, and MPI_Pack_size() in other places.
+ */
+ //if (is_homogeneous) {
+ type_size=smpi_datatype_size(datatype);
+
+ /* } else {
+ MPIR_Pack_size_impl(1, datatype, &type_size);
+ }*/
+ nbytes = (size_t) (count) * (type_size);
+
+ /* Search for the corresponding system size inside the tuning table */
+ while ((range < (mv2_size_bcast_tuning_table - 1)) &&
+ (comm_size > mv2_bcast_thresholds_table[range].numproc)) {
+ range++;
+ }
+ /* Search for corresponding inter-leader function */
+ while ((range_threshold < (mv2_bcast_thresholds_table[range].size_inter_table - 1))
+ && (nbytes >
+ mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max)
+ && (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
+ range_threshold++;
+ }
+
+ /* Search for corresponding intra-node function */
+ while ((range_threshold_intra <
+ (mv2_bcast_thresholds_table[range].size_intra_table - 1))
+ && (nbytes >
+ mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max)
+ && (mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max !=
+ -1)) {
+ range_threshold_intra++;
+ }
+
+ MV2_Bcast_function =
+ mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
+ MV2_pt_Bcast_function;
+
+ MV2_Bcast_intra_node_function =
+ mv2_bcast_thresholds_table[range].
+ intra_node[range_threshold_intra].MV2_pt_Bcast_function;
+
+/* if (mv2_user_bcast_intra == NULL && */
+/* MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {*/
+/* MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;*/
+/* }*/
+
+ if (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
+ zcpy_pipelined_knomial_factor != -1) {
+ zcpy_knomial_factor =
+ mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
+ zcpy_pipelined_knomial_factor;
+ }
+
+ if (mv2_pipelined_zcpy_knomial_factor != -1) {
+ zcpy_knomial_factor = mv2_pipelined_zcpy_knomial_factor;
+ }
+
+ if(MV2_Bcast_intra_node_function == NULL) {
+ /* if tuning table do not have any intra selection, set func pointer to
+ ** default one for mcast intra node */
+ MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;
+ }
+
+ /* Set value of pipeline segment size */
+ bcast_segment_size = mv2_bcast_thresholds_table[range].bcast_segment_size;
+
+ /* Set value of inter node knomial factor */
+ mv2_inter_node_knomial_factor = mv2_bcast_thresholds_table[range].inter_node_knomial_factor;