Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Update copyright lines with new year.
[simgrid.git] / src / smpi / colls / bcast / bcast-ompi-pipeline.cpp
1 /* Copyright (c) 2013-2019. The SimGrid Team.
2  * All rights reserved.                                                     */
3
4 /* This program is free software; you can redistribute it and/or modify it
5  * under the terms of the license (GNU LGPL) which comes with this package. */
6
7 #include "../coll_tuned_topo.hpp"
8 #include "../colls_private.hpp"
9
10 #define MAXTREEFANOUT 32
11
12 namespace simgrid{
13 namespace smpi{
14 int Coll_bcast_ompi_pipeline::bcast( void* buffer,
15                                       int original_count,
16                                       MPI_Datatype datatype,
17                                       int root,
18                                       MPI_Comm comm)
19 {
20     int count_by_segment = original_count;
21     size_t type_size;
22     size_t segsize =1024  << 7;
23     //mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
24     //mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
25
26 //    return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
27 //                                                count_by_segment, data->cached_pipeline );
28     ompi_coll_tree_t * tree = ompi_coll_tuned_topo_build_chain( 1, comm, root );
29     int i;
30     int rank, size;
31     int segindex;
32     int num_segments; /* Number of segments */
33     int sendcount;    /* number of elements sent in this segment */
34     size_t realsegsize;
35     char *tmpbuf;
36     ptrdiff_t extent;
37     MPI_Request recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
38     MPI_Request *send_reqs = NULL;
39     int req_index;
40
41     /**
42      * Determine number of elements sent per operation.
43      */
44     type_size = datatype->size();
45
46     size = comm->size();
47     rank = comm->rank();
48     if(size==1)return MPI_SUCCESS;
49
50
51     const double a_p16  = 3.2118e-6; /* [1 / byte] */
52     const double b_p16  = 8.7936;
53     const double a_p64  = 2.3679e-6; /* [1 / byte] */
54     const double b_p64  = 1.1787;
55     const double a_p128 = 1.6134e-6; /* [1 / byte] */
56     const double b_p128 = 2.1102;
57     size_t message_size;
58
59     /* else we need data size for decision function */
60     message_size = type_size * (unsigned long)original_count;   /* needed for decision */
61
62     if (size < (a_p128 * message_size + b_p128)) {
63             //Pipeline with 128KB segments
64             segsize = 1024  << 7;
65     }else if (size < (a_p64 * message_size + b_p64)) {
66             // Pipeline with 64KB segments
67             segsize = 1024 << 6;
68     }else if (size < (a_p16 * message_size + b_p16)) {
69             //Pipeline with 16KB segments
70             segsize = 1024 << 4;
71     }
72
73     COLL_TUNED_COMPUTED_SEGCOUNT( segsize, type_size, count_by_segment );
74
75     XBT_DEBUG("coll:tuned:bcast_intra_pipeline rank %d ss %5zu type_size %lu count_by_segment %d",
76                  comm->rank(), segsize, (unsigned long)type_size, count_by_segment);
77
78
79
80     extent = datatype->get_extent();
81     num_segments = (original_count + count_by_segment - 1) / count_by_segment;
82     realsegsize = count_by_segment * extent;
83
84     /* Set the buffer pointers */
85     tmpbuf = (char *) buffer;
86
87     if( tree->tree_nextsize != 0 ) {
88       send_reqs = new MPI_Request[tree->tree_nextsize];
89     }
90
91     /* Root code */
92     if( rank == root ) {
93         /*
94            For each segment:
95            - send segment to all children.
96              The last segment may have less elements than other segments.
97         */
98         sendcount = count_by_segment;
99         for( segindex = 0; segindex < num_segments; segindex++ ) {
100             if( segindex == (num_segments - 1) ) {
101                 sendcount = original_count - segindex * count_by_segment;
102             }
103             for( i = 0; i < tree->tree_nextsize; i++ ) {
104                 send_reqs[i] = Request::isend(tmpbuf, sendcount, datatype,
105                                          tree->tree_next[i],
106                                          COLL_TAG_BCAST, comm);
107            }
108
109             /* complete the sends before starting the next sends */
110             Request::waitall( tree->tree_nextsize, send_reqs,
111                                          MPI_STATUSES_IGNORE );
112
113             /* update tmp buffer */
114             tmpbuf += realsegsize;
115
116         }
117     }
118
119     /* Intermediate nodes code */
120     else if( tree->tree_nextsize > 0 ) {
121         /*
122            Create the pipeline.
123            1) Post the first receive
124            2) For segments 1 .. num_segments
125               - post new receive
126               - wait on the previous receive to complete
127               - send this data to children
128            3) Wait on the last segment
129            4) Compute number of elements in last segment.
130            5) Send the last segment to children
131          */
132         req_index = 0;
133         recv_reqs[req_index]=Request::irecv(tmpbuf, count_by_segment, datatype,
134                            tree->tree_prev, COLL_TAG_BCAST,
135                            comm);
136
137         for( segindex = 1; segindex < num_segments; segindex++ ) {
138
139             req_index = req_index ^ 0x1;
140
141             /* post new irecv */
142             recv_reqs[req_index]= Request::irecv( tmpbuf + realsegsize, count_by_segment,
143                                 datatype, tree->tree_prev,
144                                 COLL_TAG_BCAST,
145                                 comm);
146
147             /* wait for and forward the previous segment to children */
148             Request::wait( &recv_reqs[req_index ^ 0x1],
149                                      MPI_STATUSES_IGNORE );
150
151             for( i = 0; i < tree->tree_nextsize; i++ ) {
152                 send_reqs[i]=Request::isend(tmpbuf, count_by_segment, datatype,
153                                          tree->tree_next[i],
154                                          COLL_TAG_BCAST, comm );
155             }
156
157             /* complete the sends before starting the next iteration */
158             Request::waitall( tree->tree_nextsize, send_reqs,
159                                          MPI_STATUSES_IGNORE );
160
161             /* Update the receive buffer */
162             tmpbuf += realsegsize;
163         }
164
165         /* Process the last segment */
166         Request::wait( &recv_reqs[req_index], MPI_STATUSES_IGNORE );
167         sendcount = original_count - (num_segments - 1) * count_by_segment;
168         for( i = 0; i < tree->tree_nextsize; i++ ) {
169             send_reqs[i] = Request::isend(tmpbuf, sendcount, datatype,
170                                      tree->tree_next[i],
171                                      COLL_TAG_BCAST, comm);
172         }
173
174         Request::waitall( tree->tree_nextsize, send_reqs,
175                                      MPI_STATUSES_IGNORE );
176     }
177
178     /* Leaf nodes */
179     else {
180         /*
181            Receive all segments from parent in a loop:
182            1) post irecv for the first segment
183            2) for segments 1 .. num_segments
184               - post irecv for the next segment
185               - wait on the previous segment to arrive
186            3) wait for the last segment
187         */
188         req_index = 0;
189         recv_reqs[req_index] = Request::irecv(tmpbuf, count_by_segment, datatype,
190                                  tree->tree_prev, COLL_TAG_BCAST,
191                                  comm);
192
193         for( segindex = 1; segindex < num_segments; segindex++ ) {
194             req_index = req_index ^ 0x1;
195             tmpbuf += realsegsize;
196             /* post receive for the next segment */
197             recv_reqs[req_index] = Request::irecv(tmpbuf, count_by_segment, datatype,
198                                      tree->tree_prev, COLL_TAG_BCAST,
199                                      comm);
200             /* wait on the previous segment */
201             Request::wait( &recv_reqs[req_index ^ 0x1],
202                                      MPI_STATUS_IGNORE );
203         }
204
205         Request::wait( &recv_reqs[req_index], MPI_STATUS_IGNORE );
206     }
207
208     delete[] send_reqs;
209     ompi_coll_tuned_topo_destroy_tree(&tree);
210
211     return (MPI_SUCCESS);
212 }
213
214 }
215 }