2 * 2.5D Block Matrix Multiplication example
6 #include "Matrix_init.h"
13 XBT_LOG_NEW_DEFAULT_CATEGORY(25D_MM,
14 "Messages specific for this msg example");
17 size_t m, size_t k, size_t n,
18 size_t Block_size, size_t group, size_t key,
19 size_t size_row, size_t size_col, size_t NB_groups ){
20 double *a, *b, *c, *res;
21 /* Split the communicator into groups */
23 /* Find out my identity in the default communicator */
30 double time, communication_time = 0;
31 double start_time, end_time; //time mesure
32 double end_time_intern; //time mesure
33 double start_time_reduce, end_time_reduce; //time mesure
37 if ( group >= NB_groups ){
39 "Not enough group NB_groups : %zu my group id : %zu\n",
41 MPI_Comm_split(MPI_COMM_WORLD, 0, key, &my_world);
44 MPI_Comm_split(MPI_COMM_WORLD, 1, key, &my_world);
47 MPI_Comm_size (my_world, &NB_proc);
49 if ( NB_proc < (int)(size_row*size_col*NB_groups) ){
51 "Not enough processors NB_proc : %d required : %zu\n",
52 NB_proc, size_row*size_col*NB_groups);
59 MPI_Comm_split(my_world, group, key, &group_comm);
61 MPI_Comm_rank(group_comm, &myrank);
62 MPI_Comm_size (group_comm, &NB_proc);
63 /* for each group start the execution of his */
66 NB_proc=size_row*size_col;
67 size_t row = myrank / size_row;
68 size_t col = myrank % size_row;
73 /*-------------------------Check some mandatory conditions------------------*/
74 size_t NB_Block = k / Block_size;
75 if ( k % Block_size != 0 ){
77 "The matrix size has to be proportionnal to the number\
78 of blocks: %zu\n", NB_Block);
82 if ( size_row > NB_Block || size_col > NB_Block ){
84 "Number of blocks is too small compare to the number of"
85 " processors (%zu,%zu) in a row or a col (%zu)\n",
86 size_col, size_row, NB_Block);
90 if ( NB_Block % size_row != 0 || NB_Block % size_col != 0){
91 XBT_INFO("The number of Block by processor is not an %s",
98 if(row >= size_col || col >= size_row){
99 XBT_INFO( "I'm useless bye!!! col: %zu row: %zu, "
100 "size_col: %zu , size_row: %zu \n",
101 col,row,size_col,size_row);
107 /*----------------------Prepare the Communication Layer-------------------*/
108 /* add useless processor on a new color to execute the matrix
109 * multiplication with the other processors*/
111 /* Split comm size_to row and column comms */
112 MPI_Comm row_comm, col_comm, group_line;
113 MPI_Comm_split(my_world, myrank, MPI_UNDEFINED, &group_line);
114 /* color by row, rank by column */
115 MPI_Comm_split(group_comm, size_row, MPI_UNDEFINED, &row_comm);
116 /* color by column, rank by row */
117 MPI_Comm_split(group_comm, size_col, MPI_UNDEFINED, &col_comm);
118 /*------------------------Communication Layer can be used-----------------*/
122 XBT_DEBUG( "I'm initialized col: %zu row: %zu, "
123 "size_col: %zu , size_row: %zu, my rank: %d \n"
124 ,col,row,size_col,size_row, myrank);
128 /*------------------------Initialize the matrices---------------------------*/
130 /* think about a common interface
131 * int pdgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans,
132 * m, n, k, alpha, a, ia, ja, lda, b, ib, jb, ldb,
133 * beta, c, ldc, Comm, rank );
136 /*------------------------Prepare the Communication Layer-------------------*/
137 /* Split comm size_to row and column comms */
138 MPI_Comm row_comm, col_comm, group_line;
139 MPI_Comm_split(my_world, myrank, group, &group_line);
140 /* color by row, rank by column */
141 MPI_Comm_split(group_comm, row, col, &row_comm);
142 /* color by column, rank by row */
143 MPI_Comm_split(group_comm, col, row, &col_comm);
144 /*-------------------------Communication Layer can be used------------------*/
149 size_t k_a = k / size_row;
150 size_t k_b = k / size_col;
154 /*only on the group 0*/
156 matrices_initialisation(&a, &b, &c, m, k_a, k_b, n, row, col);
157 if( NB_groups > 1 ) res = malloc( sizeof(double)*m*n );
158 } else matrices_allocation(&a, &b, &c, m, k_a, k_b, n);
161 /*-------------------Configuration for Summa algorihtm--------------------*/
162 /*--------------------Allocation of matrices block-------------------------*/
163 double *a_Summa, *b_Summa;
164 blocks_initialisation(&a_Summa, &b_Summa, m, Block_size, n);
166 /*--------------------Communication types for MPI--------------------------*/
167 MPI_Datatype Block_a;
168 MPI_Datatype Block_a_local;
169 MPI_Datatype Block_b;
170 MPI_Type_vector(m , Block_size, k_a, MPI_DOUBLE, &Block_a);
171 MPI_Type_vector(m , Block_size, Block_size, MPI_DOUBLE, &Block_a_local);
172 MPI_Type_vector(Block_size, n, n, MPI_DOUBLE, &Block_b);
173 MPI_Type_commit(&Block_a);
174 MPI_Type_commit(&Block_a_local);
175 MPI_Type_commit(&Block_b);
176 /*-------------Communication types for MPI are configured------------------*/
181 MPI_Barrier(my_world);
182 start_time = MPI_Wtime();
183 if( NB_groups > 1 ) {
184 err = MPI_Bcast(a, m*k_a, MPI_DOUBLE, 0, group_line);
185 if (err != MPI_SUCCESS) {
186 perror("Error Bcast A\n");
189 err = MPI_Bcast(b, n*k_b, MPI_DOUBLE, 0, group_line);
190 if (err != MPI_SUCCESS) {
191 perror("Error Bcast B\n");
194 MPI_Barrier(my_world);
196 end_time_intern = MPI_Wtime();
197 communication_time += start_time - end_time_intern;
199 XBT_INFO( "group %zu NB_block: %zu, NB_groups %zu\n"
200 ,group,NB_Block, NB_groups);
202 "m %zu, k_a %zu, k_b %zu, n %zu, Block_size %zu, "
203 "group*NB_Block/NB_groups %zu, "
204 "(group+1)*NB_Block/NB_groups %zu, row %zu, col %zu,"
205 "size_row %zu, size_col %zu\n" ,
206 m, k_a, k_b, n, Block_size,
207 group*NB_Block/NB_groups, (group+1)*NB_Block/NB_groups,
208 row, col, size_row, size_col);
211 Summa( a, b, c, k_a, n, n, m, k_a, k_b, n, Block_size,
212 group*NB_Block/NB_groups, (group+1)*NB_Block/NB_groups,
213 row, col, size_row, size_col, a_Summa, b_Summa, Block_a,
214 Block_a_local, Block_b, row_comm, col_comm, 0);
216 /*-------------------------End Summa algorihtm-----------------------------*/
218 MPI_Comm_rank(group_line, &myrank);
220 MPI_Barrier(my_world);
221 start_time_reduce = MPI_Wtime();
222 if( NB_groups > 1 ) {
223 // a gather is better?
224 err = MPI_Reduce(c, res, m*n, MPI_DOUBLE, MPI_SUM, 0, group_line);
225 if (err != MPI_SUCCESS) {
226 perror("Error Bcast A\n");
234 MPI_Barrier(my_world);
235 end_time_reduce = MPI_Wtime();
237 MPI_Barrier(my_world);
238 end_time = MPI_Wtime();
239 time = start_time - end_time;
240 double reduce_time = start_time_reduce - end_time_reduce;
241 printf("communication time: %le reduce time: %le nanoseconds, "
242 "total time: %le nanoseconds\n",communication_time,reduce_time,time);
243 MPI_Barrier(my_world);
247 check_result(res, a, b, m, n, k_a, k_b, row, col, size_row, size_col);
250 // close properly the pragram
252 MPI_Type_free(&Block_a);
253 MPI_Type_free(&Block_a_local);
254 MPI_Type_free(&Block_b);
262 if( NB_groups > 1 ) {
267 MPI_Barrier(MPI_COMM_WORLD);
268 MPI_Comm_free(&my_world);
269 MPI_Comm_free(&group_comm);
270 MPI_Comm_free(&group_line);
271 MPI_Comm_free(&row_comm);
272 MPI_Comm_free(&col_comm);