2 * 2.5D Block Matrix Multiplication example
4 * Authors: Quintin Jean-Noël
7 #include "Matrix_init.h"
15 XBT_LOG_NEW_DEFAULT_CATEGORY(25D_MM,
16 "Messages specific for this msg example");
19 size_t m, size_t k, size_t n,
20 size_t Block_size, size_t group, size_t key,
21 size_t size_row, size_t size_col, size_t NB_groups ){
24 /* Split the communicator into groups */
26 /* Find out my identity in the default communicator */
33 double time, communication_time = 0;
34 struct timespec start_time, end_time; //time mesure
35 struct timespec end_time_intern; //time mesure
36 struct timespec start_time_reduce, end_time_reduce; //time mesure
40 if ( group >= NB_groups ){
42 "Not enough group NB_groups : %zu my group id : %zu\n",
44 MPI_Comm_split(MPI_COMM_WORLD, 0, key, &my_world);
47 MPI_Comm_split(MPI_COMM_WORLD, 1, key, &my_world);
50 MPI_Comm_size (my_world, &NB_proc);
52 if ( NB_proc < (int)(size_row*size_col*NB_groups) ){
54 "Not enough processors NB_proc : %d required : %zu\n",
55 NB_proc, size_row*size_col*NB_groups);
62 MPI_Comm_split(my_world, group, key, &group_comm);
64 MPI_Comm_rank(group_comm, &myrank);
65 MPI_Comm_size (group_comm, &NB_proc);
66 /* for each group start the execution of his */
69 NB_proc=size_row*size_col;
70 size_t row = myrank / size_row;
71 size_t col = myrank % size_row;
76 /*-------------------------Check some mandatory conditions------------------*/
77 size_t NB_Block = k / Block_size;
78 if ( k % Block_size != 0 ){
80 "The matrix size has to be proportionnal to the number\
81 of blocks: %zu\n", NB_Block);
85 if ( size_row > NB_Block || size_col > NB_Block ){
87 "Number of blocks is too small compare to the number of"
88 " processors (%zu,%zu) in a row or a col (%zu)\n",
89 size_col, size_row, NB_Block);
93 if ( NB_Block % size_row != 0 || NB_Block % size_col != 0){
94 XBT_INFO("The number of Block by processor is not an %s",
101 if(row >= size_col || col >= size_row){
102 XBT_INFO( "I'm useless bye!!! col: %zu row: %zu, "
103 "size_col: %zu , size_row: %zu \n",
104 col,row,size_col,size_row);
110 /*----------------------Prepare the Communication Layer-------------------*/
111 /* add useless processor on a new color to execute the matrix
112 * multiplication with the other processors*/
114 /* Split comm size_to row and column comms */
115 MPI_Comm row_comm, col_comm, group_line;
116 MPI_Comm_split(my_world, myrank, MPI_UNDEFINED, &group_line);
117 /* color by row, rank by column */
118 MPI_Comm_split(group_comm, size_row, MPI_UNDEFINED, &row_comm);
119 /* color by column, rank by row */
120 MPI_Comm_split(group_comm, size_col, MPI_UNDEFINED, &col_comm);
121 /*------------------------Communication Layer can be used-----------------*/
125 XBT_DEBUG( "I'm initialized col: %zu row: %zu, "
126 "size_col: %zu , size_row: %zu, my rank: %d \n"
127 ,col,row,size_col,size_row, myrank);
131 /*------------------------Initialize the matrices---------------------------*/
133 /* think about a common interface
134 * int pdgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans,
135 * m, n, k, alpha, a, ia, ja, lda, b, ib, jb, ldb,
136 * beta, c, ldc, Comm, rank );
139 /*------------------------Prepare the Communication Layer-------------------*/
140 /* Split comm size_to row and column comms */
141 MPI_Comm row_comm, col_comm, group_line;
142 MPI_Comm_split(my_world, myrank, group, &group_line);
143 /* color by row, rank by column */
144 MPI_Comm_split(group_comm, row, col, &row_comm);
145 /* color by column, rank by row */
146 MPI_Comm_split(group_comm, col, row, &col_comm);
147 /*-------------------------Communication Layer can be used------------------*/
152 size_t k_a = k / size_row;
153 size_t k_b = k / size_col;
157 /*only on the group 0*/
159 matrices_initialisation(&a, &b, &c, m, k_a, k_b, n, row, col);
160 if( NB_groups > 1 ) res = malloc( sizeof(double)*m*n );
161 } else matrices_allocation(&a, &b, &c, m, k_a, k_b, n);
164 /*-------------------Configuration for Summa algorihtm--------------------*/
165 /*--------------------Allocation of matrices block-------------------------*/
166 double *a_Summa, *b_Summa;
167 blocks_initialisation(&a_Summa, &b_Summa, m, Block_size, n);
169 /*--------------------Communication types for MPI--------------------------*/
170 MPI_Datatype Block_a;
171 MPI_Datatype Block_a_local;
172 MPI_Datatype Block_b;
173 MPI_Type_vector(m , Block_size, k_a, MPI_DOUBLE, &Block_a);
174 MPI_Type_vector(m , Block_size, Block_size, MPI_DOUBLE, &Block_a_local);
175 MPI_Type_vector(Block_size, n, n, MPI_DOUBLE, &Block_b);
176 MPI_Type_commit(&Block_a);
177 MPI_Type_commit(&Block_a_local);
178 MPI_Type_commit(&Block_b);
179 /*-------------Communication types for MPI are configured------------------*/
184 MPI_Barrier(my_world);
185 get_time(&start_time);
186 if( NB_groups > 1 ) {
187 err = MPI_Bcast(a, m*k_a, MPI_DOUBLE, 0, group_line);
188 if (err != MPI_SUCCESS) {
189 perror("Error Bcast A\n");
192 err = MPI_Bcast(b, n*k_b, MPI_DOUBLE, 0, group_line);
193 if (err != MPI_SUCCESS) {
194 perror("Error Bcast B\n");
197 MPI_Barrier(my_world);
199 get_time(&end_time_intern);
200 communication_time += get_timediff(&start_time,&end_time_intern);
202 XBT_INFO( "group %zu NB_block: %zu, NB_groups %zu\n"
203 ,group,NB_Block, NB_groups);
205 "m %zu, k_a %zu, k_b %zu, n %zu, Block_size %zu, "
206 "group*NB_Block/NB_groups %zu, "
207 "(group+1)*NB_Block/NB_groups %zu, row %zu, col %zu,"
208 "size_row %zu, size_col %zu\n" ,
209 m, k_a, k_b, n, Block_size,
210 group*NB_Block/NB_groups, (group+1)*NB_Block/NB_groups,
211 row, col, size_row, size_col);
214 Summa( a, b, c, k_a, n, n, m, k_a, k_b, n, Block_size,
215 group*NB_Block/NB_groups, (group+1)*NB_Block/NB_groups,
216 row, col, size_row, size_col, a_Summa, b_Summa, Block_a,
217 Block_a_local, Block_b, row_comm, col_comm, 0);
219 /*-------------------------End Summa algorihtm-----------------------------*/
221 MPI_Comm_rank(group_line, &myrank);
223 MPI_Barrier(my_world);
224 get_time(&start_time_reduce);
225 if( NB_groups > 1 ) {
226 // a gather is better?
227 err = MPI_Reduce(c, res, m*n, MPI_DOUBLE, MPI_SUM, 0, group_line);
228 if (err != MPI_SUCCESS) {
229 perror("Error Bcast A\n");
237 MPI_Barrier(my_world);
238 get_time(&end_time_reduce);
240 MPI_Barrier(my_world);
242 time = get_timediff(&start_time,&end_time);
243 double reduce_time = get_timediff(&start_time_reduce,&end_time_reduce);
244 printf("communication time: %le reduce time: %le nanoseconds, "
245 "total time: %le nanoseconds\n",communication_time,reduce_time,time);
246 MPI_Barrier(my_world);
250 check_result(res, a, b, m, n, k_a, k_b, row, col, size_row, size_col);
253 // close properly the pragram
255 MPI_Type_free(&Block_a);
256 MPI_Type_free(&Block_a_local);
257 MPI_Type_free(&Block_b);
265 if( NB_groups > 1 ) {
270 MPI_Barrier(MPI_COMM_WORLD);
271 MPI_Comm_free(&my_world);
272 MPI_Comm_free(&group_comm);
273 MPI_Comm_free(&group_line);
274 MPI_Comm_free(&row_comm);
275 MPI_Comm_free(&col_comm);