+++ /dev/null
-/* Copyright (c) 2012, 2014. The SimGrid Team.
- * All rights reserved. */
-
-/* This program is free software; you can redistribute it and/or modify it
- * under the terms of the license (GNU LGPL) which comes with this package. */
-
-/* Block Matrix Multiplication example */
-
-#include "Matrix_init.h"
-#include "Summa.h"
-#include "xbt/log.h"
-#include <xbt/str.h>
-
-#include <stdio.h>
-#include <string.h>
-#include <mpi.h>
-#include <math.h>
-#include <getopt.h>
-
-#define CHECK_25D 1
-
-XBT_LOG_NEW_DEFAULT_CATEGORY(MM_mpi, "Messages specific for this msg example");
-
-static double two_dot_five(size_t m, size_t k, size_t n, size_t Block_size, size_t group, size_t key, size_t size_row,
- size_t size_col, size_t NB_groups ){
- double *a, *b, *c;
- double *res = NULL;
- /* Split the communicator into groups */
-
- /* Find out my identity in the default communicator */
- int myrank;
- int NB_proc;
- int err;
- int useless = 0;
-
- double time, communication_time = 0;
- double start_time, end_time; //time mesure
- double end_time_intern; //time mesure
- double start_time_reduce, end_time_reduce; //time mesure
-
- MPI_Comm my_world;
-
- if ( group >= NB_groups ){
- XBT_DEBUG("Not enough group NB_groups : %zu my group id : %zu\n", NB_groups, group);
- MPI_Comm_split(MPI_COMM_WORLD, 0, key, &my_world);
- return -1;
- }else{
- MPI_Comm_split(MPI_COMM_WORLD, 1, key, &my_world);
- }
-
- MPI_Comm_size (my_world, &NB_proc);
-
- if ( NB_proc < (int)(size_row*size_col*NB_groups) ){
- XBT_INFO("Not enough processors NB_proc : %d required : %zu\n", NB_proc, size_row*size_col*NB_groups);
- return -1;
- }
-
- MPI_Comm group_comm;
- MPI_Comm_split(my_world, group, key, &group_comm);
-
- MPI_Comm_rank(group_comm, &myrank);
- MPI_Comm_size (group_comm, &NB_proc);
- /* for each group start the execution of his */
-
- NB_proc=size_row*size_col;
- size_t row = myrank / size_row;
- size_t col = myrank % size_row;
-
- /*-------------------------Check some mandatory conditions------------------*/
- size_t NB_Block = k / Block_size;
- if ( k % Block_size != 0 ){
- XBT_INFO("The matrix size has to be proportionnal to the number of blocks: %zu\n", NB_Block);
- return -1;
- }
-
- if ( size_row > NB_Block || size_col > NB_Block ){
- XBT_INFO("Number of blocks is too small compare to the number of processors (%zu,%zu) in a row or a col (%zu)\n",
- size_col, size_row, NB_Block);
- return -1;
- }
-
- if ( NB_Block % size_row != 0 || NB_Block % size_col != 0){
- XBT_INFO("The number of Block by processor is not an integer\n");
- return -1;
- }
-
- if(row >= size_col || col >= size_row){
- XBT_INFO( "I'm useless bye!!! col: %zu row: %zu, size_col: %zu , size_row: %zu \n", col,row,size_col,size_row);
- useless = 1;
- }
-
- if(useless == 1){
- /*----------------------Prepare the Communication Layer-------------------*/
- /* add useless processor on a new color to execute the matrix
- * multiplication with the other processors*/
-
- /* Split comm size_to row and column comms */
- MPI_Comm row_comm, col_comm, group_line;
- MPI_Comm_split(my_world, myrank, MPI_UNDEFINED, &group_line);
- /* color by row, rank by column */
- MPI_Comm_split(group_comm, size_row, MPI_UNDEFINED, &row_comm);
- /* color by column, rank by row */
- MPI_Comm_split(group_comm, size_col, MPI_UNDEFINED, &col_comm);
- /*------------------------Communication Layer can be used-----------------*/
-
- return 0;
- }
- XBT_DEBUG("I'm initialized col: %zu row: %zu, size_col: %zu , size_row: %zu, my rank: %d \n", col,row,size_col,
- size_row, myrank);
-
- /*------------------------Initialize the matrices---------------------------*/
- /* think about a common interface
- * int pdgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, a, ia, ja, lda, b, ib, jb, ldb,
- * beta, c, ldc, Comm, rank );
- */
-
- /*------------------------Prepare the Communication Layer-------------------*/
- /* Split comm size_to row and column comms */
- MPI_Comm row_comm, col_comm, group_line;
- MPI_Comm_split(my_world, myrank, group, &group_line);
- /* color by row, rank by column */
- MPI_Comm_split(group_comm, row, col, &row_comm);
- /* color by column, rank by row */
- MPI_Comm_split(group_comm, col, row, &col_comm);
- /*-------------------------Communication Layer can be used------------------*/
-
- // matrix sizes
- m = m / size_col;
- n = n / size_row;
- size_t k_a = k / size_row;
- size_t k_b = k / size_col;
-
- /*only on the group 0*/
- if( group == 0 ) {
- matrices_initialisation(&a, &b, &c, m, k_a, k_b, n, row, col);
- if( NB_groups > 1 ) res = malloc( sizeof(double)*m*n );
- } else matrices_allocation(&a, &b, &c, m, k_a, k_b, n);
-
- /*-------------------Configuration for Summa algorihtm--------------------*/
- /*--------------------Allocation of matrices block-------------------------*/
- double *a_Summa, *b_Summa;
- blocks_initialisation(&a_Summa, &b_Summa, m, Block_size, n);
-
- /*--------------------Communication types for MPI--------------------------*/
- MPI_Datatype Block_a;
- MPI_Datatype Block_a_local;
- MPI_Datatype Block_b;
- MPI_Type_vector(m , Block_size, k_a, MPI_DOUBLE, &Block_a);
- MPI_Type_vector(m , Block_size, Block_size, MPI_DOUBLE, &Block_a_local);
- MPI_Type_vector(Block_size, n, n, MPI_DOUBLE, &Block_b);
- MPI_Type_commit(&Block_a);
- MPI_Type_commit(&Block_a_local);
- MPI_Type_commit(&Block_b);
- /*-------------Communication types for MPI are configured------------------*/
-
- MPI_Barrier(my_world);
- start_time = MPI_Wtime();
- if( NB_groups > 1 ) {
- err = MPI_Bcast(a, m*k_a, MPI_DOUBLE, 0, group_line);
- if (err != MPI_SUCCESS) {
- perror("Error Bcast A\n");
- return -1;
- }
- err = MPI_Bcast(b, n*k_b, MPI_DOUBLE, 0, group_line);
- if (err != MPI_SUCCESS) {
- perror("Error Bcast B\n");
- return -1;
- }
- MPI_Barrier(my_world);
- }
- end_time_intern = MPI_Wtime();
- communication_time += end_time_intern - start_time;
-
- XBT_INFO( "group %zu NB_block: %zu, NB_groups %zu\n",group,NB_Block, NB_groups);
- XBT_INFO("m %zu, k_a %zu, k_b %zu, n %zu, Block_size %zu, group*NB_Block/NB_groups %zu, "
- "(group+1)*NB_Block/NB_groups %zu, row %zu, col %zu, size_row %zu, size_col %zu\n",m, k_a, k_b, n,
- Block_size, group*NB_Block/NB_groups, (group+1)*NB_Block/NB_groups,row, col, size_row, size_col);
-
-
- Summa(a, b, c, k_a, n, n, m, k_a, k_b, n, Block_size, group*NB_Block/NB_groups, (group+1)*NB_Block/NB_groups,
- row, col, size_row, size_col, a_Summa, b_Summa, Block_a, Block_a_local, Block_b, row_comm, col_comm, 0);
-
- /*-------------------------End Summa algorihtm-----------------------------*/
-
- MPI_Comm_rank(group_line, &myrank);
-
- MPI_Barrier(my_world);
- start_time_reduce = MPI_Wtime();
- if( NB_groups > 1 ) {
- // a gather is better?
- err = MPI_Reduce(c, res, m*n, MPI_DOUBLE, MPI_SUM, 0, group_line);
- if (err != MPI_SUCCESS) {
- perror("Error Bcast A\n");
- return -1;
- }
- }else{
- double *swap= c;
- c = res;
- res=swap;
- }
- MPI_Barrier(my_world);
- end_time_reduce = MPI_Wtime();
-
- MPI_Barrier(my_world);
- end_time = MPI_Wtime();
- time = end_time - start_time;
- double reduce_time = end_time_reduce - start_time_reduce;
- printf("communication time: %e reduce time: %e seconds, total time: %e seconds\n", communication_time, reduce_time,
- time);
- MPI_Barrier(my_world);
-
-#if CHECK_25D
- if(myrank == 0)
- check_result(res, a, b, m, n, k_a, k_b, row, col, size_row, size_col);
-#endif
-
- // close properly the pragram
- MPI_Type_free(&Block_a);
- MPI_Type_free(&Block_a_local);
- MPI_Type_free(&Block_b);
-
- free(a_Summa);
- free(b_Summa);
-
- free( a );
- free( b );
- if( NB_groups > 1 ) {
- free( c );
- }
- free(res);
-
- MPI_Barrier(MPI_COMM_WORLD);
- MPI_Comm_free(&my_world);
- MPI_Comm_free(&group_comm);
- MPI_Comm_free(&group_line);
- MPI_Comm_free(&row_comm);
- MPI_Comm_free(&col_comm);
- return 0;
-}
-
-int main(int argc, char ** argv)
-{
- size_t m = 1024 , n = 1024 , k = 1024;
- size_t NB_Block = 16;
- size_t Block_size = k/NB_Block ;
- size_t NB_groups = 1, group = 0, key = 0;
- /* x index on M
- y index on N
- Z index on K */
-
- int myrank;
- int NB_proc;
- size_t row, col, size_row, size_col; //description: vitual processor topology
- row = 0;
- col = 0;
-
- MPI_Init(&argc, &argv);
-
- /* Find out my identity in the default communicator */
- MPI_Comm_rank ( MPI_COMM_WORLD, &myrank );
- MPI_Comm_size ( MPI_COMM_WORLD, &NB_proc );
-
- if(NB_proc != 1)
- for (size_col=NB_proc/2; NB_proc%size_col; size_col--);
- else
- size_col = 1;
-
- size_row = NB_proc/size_col;
- if (size_row > size_col){
- size_col = size_row;
- size_row = NB_proc/size_col;
- }
-
-#if DEBUG_MPI
- size_t loop=1;
- while(loop==1);
-#endif
-
- int opt = 0;
- optind = 1;
-
- //get the parameter from command line
- while ((opt = getopt(argc, argv, "hr:c:M:N:K:B:G:g:k:P:")) != -1) {
- switch(opt) {
- case 'h':
- XBT_INFO("Usage: mxm_cblas_test [options]\n"
- " -M I M size (default: %zu)\n"
- " -N I N size (default: %zu)\n"
- " -K I K size (default: %zu)\n"
- " -B I Block size on the k dimension (default: %zu)\n"
- " -G I Number of processor groups (default: %zu)\n"
- " -g I group index (default: %zu)\n"
- " -k I group rank (default: %zu)\n"
- " -r I processor row size (default: %zu)\n"
- " -c I processor col size (default: %zu)\n"
- " -h help\n", m, n, k, Block_size, NB_groups, group, key, row, col);
- return 0;
- case 'M':
- m = xbt_str_parse_int(optarg, "Invalid M size: %s");
- break;
- case 'N':
- n = xbt_str_parse_int(optarg, "Invalid N size: %s");
- break;
- case 'K':
- k = xbt_str_parse_int(optarg, "Invalid K size: %s");
- break;
- case 'B':
- Block_size = xbt_str_parse_int(optarg, "Invalid block size: %s");
- break;
- case 'G':
- NB_groups = xbt_str_parse_int(optarg, "Invalid number of processor groups: %s");
- break;
- case 'g':
- group = xbt_str_parse_int(optarg, "Invalid group index: %s");
- break;
- case 'k':
- key = xbt_str_parse_int(optarg, "Invalid group rank: %s");
- break;
- case 'r':
- size_row = xbt_str_parse_int(optarg, "Invalid processor row size: %s");
- break;
- case 'c':
- size_col = xbt_str_parse_int(optarg, "Invalid processor col size: %s");
- break;
- }
- }
-
- two_dot_five( m, k, n, Block_size, group, key, size_row, size_col, NB_groups);
-
- // close properly the program
- MPI_Barrier(MPI_COMM_WORLD);
- MPI_Finalize();
- return 0;
-}
+++ /dev/null
-/* Copyright (c) 2012, 2014. The SimGrid Team.
- * All rights reserved. */
-
-/* This program is free software; you can redistribute it and/or modify it
- * under the terms of the license (GNU LGPL) which comes with this package. */
-
-#include "Matrix_init.h"
-#include <math.h>
-#include <stdio.h>
-#include "xbt/log.h"
- XBT_LOG_NEW_DEFAULT_CATEGORY(MM_init, "Messages specific for this msg example");
-#define _unused(x) ((void)x)
-
-void matrices_initialisation(double ** p_a, double ** p_b, double ** p_c, size_t m, size_t k_a, size_t k_b, size_t n,
- size_t row, size_t col)
-{
- size_t x, y, z;
- size_t lda = k_a;
- size_t ldb = n;
- size_t ldc = n;
- _unused(row);
-
- double *a = malloc(sizeof(double) * m * k_a);
-
- if ( a == 0 ){
- perror("Error allocation Matrix A");
- exit(-1);
- }
-
- double *b = malloc(sizeof(double) * k_b * n);
-
- if ( b == 0 ){
- perror("Error allocation Matrix B");
- exit(-1);
- }
-
- double *c = malloc(sizeof(double) * m * n);
- if ( c == 0 ){
- perror("Error allocation Matrix C");
- exit(-1);
- }
-
- *p_a=a;
- *p_b =b;
- *p_c=c;
-
- // size_tialisation of the matrices
- for( x=0; x<m; x++){
- for( z=0; z<k_a; z++){
-#ifdef SIMPLE_MATRIX
- a[x*lda+z] = 1;
-#else
- a[x*lda+z] = (double)(z+col*n);
-#endif
- }
- }
- for( z=0; z<k_b; z++){
- for( y=0; y<n; y++){
-#ifdef SIMPLE_MATRIX
- b[z*ldb+y] = 1;
-#else
- b[z*ldb+y] = (double)(y);
-#endif
- }
- }
- for( x=0; x<m; x++){
- for( y=0; y<n; y++){
- c[x*ldc+y] = 0;
- }
- }
-}
-
-void matrices_allocation( double ** p_a, double ** p_b, double ** p_c, size_t m, size_t k_a, size_t k_b, size_t n)
-{
- double *a = malloc(sizeof(double) * m * k_a);
-
- if ( a == 0 ){
- perror("Error allocation Matrix A");
- exit(-1);
- }
-
- double *b = malloc(sizeof(double) * k_b * n);
-
- if ( b == 0 ){
- perror("Error allocation Matrix B");
- exit(-1);
- }
-
- double *c = malloc(sizeof(double) * m * n);
- if ( c == 0 ){
- perror("Error allocation Matrix C");
- exit(-1);
- }
-
- *p_a=a;
- *p_b =b;
- *p_c=c;
-}
-
-void blocks_initialisation( double ** p_a_local, double ** p_b_local, size_t m, size_t B_k, size_t n)
-{
- size_t x,y,z;
- size_t lda = B_k;
- size_t ldb = n;
-
- double *a_local = malloc(sizeof(double) * m * B_k);
-
- if ( a_local == 0 ){
- perror("Error allocation Matrix A");
- exit(-1);
- }
-
- double *b_local = malloc(sizeof(double) * B_k * n);
-
- if ( b_local == 0 ){
- perror("Error allocation Matrix B");
- exit(-1);
- }
-
- *p_a_local = a_local;
- *p_b_local = b_local;
-
- // size_tialisation of the matrices
- for( x=0; x<m; x++){
- for( z=0; z<B_k; z++){
- a_local[x*lda+z] = 0.0;
- }
- }
- for( z=0; z<B_k; z++){
- for( y=0; y<n; y++){
- b_local[z*ldb+y] = 0.0;
- }
- }
-}
-
-void check_result(double *c, double *a, double *b, size_t m, size_t n, size_t k_a, size_t k_b,
- size_t row, size_t col, size_t size_row, size_t size_col)
-{
- size_t x,y;
- size_t ldc = n;
- _unused(a);
- _unused(b);
- _unused(k_b);
- _unused(k_a);
- _unused(row);
- _unused(col);
- _unused(size_row);
- /* these variable could be use to check the result in function of the
- * matrix initialization */
-
- /*Display for checking */
-#ifdef SIMPLE_MATRIX
- XBT_INFO("Value get : %f excepted %zu multiply by y\n", c[((int)m/2)*ldc+1],size_row*k_a );
-#else
- XBT_INFO("Value get : %f excepted %zu multiply by y\n", c[((int)m/2)*ldc+1], 1*(size_col*m)*((size_col*m)-1)/2) ;
-#endif
- for( x=0; x<m; x++){
- for( y=0; y<n; y++){
- /* WARNING this could be lead to some errors ( precision with double )*/
-#ifdef SIMPLE_MATRIX
- if ( fabs(c[x*ldc + y] - size_row*k_a) > 0.0000001)
-#else
- if ( fabs(c[x*ldc + y] - y*(size_col*m)*((size_col*m)-1)/2) > 0.0000001)
-#endif
- {
-#ifdef SIMPLE_MATRIX
- XBT_INFO( "%f\t%zu, y : %zu x : %zu \n", c[x*ldc+y], size_row*k_a, y, x);
-#else
- XBT_INFO( "%f\t%zu, y : %zu x : %zu \n", c[x*ldc+y], y*(size_col*m)*((size_col*m)-1)/2, y, x);
-#endif
- goto error_exit;
- }
- }
- }
- XBT_INFO("result check: ok\n");
- return;
-error_exit:
- XBT_INFO("result check not ok\nWARNING the test could be lead to some errors ( precision with double )\n");
- return;
-}
+++ /dev/null
-/* Copyright (c) 2012-2014. The SimGrid Team.
- * All rights reserved. */
-
-/* This program is free software; you can redistribute it and/or modify it
- * under the terms of the license (GNU LGPL) which comes with this package. */
-
-/* Classical Block Matrix Multiplication example */
-
-#include "Matrix_init.h"
-#include "Summa.h"
-#include "xbt/log.h"
-#include <stdio.h>
-
-XBT_LOG_NEW_DEFAULT_CATEGORY(MM_Summa, "Messages specific for this msg example");
-
-double Summa(double *a, double *b, double *c, size_t lda, size_t ldb, size_t ldc,
- size_t m, size_t k_a, size_t k_b, size_t n, size_t Block_size, size_t start, size_t end,
- size_t row, size_t col, size_t size_row, size_t size_col, double *a_local, double *b_local,
- MPI_Datatype Block_a, MPI_Datatype Block_a_local, MPI_Datatype Block_b,
- MPI_Comm row_comm, MPI_Comm col_comm, int subs)
-{
- double *B_a , *B_b ; //matrix blocks
- size_t err;
- //double alpha = 1, beta = 1; //C := alpha * a * b + beta * c
- size_t B_proc_row; // Number of bloc(row or col) on one processor
-#ifndef CYCLIC
- size_t B_proc_col;
- B_proc_col = k_b / Block_size; // Number of block on one processor
-#endif
- B_proc_row = k_a / Block_size; // Number of block on one processor
-
- //size_t lda = k_a, ldb = n, ldc = n;
- size_t lda_local = lda;
- size_t ldb_local = ldb;
-
- double time, computation_time = 0, communication_time = 0;
- double start_time, end_time; //time mesure
- double start_time_intern, end_time_intern; //time mesure
-
- start_time = MPI_Wtime();
-
- /*-------------Distributed Matrix Multiplication algorithm-----------------*/
- size_t iter;
- for( iter = start; iter < end; iter++ ){
- size_t pivot_row, pivot_col, pos_a, pos_b;
-#ifdef CYCLIC
- // pivot row on processor layer
- pivot_row = (iter % size_col);
- pivot_col = (iter % size_row);
- //position of the block
- if(subs == 1){
- pos_a = (size_t)((iter - start) / size_row) * Block_size;
- pos_b = (size_t)((iter - start) / size_col) * ldb * Block_size;
- }else{
- pos_a = (size_t)(iter / size_row) * Block_size;
- pos_b = (size_t)(iter / size_col) * ldb * Block_size;
- }
-#else
- // pivot row on processor layer
- pivot_row = (size_t)(iter / B_proc_col) % size_col;
- pivot_col = (size_t)(iter / B_proc_row) % size_row;
- //position of the block
- if(subs == 1){
- pos_a = ((iter - start) % B_proc_row) * Block_size;
- pos_b = ((iter - start) % B_proc_col) * ldb * Block_size;
- }else{
- pos_a = (iter % B_proc_row) * Block_size;
- pos_b = (iter % B_proc_col) * ldb * Block_size;
- }
-#endif
- XBT_DEBUG( "pivot: %zu, iter: %zu, B_proc_col: %zu, size_col:%zu, size_row: %zu\n",
- pivot_row, iter, B_proc_row,size_col,size_row);
-/* MPI_Barrier(row_comm);*/
-/* MPI_Barrier(col_comm);*/
-
- start_time_intern = MPI_Wtime();
- //Broadcast the row
- if(size_row > 1){
- MPI_Datatype * Block;
- if( pivot_col != col ){
- B_a = a_local;
- lda_local = Block_size;
- XBT_DEBUG("recieve B_a %zu,%zu \n",m , Block_size);
- Block = &Block_a_local;
- }else{
- B_a = a + pos_a;
- lda_local = lda;
- XBT_DEBUG("sent B_a %zu,%zu \n",m , Block_size);
- Block = &Block_a;
- }
- err = MPI_Bcast(B_a, 1, *Block, pivot_col, row_comm);
- if (err != MPI_SUCCESS) {
- perror("Error Bcast A\n");
- return -1;
- }
- }else{
- B_a = a + pos_a;
- XBT_DEBUG("position of B_a: %zu \n", pos_a);
- }
-
- //Broadcast the col
- if(size_col > 1){
- if( pivot_row == row ){
- B_b = b + pos_b;
- XBT_DEBUG("sent B_b Block_size: %zu, pos:%zu \n", ldb, pos_b);
- }else{
- B_b = b_local;
- XBT_DEBUG("recieve B_b %zu,%zu \n", Block_size,n);
- }
- err = MPI_Bcast(B_b, 1, Block_b, pivot_row, col_comm );
- if (err != MPI_SUCCESS) {
- perror("Error Bcast B\n");
- MPI_Finalize();
- exit(-1);
- }
- }else{
- B_b = b + pos_b;
- XBT_DEBUG("position of B_b: %zu \n", pos_b);
- }
- end_time_intern = MPI_Wtime();
- communication_time += end_time_intern - start_time_intern;
-
-/* MPI_Barrier(row_comm);*/
-/* MPI_Barrier(col_comm);*/
- start_time_intern = MPI_Wtime();
- XBT_DEBUG("execute Gemm number: %zu\n", iter);
- //We have recieved a line of block and a colomn
- // cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans,
- // m, n, Block_size, alpha, B_a, lda_local, B_b, ldb_local,
- // beta, c, ldc );
- int i, j, k;
- for(i = 0; i < m; i++)
- for(j = 0; j < n; j++)
- for(k = 0; k < Block_size; k++)
- c[i*ldc+j] += B_a[i*lda_local+k]*B_b[k*ldb_local+j];
-
- end_time_intern = MPI_Wtime();
- computation_time += end_time_intern - start_time_intern;
-
- }
- MPI_Barrier(row_comm);
- MPI_Barrier(col_comm);
-
- end_time = MPI_Wtime();
- time = end_time - start_time ;
- printf("communication time: %e seconds, computation time: %e seconds\n", communication_time, computation_time);
-
- return time;
-}