/* ******************************************************************** Example 22 (mm_mult_cannon.c) Objective : Matrix Matrix multiplication (Using Cartesian Topology, CANNON Algorithm) Input : Read files (mdata1.inp) for first input matrix and (mdata2.inp) for second input matrix Output : Result of matrix matrix multiplication on Processor 0. Necessary Condition : Number of Processes should be less than or equal to 8. Matrices A and B should be equally striped. that is Row size and Column size should be properly divisible by Number of processes used. ******************************************************************** */ #include #include #include #include "mpi.h" #include "cart.h" /* Communication block set up for mesh toplogy */ void SetUp_Mesh(MESH_INFO_TYPE *); main (int argc, char *argv[]) { int istage,irow,icol,jrow,jcol,iproc,jproc,index,Proc_Id,Root =0; int A_Bloc_MatrixSize, B_Bloc_MatrixSize; int NoofRows_A, NoofCols_A, NoofRows_B, NoofCols_B; int NoofRows_BlocA, NoofCols_BlocA, NoofRows_BlocB, NoofCols_BlocB; int Local_Index, Global_Row_Index, Global_Col_Index; int Matrix_Size[4]; int source, destination, send_tag, recv_tag, Bcast_root; float **Matrix_A, **Matrix_B, **Matrix_C; float *A_Bloc_Matrix, *B_Bloc_Matrix, *C_Bloc_Matrix, *Temp_BufferA; float *MatA_array, *MatB_array, *MatC_array; FILE *fp; int MatrixA_FileStatus = 1, MatrixB_FileStatus = 1; MESH_INFO_TYPE grid; MPI_Status status; /* Initialising */ MPI_Init (&argc, &argv); /* Set up the MPI_COMM_WORLD and CARTESIAN TOPOLOGY */ SetUp_Mesh(&grid); /* Reading Input */ if (grid.MyRank == Root){ if ((fp = fopen ("./data/mdata1.inp", "r")) == NULL){ MatrixA_FileStatus = 0; } if(MatrixA_FileStatus != 0) { fscanf(fp, "%d %d\n", &NoofRows_A, &NoofCols_A); Matrix_Size[0] = NoofRows_A; Matrix_Size[1] = NoofCols_A; Matrix_A = (float **) malloc (NoofRows_A * sizeof(float *)); for (irow = 0; irow < NoofRows_A; irow++){ Matrix_A[irow] = (float *) malloc(NoofCols_A * sizeof(float)); for (icol = 0; icol < NoofCols_A; icol++) fscanf(fp, "%f", &Matrix_A[irow][icol]); } fclose(fp); } if((fp = fopen ("./data/mdata2.inp", "r")) == NULL){ MatrixB_FileStatus = 0; } if(MatrixB_FileStatus != 0) { fscanf(fp, "%d %d\n", &NoofRows_B, &NoofCols_B); Matrix_Size[2] = NoofRows_B; Matrix_Size[3] = NoofCols_B; Matrix_B = (float **) malloc (NoofRows_B * sizeof(float *)); for(irow = 0; irow < NoofRows_B; irow++){ Matrix_B[irow] = (float *) malloc(NoofCols_B * sizeof(float *)); for(icol = 0; icol < NoofCols_B; icol++) fscanf(fp, "%f", &Matrix_B[irow][icol]); } fclose(fp); } } /* MyRank == Root */ /* Send Matrix Size to all processors */ MPI_Barrier(grid.Comm); MPI_Bcast (&MatrixA_FileStatus, 1, MPI_INT, 0, grid.Comm); if(MatrixA_FileStatus == 0) { if(grid.MyRank == 0) printf("Can't open input file for Matrix A .."); MPI_Finalize(); exit(-1); } MPI_Bcast (&MatrixB_FileStatus, 1, MPI_INT, 0, grid.Comm); if(MatrixB_FileStatus == 0) { if(grid.MyRank == 0) printf("Can't open input file for Matrix B .."); MPI_Finalize(); exit(-1); } MPI_Bcast (Matrix_Size, 4, MPI_INT, 0, grid.Comm); NoofRows_A = Matrix_Size[0]; NoofCols_A = Matrix_Size[1]; NoofRows_B = Matrix_Size[2]; NoofCols_B = Matrix_Size[3]; if(NoofCols_A != NoofRows_B){ MPI_Finalize(); if(grid.MyRank == Root){ printf("Matrices Dimensions incompatible for Multiplication"); } exit(-1); } if( NoofRows_A % grid.p_proc != 0 || NoofCols_A % grid.p_proc != 0 || NoofRows_B % grid.p_proc != 0 || NoofCols_B % grid.p_proc != 0){ MPI_Finalize(); if(grid.MyRank == Root){ printf("Matrices can't be divided among processors equally"); } exit(-1); } NoofRows_BlocA = NoofRows_A / grid.p_proc; NoofCols_BlocA = NoofCols_A / grid.p_proc; NoofRows_BlocB = NoofRows_B / grid.p_proc; NoofCols_BlocB = NoofCols_B / grid.p_proc; A_Bloc_MatrixSize = NoofRows_BlocA * NoofCols_BlocA; B_Bloc_MatrixSize = NoofRows_BlocB * NoofCols_BlocB; /* Memory allocating for Bloc Matrices */ A_Bloc_Matrix = (float *) malloc (A_Bloc_MatrixSize * sizeof(float)); B_Bloc_Matrix = (float *) malloc (B_Bloc_MatrixSize * sizeof(float)); /* memory for arrangmeent of the data in one dim. arrays before MPI_SCATTER */ MatA_array =(float *)malloc(sizeof(float) * NoofRows_A * NoofCols_A); MatB_array =(float *)malloc(sizeof(float) * NoofRows_B * NoofCols_B); /*Rearrange the input matrices in one dim arrays by approriate order*/ if (grid.MyRank == Root) { /* Rearranging Matrix A*/ for (iproc = 0; iproc < grid.p_proc; iproc++){ for (jproc = 0; jproc < grid.p_proc; jproc++){ Proc_Id = iproc * grid.p_proc + jproc; for (irow = 0; irow < NoofRows_BlocA; irow++){ Global_Row_Index = iproc * NoofRows_BlocA + irow; for (icol = 0; icol < NoofCols_BlocA; icol++){ Local_Index = (Proc_Id * A_Bloc_MatrixSize) + (irow * NoofCols_BlocA) + icol; Global_Col_Index = jproc * NoofCols_BlocA + icol; MatA_array[Local_Index] = Matrix_A[Global_Row_Index][Global_Col_Index]; } } } } /* Rearranging Matrix B*/ for (iproc = 0; iproc < grid.p_proc; iproc++){ for (jproc = 0; jproc < grid.p_proc; jproc++){ Proc_Id = iproc * grid.p_proc + jproc; for (irow = 0; irow < NoofRows_BlocB; irow++){ Global_Row_Index = iproc * NoofRows_BlocB + irow; for (icol = 0; icol < NoofCols_BlocB; icol++){ Local_Index = (Proc_Id * B_Bloc_MatrixSize) + (irow * NoofCols_BlocB) + icol; Global_Col_Index = jproc * NoofCols_BlocB + icol; MatB_array[Local_Index] = Matrix_B[Global_Row_Index][Global_Col_Index]; } } } } } /* if loop ends here */ MPI_Barrier(grid.Comm); /* Scatter the Data to all processes by MPI_SCATTER */ MPI_Scatter (MatA_array, A_Bloc_MatrixSize, MPI_FLOAT, A_Bloc_Matrix , A_Bloc_MatrixSize , MPI_FLOAT, 0, grid.Comm); MPI_Scatter (MatB_array, B_Bloc_MatrixSize, MPI_FLOAT, B_Bloc_Matrix, B_Bloc_MatrixSize, MPI_FLOAT, 0, grid.Comm); /* Do initial arrangement of Matrices */ if(grid.Row !=0){ source = (grid.Col + grid.Row) % grid.p_proc; destination = (grid.Col + grid.p_proc - grid.Row) % grid.p_proc; recv_tag =0; send_tag = 0; MPI_Sendrecv_replace(A_Bloc_Matrix, A_Bloc_MatrixSize, MPI_FLOAT, destination, send_tag, source, recv_tag, grid.Row_comm, &status); } if(grid.Col !=0){ source = (grid.Row + grid.Col) % grid.p_proc; destination = (grid.Row + grid.p_proc - grid.Col) % grid.p_proc; recv_tag =0; send_tag = 0; MPI_Sendrecv_replace(B_Bloc_Matrix, B_Bloc_MatrixSize, MPI_FLOAT, destination,send_tag, source, recv_tag, grid.Col_comm, &status); } /* Allocate Memory for Bloc C Array */ C_Bloc_Matrix = (float *) malloc (NoofRows_BlocA * NoofCols_BlocB * sizeof(float)); for(index=0; indexSize)); MPI_Comm_rank(MPI_COMM_WORLD, &(grid->MyRank)); /* For square mesh */ grid->p_proc = (int)sqrt((double) grid->Size); if(grid->p_proc * grid->p_proc != grid->Size){ MPI_Finalize(); if(grid->MyRank == 0){ printf("Number of Processors should be perfect square\n"); } exit(-1); } Dimensions[0] = Dimensions[1] = grid->p_proc; /* Wraparound mesh in both dimensions. */ Periods[0] = Periods[1] = 1; /* Create Cartesian topology in two dimnesions and Cartesian decomposition of the processes */ MPI_Cart_create(MPI_COMM_WORLD, NDIMENSIONS, Dimensions, Periods, 1, &(grid->Comm)); MPI_Cart_coords(grid->Comm, grid->MyRank, NDIMENSIONS, Coordinates); grid->Row = Coordinates[0]; grid->Col = Coordinates[1]; /*Construction of row communicator and column communicators (use cartesian row and columne machanism to get Row/Col Communicators) */ Remain_dims[0] = 0; Remain_dims[1] = 1; /*The output communicator represents the column containing the process */ MPI_Cart_sub(grid->Comm, Remain_dims, &(grid->Row_comm)); Remain_dims[0] = 1; Remain_dims[1] = 0; /*The output communicator represents the row containing the process */ MPI_Cart_sub(grid->Comm, Remain_dims, &(grid->Col_comm)); }