1 Write 3 MPI Programs for Matrix Multiplication Assume all
1. Write 3 MPI Programs for Matrix Multiplication
Assume all process start with initial portion of data and hold results in the end
Version One:All to all broudcast
Version Two:Row/Column broudcast
Version Three: Cannon\'s Alogorithm
2. Compare
Run on 16 processes
With Matrix size N=256
Compare running time of each
3. Deliverables
Source Code
Analysisof running time
Graph of time
Solution
Following is a matrix multiplication code written in MPI (Message Passing Interface) which could be run on CPU cluster for parallel processing. This has been successfully tested with two square matrices, each of the size 1500*1500.
#include<stdio.h>
 #include<mpi.h>
 #define NUM_ROWS_A 12 //rows of input [A]
 #define NUM_COLUMNS_A 12 //columns of input [A]
 #define NUM_ROWS_B 12 //rows of input [B]
 #define NUM_COLUMNS_B 12 //columns of input [B]
 #define MASTER_TO_SLAVE_TAG 1 //tag for messages sent from master to slaves
 #define SLAVE_TO_MASTER_TAG 4 //tag for messages sent from slaves to master
 void makeAB(); //makes the [A] and [B] matrixes
 void printArray(); //print the content of output matrix [C];
 int rank; //process rank
 int size; //number of processes
 int i, j, k; //helper variables
 double mat_a[NUM_ROWS_A][NUM_COLUMNS_A]; //declare input [A]
 double mat_b[NUM_ROWS_B][NUM_COLUMNS_B]; //declare input [B]
 double mat_result[NUM_ROWS_A][NUM_COLUMNS_B]; //declare output [C]
 double start_time; //hold start time
 double end_time; // hold end time
 int low_bound; //low bound of the number of rows of [A] allocated to a slave
 int upper_bound; //upper bound of the number of rows of [A] allocated to a slave
 int portion; //portion of the number of rows of [A] allocated to a slave
 MPI_Status status; // store status of a MPI_Recv
 MPI_Request request; //capture request of a MPI_Isend
 int main(int argc, char *argv[])
 {
 MPI_Init(&argc, &argv); //initialize MPI operations
 MPI_Comm_rank(MPI_COMM_WORLD, &rank); //get the rank
 MPI_Comm_size(MPI_COMM_WORLD, &size); //get number of processes
 /* master initializes work*/
 if (rank == 0) {
 makeAB();
 start_time = MPI_Wtime();
 for (i = 1; i < size; i++) {//for each slave other than the master
 portion = (NUM_ROWS_A / (size - 1)); // calculate portion without master
 low_bound = (i - 1) * portion;
 if (((i + 1) == size) && ((NUM_ROWS_A % (size - 1)) != 0)) {//if rows of [A] cannot be equally divided among slaves
 upper_bound = NUM_ROWS_A; //last slave gets all the remaining rows
 } else {
 upper_bound = low_bound + portion; //rows of [A] are equally divisable among slaves
 }
 //send the low bound first without blocking, to the intended slave
 MPI_Isend(&low_bound, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD, &request);
 //next send the upper bound without blocking, to the intended slave
 MPI_Isend(&upper_bound, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &request);
 //finally send the allocated row portion of [A] without blocking, to the intended slave
 MPI_Isend(&mat_a[low_bound][0], (upper_bound - low_bound) * NUM_COLUMNS_A, MPI_DOUBLE, i, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &request);
 }
 }
 //broadcast [B] to all the slaves
 MPI_Bcast(&mat_b, NUM_ROWS_B*NUM_COLUMNS_B, MPI_DOUBLE, 0, MPI_COMM_WORLD);
 /* work done by slaves*/
 if (rank > 0) {
 //receive low bound from the master
 MPI_Recv(&low_bound, 1, MPI_INT, 0, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD, &status);
 //next receive upper bound from the master
 MPI_Recv(&upper_bound, 1, MPI_INT, 0, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &status);
 //finally receive row portion of [A] to be processed from the master
 MPI_Recv(&mat_a[low_bound][0], (upper_bound - low_bound) * NUM_COLUMNS_A, MPI_DOUBLE, 0, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &status);
 for (i = low_bound; i < upper_bound; i++) {//iterate through a given set of rows of [A]
 for (j = 0; j < NUM_COLUMNS_B; j++) {//iterate through columns of [B]
 for (k = 0; k < NUM_ROWS_B; k++) {//iterate through rows of [B]
 mat_result[i][j] += (mat_a[i][k] * mat_b[k][j]);
 }
 }
 }
 //send back the low bound first without blocking, to the master
 MPI_Isend(&low_bound, 1, MPI_INT, 0, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD, &request);
 //send the upper bound next without blocking, to the master
 MPI_Isend(&upper_bound, 1, MPI_INT, 0, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &request);
 //finally send the processed portion of data without blocking, to the master
 MPI_Isend(&mat_result[low_bound][0], (upper_bound - low_bound) * NUM_COLUMNS_B, MPI_DOUBLE, 0, SLAVE_TO_MASTER_TAG + 2, MPI_COMM_WORLD, &request);
 }
 /* master gathers processed work*/
 if (rank == 0) {
 for (i = 1; i < size; i++) {// untill all slaves have handed back the processed data
 //receive low bound from a slave
 MPI_Recv(&low_bound, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD, &status);
 //receive upper bound from a slave
 MPI_Recv(&upper_bound, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &status);
 //receive processed data from a slave
 MPI_Recv(&mat_result[low_bound][0], (upper_bound - low_bound) * NUM_COLUMNS_B, MPI_DOUBLE, i, SLAVE_TO_MASTER_TAG + 2, MPI_COMM_WORLD, &status);
 }
 end_time = MPI_Wtime();
 printf(\"\ Running Time = %f\ \ \", end_time - start_time);
 printArray();
 }
 MPI_Finalize(); //finalize MPI operations
 return 0;
 }
 void makeAB()
 {
 for (i = 0; i < NUM_ROWS_A; i++) {
 for (j = 0; j < NUM_COLUMNS_A; j++) {
 mat_a[i][j] = i + j;
 }
 }
 for (i = 0; i < NUM_ROWS_B; i++) {
 for (j = 0; j < NUM_COLUMNS_B; j++) {
 mat_b[i][j] = i*j;
 }
 }
 }
 void printArray()
 {
 for (i = 0; i < NUM_ROWS_A; i++) {
 printf(\"\ \");
 for (j = 0; j < NUM_COLUMNS_A; j++)
 printf(\"%8.2f \", mat_a[i][j]);
 }
 printf(\"\ \ \ \");
 for (i = 0; i < NUM_ROWS_B; i++) {
 printf(\"\ \");
 for (j = 0; j < NUM_COLUMNS_B; j++)
 printf(\"%8.2f \", mat_b[i][j]);
 }
 printf(\"\ \ \ \");
 for (i = 0; i < NUM_ROWS_A; i++) {
 printf(\"\ \");
 for (j = 0; j < NUM_COLUMNS_B; j++)
 printf(\"%8.2f \", mat_result[i][j]);
 }
 printf(\"\ \ \");
 }
Row-wise Matrix-Vector
 Multiply
 MPI_Comm_size(comm, &size);
 MPI_Comm_rank(comm, &rank);
 nlocal = n/size ;
 MPI_Allgather(local_b,nlocal,MPI_DOUBLE, b, nlocal,
 MPI_DOUBLE, comm);
 for(i=0; i<nlocal; i++){
 x[i] = 0.0;
 for(j=0; j<n; j+=)
 x[i] += a[i*n+j]*b[j];
 }
The Program Code of Parallel Application for Matrix
 Multiplication
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <math.h>
 #include <mpi.h>
 int ProcNum = 0; // Number of available processes
 int ProcRank = 0; // Rank of current process
 int GridSize; // Size of virtual processor grid
 32
 int GridCoords[2]; // Coordinates of current processor in grid
 MPI_Comm GridComm; // Grid communicator
 MPI_Comm ColComm; // Column communicator
 MPI_Comm RowComm; // Row communicator
 /// Function for simple initialization of matrix elements
 void DummyDataInitialization (double* pAMatrix, double* pBMatrix,int Size){
 int i, j; // Loop variables
 for (i=0; i<Size; i++)
 for (j=0; j<Size; j++) {
 pAMatrix[i*Size+j] = 1;
 pBMatrix[i*Size+j] = 1;
 }
 }
 // Function for random initialization of matrix elements
 void RandomDataInitialization (double* pAMatrix, double* pBMatrix,
 int Size) {
 int i, j; // Loop variables
 srand(unsigned(clock()));
 for (i=0; i<Size; i++)
 for (j=0; j<Size; j++) {
 pAMatrix[i*Size+j] = rand()/double(1000);
 pBMatrix[i*Size+j] = rand()/double(1000);
 }
 }
 // Function for formatted matrix output
 void PrintMatrix (double* pMatrix, int RowCount, int ColCount) {
 int i, j; // Loop variables
 for (i=0; i<RowCount; i++) {
 for (j=0; j<ColCount; j++)
 printf(\"%7.4f \", pMatrix[i*ColCount+j]);
 printf(\"\ \");
 }
 }
 // Function for matrix multiplication
 void SerialResultCalculation(double* pAMatrix, double* pBMatrix,
 double* pCMatrix, int Size) {
 int i, j, k; // Loop variables
 for (i=0; i<Size; i++) {
 for (j=0; j<Size; j++)
 for (k=0; k<Size; k++)
 pCMatrix[i*Size+j] += pAMatrix[i*Size+k]*pBMatrix[k*Size+j];
 }
 }
 // Function for block multiplication
 void BlockMultiplication(double* pAblock, double* pBblock,
 double* pCblock, int Size) {
 SerialResultCalculation(pAblock, pBblock, pCblock, Size);
 }
 // Function for creating the two-dimensional grid communicator
 // and communicators for each row and each column of the grid
 void CreateGridCommunicators() {
 int DimSize[2]; // Number of processes in each dimension of the grid
 int Periodic[2]; // =1, if the grid dimension should be periodic
 int Subdims[2]; // =1, if the grid dimension should be fixed
DimSize[0] = GridSize;
 33
 DimSize[1] = GridSize;
 Periodic[0] = 0;
 Periodic[1] = 0;
 // Creation of the Cartesian communicator
 MPI_Cart_create(MPI_COMM_WORLD, 2, DimSize, Periodic, 1, &GridComm);
 // Determination of the cartesian coordinates for every process
 MPI_Cart_coords(GridComm, ProcRank, 2, GridCoords);
// Creating communicators for rows
 Subdims[0] = 0; // Dimensionality fixing
 Subdims[1] = 1; // The presence of the given dimension in the subgrid
 MPI_Cart_sub(GridComm, Subdims, &RowComm);
// Creating communicators for columns
 Subdims[0] = 1;
 Subdims[1] = 0;
 MPI_Cart_sub(GridComm, Subdims, &ColComm);
 }
 // Function for memory allocation and data initialization
 void ProcessInitialization (double* &pAMatrix, double* &pBMatrix,
 double* &pCMatrix, double* &pAblock, double* &pBblock, double* &pCblock,
 double* &pTemporaryAblock, int &Size, int &BlockSize ) {
 if (ProcRank == 0) {
 do {
 printf(\"\ Enter the size of matrices: \");
 scanf(\"%d\", &Size);
if (Size%GridSize != 0) {
 printf (\"Size of matrices must be divisible by the grid size!\ \");
 }
 }
 while (Size%GridSize != 0);
 }
 MPI_Bcast(&Size, 1, MPI_INT, 0, MPI_COMM_WORLD);
 BlockSize = Size/GridSize;
 pAblock = new double [BlockSize*BlockSize];
 pBblock = new double [BlockSize*BlockSize];
 pCblock = new double [BlockSize*BlockSize];
 pTemporaryAblock = new double [BlockSize*BlockSize];
 for (int i=0; i<BlockSize*BlockSize; i++) {
 pCblock[i] = 0;
 }
 if (ProcRank == 0) {
 pAMatrix = new double [Size*Size];
 pBMatrix = new double [Size*Size];
 pCMatrix = new double [Size*Size];
 DummyDataInitialization(pAMatrix, pBMatrix, Size);
 //RandomDataInitialization(pAMatrix, pBMatrix, Size);
 }
 }
 // Function for checkerboard matrix decomposition
 void CheckerboardMatrixScatter(double* pMatrix, double* pMatrixBlock,
 int Size, int BlockSize) {
 double * MatrixRow = new double [BlockSize*Size];
 if (GridCoords[1] == 0) {
 MPI_Scatter(pMatrix, BlockSize*Size, MPI_DOUBLE, MatrixRow,
 34
 BlockSize*Size, MPI_DOUBLE, 0, ColComm);
 }
 for (int i=0; i<BlockSize; i++) {
 MPI_Scatter(&MatrixRow[i*Size], BlockSize, MPI_DOUBLE,
 &(pMatrixBlock[i*BlockSize]), BlockSize, MPI_DOUBLE, 0, RowComm);
 }
 delete [] MatrixRow;
 }
 // Data distribution among the processes
 void DataDistribution(double* pAMatrix, double* pBMatrix, double*
 pMatrixAblock, double* pBblock, int Size, int BlockSize) {
 // Scatter the matrix among the processes of the first grid column
 CheckerboardMatrixScatter(pAMatrix, pMatrixAblock, Size, BlockSize);
 CheckerboardMatrixScatter(pBMatrix, pBblock, Size, BlockSize);
 }
 // Function for gathering the result matrix
 void ResultCollection (double* pCMatrix, double* pCblock, int Size,
 int BlockSize) {
 double * pResultRow = new double [Size*BlockSize];
 for (int i=0; i<BlockSize; i++) {
 MPI_Gather( &pCblock[i*BlockSize], BlockSize, MPI_DOUBLE,
 &pResultRow[i*Size], BlockSize, MPI_DOUBLE, 0, RowComm);
 }
 if (GridCoords[1] == 0) {
 MPI_Gather(pResultRow, BlockSize*Size, MPI_DOUBLE, pCMatrix,
 BlockSize*Size, MPI_DOUBLE, 0, ColComm);
 }
 delete [] pResultRow;
 }
 // Broadcasting blocks of the matrix A to process grid rows
 void ABlockCommunication (int iter, double *pAblock, double* pMatrixAblock,
 int BlockSize) {
 // Defining the leading process of the process grid row
 int Pivot = (GridCoords[0] + iter) % GridSize;
// Copying the transmitted block in a separate memory buffer
 if (GridCoords[1] == Pivot) {
 for (int i=0; i<BlockSize*BlockSize; i++)
 pAblock[i] = pMatrixAblock[i];
 }
// Block broadcasting
 MPI_Bcast(pAblock, BlockSize*BlockSize, MPI_DOUBLE, Pivot, RowComm);
 }
 // Function for cyclic shifting the blocks of the matrix B
 void BblockCommunication (double *pBblock, int BlockSize) {
 MPI_Status Status;
 int NextProc = GridCoords[0] + 1;
 if ( GridCoords[0] == GridSize-1 ) NextProc = 0;
 int PrevProc = GridCoords[0] - 1;
 if ( GridCoords[0] == 0 ) PrevProc = GridSize-1;
 MPI_Sendrecv_replace( pBblock, BlockSize*BlockSize, MPI_DOUBLE,
 NextProc, 0, PrevProc, 0, ColComm, &Status);
 }
 35
 // Function for parallel execution of the Fox method
 void ParallelResultCalculation(double* pAblock, double* pMatrixAblock,
 double* pBblock, double* pCblock, int BlockSize) {
 for (int iter = 0; iter < GridSize; iter ++) {
 // Sending blocks of matrix A to the process grid rows
 ABlockCommunication (iter, pAblock, pMatrixAblock, BlockSize);
 // Block multiplication
 BlockMultiplication(pAblock, pBblock, pCblock, BlockSize);
 // Cyclic shift of blocks of matrix B in process grid columns
 BblockCommunication(pBblock, BlockSize);
 }
 }
 // Test printing of the matrix block
 void TestBlocks (double* pBlock, int BlockSize, char str[]) {
 MPI_Barrier(MPI_COMM_WORLD);
 if (ProcRank == 0) {
 printf(\"%s \ \", str);
 }
 for (int i=0; i<ProcNum; i++) {
 if (ProcRank == i) {
 printf (\"ProcRank = %d \ \", ProcRank);
 PrintMatrix(pBlock, BlockSize, BlockSize);
 }
 MPI_Barrier(MPI_COMM_WORLD);
 }
 }
 // Function for testing the matrix multiplication result
 void TestResult(double* pAMatrix, double* pBMatrix, double* pCMatrix,
 int Size) {
 double* pSerialResult; // Result matrix of serial multiplication
 double Accuracy = 1.e-6; // Comparison accuracy
 int equal = 0; // =1, if the matrices are not equal
 int i; // Loop variable
 if (ProcRank == 0) {
 pSerialResult = new double [Size*Size];
 for (i=0; i<Size*Size; i++) {
 pSerialResult[i] = 0;
 }
 BlockMultiplication(pAMatrix, pBMatrix, pSerialResult, Size);
 for (i=0; i<Size*Size; i++) {
 if (fabs(pSerialResult[i]-pCMatrix[i]) >= Accuracy)
 equal = 1;
 }
 if (equal == 1)
 printf(\"The results of serial and parallel algorithms are NOT\"
 \"identical. Check your code.\");
 else
 printf(\"The results of serial and parallel algorithms are \"
 \"identical. \");
 }
 }
 // Function for computational process termination
 void ProcessTermination (double* pAMatrix, double* pBMatrix,
 double* pCMatrix, double* pAblock, double* pBblock, double* pCblock,
 double* pMatrixAblock) {
 if (ProcRank == 0) {
 delete [] pAMatrix;
 delete [] pBMatrix;
 delete [] pCMatrix;
 36
 }
 delete [] pAblock;
 delete [] pBblock;
 delete [] pCblock;
 delete [] pMatrixAblock;
 }
 void main(int argc, char* argv[]) {
 double* pAMatrix; // First argument of matrix multiplication
 double* pBMatrix; // Second argument of matrix multiplication
 double* pCMatrix; // Result matrix
 int Size; // Size of matrices
 int BlockSize; // Sizes of matrix blocks
 double *pAblock; // Initial block of matrix A
 double *pBblock; // Initial block of matrix B
 double *pCblock; // Block of result matrix C
 double *pMatrixAblock;
 double Start, Finish, Duration;
 setvbuf(stdout, 0, _IONBF, 0);
 MPI_Init(&argc, &argv);
 MPI_Comm_size(MPI_COMM_WORLD, &ProcNum);
 MPI_Comm_rank(MPI_COMM_WORLD, &ProcRank);
 GridSize = sqrt((double)ProcNum);
 if (ProcNum != GridSize*GridSize) {
 if (ProcRank == 0) {
 printf (\"Number of processes must be a perfect square \ \");
 }
 }
 else {
 if (ProcRank == 0)
 printf(\"Parallel matrix multiplication program\ \");
 // Creating the cartesian grid, row and column communcators
 CreateGridCommunicators();
// Memory allocation and initialization of matrix elements
 ProcessInitialization ( pAMatrix, pBMatrix, pCMatrix, pAblock, pBblock,
 pCblock, pMatrixAblock, Size, BlockSize );
 DataDistribution(pAMatrix, pBMatrix, pMatrixAblock, pBblock, Size,
 BlockSize);
 // Execution of the Fox method
 ParallelResultCalculation(pAblock, pMatrixAblock, pBblock,
 pCblock, BlockSize);
 // Gathering the result matrix
 ResultCollection(pCMatrix, pCblock, Size, BlockSize);
 TestResult(pAMatrix, pBMatrix, pCMatrix, Size);
 // Process Termination
 ProcessTermination (pAMatrix, pBMatrix, pCMatrix, pAblock, pBblock,
 pCblock, pMatrixAblock);
 }
 MPI_Finalize();
 }








