1 Write 3 MPI Programs for Matrix Multiplication Assume all
1. Write 3 MPI Programs for Matrix Multiplication
Assume all process start with initial portion of data and hold results in the end
Version One:All to all broudcast
Version Two:Row/Column broudcast
Version Three: Cannon\'s Alogorithm
2. Compare
Run on 16 processes
With Matrix size N=256
Compare running time of each
3. Deliverables
Source Code
Analysisof running time
Graph of time
Solution
Following is a matrix multiplication code written in MPI (Message Passing Interface) which could be run on CPU cluster for parallel processing. This has been successfully tested with two square matrices, each of the size 1500*1500.
#include<stdio.h>
#include<mpi.h>
#define NUM_ROWS_A 12 //rows of input [A]
#define NUM_COLUMNS_A 12 //columns of input [A]
#define NUM_ROWS_B 12 //rows of input [B]
#define NUM_COLUMNS_B 12 //columns of input [B]
#define MASTER_TO_SLAVE_TAG 1 //tag for messages sent from master to slaves
#define SLAVE_TO_MASTER_TAG 4 //tag for messages sent from slaves to master
void makeAB(); //makes the [A] and [B] matrixes
void printArray(); //print the content of output matrix [C];
int rank; //process rank
int size; //number of processes
int i, j, k; //helper variables
double mat_a[NUM_ROWS_A][NUM_COLUMNS_A]; //declare input [A]
double mat_b[NUM_ROWS_B][NUM_COLUMNS_B]; //declare input [B]
double mat_result[NUM_ROWS_A][NUM_COLUMNS_B]; //declare output [C]
double start_time; //hold start time
double end_time; // hold end time
int low_bound; //low bound of the number of rows of [A] allocated to a slave
int upper_bound; //upper bound of the number of rows of [A] allocated to a slave
int portion; //portion of the number of rows of [A] allocated to a slave
MPI_Status status; // store status of a MPI_Recv
MPI_Request request; //capture request of a MPI_Isend
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv); //initialize MPI operations
MPI_Comm_rank(MPI_COMM_WORLD, &rank); //get the rank
MPI_Comm_size(MPI_COMM_WORLD, &size); //get number of processes
/* master initializes work*/
if (rank == 0) {
makeAB();
start_time = MPI_Wtime();
for (i = 1; i < size; i++) {//for each slave other than the master
portion = (NUM_ROWS_A / (size - 1)); // calculate portion without master
low_bound = (i - 1) * portion;
if (((i + 1) == size) && ((NUM_ROWS_A % (size - 1)) != 0)) {//if rows of [A] cannot be equally divided among slaves
upper_bound = NUM_ROWS_A; //last slave gets all the remaining rows
} else {
upper_bound = low_bound + portion; //rows of [A] are equally divisable among slaves
}
//send the low bound first without blocking, to the intended slave
MPI_Isend(&low_bound, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD, &request);
//next send the upper bound without blocking, to the intended slave
MPI_Isend(&upper_bound, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &request);
//finally send the allocated row portion of [A] without blocking, to the intended slave
MPI_Isend(&mat_a[low_bound][0], (upper_bound - low_bound) * NUM_COLUMNS_A, MPI_DOUBLE, i, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &request);
}
}
//broadcast [B] to all the slaves
MPI_Bcast(&mat_b, NUM_ROWS_B*NUM_COLUMNS_B, MPI_DOUBLE, 0, MPI_COMM_WORLD);
/* work done by slaves*/
if (rank > 0) {
//receive low bound from the master
MPI_Recv(&low_bound, 1, MPI_INT, 0, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD, &status);
//next receive upper bound from the master
MPI_Recv(&upper_bound, 1, MPI_INT, 0, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &status);
//finally receive row portion of [A] to be processed from the master
MPI_Recv(&mat_a[low_bound][0], (upper_bound - low_bound) * NUM_COLUMNS_A, MPI_DOUBLE, 0, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &status);
for (i = low_bound; i < upper_bound; i++) {//iterate through a given set of rows of [A]
for (j = 0; j < NUM_COLUMNS_B; j++) {//iterate through columns of [B]
for (k = 0; k < NUM_ROWS_B; k++) {//iterate through rows of [B]
mat_result[i][j] += (mat_a[i][k] * mat_b[k][j]);
}
}
}
//send back the low bound first without blocking, to the master
MPI_Isend(&low_bound, 1, MPI_INT, 0, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD, &request);
//send the upper bound next without blocking, to the master
MPI_Isend(&upper_bound, 1, MPI_INT, 0, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &request);
//finally send the processed portion of data without blocking, to the master
MPI_Isend(&mat_result[low_bound][0], (upper_bound - low_bound) * NUM_COLUMNS_B, MPI_DOUBLE, 0, SLAVE_TO_MASTER_TAG + 2, MPI_COMM_WORLD, &request);
}
/* master gathers processed work*/
if (rank == 0) {
for (i = 1; i < size; i++) {// untill all slaves have handed back the processed data
//receive low bound from a slave
MPI_Recv(&low_bound, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD, &status);
//receive upper bound from a slave
MPI_Recv(&upper_bound, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &status);
//receive processed data from a slave
MPI_Recv(&mat_result[low_bound][0], (upper_bound - low_bound) * NUM_COLUMNS_B, MPI_DOUBLE, i, SLAVE_TO_MASTER_TAG + 2, MPI_COMM_WORLD, &status);
}
end_time = MPI_Wtime();
printf(\"\ Running Time = %f\ \ \", end_time - start_time);
printArray();
}
MPI_Finalize(); //finalize MPI operations
return 0;
}
void makeAB()
{
for (i = 0; i < NUM_ROWS_A; i++) {
for (j = 0; j < NUM_COLUMNS_A; j++) {
mat_a[i][j] = i + j;
}
}
for (i = 0; i < NUM_ROWS_B; i++) {
for (j = 0; j < NUM_COLUMNS_B; j++) {
mat_b[i][j] = i*j;
}
}
}
void printArray()
{
for (i = 0; i < NUM_ROWS_A; i++) {
printf(\"\ \");
for (j = 0; j < NUM_COLUMNS_A; j++)
printf(\"%8.2f \", mat_a[i][j]);
}
printf(\"\ \ \ \");
for (i = 0; i < NUM_ROWS_B; i++) {
printf(\"\ \");
for (j = 0; j < NUM_COLUMNS_B; j++)
printf(\"%8.2f \", mat_b[i][j]);
}
printf(\"\ \ \ \");
for (i = 0; i < NUM_ROWS_A; i++) {
printf(\"\ \");
for (j = 0; j < NUM_COLUMNS_B; j++)
printf(\"%8.2f \", mat_result[i][j]);
}
printf(\"\ \ \");
}
Row-wise Matrix-Vector
Multiply
MPI_Comm_size(comm, &size);
MPI_Comm_rank(comm, &rank);
nlocal = n/size ;
MPI_Allgather(local_b,nlocal,MPI_DOUBLE, b, nlocal,
MPI_DOUBLE, comm);
for(i=0; i<nlocal; i++){
x[i] = 0.0;
for(j=0; j<n; j+=)
x[i] += a[i*n+j]*b[j];
}
The Program Code of Parallel Application for Matrix
Multiplication
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <mpi.h>
int ProcNum = 0; // Number of available processes
int ProcRank = 0; // Rank of current process
int GridSize; // Size of virtual processor grid
32
int GridCoords[2]; // Coordinates of current processor in grid
MPI_Comm GridComm; // Grid communicator
MPI_Comm ColComm; // Column communicator
MPI_Comm RowComm; // Row communicator
/// Function for simple initialization of matrix elements
void DummyDataInitialization (double* pAMatrix, double* pBMatrix,int Size){
int i, j; // Loop variables
for (i=0; i<Size; i++)
for (j=0; j<Size; j++) {
pAMatrix[i*Size+j] = 1;
pBMatrix[i*Size+j] = 1;
}
}
// Function for random initialization of matrix elements
void RandomDataInitialization (double* pAMatrix, double* pBMatrix,
int Size) {
int i, j; // Loop variables
srand(unsigned(clock()));
for (i=0; i<Size; i++)
for (j=0; j<Size; j++) {
pAMatrix[i*Size+j] = rand()/double(1000);
pBMatrix[i*Size+j] = rand()/double(1000);
}
}
// Function for formatted matrix output
void PrintMatrix (double* pMatrix, int RowCount, int ColCount) {
int i, j; // Loop variables
for (i=0; i<RowCount; i++) {
for (j=0; j<ColCount; j++)
printf(\"%7.4f \", pMatrix[i*ColCount+j]);
printf(\"\ \");
}
}
// Function for matrix multiplication
void SerialResultCalculation(double* pAMatrix, double* pBMatrix,
double* pCMatrix, int Size) {
int i, j, k; // Loop variables
for (i=0; i<Size; i++) {
for (j=0; j<Size; j++)
for (k=0; k<Size; k++)
pCMatrix[i*Size+j] += pAMatrix[i*Size+k]*pBMatrix[k*Size+j];
}
}
// Function for block multiplication
void BlockMultiplication(double* pAblock, double* pBblock,
double* pCblock, int Size) {
SerialResultCalculation(pAblock, pBblock, pCblock, Size);
}
// Function for creating the two-dimensional grid communicator
// and communicators for each row and each column of the grid
void CreateGridCommunicators() {
int DimSize[2]; // Number of processes in each dimension of the grid
int Periodic[2]; // =1, if the grid dimension should be periodic
int Subdims[2]; // =1, if the grid dimension should be fixed
DimSize[0] = GridSize;
33
DimSize[1] = GridSize;
Periodic[0] = 0;
Periodic[1] = 0;
// Creation of the Cartesian communicator
MPI_Cart_create(MPI_COMM_WORLD, 2, DimSize, Periodic, 1, &GridComm);
// Determination of the cartesian coordinates for every process
MPI_Cart_coords(GridComm, ProcRank, 2, GridCoords);
// Creating communicators for rows
Subdims[0] = 0; // Dimensionality fixing
Subdims[1] = 1; // The presence of the given dimension in the subgrid
MPI_Cart_sub(GridComm, Subdims, &RowComm);
// Creating communicators for columns
Subdims[0] = 1;
Subdims[1] = 0;
MPI_Cart_sub(GridComm, Subdims, &ColComm);
}
// Function for memory allocation and data initialization
void ProcessInitialization (double* &pAMatrix, double* &pBMatrix,
double* &pCMatrix, double* &pAblock, double* &pBblock, double* &pCblock,
double* &pTemporaryAblock, int &Size, int &BlockSize ) {
if (ProcRank == 0) {
do {
printf(\"\ Enter the size of matrices: \");
scanf(\"%d\", &Size);
if (Size%GridSize != 0) {
printf (\"Size of matrices must be divisible by the grid size!\ \");
}
}
while (Size%GridSize != 0);
}
MPI_Bcast(&Size, 1, MPI_INT, 0, MPI_COMM_WORLD);
BlockSize = Size/GridSize;
pAblock = new double [BlockSize*BlockSize];
pBblock = new double [BlockSize*BlockSize];
pCblock = new double [BlockSize*BlockSize];
pTemporaryAblock = new double [BlockSize*BlockSize];
for (int i=0; i<BlockSize*BlockSize; i++) {
pCblock[i] = 0;
}
if (ProcRank == 0) {
pAMatrix = new double [Size*Size];
pBMatrix = new double [Size*Size];
pCMatrix = new double [Size*Size];
DummyDataInitialization(pAMatrix, pBMatrix, Size);
//RandomDataInitialization(pAMatrix, pBMatrix, Size);
}
}
// Function for checkerboard matrix decomposition
void CheckerboardMatrixScatter(double* pMatrix, double* pMatrixBlock,
int Size, int BlockSize) {
double * MatrixRow = new double [BlockSize*Size];
if (GridCoords[1] == 0) {
MPI_Scatter(pMatrix, BlockSize*Size, MPI_DOUBLE, MatrixRow,
34
BlockSize*Size, MPI_DOUBLE, 0, ColComm);
}
for (int i=0; i<BlockSize; i++) {
MPI_Scatter(&MatrixRow[i*Size], BlockSize, MPI_DOUBLE,
&(pMatrixBlock[i*BlockSize]), BlockSize, MPI_DOUBLE, 0, RowComm);
}
delete [] MatrixRow;
}
// Data distribution among the processes
void DataDistribution(double* pAMatrix, double* pBMatrix, double*
pMatrixAblock, double* pBblock, int Size, int BlockSize) {
// Scatter the matrix among the processes of the first grid column
CheckerboardMatrixScatter(pAMatrix, pMatrixAblock, Size, BlockSize);
CheckerboardMatrixScatter(pBMatrix, pBblock, Size, BlockSize);
}
// Function for gathering the result matrix
void ResultCollection (double* pCMatrix, double* pCblock, int Size,
int BlockSize) {
double * pResultRow = new double [Size*BlockSize];
for (int i=0; i<BlockSize; i++) {
MPI_Gather( &pCblock[i*BlockSize], BlockSize, MPI_DOUBLE,
&pResultRow[i*Size], BlockSize, MPI_DOUBLE, 0, RowComm);
}
if (GridCoords[1] == 0) {
MPI_Gather(pResultRow, BlockSize*Size, MPI_DOUBLE, pCMatrix,
BlockSize*Size, MPI_DOUBLE, 0, ColComm);
}
delete [] pResultRow;
}
// Broadcasting blocks of the matrix A to process grid rows
void ABlockCommunication (int iter, double *pAblock, double* pMatrixAblock,
int BlockSize) {
// Defining the leading process of the process grid row
int Pivot = (GridCoords[0] + iter) % GridSize;
// Copying the transmitted block in a separate memory buffer
if (GridCoords[1] == Pivot) {
for (int i=0; i<BlockSize*BlockSize; i++)
pAblock[i] = pMatrixAblock[i];
}
// Block broadcasting
MPI_Bcast(pAblock, BlockSize*BlockSize, MPI_DOUBLE, Pivot, RowComm);
}
// Function for cyclic shifting the blocks of the matrix B
void BblockCommunication (double *pBblock, int BlockSize) {
MPI_Status Status;
int NextProc = GridCoords[0] + 1;
if ( GridCoords[0] == GridSize-1 ) NextProc = 0;
int PrevProc = GridCoords[0] - 1;
if ( GridCoords[0] == 0 ) PrevProc = GridSize-1;
MPI_Sendrecv_replace( pBblock, BlockSize*BlockSize, MPI_DOUBLE,
NextProc, 0, PrevProc, 0, ColComm, &Status);
}
35
// Function for parallel execution of the Fox method
void ParallelResultCalculation(double* pAblock, double* pMatrixAblock,
double* pBblock, double* pCblock, int BlockSize) {
for (int iter = 0; iter < GridSize; iter ++) {
// Sending blocks of matrix A to the process grid rows
ABlockCommunication (iter, pAblock, pMatrixAblock, BlockSize);
// Block multiplication
BlockMultiplication(pAblock, pBblock, pCblock, BlockSize);
// Cyclic shift of blocks of matrix B in process grid columns
BblockCommunication(pBblock, BlockSize);
}
}
// Test printing of the matrix block
void TestBlocks (double* pBlock, int BlockSize, char str[]) {
MPI_Barrier(MPI_COMM_WORLD);
if (ProcRank == 0) {
printf(\"%s \ \", str);
}
for (int i=0; i<ProcNum; i++) {
if (ProcRank == i) {
printf (\"ProcRank = %d \ \", ProcRank);
PrintMatrix(pBlock, BlockSize, BlockSize);
}
MPI_Barrier(MPI_COMM_WORLD);
}
}
// Function for testing the matrix multiplication result
void TestResult(double* pAMatrix, double* pBMatrix, double* pCMatrix,
int Size) {
double* pSerialResult; // Result matrix of serial multiplication
double Accuracy = 1.e-6; // Comparison accuracy
int equal = 0; // =1, if the matrices are not equal
int i; // Loop variable
if (ProcRank == 0) {
pSerialResult = new double [Size*Size];
for (i=0; i<Size*Size; i++) {
pSerialResult[i] = 0;
}
BlockMultiplication(pAMatrix, pBMatrix, pSerialResult, Size);
for (i=0; i<Size*Size; i++) {
if (fabs(pSerialResult[i]-pCMatrix[i]) >= Accuracy)
equal = 1;
}
if (equal == 1)
printf(\"The results of serial and parallel algorithms are NOT\"
\"identical. Check your code.\");
else
printf(\"The results of serial and parallel algorithms are \"
\"identical. \");
}
}
// Function for computational process termination
void ProcessTermination (double* pAMatrix, double* pBMatrix,
double* pCMatrix, double* pAblock, double* pBblock, double* pCblock,
double* pMatrixAblock) {
if (ProcRank == 0) {
delete [] pAMatrix;
delete [] pBMatrix;
delete [] pCMatrix;
36
}
delete [] pAblock;
delete [] pBblock;
delete [] pCblock;
delete [] pMatrixAblock;
}
void main(int argc, char* argv[]) {
double* pAMatrix; // First argument of matrix multiplication
double* pBMatrix; // Second argument of matrix multiplication
double* pCMatrix; // Result matrix
int Size; // Size of matrices
int BlockSize; // Sizes of matrix blocks
double *pAblock; // Initial block of matrix A
double *pBblock; // Initial block of matrix B
double *pCblock; // Block of result matrix C
double *pMatrixAblock;
double Start, Finish, Duration;
setvbuf(stdout, 0, _IONBF, 0);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &ProcNum);
MPI_Comm_rank(MPI_COMM_WORLD, &ProcRank);
GridSize = sqrt((double)ProcNum);
if (ProcNum != GridSize*GridSize) {
if (ProcRank == 0) {
printf (\"Number of processes must be a perfect square \ \");
}
}
else {
if (ProcRank == 0)
printf(\"Parallel matrix multiplication program\ \");
// Creating the cartesian grid, row and column communcators
CreateGridCommunicators();
// Memory allocation and initialization of matrix elements
ProcessInitialization ( pAMatrix, pBMatrix, pCMatrix, pAblock, pBblock,
pCblock, pMatrixAblock, Size, BlockSize );
DataDistribution(pAMatrix, pBMatrix, pMatrixAblock, pBblock, Size,
BlockSize);
// Execution of the Fox method
ParallelResultCalculation(pAblock, pMatrixAblock, pBblock,
pCblock, BlockSize);
// Gathering the result matrix
ResultCollection(pCMatrix, pCblock, Size, BlockSize);
TestResult(pAMatrix, pBMatrix, pCMatrix, Size);
// Process Termination
ProcessTermination (pAMatrix, pBMatrix, pCMatrix, pAblock, pBblock,
pCblock, pMatrixAblock);
}
MPI_Finalize();
}







