## Wednesday 13 May 2015

### Cuda program for multiplication of two matrices

For Cuda multiplication using shared memory, check Next Post.

Watch following video:

In this post of "Cuda program for multiplication of two matrices", I have considered two cases:
1. Two dimensional blocks and one thread per block.
2. One block and two dimensional threads in that block.

1. Two dimensional blocks and one thread per block.
#include<stdio.h>
#include<cuda.h>
#define row1 2 /* Number of rows of first matrix */
#define col1 3 /* Number of columns of first matrix */
#define row2 3 /* Number of rows of second matrix */
#define col2 2 /* Number of columns of second matrix */

__global__ void matproduct(int *l,int *m, int *n)
{
int x=blockIdx.x;
int y=blockIdx.y;
int k;

n[col2*y+x]=0;
for(k=0;k<col1;k++)
{
n[col2*y+x]=n[col2*y+x]+l[col1*y+k]*m[col2*k+x];
}
}

int main()
{
int a[row1][col1];
int b[row2][col2];
int c[row1][col2];
int *d,*e,*f;
int i,j;

printf("\n Enter elements of first matrix of size 2*3\n");
for(i=0;i<row1;i++)
{
for(j=0;j<col1;j++)
{
scanf("%d",&a[i][j]);
}
}
printf("\n Enter elements of second matrix of size 3*2\n");
for(i=0;i<row2;i++)
{
for(j=0;j<col2;j++)
{
scanf("%d",&b[i][j]);
}
}

cudaMalloc((void **)&d,row1*col1*sizeof(int));
cudaMalloc((void **)&e,row2*col2*sizeof(int));
cudaMalloc((void **)&f,row1*col2*sizeof(int));

cudaMemcpy(d,a,row1*col1*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(e,b,row2*col2*sizeof(int),cudaMemcpyHostToDevice);

dim3 grid(col2,row1);
/* Here we are defining two dimensional Grid(collection of blocks) structure. Syntax is dim3 grid(no. of columns,no. of rows) */

matproduct<<<grid,1>>>(d,e,f);

cudaMemcpy(c,f,row1*col2*sizeof(int),cudaMemcpyDeviceToHost);
printf("\nProduct of two matrices:\n ");
for(i=0;i<row1;i++)
{
for(j=0;j<col2;j++)
{
printf("%d\t",c[i][j]);
}
printf("\n");
}

cudaFree(d);
cudaFree(e);
cudaFree(f);

return 0;
}

Output:
Enter elements of first matrix of size 2*3
1 2 3 4 5 6

Enter elements of second matrix of size 3*2
7 8 9 10 11 12

Product of two matrices:
58    64
139    154

2. One block and two dimensional threads in that block.
#include<stdio.h>
#include<cuda.h>
#define row1 2 /* Number of rows of first matrix */
#define col1 3 /* Number of columns of first matrix */
#define row2 3 /* Number of rows of second matrix */
#define col2 2 /* Number of columns of second matrix */

__global__ void matadd(int *l,int *m, int *n)
{

int k;

n[col2*y+x]=0;
for(k=0;k<col1;k++)
{
n[col2*y+x]=n[col2*y+x]+l[col1*y+k]*m[col2*k+x];
}
}

int main()
{
int a[row1][col1];
int b[row2][col2];
int c[row1][col2];
int *d,*e,*f;
int i,j;

printf("\n Enter elements of first matrix of size 2*3\n");
for(i=0;i<row1;i++)
{
for(j=0;j<col1;j++)
{
scanf("%d",&a[i][j]);
}
}
printf("\n Enter elements of second matrix of size 3*2\n");
for(i=0;i<row2;i++)
{
for(j=0;j<col2;j++)
{
scanf("%d",&b[i][j]);
}
}

cudaMalloc((void **)&d,row1*col1*sizeof(int));
cudaMalloc((void **)&e,row2*col2*sizeof(int));
cudaMalloc((void **)&f,row1*col2*sizeof(int));

cudaMemcpy(d,a,row1*col1*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(e,b,row2*col2*sizeof(int),cudaMemcpyHostToDevice);

/* Here we are defining two dimensional Grid(collection of blocks) structure. Syntax is dim3 grid(no. of columns,no. of rows) */

cudaMemcpy(c,f,row1*col2*sizeof(int),cudaMemcpyDeviceToHost);

printf("\nProduct of two matrices:\n ");
for(i=0;i<row1;i++)
{
for(j=0;j<col2;j++)
{
printf("%d\t",c[i][j]);
}
printf("\n");
}

cudaFree(d);
cudaFree(e);
cudaFree(f);

return 0;
}

Output:
Enter elements of first matrix of size 2*3
1 2 3 4 5 6

Enter elements of second matrix of size 3*2
7 8 9 10 11 12

Product of two matrices:
58    64
139    154