In this post, we will see CUDA Matrix Multiplication Shared Memory | CUDA Matrix Multiplication Code and Tutorial | cuda matrix multiplication code,cuda matrix multiplication tutorial,cuda matrix multiplication shared memory,cuda matrix multiplication,cuda programming,cuda programming tutorial,cuda programming c++.
Watch following video:
Watch on YouTube: https://www.youtube.com/watch?v=XeR400_QFXQ
In last post, we have seen matrix multiplication in Cuda. In this post, we will see matrix multiplication using shared memory.
Here, I have considered two matrices of sizes row1*col1 and row2*col2. Resultant matrix(product), definitely will be of size row1*col2. That's why, I have considered a two dimensional grid having row1*col2 blocks. Each block will be responsible to calculate one value of product. To find each value in product, there will be col1 or row2 number of multiplications. So I considered, Col1 number of threads in each block; So one thread for one multiplication. In short, total number of blocks = row1*col2 and in each block, number of threads = col1(or row2).
I have used shared array p[] to save intermediate multiplication values. In each block, each thread will find out value of p[i]. __syncthreads() will make sure that all threads has finished their computation i.e. all values of array p[] are available which will be added together to get one value of product.
Program:
Output:
Enter elements of first matrix of size 2*3
1 2 3 4 5 6
Enter elements of second matrix of size 3*2
1 2 3 4 5 6
Product of two matrices:
22 28
49 64
Watch following video:
Watch on YouTube: https://www.youtube.com/watch?v=XeR400_QFXQ
In last post, we have seen matrix multiplication in Cuda. In this post, we will see matrix multiplication using shared memory.
Here, I have considered two matrices of sizes row1*col1 and row2*col2. Resultant matrix(product), definitely will be of size row1*col2. That's why, I have considered a two dimensional grid having row1*col2 blocks. Each block will be responsible to calculate one value of product. To find each value in product, there will be col1 or row2 number of multiplications. So I considered, Col1 number of threads in each block; So one thread for one multiplication. In short, total number of blocks = row1*col2 and in each block, number of threads = col1(or row2).
I have used shared array p[] to save intermediate multiplication values. In each block, each thread will find out value of p[i]. __syncthreads() will make sure that all threads has finished their computation i.e. all values of array p[] are available which will be added together to get one value of product.
Program:
#include<stdio.h>
#include<cuda.h>
#define row1 2 /* Number of rows of first matrix */
#define col1 3 /* Number of columns of first matrix */
#define row2 3 /* Number of rows of second matrix */
#define col2 2 /* Number of columns of second matrix */
__global__ void matproductsharedmemory(int *l,int *m, int *n)
{
int x=blockIdx.x;
int y=blockIdx.y;
__shared__ int p[col1];
int i;
int k=threadIdx.x;
n[col2*y+x]=0;
p[k]=l[col1*y+k]*m[col2*k+x];
__syncthreads();
for(i=0;i<col1;i++)
n[col2*y+x]=n[col2*y+x]+p[i];
}
int main()
{
int a[row1][col1];
int b[row2][col2];
int c[row1][col2];
int *d,*e,*f;
int i,j;
printf("\n Enter elements of first matrix of size 2*3\n");
for(i=0;i<row1;i++)
{
for(j=0;j<col1;j++)
{
scanf("%d",&a[i][j]);
}
}
printf("\n Enter elements of second matrix of size 3*2\n");
for(i=0;i<row2;i++)
{
for(j=0;j<col2;j++)
{
scanf("%d",&b[i][j]);
}
}
cudaMalloc((void **)&d,row1*col1*sizeof(int));
cudaMalloc((void **)&e,row2*col2*sizeof(int));
cudaMalloc((void **)&f,row1*col2*sizeof(int));
cudaMemcpy(d,a,row1*col1*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(e,b,row2*col2*sizeof(int),cudaMemcpyHostToDevice);
dim3 grid(col2,row1);
/* Here we are defining two dimensional Grid(collection of blocks) structure. Syntax is dim3 grid(no. of columns,no. of rows) */
matproductsharedmemory<<<grid,col1>>>(d,e,f);
cudaMemcpy(c,f,row1*col2*sizeof(int),cudaMemcpyDeviceToHost);
printf("\n Product of two matrices:\n ");
for(i=0;i<row1;i++)
{
for(j=0;j<col2;j++)
{
printf("%d\t",c[i][j]);
}
printf("\n");
}
cudaFree(d);
cudaFree(e);
cudaFree(f);
return 0;
}
#include<cuda.h>
#define row1 2 /* Number of rows of first matrix */
#define col1 3 /* Number of columns of first matrix */
#define row2 3 /* Number of rows of second matrix */
#define col2 2 /* Number of columns of second matrix */
__global__ void matproductsharedmemory(int *l,int *m, int *n)
{
int x=blockIdx.x;
int y=blockIdx.y;
__shared__ int p[col1];
int i;
int k=threadIdx.x;
n[col2*y+x]=0;
p[k]=l[col1*y+k]*m[col2*k+x];
__syncthreads();
for(i=0;i<col1;i++)
n[col2*y+x]=n[col2*y+x]+p[i];
}
int main()
{
int a[row1][col1];
int b[row2][col2];
int c[row1][col2];
int *d,*e,*f;
int i,j;
printf("\n Enter elements of first matrix of size 2*3\n");
for(i=0;i<row1;i++)
{
for(j=0;j<col1;j++)
{
scanf("%d",&a[i][j]);
}
}
printf("\n Enter elements of second matrix of size 3*2\n");
for(i=0;i<row2;i++)
{
for(j=0;j<col2;j++)
{
scanf("%d",&b[i][j]);
}
}
cudaMalloc((void **)&d,row1*col1*sizeof(int));
cudaMalloc((void **)&e,row2*col2*sizeof(int));
cudaMalloc((void **)&f,row1*col2*sizeof(int));
cudaMemcpy(d,a,row1*col1*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(e,b,row2*col2*sizeof(int),cudaMemcpyHostToDevice);
dim3 grid(col2,row1);
/* Here we are defining two dimensional Grid(collection of blocks) structure. Syntax is dim3 grid(no. of columns,no. of rows) */
matproductsharedmemory<<<grid,col1>>>(d,e,f);
cudaMemcpy(c,f,row1*col2*sizeof(int),cudaMemcpyDeviceToHost);
printf("\n Product of two matrices:\n ");
for(i=0;i<row1;i++)
{
for(j=0;j<col2;j++)
{
printf("%d\t",c[i][j]);
}
printf("\n");
}
cudaFree(d);
cudaFree(e);
cudaFree(f);
return 0;
}
Output:
Enter elements of first matrix of size 2*3
1 2 3 4 5 6
Enter elements of second matrix of size 3*2
1 2 3 4 5 6
Product of two matrices:
22 28
49 64
No comments:
Post a Comment