cuda Programming matrix multiplication and matrix addition

1: Experimental platform environment

windows environment
IDE: visual studio2022

2: Experimental process

2.1: check gpu hardware configuration

First, check the GPU hardware configuration of the computer. The code snippet is:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
/*
void main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    int dev;
    for (dev = 0; dev < deviceCount; dev++)
    {
        int driver_version(0), runtime_version(0);
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        if (dev == 0)
            if (deviceProp.minor = 9999 && deviceProp.major == 9999)
                printf("\n");
        printf("\nDevice%d:\"%s\"\n", dev, deviceProp.name);
        cudaDriverGetVersion(&driver_version);
        printf("CUDA Driver version:% d.% d \ n ", driver_version / 1000, (driver_version% 1000) / 10);
        cudaRuntimeGetVersion(&runtime_version);
        printf("CUDA Runtime version:%d.%d\n ", runtime\u version / 1000, (runtime\u version% 1000) / 10);
        printf("Device computing capacity:% d.%d\n", deviceProp.major, deviceProp.minor);
        printf("Total amount of Global Memory:                  %u bytes\n", deviceProp.totalGlobalMem);
        printf("Number of SMs:                                  %d\n", deviceProp.multiProcessorCount);
        printf("Total amount of Constant Memory:                %u bytes\n", deviceProp.totalConstMem);
        printf("Total amount of Shared Memory per block:        %u bytes\n", deviceProp.sharedMemPerBlock);
        printf("Total number of registers available per block:  %d\n", deviceProp.regsPerBlock);
        printf("Warp size:                                      %d\n", deviceProp.warpSize);
        printf("Maximum number of threads per SM:               %d\n", deviceProp.maxThreadsPerMultiProcessor);
        printf("Maximum number of threads per block:            %d\n", deviceProp.maxThreadsPerBlock);
        printf("Maximum size of each dimension of a block:      %d x %d x %d\n", deviceProp.maxThreadsDim[0],
            deviceProp.maxThreadsDim[1],
            deviceProp.maxThreadsDim[2]);
        printf("Maximum size of each dimension of a grid:       %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
        printf("Maximum memory pitch:                           %u bytes\n", deviceProp.memPitch);
        printf("Texture alignmemt:                              %u bytes\n", deviceProp.texturePitchAlignment);
        printf("Clock rate:                                     %.2f GHz\n", deviceProp.clockRate * 1e-6f);
        printf("Memory Clock rate:                              %.0f MHz\n", deviceProp.memoryClockRate * 1e-3f);
        printf("Memory Bus Width:                               %d-bit\n", deviceProp.memoryBusWidth);
    }
    system("pause");
    //return 0;
}
*/

The test results are:

It can be seen that using NVIDIA GeForce GTX 950M, the number of stream processors (sm) is 5, the size of shared memory per thread is 49152bytes, the maximum number of threads per block is 1024102464, and the maximum number of blocks per grid is 2147483676553565535.

2.2: matrix multiplication

Define matrix and its set and get functions as follows:

Since the matrix, set and get methods are called on the GPU side and will not be used by the CPU, the preceding is__ Only GPU is called.

When separate memory allocation is required on host and device, data copy is required and error prone. Using unified memory can avoid this error. Unified memory uses a managed memory to jointly roll the memory in host and device, and automatically transfers data in host and device. Use the cudaMallocManaged function in CUDA to allocate managed memory:
cudaError_t cudaMallocManaged(void **devPtr, size_t size, unsigned int flag=0);

At the same time, after using it, you need to use cudaFree function to free memory:

In order to avoid program errors, you need to use the cudaDeviceSynchronize() function to ensure the synchronization of device and host, so that you can correctly access the results of kernel calculation later.

Let matrices A and B be 2 ^ 10 * 2 ^ 10 matrices. Let block size be 3232 and grid size be 3232. Let each thread calculate an element in matrix C.

The Kernel function realizes the calculation of each matrix element:

Finally, check the matrix calculation results:

After operation, you can see that the maximum error of operation is 0:

Finally, the nvprof tool is used to analyze the operation of the kernel,

It can be seen that when the matrix size is 2 ^ 10 * 2 ^ 10 and the block size is 3232, the average running time is 1.06175s
When the matrix size is fixed as 2 ^ 10 * 2 ^ 10 and the test block size is 44, 88, 1616 and 32 * 32, the running time:

blockdim \ matrix scale 2 ^ 10
4 2.13349
8 1.05293
16 1.05353
32 1.06175

You can see that the block becomes smaller first and then larger. You can see that the larger the block is, the better. Just take a compromise size.
The complete code segment is as follows:

#include <stdio.h>
#include <iostream>

// Matrix type, row first, M(row, col) = *(M.elements + row * M.width + col)
struct Matrix
{
    int width;
    int height;
    float *elements;
};

 // Get the (row, col) element of matrix A
__device__ float getElement(Matrix *A, int row, int col)
{
    return A->elements[row * A->width + col];
}

// Assign a value to the (row, col) element of matrix A
__device__ void setElement(Matrix *A, int row, int col, float value)
{
    A->elements[row * A->width + col] = value;
}

// Matrix multiplication kernel, 2-D, one element per thread
__global__ void matMulKernel(Matrix *A, Matrix *B, Matrix *C)
{
    float Cvalue = 0.0;
    int row = threadIdx.y + blockIdx.y * blockDim.y;
    int col = threadIdx.x + blockIdx.x * blockDim.x;
    for (int i = 0; i < A->width; ++i)
    {
        Cvalue += getElement(A, row, i) * getElement(B, i, col);
    }
    setElement(C, row, col, Cvalue);
}

int main()
{
    int width = 1 << 10;
    int height = 1 << 10;
    Matrix *A, *B, *C;
    // Request managed memory
    cudaMallocManaged((void**)&A, sizeof(Matrix));
    cudaMallocManaged((void**)&B, sizeof(Matrix));
    cudaMallocManaged((void**)&C, sizeof(Matrix));
    int nBytes = width * height * sizeof(float);
    cudaMallocManaged((void**)&A->elements, nBytes);
    cudaMallocManaged((void**)&B->elements, nBytes);
    cudaMallocManaged((void**)&C->elements, nBytes);

    // Initialization data
    A->height = height;
    A->width = width;
    B->height = height;
    B->width = width;
    C->height = height;
    C->width = width;
    for (int i = 0; i < width * height; ++i)
    {
        A->elements[i] = 1.0;
        B->elements[i] = 2.0;
    }

    // Define the execution configuration of the kernel
    dim3 blockSize(32, 32);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, 
        (height + blockSize.y - 1) / blockSize.y);
    // Execute kernel
    matMulKernel << < gridSize, blockSize >> >(A, B, C);


    // Synchronize the device to ensure that the results can be accessed correctly
    cudaDeviceSynchronize();
    // Check execution results
    float maxError = 0.0;
    for (int i = 0; i < width * height; ++i)
        maxError = fmax(maxError, fabs(C->elements[i] - 2 * width));
    std::cout << "maximum error: " << maxError << std::endl;

    // Free memory
    cudaFree(A);
    cudaFree(B);
    cudaFree(C);
    cudaFree(&A->elements);
    cudaFree(&B->elements);
    cudaFree(&C->elements);

    return 0;
}

2.3: matrix plus

#include <stdio.h>
#include <iostream>

// Matrix type, row first, M(row, col) = *(M.elements + row * M.width + col)
struct Matrix
{
    int width;
    int height;
    float *elements;
};

 // Get the (row, col) element of matrix A
__device__ float getElement(Matrix *A, int row, int col)
{
    return A->elements[row * A->width + col];
}

// Assign a value to the (row, col) element of matrix A
__device__ void setElement(Matrix *A, int row, int col, float value)
{
    A->elements[row * A->width + col] = value;
}

// Matrix addition kernel, 2-D, one element per thread
__global__ void matMulKernel(Matrix *A, Matrix *B, Matrix *C)
{
    float Cvalue = 0.0;
    int row = threadIdx.y + blockIdx.y * blockDim.y;
    int col = threadIdx.x + blockIdx.x * blockDim.x;
    // for (int i = 0; i < A->width; ++i)
    Cvalue += getElement(A, row, col) + getElement(B, row, col);
    setElement(C, row, col, Cvalue);
}

int main()
{
    int width = 1 << 10;
    int height = 1 << 10;
    Matrix *A, *B, *C;
    // Request managed memory
    cudaMallocManaged((void**)&A, sizeof(Matrix));
    cudaMallocManaged((void**)&B, sizeof(Matrix));
    cudaMallocManaged((void**)&C, sizeof(Matrix));
    int nBytes = width * height * sizeof(float);
    cudaMallocManaged((void**)&A->elements, nBytes);
    cudaMallocManaged((void**)&B->elements, nBytes);
    cudaMallocManaged((void**)&C->elements, nBytes);

    // Initialization data
    A->height = height;
    A->width = width;
    B->height = height;
    B->width = width;
    C->height = height;
    C->width = width;
    for (int i = 0; i < width * height; ++i)
    {
        A->elements[i] = 1.0;
        B->elements[i] = 2.0;
    }

    // Define the execution configuration of the kernel
    dim3 blockSize(32, 32);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, 
        (height + blockSize.y - 1) / blockSize.y);
    // Execute kernel
    matMulKernel << < gridSize, blockSize >> >(A, B, C);


    // Synchronize the device to ensure that the results can be accessed correctly
    cudaDeviceSynchronize();
    // Check execution results
    float maxError = 0.0;
    for (int i = 0; i < width * height; ++i)
        maxError = fmax(maxError, fabs(C->elements[i] - 3));
    std::cout << "maximum error: " << maxError << std::endl;

    
    // Free memory
    cudaFree(A);
    cudaFree(B);
    cudaFree(C);
    cudaFree(&A->elements);
    cudaFree(&B->elements);
    cudaFree(&C->elements);


    return 0;
}

Related links

Tags: CUDA

Posted by MeanMrMustard on Fri, 15 Apr 2022 11:41:42 +0930