cuda

nvidia-smi 显示GPU信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
void CPUFunction()
{
printf("This function is defined to run on the CPU.\n");
}

__global__ void GPUFunction() //__global__的返回值为void
{
printf("This function is defined to run on the GPU.\n");
}

int main()
{
CPUFunction();

GPUFunction<<<2, 4>>>(); // 2 blocks, 4 threads / block
cudaDeviceSynchronize();
}

核函数启动方式为异步

1
<<< NUMBER_OF_BLOCKS, NUMBER_OF_THREADS_PER_BLOCK>>>

gridDim.x grid中的block数量
blockIdx.x grid中当前block的index(从0开始)
blockDim.x block中线程数量
threadIdx.x block中线程的index(从0开始)

32个线程一个wrap

鉴于 GPU 的硬件特性,所含线程的数量为 32 的倍数的线程块是为理想的选择

动态计算block的数量,N为问题规模

1
size_t number_of_blocks = (N + threads_per_block - 1) / threads_per_block;

grid中的线程数小于问题规模N的情况

在核函数中加入网格跨度循环, 一个线程完成index, index + threads_per_grid, index + threads_per_grid*2 …的任务

1
2
3
4
5
6
7
8
9
10
__global void kernel(int *a, int N)
{
int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x;
int gridStride = gridDim.x * blockDim.x;

for (int i = indexWithinTheGrid; i < N; i += gridStride)
{
// do work on a[i];
}
}

错误处理

1
2
3
4
5
6
7
cudaError_t err;
err = cudaMallocManaged(&a, N) // Assume the existence of `a` and `N`.

if (err != cudaSuccess) // `cudaSuccess` is provided by CUDA.
{
printf("Error: %s\n", cudaGetErrorString(err)); // `cudaGetErrorString` is provided by CUDA.
}
1
2
3
4
5
6
7
8
someKernel<<<1, -1>>>();  // -1 is not a valid number of threads.

cudaError_t err;
err = cudaGetLastError(); // `cudaGetLastError` will return the error from above.
if (err != cudaSuccess)
{
printf("Error: %s\n", cudaGetErrorString(err));
}

封装成宏

1
2
3
4
5
6
7
8
inline cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}

核函数在流中运行,
cuda中有一个默认流和多个非默认流
流中的核函数顺序执行,
不同的非默认流中的核函数可以同时运行(无法保证顺序)
默认流中的核函数不能与其他非默认流的核函数同时运行
默认流会受到阻碍,并在其他所有流完成之后方可运行,但其亦会阻碍其他流的运行直至其自身已运行完毕

1
2
3
4
5
6
7
8
cudaStream_t stream;
cudaStreamCreate(&stream);

kernel<<<number_of_blocks, threads_per_block, bytes_of_shared_memory, stream>>>;

// some things to do

cudaStreamDestroy(stream);
number_of_blocks/threads_per_block 256 512 1024
256 129.34ms 106.04ms 113.95ms
512 100.13ms 129.93ms 109.08ms
1024 106.67ms 102.97ms 104.36ms
2048 100.05ms 99.280ms 109.68ms
4096 100.65ms 102.35ms 113.03ms