批改娘 10108. Streams and Concurrency II (CUDA)

contents

  1. 1. 題目描述
  2. 2. 規格
  3. 3. Solution

題目描述

Demo

規格

  • Accepted 判斷依準:單一 Device 是否同時執行兩個以上的 kernel。
  • 目前只有舊的 GPU 可以提供 Judge,請確定程式運行在第三個 GPU 上。意即
1
2
int device = 2;
cudaSetDevice(device);

Solution

這一題要完成數個 kernel function 可以同時運行,因為有時候使用的 core 並不會同時運作,在有剩餘的 core 情況下,就可以將下一個 kerenl function 帶進來運作,這時候效能就可以大幅度提升。

隨便寫一個測試即可,但是在設計這一題時,發現新版的 GTX 980Ti 並不支援,藉由 CUDA 環境變數仍然無法看出,最後只能在舊版的 GPU 上,利用舊有的 nvprof 進行分析,儘管是最新版的 nvvp 仍然無法針對在 GTX 980Ti 裝置上運作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <cuda.h>
#include <omp.h>
#define MAXN (64)
#define nStreams 4
__global__ void test_global1(uint32_t IN[], int m, uint32_t OUT[]) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t sum = 0;
int LOCALIT = m * 500000;
for (int i = 0; i < LOCALIT; i++) {
sum += IN[x];
}
OUT[x] = sum;
}

uint32_t hostIn[MAXN], hostOut[MAXN];
#define CheckCuda(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
cudaStream_t stream[nStreams];
void pipelTest(uint32_t *cuIn[], uint32_t *cuOut[], int n, int m[]) {
dim3 cuBlock(1);
dim3 cuGrid(n / 1);
test_global1<<<cuGrid, cuBlock, 0, stream[0]>>>(cuIn[0], m[0], cuOut[0]);
test_global1<<<cuGrid, cuBlock, 0, stream[1]>>>(cuIn[1], m[1], cuOut[1]);
test_global1<<<cuGrid, cuBlock, 0, stream[2]>>>(cuIn[2], m[2], cuOut[2]);
test_global1<<<cuGrid, cuBlock, 0, stream[3]>>>(cuIn[3], m[3], cuOut[3]);
}
int main() {
int device = 2;
cudaSetDevice(device);
// Find device clock rate to calculate number of cycles (for 10ms)
cudaDeviceProp deviceProp;
cudaGetDevice(&device);
cudaGetDeviceProperties(&deviceProp, device);
int clockRate = deviceProp.clockRate;
printf("Device clock rate: %.3f GHz\n", (float)clockRate/1000000);

// Check card supports concurrency
if (deviceProp.concurrentKernels == 0) {
printf("GPU does not support concurrent kernel execution\n");
printf("CUDA kernel runs will be serialised\n");
}

//
srand(time(NULL));
uint32_t *cuIn[nStreams];
uint32_t *cuOut[nStreams];
for (int i = 0; i < nStreams; i++) {
CheckCuda(cudaStreamCreate(&stream[i]));
CheckCuda(cudaMalloc((void **)&cuIn[i], MAXN*sizeof(uint32_t)));
CheckCuda(cudaMalloc((void **)&cuOut[i], MAXN*sizeof(uint32_t)));
for (int j = 0; j < MAXN; j++)
hostIn[j] = rand();
cudaMemcpy(cuIn[i], hostIn, MAXN*sizeof(uint32_t), cudaMemcpyHostToDevice);
}
int m[] = {1, 2, 4, 8};
for (int i = 0; i < 5; i++) {
pipelTest(cuIn, cuOut, MAXN, m);
CheckCuda(cudaThreadSynchronize());
}
CheckCuda(cudaDeviceSynchronize());
for (int i = 0; i < nStreams; i++) {
cudaMemcpy(hostOut, cuOut[i], MAXN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
uint32_t sum = 0;
for (int j = 0; j < MAXN; j++)
sum += hostOut[j];
printf("%u\n", sum);
}
for (int i = 0; i < nStreams; i++)
cudaFree(cuIn[i]);
return 0;
}