批改娘 10090. Dot Product (OpenCL)

contents

  1. 1. 題目描述
    1. 1.1. main.c
    2. 1.2. utils.h
  2. 2. 範例輸入
  3. 3. 範例輸出
  4. 4. 編譯參數
  5. 5. Solution
    1. 5.1. main.c
    2. 5.2. vecdot.cl

題目描述

請用 OpenCL 改寫下段的計算:

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include <stdio.h>
#include <assert.h>
#include <omp.h>
#include <inttypes.h>
#include "utils.h"
#define MAXGPU 8
#define MAXCODESZ 32767
#define MAXN 16777216
static cl_uint A[MAXN], B[MAXN], C[MAXN];
int main(int argc, char *argv[]) {
omp_set_num_threads(4);
int N;
uint32_t key1, key2;
while (scanf("%d %" PRIu32 " %" PRIu32, &N, &key1, &key2) == 3) {
int chunk = N / 4;
for (int i = 0; i < N; i++) {
A[i] = encrypt(i, key1);
B[i] = encrypt(i, key2);
}
for (int i = 0; i < N; i++)
C[i] = A[i] * B[i];
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += C[i];
printf("%" PRIu32 "\n", sum);
}
return 0;
}

utils.h

1
2
3
4
5
6
7
8
9
10
#ifndef _UTILS_H
#define _UTILS_H
#include <stdint.h>
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
#endif

範例輸入

1
2
16777216 1 2
16777216 3 5

範例輸出

1
2
2885681152
2147483648

編譯參數

1
gcc -std=c99 -O2 main.c -lOpenCL -fopenmp

Solution

這一題藉由兩個亂數產生長度為 $N$ 的兩個向量,計算內積結果為何。

由於這是第一份計算 OpenCL 的應用,特別注意 Memory Leak 的問題,確定每一次執行都有正常釋放資源,可以透過 $ htop 或者 $ top 指令監控,若在 nvidia 平台下,可以使用 $ nvidia-smi 觀察程式佔有的記憶體量已經排隊情況,同時要小心繁重工作導致熱當機。

這一題原本預設要從 CPU 產生兩個向量,再傳送到 GPU 上面計算,同學一問就不小心將加密的檔案一起釋出,結果就能直接在 GPU 上產生,並且內積完使用 $O(\log N)$ 進行 work-group 內部進行加總,這大幅度地降低需要回到 CPU 計算總和的時間。

特別注意實驗環境最多允許一個 work-group 有 1024 個 work-item,從效率結果上來看,work-item 並不是越多越好,因為牽涉到 register 數量以及 memory access 的效率,這部分編譯器無法幫忙,全權交給程序員決定。而且 GPU 還有嚴重的 bank conflict,在計算一個 work-group 總和時,特殊的寫法減少 bank conflict 的發生。

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include "utils.h"
#include <omp.h>
#define MAXGPU 1
#define MAXN 16777216
#define GPULOCAL 256
uint32_t hostC[MAXN/GPULOCAL];
int N;
uint32_t keyA, keyB;
char clSrcFormat[1024] = "";
char clSrc[1024] = "";
char clSrcMain[1024] = "vecdot";
// -- start working with OpenCL
cl_context clCtx;
cl_program clPrg;
cl_kernel clKrn;
cl_command_queue clQue;
cl_mem clMemOut;
#define CheckFailAndExit(status) \
if (status != CL_SUCCESS) { \
fprintf(stderr, "Error %d: Line %u in file %s\n\n", status, __LINE__, __FILE__), \
destroyGPU(clCtx, clPrg, clKrn, clQue, clMemOut); \
}
#define clFuncArgs cl_context *clCtx, cl_program *clPrg, cl_kernel *clKrn, \
cl_command_queue *clQue, cl_mem *clMemOut
#define clCallFunc &clCtx, &clPrg, &clKrn, &clQue, &clMemOut
void destroyGPU(clFuncArgs) {
fprintf(stderr, "Starting Cleanup ...\n\n");
if (*clMemOut) clReleaseMemObject(*clMemOut);
if (*clKrn) clReleaseKernel(*clKrn);
if (*clPrg) clReleaseProgram(*clPrg);
if (*clQue) clReleaseCommandQueue(*clQue);
if (*clCtx) clReleaseContext(*clCtx);
exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
// -- generate kernel code
FILE *codefin = fopen(fileName, "r");
assert(codefin != NULL);
size_t clSrcLen = fread(clSrc, 1, 1024, codefin);
cl_int clStat;
cl_uint clPlatN, clGPUN;
cl_platform_id clPlatID;
cl_device_id clGPUID[MAXGPU];
const char *clSrcPtr = clSrc;
// -- basic OpenCL setup
clGetPlatformIDs(1, &clPlatID, &clPlatN);
clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, NULL);
*clCtx = clCreateContext(NULL, 1, clGPUID, NULL, NULL, &clStat);
CheckFailAndExit(clStat);
*clQue = clCreateCommandQueue(*clCtx, clGPUID[0], 0, &clStat);
CheckFailAndExit(clStat);
*clPrg = clCreateProgramWithSource(*clCtx, 1, &clSrcPtr, &clSrcLen, &clStat);
CheckFailAndExit(clStat);
clStat = clBuildProgram(*clPrg, 1, clGPUID, NULL, NULL, NULL);
if (clStat != CL_SUCCESS) {
fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__);
size_t log_size;
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *program_log = (char *) calloc(log_size+1, sizeof(char));
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
printf("%s", program_log);
free(program_log);
CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
}
*clKrn = clCreateKernel(*clPrg, clSrcMain, &clStat);
CheckFailAndExit(clStat);
// -- create all buffers
cl_mem_flags clOutBuffFlag = CL_MEM_WRITE_ONLY;
*clMemOut = clCreateBuffer(*clCtx, clOutBuffFlag, sizeof(uint32_t)*MAXN/GPULOCAL,
hostC, &clStat);
CheckFailAndExit(clStat);
return 1;
}
int executeGPU(clFuncArgs) {
uint32_t padding = 0;
while (N%GPULOCAL) {
padding += encrypt(N, keyA) * encrypt(N, keyB);
N++;
}
cl_int clStat;
size_t globalOffset[] = {0};
size_t globalSize[] = {N};
size_t localSize[] = {GPULOCAL};
// -- set argument to kernel
clStat = clSetKernelArg(*clKrn, 0, sizeof(cl_uint), (void *) &keyA);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(*clKrn, 1, sizeof(cl_uint), (void *) &keyB);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(*clKrn, 2, sizeof(cl_mem), (void *) clMemOut);
CheckFailAndExit(clStat);
// -- execute
clStat = clEnqueueNDRangeKernel(*clQue, *clKrn, 1, globalOffset,
globalSize, localSize, 0, NULL, NULL);
CheckFailAndExit(clStat);
// -- read back
clEnqueueReadBuffer(*clQue, *clMemOut, CL_TRUE, 0, sizeof(uint32_t)*N/GPULOCAL,
hostC, 0, NULL, NULL);
uint32_t sum = 0;
omp_set_num_threads(4);
#pragma omp parallel for reduction(+: sum)
for (int i = 0; i < N/GPULOCAL; i++)
sum += hostC[i];
printf("%u\n", sum - padding);
return 1;
}
int readIn() {
int has = 0;
if (scanf("%d %u %u", &N, &keyA, &keyB) != 3)
return 0;
return 1;
}
void onStart() {
initAllGPU("vecdot.cl", clCallFunc);
while (readIn())
executeGPU(clCallFunc);
destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
printf("God Bless Me\n");
destroyGPU(clCallFunc);
exit(0);
}
int main(int argc, char *argv[]) {
const char sigErr[] = "I can't catch signal.\n";
if (signal(SIGTRAP, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGSEGV, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGFPE, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGKILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGINT, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
onStart();
return 0;
}

vecdot.cl

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#define uint32_t unsigned int
inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
__kernel void vecdot(uint32_t keyA, uint32_t keyB, __global int* C) {
__local int buf[256];
int globalId = get_global_id(0);
int groupId = get_group_id(0);
int localId = get_local_id(0);
int localSz = get_local_size(0);
buf[localId] = encrypt(globalId, keyA) * encrypt(globalId, keyB);
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = localSz>>1; i; i >>= 1) {
if (localId < i)
buf[localId] += buf[localId + i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (localId == 0)
C[groupId] = buf[0];
}