批改娘 10096. Fast Game of Life (OpenCL)

contents

  1. 1. 題目描述
  2. 2. 輸入格式
  3. 3. 輸出格式
  4. 4. 範例輸入 1
  5. 5. 範例輸出 1
  6. 6. 範例輸入 2
  7. 7. 範例輸出 2
  8. 8. 編譯參數
  9. 9. 備註
  10. 10. Solution
    1. 10.1. partial local memory
    2. 10.2. full local memory
    3. 10.3. 最終優化

題目描述

生命遊戲中,對於任意細胞,規則如下:
每個細胞有兩種狀態-存活或死亡,每個細胞與以自身為中心的周圍八格細胞產生互動。

  • 當前細胞為存活狀態時,當周圍低於 2 個 (不包含 2 個) 存活細胞時,該細胞變成死亡狀態。
  • 當前細胞為存活狀態時,當周圍有 2 個或 3 個存活細胞時, 該細胞保持原樣。
  • 當前細胞為存活狀態時,當周圍有 3 個以上的存活細胞時,該細胞變成死亡狀態。
  • 當前細胞為死亡狀態時,當周圍有 3 個存活細胞時,該細胞變成存活狀態。

可以把最初的細胞結構定義為種子,當所有在種子中的細胞同時被以上規則處理後,可以得到第一代細胞圖。按規則繼續處理當前的細胞圖,可以得到下一代的細胞圖,周而復始。

輸入格式

輸入第一行有兩個整數 $N$, $M$,表示盤面大小為 $N \times N$,模擬週期次數 $M$。接下來會有 $N$ 行,每一行上會有 $N$ 個字符,以 0 表示 $(i, j)$ 格子上的細胞屬於死亡狀態,反之 1 為存活狀態。

  • $1 \le N \le 2000$
  • $0 \le M \le 5000$

輸出格式

對於每一組測資輸出 $N$ 行,每一行上有 $N$ 個字元表示模擬 $M$ 次的最終盤面結果。

範例輸入 1

1
2
3
4
5
6
5 1
10001
00100
01110
00100
01010

範例輸出 1

1
2
3
4
5
00000
00100
01010
00000
00100

範例輸入 2

1
2
3
4
5
6
5 3
10001
00100
01110
00100
01010

範例輸出 2

1
2
3
4
5
00000
00000
01110
00000
00000

編譯參數

1
2
$ gcc -std=c99 -O2 main.c -lOpenCL -fopenmp -o main
$ ./main

備註

  • 2016/05/07 放寬時間限制,請減少 clCreateBuffer 數量並重複使用那些已經建立好的。
  • 2016/05/09 提供測資下載

by Morris

Solution

簡單的模擬題目,平行化只需要套用滾動數組即可。

當我們拚命優化 local memory 存取,卻在替同學 debug 時發現意外地加速,於是新境界到來,順便跟同學交流一下加速部份,甚至連開檔時間都要省!一起追尋神乎其技的感覺非常不賴。

3571 ms (24-core CPU) -> 2567 ms (GPU, partial local memory) -> 2472 ms (GPU, full local memory) -> 1675 ms (GPU, full local memory + work group opt) -> 967 ms (GPU, global memory + I/O opt + embedded kernel code)

partial local memory

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#define N %d
#define binN %d
#define CTYPE char
__kernel void simulate(__global CTYPE *IN,
__global CTYPE *OUT) {
int x = get_global_id(0);
int y = get_global_id(1);
int localX = get_local_id(0);
int localY = get_local_id(1);
int localSz = get_local_size(0);
__local char g[16][16];
const int dx[] = {-1, -1, -1, 0, 0, 1, 1, 1};
const int dy[] = {-1, 0, 1, -1, 1, -1, 0, 1};
char t = IN[x * binN + y];
g[localX][localY] = t;
barrier(CLK_LOCAL_MEM_FENCE);
int adj = 0;
for (int i = 0; i < 8; i++) {
int cx = localX + dx[i];
int cy = localY + dy[i];
int tx = x + dx[i];
int ty = y + dy[i];
if (tx < 0 || ty < 0 || tx >= N || ty >= N)
continue;
if (cx >= 0 && cx < localSz && cy >= 0 && cy < localSz) {
adj += g[cx][cy];
} else {
adj += IN[tx * binN + ty];
}
}
OUT[x * binN + y] = (t == 0 && adj == 3) || (t == 1 && (adj == 2 || adj == 3));
}

full local memory

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#define N %d
#define binN %d
#define localN %d
#define CTYPE char
inline void move_border(__local char g[][localN+2], __global CTYPE *IN,
int localX, int localY, int localSz, int x, int y) {
if (localX == 1) {
g[localX-1][localY] = IN[(x-1) * binN + y];
if (localY == 1)
g[localX-1][localY-1] = IN[(x-1) * binN + (y-1)];
if (localY == localSz)
g[localX-1][localY+1] = IN[(x-1) * binN + (y+1)];
}
if (localY == 1) g[localX][localY-1] = IN[x * binN + (y-1)];
if (localY == localSz) g[localX][localY+1] = IN[x * binN + (y+1)];
if (localX == localSz) {
g[localX+1][localY] = IN[(x+1) * binN + y];
if (localY == 1)
g[localX+1][localY-1] = IN[(x+1) * binN + (y-1)];
if (localY == localSz)
g[localX+1][localY+1] = IN[(x+1) * binN + (y+1)];
}
}
__kernel void simulate(__global CTYPE *IN,
__global CTYPE *OUT) {
int x = get_global_id(0)+1;
int y = get_global_id(1)+1;
int localX = get_local_id(0)+1;
int localY = get_local_id(1)+1;
int localSz = get_local_size(0);
__local char g[localN+2][localN+2];
const int dx[] = {-1, -1, -1, 0, 0, 1, 1, 1};
const int dy[] = {-1, 0, 1, -1, 1, -1, 0, 1};
// move itself to local
char t = IN[x * binN + y];
g[localX][localY] = t;
// move border to local
move_border(g, IN, localX, localY, localSz, x, y);
barrier(CLK_LOCAL_MEM_FENCE);
if (x > N || y > N) return ;
int adj = 0;
for (int i = 0; i < 8; i++) {
int cx = localX + dx[i];
int cy = localY + dy[i];
adj += g[cx][cy];
}
OUT[x * binN + y] = (t == 0 && adj == 3) || (t == 1 && (adj == 2 || adj == 3));
}

最終優化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include <omp.h>
#define OPENCL_MAXGPU 2
#define KERNEL_CODE_LEN 32767
#define MAXN 2048
#define MAXM 2
char hostMtx[2][MAXN*MAXN];
int N, M, binN;
// -- start working with OpenCL
const int clNeedDevCnt = 1;
#define CheckFailAndExit(status) \
if (status != CL_SUCCESS) { \
fprintf(stderr, "Error %d: Line %u in file %s\n", status, __LINE__, __FILE__), \
destroyGPU(clCtx, clPrg, clKrn, clQue, clMemIn); \
}
#define clFuncArgs cl_context clCtx[], cl_program clPrg[], cl_kernel clKrn[], \
cl_command_queue clQue[], cl_mem clMemIn[][MAXM]
#define clCallFunc clCtx, clPrg, clKrn, clQue, clMemIn
#define clCallFuncOuter clCtx, clPrg, clKrn, clQue, clMemIn
void destroyGPU(clFuncArgs) {
fprintf(stderr, "Starting Cleanup ...\n\n");
for (int i = 0; i < clNeedDevCnt; i++) {
for (int j = 0; j < M; j++) {
if (clMemIn[i][j])
clReleaseMemObject(clMemIn[i][j]);
}
}
for (int i = 0; i < clNeedDevCnt; i++) {
if (clKrn[i])
clReleaseKernel(clKrn[i]);
if (clPrg[i])
clReleaseProgram(clPrg[i]);
}
for (int i = 0; i < clNeedDevCnt; i++) {
if (clQue[i])
clReleaseCommandQueue(clQue[i]);
}
for (int i = 0; i < clNeedDevCnt; i++) {
if (clCtx[i])
clReleaseContext(clCtx[i]);
}
exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
static char clSrcFormat[KERNEL_CODE_LEN] =
"#define N %d\n"
"#define M %d\n"
"#define CTYPE char\n"
"__kernel void simulate(__global CTYPE *IN,\n"
" __global CTYPE *OUT) {\n"
" int id = get_global_id(0);\n"
" int x = id / M+1, y = id % M +1;\n"
"#define G(x, y) IN[(x) * N + (y)]\n"
" char t = G(x, y);\n"
" char adj = G(x-1, y-1) + G(x-1, y) + G(x-1, y+1) + G(x, y-1) + G(x, y+1)\n"
" + G(x+1, y-1) + G(x+1, y) + G(x+1, y+1);\n"
" OUT[x * N + y] = (t == 0 && adj == 3) || (t == 1 && (adj == 2 || adj == 3));\n"
"}";
static char clSrc[KERNEL_CODE_LEN] = "";
// -- generate kernel code
// FILE *codefin = fopen(fileName, "r");
// assert(codefin != NULL);
// assert(fread(clSrcFormat, 1, KERNEL_CODE_LEN, codefin) < KERNEL_CODE_LEN);
sprintf(clSrc, clSrcFormat, N+2, N);
size_t clSrcLen = strlen(clSrc);
// fclose(codefin);
cl_int clStat;
cl_uint clPlatN, clGPUN, clDevN;
cl_platform_id clPlatID;
cl_device_id clGPUID[OPENCL_MAXGPU];
const char *clSrcPtr = clSrc;
// -- basic OpenCL setup
clGetPlatformIDs(1, &clPlatID, &clPlatN);
clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, OPENCL_MAXGPU, clGPUID, &clDevN);
assert(clDevN >= clNeedDevCnt);
for (int i = 0; i < clNeedDevCnt; i++) {
clCtx[i] = clCreateContext(NULL, 1, clGPUID+i, NULL, NULL, &clStat);
CheckFailAndExit(clStat);
}
for (int i = 0; i < clNeedDevCnt; i++) {
clQue[i] = clCreateCommandQueue(clCtx[i], clGPUID[i],
0, &clStat);
CheckFailAndExit(clStat);
}
for (int i = 0; i < clNeedDevCnt; i++) {
clPrg[i] = clCreateProgramWithSource(clCtx[i], 1, &clSrcPtr, &clSrcLen, &clStat);
CheckFailAndExit(clStat);
clStat = clBuildProgram(clPrg[i], 1, clGPUID+i, "-cl-fast-relaxed-math", NULL, NULL);
if (clStat != CL_SUCCESS) {
fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__);
size_t log_size;
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *program_log = (char *) calloc(log_size+1, sizeof(char));
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
printf("%s", program_log);
free(program_log);
CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
}
clKrn[i] = clCreateKernel(clPrg[i], "simulate", &clStat);
CheckFailAndExit(clStat);
}
// -- create all buffers
cl_mem_flags clInBuffFlag = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
for (int d = 0; d < clNeedDevCnt; d++) {
for (int i = 0; i < 2; i++) {
clMemIn[d][i] = clCreateBuffer(clCtx[d], clInBuffFlag,
sizeof(char)*binN*binN, hostMtx[i], &clStat);
CheckFailAndExit(clStat);
}
}
return 1;
}
int executeGPU(clFuncArgs) {
cl_int clStat;
size_t globalOffset[] = {0};
size_t globalSize[] = {N*N};
int flag = 0;
for (int it = 0; it < M; it++) {
// -- set argument to kernel
clStat = clSetKernelArg(clKrn[0], 0, sizeof(cl_mem), &clMemIn[0][flag]);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(clKrn[0], 1, sizeof(cl_mem), &clMemIn[0][!flag]);
CheckFailAndExit(clStat);
// -- execute
clStat = clEnqueueNDRangeKernel(clQue[0], clKrn[0], 1, globalOffset,
globalSize, 0, 0, NULL, NULL);
CheckFailAndExit(clStat);
flag = !flag;
}
// -- read back
clStat = clEnqueueReadBuffer(clQue[0], clMemIn[0][flag], CL_TRUE, 0,
sizeof(char)*binN*binN, hostMtx[flag], 0, NULL, NULL);
for (int i = 1; i <= N; i++) {
for (int j = 1; j <= N; j++)
hostMtx[flag][i*binN+j] += '0';
puts(hostMtx[flag]+i*binN+1);
}
return 1;
}
void onStart(clFuncArgs) {
assert(scanf("%d %d", &N, &M) == 2);
while (getchar() != '\n');
static char str[2048][2048];
for (int i = 1; i <= N; i++)
assert(fgets(str[i]+1, 2048, stdin) != NULL);
binN = N+2;
for (int i = 1; i <= N; i++) {
for (int j = 1; j <= N; j++)
hostMtx[0][i*binN + j] = str[i][j] - '0';
}
initAllGPU("game-of-life.cl", clCallFunc);
executeGPU(clCallFunc);
return ;
}
cl_context clCtx[OPENCL_MAXGPU];
cl_program clPrg[OPENCL_MAXGPU];
cl_kernel clKrn[OPENCL_MAXGPU];
cl_command_queue clQue[OPENCL_MAXGPU];
cl_mem clMemIn[OPENCL_MAXGPU][MAXM];
void sigHandler(int signo) {
printf("God Bless Me\n");
destroyGPU(clCallFuncOuter);
exit(0);
}
int main(int argc, char *argv[]) {
const char sigErr[] = "I can't catch signal.\n";
if (signal(SIGTRAP, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGSEGV, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGFPE, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGKILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGINT, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
onStart(clCallFuncOuter);
return 0;
}