批改娘 10091. Fast Matrix Multiplication (OpenCL)

contents

  1. 1. 題目描述
    1. 1.1. sequence.c
  2. 2. 輸入格式
  3. 3. 輸出格式
  4. 4. 範例輸入
  5. 5. 範例輸出
  6. 6. 編譯參數
  7. 7. Solution
    1. 7.1. main.c
    2. 7.2. matrixmul.cl

題目描述

計算兩個大小為 $N \times N$ 方陣 $A, \; B$ 相乘結果 $C = A \times B$。為了節省輸入輸出時間,採用亂數產生,可以參考下述程式碼,並改寫成 OpenCL 的版本進行加速。

使用 Profile 可以透過 NVIDIA Visual Profiler (GUI) 查看,遠端連線使用 ssh -X username@hostnvprof.sh, nvvp.cfg 下載

sequence.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
UINT sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + A[i][j]) * 2654435761LU;
}
return h;
}
UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
int N;
uint32_t S1, S2;
scanf("%d %u %u", &N, &S1, &S2);
rand_gen(S1, N, A);
rand_gen(S2, N, B);
multiply(N, A, B, C);
#ifdef DEBUG
print_matrix(N, A);
print_matrix(N, B);
print_matrix(N, C);
#endif
printf("%u\n", signature(N, C));
return 0;
}

輸入格式

測資只有一組,包含三個整數 $N, S_1, S_2$,分別為方陣大小 $N \times N$,產生矩陣 $A$、$B$ 的亂數種子。

  • $64 \le N \le 1024$,保證 $N \mod 64 \equiv 0$
  • $0 \le S_1, \; S_2 < 2^{31}$

輸出格式

輸出一行雜湊值 $H$,可參考 sequence.c 的流程。

範例輸入

1
64 1 2

範例輸出

1
3376147904

編譯參數

1
gcc -std=c99 -O2 main.c -lm -lOpenCL -fopenmp -I/usr/include/CL

Solution

兩個 $N \times N$ 乘法計算,從網路上常見的作法通常直接針對最終答案進行切塊,大致上分成三種運行方式

  • 每一個 thread 只處理一個向量內積,因此會需要 $N^2$ 個 threads,彼此之間獨立。特別注意到在 GPU 程式中,每一個維度的索引值不可大於 65535,但是拆成兩個維度就沒有上限。
  • 每一個 thread 只處理一個向量內積,但數個 thread 會分配到同一個 block,並且合作將 global memory 搬到 on-chip 的 local/shared memory 來加快速度。特別注意到 share memory 是 on-chip 的,通常能儲存量都非常小,儘管他們能藉由 data-reused 加快存取速度,但因為 share memory 大小限制導致有一個加速上限。
  • 每一個 thread 處理一個或數個列上的所有值。

這些牽涉到 warp scheduling 和 memory coalesce 問題,有時候理論分析平行度看起來很高,但實際運作還是得看 warp size 和發生 branch 的情況。下述程式是按照第三個做法完成。

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#define MAXGPU 1
#define MAXN 1024
uint32_t hostA[MAXN*MAXN], hostB[MAXN*MAXN], hostC[MAXN*MAXN];
int N = MAXN;
char clSrcFormat[1024];
char clSrc[1024] = "";
char clSrcMain[1024] = "matrixMul";
// -- start working with OpenCL
cl_context clCtx;
cl_program clPrg;
cl_kernel clKrn;
cl_command_queue clQue;
cl_mem clMemIn1, clMemIn2, clMemOut;
#define CheckFailAndExit(state) \
if (state != CL_SUCCESS) { \
printf("Error: Line %u in file %s\n\n", __LINE__, __FILE__), \
destroyGPU(clCtx, clPrg, clKrn, clQue, clMemIn1, clMemIn2, clMemOut); \
}
#define clFuncArgs cl_context *clCtx, cl_program *clPrg, cl_kernel *clKrn, \
cl_command_queue *clQue, cl_mem *clMemIn1, cl_mem *clMemIn2, \
cl_mem *clMemOut
#define clCallFunc &clCtx, &clPrg, &clKrn, &clQue, &clMemIn1, &clMemIn2, &clMemOut
void destroyGPU(clFuncArgs) {
fprintf(stderr, "Starting Cleanup ...\n\n");
if (*clMemOut) clReleaseMemObject(*clMemOut);
if (*clMemIn2) clReleaseMemObject(*clMemIn2);
if (*clMemIn1) clReleaseMemObject(*clMemIn1);
if (*clKrn) clReleaseKernel(*clKrn);
if (*clPrg) clReleaseProgram(*clPrg);
if (*clQue) clReleaseCommandQueue(*clQue);
if (*clCtx) clReleaseContext(*clCtx);
exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
// -- generate kernel code
FILE *codefin = fopen(fileName, "r");
assert(codefin != NULL);
size_t clSrcLen = fread(clSrcFormat, 1, 1024, codefin);
sprintf(clSrc, clSrcFormat, N);
fclose(codefin);
cl_int clStat;
cl_uint clPlatN, clGPUN;
cl_platform_id clPlatID;
cl_device_id clGPUID[MAXGPU];
const char *clSrcPtr = clSrc;
// -- basic OpenCL setup
clGetPlatformIDs(1, &clPlatID, &clPlatN);
clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clGPUN);
*clCtx = clCreateContext(NULL, 1, clGPUID, NULL, NULL, &clStat);
CheckFailAndExit(clStat);
*clQue = clCreateCommandQueue(*clCtx, clGPUID[0], 0, &clStat);
CheckFailAndExit(clStat);
*clPrg = clCreateProgramWithSource(*clCtx, 1, &clSrcPtr, &clSrcLen, &clStat);
CheckFailAndExit(clStat);
clStat = clBuildProgram(*clPrg, 1, clGPUID, NULL, NULL, NULL);
if (clStat != CL_SUCCESS) {
printf("Error in clBuildProgram, Line %u in file %s\n\n", __LINE__, __FILE__);
size_t log_size;
clGetProgramBuildInfo(*clPrg, clGPUID[0], CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &clStat, NULL);
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *program_log = (char *) calloc(log_size+1, sizeof(char));
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
printf("%s", program_log);
free(program_log);
CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
}
*clKrn = clCreateKernel(*clPrg, clSrcMain, &clStat);
CheckFailAndExit(clStat);
// -- create all buffers
cl_mem_flags clInBuffFlag = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR;
cl_mem_flags clOutBuffFlag = CL_MEM_WRITE_ONLY;
*clMemIn1 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N,
hostA, &clStat);
CheckFailAndExit(clStat);
*clMemIn2 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N,
hostB, &clStat);
CheckFailAndExit(clStat);
*clMemOut = clCreateBuffer(*clCtx, clOutBuffFlag, sizeof(uint32_t)*N*N,
hostC, &clStat);
CheckFailAndExit(clStat);
// -- set argument to kernel
clStat = clSetKernelArg(*clKrn, 0, sizeof(cl_mem), (void *) clMemIn1);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(*clKrn, 1, sizeof(cl_mem), (void *) clMemIn2);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(*clKrn, 2, sizeof(cl_mem), (void *) clMemOut);
CheckFailAndExit(clStat);
return 1;
}
int min(int x, int y) {
return x < y ? x : y;
}
int executeGPU(clFuncArgs) {
cl_int clStat;
size_t globalOffset[] = {0, 0};
size_t globalSize[] = {N};
size_t localSize[] = {min(N, 64)};
clStat = clEnqueueNDRangeKernel(*clQue, *clKrn, 1, globalOffset,
globalSize, localSize, 0, NULL, NULL);
CheckFailAndExit(clStat);
clFinish(*clQue);
// -- read back
clEnqueueReadBuffer(*clQue, *clMemOut, CL_TRUE, 0, sizeof(uint32_t)*N*N,
hostC, 0, NULL, NULL);
return 1;
}
void readIn() {
uint32_t c1, c2;
assert(scanf("%d %u %u", &N, &c1, &c2) == 3);
uint32_t x = 2, n = N*N;
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c1 + i + j)&(n-1);
hostA[i*N+j] = x;
}
}
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c2 + i + j)&(n-1);
hostB[i*N+j] = x;
}
}
}
void writeOut() {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
printf("%u\n", h);
}
void onStart() {
readIn();
initAllGPU("matrixmul.cl", clCallFunc);
executeGPU(clCallFunc);
writeOut();
destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
printf("God Bless Me\n");
destroyGPU(clCallFunc);
exit(0);
}
int main(int argc, char *argv[]) {
const char sigErr[] = "I can't catch signal.\n";
if (signal(SIGTRAP, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGSEGV, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGFPE, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGKILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGINT, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
onStart();
return 0;
}

matrixmul.cl

有些人也許會想問,為什麼合作搬運一個在 global memory 的陣列,要採用 for (int cr = localID; cr < N; cr += localSz) 的方式,這是因為 GPU 設計有 memory coalesce 問題,一個 warp 運作時,存取最好的是連續的,這麼一來 memory coalesce 帶進來的一坨連續的記憶體就能充份被利用,而不是每一個 thread 讀取一塊連續的記憶體片段,採用跳躍的方式在 warp scheduling 時,讀取順序看起來才比較連續。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#define N %d
#define CTYPE unsigned int
#define UNLOOP 8
__kernel void matrixMul(__global CTYPE *in1,
__global CTYPE *in2,
__global CTYPE *out) {
CTYPE rbuf[N];
int r = get_global_id(0);
int localID = get_local_id(0);
int localSz = get_local_size(0);
__local CTYPE cbuf[N];
for (int i = 0; i < N; i++)
rbuf[i] = in1[r * N + i];
for (int c = 0; c < N; c++) {
for (int cr = localID; cr < N; cr += localSz)
cbuf[cr] = in2[cr * N + c];
barrier(CLK_LOCAL_MEM_FENCE);
CTYPE sum = 0;
for (int k = 0; k+UNLOOP-1 < N; k += UNLOOP) {
sum += rbuf[k+0] * cbuf[k+0];
sum += rbuf[k+1] * cbuf[k+1];
sum += rbuf[k+2] * cbuf[k+2];
sum += rbuf[k+3] * cbuf[k+3];
sum += rbuf[k+4] * cbuf[k+4];
sum += rbuf[k+5] * cbuf[k+5];
sum += rbuf[k+6] * cbuf[k+6];
sum += rbuf[k+7] * cbuf[k+7];
}
out[r * N + c] = sum;
}
}