批改娘 10113. Longest Common Subsequence II (CUDA)

題目描述

給兩個字串 $X, \; Y$,在兩個字串中都有出現且最長的子序列 (subsequence),就是最長共同子字串

輸入格式

有多組測資,每組測資有兩行字串 $X, \; Y$,$X, \; Y$ 只由 A T C G 四個字母構成。

  • $1 \le |X|, |Y| \le 60000$

輸出格式

針對每一組測資,輸出一行 $X, \; Y$ 的最長共同子字串長度以及任何一組最長共同子字串。

範例輸入

1
2
3
4
TCA
GTA
TGGAC
TATCT

範例輸出

1
2
3
4
2
TA
3
TAC

編譯參數

1
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main

Solution

附錄為舊版設計,可以針對 OpenMP 的設計再重新設計 CUDA 版本。

舊版設計根據 CPU 平行與不平行以及 GPU 三種狀態,根據閥值決定要選擇運行哪一種版本。若加上足夠小的情況下,直接用 $O(N^2)$ 表格直接回溯,和 OpenMP 判斷閥值的公式,效能還能在加速個兩到三倍,但整體而言未必比純 OpenMP 還要快,因為有 GPU 啟動的 overhead,要跑夠多的測資才看得出來。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <omp.h>
using namespace std;
#define MAXN 60005
#define CHARSET 4
#define max(x, y) (x) > (y) ? (x) : (y)
typedef unsigned short uint16;
static char A[MAXN], B[MAXN];
static int c2i[128];
static char i2c[CHARSET+1] = "ACGT";
static int *cuP;
static char *cuB;
static uint16 *cuDP;
__global__ void prepare(int *P, char *B, int nb) {
int i = threadIdx.x;
int *p = P + i*MAXN;
char c = "ACGT"[i];
p[0] = 0;
for (int j = 1; j <= nb; j++)
p[j] = (B[j] == c) ? j : p[j-1];
}
__global__ void run(int nb, uint16 *dpIn, uint16 *dpOut, int *P) {
int j = blockDim.x*blockIdx.x+threadIdx.x+1;
if (j > nb) return ;
int pos = P[j];
uint16 t1 = pos ? dpIn[pos-1]+1 : 0;
uint16 t2 = dpIn[j];
dpOut[j] = t1 > t2 ? t1 : t2;
}
int lcs_len_seq(const char *A, int na, const char *B, int nb, uint16 dpf[]) {
static uint16 dp[2][MAXN];
memset(dp[0], 0, sizeof(uint16)*(nb+1));
dp[1][0] = 0, dpf[0] = 0;
for (int i = 1; i <= na; i++) {
for (int j = 1; j <= nb; j++) {
if (A[i-1] == B[j-1])
dp[1][j] = dp[0][j-1]+1;
else
dp[1][j] = max(dp[1][j-1], dp[0][j]);
}
memcpy(dp[0], dp[1], sizeof(uint16)*(nb+1));
}
for (int i = 0; i <= nb; i++)
dpf[i] = dp[0][i];
return dpf[nb];
}
int lcs_len_omp(const char *A, int na, const char *B, int nb, uint16 dpf[]) {
static int P[CHARSET][MAXN];
static uint16 dp[2][MAXN];
A--, B--;
#pragma omp parallel for
for (int i = 0; i < CHARSET; i++) {
memset(P[i], 0, sizeof(int)*(nb+1));
for (int j = 1; j <= nb; j++)
P[i][j] = (B[j] == i2c[i])? j : P[i][j-1];
}
for (int i = 0; i < 2; i++)
memset(dp[i], 0, sizeof(uint16)*(nb+1));
#pragma omp parallel
for (int i = 1; i <= na; i++) {
int *Pv = P[c2i[A[i]]];
#pragma omp for
for (int j = 1; j <= nb; j++) {
int last_match = Pv[j];
uint16 tmp = last_match ? dp[i&1^1][last_match-1]+1 : 0;
dp[i&1][j] = max(dp[i&1^1][j], tmp);
}
}
for (int i = 0; i <= nb; i++)
dpf[i] = dp[na&1][i];
return dpf[nb];
}
int lcs_len(const char *A, int na, const char *B, int nb, uint16 dpf[]) {
if (max(na, nb) < 2048)
return lcs_len_seq(A, na, B, nb, dpf);
if (nb < 10000)
return lcs_len_omp(A, na, B, nb, dpf);
B--;
cudaMemcpy(cuB, B, sizeof(char)*(nb+1), cudaMemcpyHostToDevice);
cudaMemset(cuDP, 0, sizeof(uint16)*(nb+1));
cudaMemset(cuDP+MAXN, 0, sizeof(uint16)*(nb+1));
int bsz = 1024;
dim3 bb(bsz);
dim3 gg((nb+bsz-1)/bsz);
prepare<<<1, 4>>>(cuP, cuB, nb);
for (int i = 0; i < na; i++) {
int v = c2i[A[i]];
run<<<gg, bb>>>(nb, cuDP+(i&1)*MAXN, cuDP+((i&1)^1)*MAXN, cuP+v*MAXN);
}
cudaMemcpy(dpf, cuDP+(na&1)*MAXN, sizeof(uint16)*(nb+1), cudaMemcpyDeviceToHost);
return dpf[nb];
}
char* alloc_str(int sz) {
return (char *) calloc(sz, sizeof(char));
}
char* substr(const char *s, int pos, int len) {
char *t = alloc_str(len+1);
memcpy(t, s+pos, len);
return t;
}
char* cat(const char *sa, const char *sb) {
int na = strlen(sa), nb = strlen(sb);
char *t = alloc_str(na + nb + 1);
memcpy(t, sa, na);
memcpy(t+na, sb, nb);
return t;
}
char* reverse(const char *s, int len) {
char *t = substr(s, 0, len);
char *l = t, *r = t + len - 1;
char tmp;
while (l < r) {
tmp = *l, *l = *r, *r = tmp;
l++, r--;
}
return t;
}
char* find_lcs(const char *a, int na, const char *b, int nb) {
if (na > nb) {
const char *c; int t;
c = a, a = b, b = c;
t = na, na = nb, nb = t;
}
if (na == 0)
return alloc_str(1);
if (na == 1) {
for (int i = 0; i < nb; i++) {
if (a[0] == b[i])
return substr(a, 0, 1);
}
return alloc_str(1);
}
static uint16 t1[MAXN];
static uint16 t2[MAXN];
int len = lcs_len(a, na, b, nb, t1);
if (len == 0)
return alloc_str(1);
int half_len = na / 2;
char *la = substr(a, 0, half_len);
char *ra = substr(a, half_len, na - half_len);
char *tb = reverse(b, nb);
char *ta = reverse(ra, na - half_len);
lcs_len(la, half_len, b, nb, t1);
lcs_len(ta, na - half_len, tb, nb, t2);
int split = -1;
for (int i = 0; i <= nb; i++) {
if (t1[i] + t2[nb-i] == len) {
split = i;
break;
}
}
char *lb = substr(b, 0, split);
char *rb = substr(b, split, nb - split);
char *sl = find_lcs(la, half_len, lb, split);
char *sr = find_lcs(ra, na - half_len, rb, nb - split);
char *ret = cat(sl, sr);
free(la), free(ra), free(ta);
free(lb), free(rb), free(tb);
free(sl), free(sr);
return ret;
}
int main() {
for (int i = 0; i < CHARSET; i++)
c2i[i2c[i]] = i;
cudaMalloc(&cuB, sizeof(char)*MAXN);
cudaMalloc(&cuP, sizeof(int)*MAXN*4);
cudaMalloc(&cuDP, sizeof(uint16)*MAXN*2);
static uint16 dpf[MAXN];
while (scanf("%s %s", A, B) == 2) {
int na = strlen(A);
int nb = strlen(B);
int len = lcs_len(A, na, B, nb, dpf);
char *str = find_lcs(A, na, B, nb);
printf("%d\n", len);
printf("%s\n", str);
free(str);
}
cudaFree(cuB);
cudaFree(cuP);
cudaFree(cuDP);
return 0;
}
Read More +

批改娘 10112. Longest Common Subsequence (CUDA)

題目描述

給兩個字串 $X, \; Y$,在兩個字串中都有出現且最長的子序列 (subsequence),就是最長共同子字串

輸入格式

有多組測資,每組測資有兩行字串 $X, \; Y$,$X, \; Y$ 只由 A T C G 四個字母構成。

  • $1 \le |X|, |Y| \le 60000$

輸出格式

針對每一組測資,輸出一行 $X, \; Y$ 的最長共同子字串長度。

範例輸入

1
2
3
4
TCA
GTA
TGGAC
TATCT

範例輸出

1
2
2
3

編譯參數

1
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main

Solution

相比於 OpenMP 版本,由於 GPU 的快取設計不如 CPU,在我們使用失敗指針優化 branch 的發生,對 GPU 反而還是慢的,因為要存取 global memory,這使得還不如發生 branch,讓 warp 分多次跑反而比較快,因為有足夠多的 warp 在一個 block。在優化程度上,CPU 和 GPU 還是得分開解決。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#include <cstdio>
#include <cstdlib>
#define MAXN 60005
#define CHARSET 4
typedef unsigned short uint16;
static char A[MAXN], B[MAXN];
static int c2i[128];
static char i2c[CHARSET+1] = "ACGT";
static int *cuP;
static char *cuB;
static uint16 *cuDP;
__global__ void prepare(int *P, char *B, int nb) {
int i = threadIdx.x;
int *p = P + i*MAXN;
char c = "ACGT"[i];
p[0] = 0;
for (int j = 1; j <= nb; j++)
p[j] = (B[j] == c) ? j : p[j-1];
}
__global__ void run(int nb, uint16 *dpIn, uint16 *dpOut, int *P) {
int j = blockDim.x*blockIdx.x+threadIdx.x+1;
if (j > nb) return ;
int pos = P[j];
uint16 t1 = pos ? dpIn[pos-1]+1 : 0;
uint16 t2 = dpIn[j];
dpOut[j] = t1 > t2 ? t1 : t2;
}
int lcs_len(const char *A, int na, const char *B, int nb, uint16 dpf[]) {
B--;
cudaMemcpy(cuB, B, sizeof(char)*(nb+1), cudaMemcpyHostToDevice);
cudaMemset(cuDP, 0, sizeof(uint16)*(nb+1));
cudaMemset(cuDP+MAXN, 0, sizeof(uint16)*(nb+1));
int bsz = 1024;
dim3 bb(bsz);
dim3 gg((nb+bsz-1)/bsz);
prepare<<<1, 4>>>(cuP, cuB, nb);
for (int i = 0; i < na; i++) {
int v = c2i[A[i]];
run<<<gg, bb>>>(nb, cuDP+(i&1)*MAXN, cuDP+((i&1)^1)*MAXN, cuP+v*MAXN);
}
cudaMemcpy(dpf, cuDP+(na&1)*MAXN, sizeof(uint16)*(nb+1), cudaMemcpyDeviceToHost);
return dpf[nb];
}
int main() {
for (int i = 0; i < CHARSET; i++)
c2i[i2c[i]] = i;
cudaMalloc(&cuB, sizeof(char)*MAXN);
cudaMalloc(&cuP, sizeof(int)*MAXN*4);
cudaMalloc(&cuDP, sizeof(uint16)*MAXN*2);
static uint16 dpf[MAXN];
while (scanf("%s %s", A, B) == 2) {
int len = lcs_len(A, strlen(A), B, strlen(B), dpf);
printf("%d\n", len);
}
cudaFree(cuB);
cudaFree(cuP);
cudaFree(cuDP);
return 0;
}
Read More +

批改娘 10111. Longest Common Subsequence II (OpenMP)

題目描述

給兩個字串 $X, \; Y$,在兩個字串中都有出現且最長的子序列 (subsequence),就是最長共同子字串

輸入格式

有多組測資,每組測資有兩行字串 $X, \; Y$,$X, \; Y$ 只由 A T C G 四個字母構成。

  • $1 \le |X|, |Y| \le 60000$

輸出格式

針對每一組測資,輸出一行 $X, \; Y$ 的最長共同子字串長度以及任何一組最長共同子字串。

範例輸入

1
2
3
4
TCA
GTA
TGGAC
TATCT

範例輸出

1
2
3
4
2
TA
3
TAC

Solution

雖然我們都知道數據小時,由於有 $O(N^2)$ 大小的表可以協助回溯找解,但當 $N$ 非常大時,就無法儲存這張表。因此,可以採用分治算法找到這一張表,也就是所謂的 Hirschberg’s algorithm,相關的 demo 在此。

時間複雜度為 $O(N \log N)$,空間複雜度為 $O(N)$,平行部分可以採用 fine-grain 設計,那麼就會有不斷建立 thread 的 overhead,但更容易處於負載平衡 (workload balance)。另一種則是在遞迴分治下,拆成好幾個 thread 完成各自部分,這樣比較容易觸發 cache 的性質。

從實作中,fine-grain 比 coarse-grain 好上許多。

fine-grain

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <omp.h>
#define MAXN 60005
#define MAXSEQ 500
#define CHARSET 4
#define max(x, y) (x) > (y) ? (x) : (y)
static __thread int c2i[128] = {['A'] = 0, ['C'] = 1, ['G'] = 2, ['T'] = 3};
int lcs_len(const char *A, int na, const char *B, int nb, int inv_flag, int dpf[]) {
static int P[CHARSET][MAXN];
int dp[2][MAXN] = {};
A--, B--;
if (!inv_flag) {
#pragma omp parallel for
for (int i = 0; i < CHARSET; i++) {
P[i][0] = nb+1;
for (int j = 1; j <= nb; j++)
P[i][j] = (B[j] == "ACGT"[i]) ? j-1 : P[i][j-1];
}
for (int i = 0; i < 2; i++)
dp[i][nb+1] = -1;
#pragma omp parallel
for (int i = 1; i <= na; i++) {
int *Pv = P[c2i[A[i]]];
int *dpIn = dp[i&1^1];
int *dpOut = dp[i&1];
#pragma omp for
for (int j = 1; j <= nb; j++) {
int t1 = dpIn[Pv[j]]+1;
int t2 = dpIn[j];
dpOut[j] = t1 > t2 ? t1 : t2;
}
}
memcpy(dpf, dp[na&1], sizeof(int)*(nb+1));
dpf[nb+1] = dpf[0] = 0;
return dpf[nb];
}
// inverse version
#pragma omp parallel for
for (int i = 0; i < CHARSET; i++) {
P[i][nb+1] = 0;
for (int j = nb; j >= 1; j--)
P[i][j] = (B[j] == "ACGT"[i]) ? j+1 : P[i][j+1];
}
for (int i = 0; i < 2; i++)
dp[i][0] = -1;
#pragma omp parallel
for (int i = na; i >= 1; i--) {
int *Pv = P[c2i[A[i]]];
int *dpIn = dp[i&1^1];
int *dpOut = dp[i&1];
#pragma omp for
for (int j = 1; j <= nb; j++) {
int t1 = dpIn[Pv[j]]+1;
int t2 = dpIn[j];
dpOut[j] = t1 > t2 ? t1 : t2;
}
}
memcpy(dpf, dp[1&1], sizeof(int)*(nb+1));
dpf[nb+1] = dpf[0] = 0;
return dpf[nb];
}
char* alloc_str(int sz) {
return (char *) calloc(sz, sizeof(char));
}
char* substr(const char *s, int pos, int len) {
char *t = alloc_str(len+1);
memcpy(t, s+pos, len);
return t;
}
char* cat(const char *sa, const char *sb) {
int na = strlen(sa), nb = strlen(sb);
char *t = alloc_str(na + nb + 1);
memcpy(t, sa, na);
memcpy(t+na, sb, nb);
return t;
}
char* find_lcs_seq(const char *A, int na, const char *B, int nb) {
static int P[CHARSET][MAXSEQ];
static char fa[MAXSEQ][MAXSEQ];
int dp[2][MAXSEQ] = {};
A--, B--;
for (int i = 0; i < CHARSET; i++) {
P[i][0] = nb+1;
for (int j = 1; j <= nb; j++)
P[i][j] = (B[j] == "ACGT"[i]) ? j-1 : P[i][j-1];
}
for (int i = 0; i < 2; i++)
dp[i][nb+1] = -1;
for (int i = 1; i <= na; i++) {
int *Pv = P[c2i[A[i]]];
int *dpIn = dp[i&1^1];
int *dpOut = dp[i&1];
for (int j = 1; j <= nb; j++) {
int t1 = dpIn[Pv[j]]+1;
int t2 = dpIn[j];
if (t1 > t2)
dpOut[j] = t1, fa[i][j] = 0;
else
dpOut[j] = t2, fa[i][j] = 1;
}
}
int sz = dp[na&1][nb];
char *ret = alloc_str(sz+1);
for (int i = na, j = nb; sz && i >= 1; i--) {
if (fa[i][j] == 0)
ret[--sz] = A[i], j = P[c2i[A[i]]][j];
}
return ret;
}
char* find_lcs(const char *a, int na, const char *b, int nb) {
if (na > nb) {
const char *c; int t;
c = a, a = b, b = c;
t = na, na = nb, nb = t;
}
if (na == 0)
return alloc_str(1);
if (na < MAXSEQ && nb < MAXSEQ)
return find_lcs_seq(a, na, b, nb);
int t1[MAXN];
int t2[MAXN];
int len = -1;
int half_len = na / 2;
char *la = substr(a, 0, half_len);
char *ra = substr(a, half_len, na - half_len);
lcs_len(la, half_len, b, nb, 0, t1);
lcs_len(ra, na - half_len, b, nb, 1, t2);
int split = -1;
for (int i = 0; i <= nb; i++) {
if (t1[i] + t2[i+1] > len)
split = i, len = t1[i] + t2[i+1];
}
if (len == 0)
return alloc_str(1);
assert(split != -1);
char *lb = substr(b, 0, split);
char *rb = substr(b, split, nb - split);
char *sl = t1[split] ? find_lcs(la, half_len, lb, split) : alloc_str(1);
char *sr = t2[split+1] ? find_lcs(ra, na - half_len, rb, nb - split) : alloc_str(1);
char *ret = cat(sl, sr);
free(la), free(ra);
free(lb), free(rb);
free(sl), free(sr);
return ret;
}
int main() {
static char A[MAXN], B[MAXN];
int dp[MAXN];
while (scanf("%s %s", A, B) == 2) {
int na = strlen(A);
int nb = strlen(B);
char *str = find_lcs(A, na, B, nb);
printf("%d\n", strlen(str));
printf("%s\n", str);
free(str);
}
return 0;
}

coarse-grain

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#define MAXN 60005
#define CHARSET 4
#define max(x, y) (x) > (y) ? (x) : (y)
typedef unsigned short uint16;
int lcs_len_seq(const char *A, int na, const char *B, int nb, int dpf[]) {
uint16 dp[2][MAXN];
memset(dp[0], 0, sizeof(uint16)*(nb+1));
dp[1][0] = 0;
for (int i = 1; i <= na; i++) {
for (int j = 1; j <= nb; j++) {
if (A[i-1] == B[j-1])
dp[1][j] = dp[0][j-1]+1;
else
dp[1][j] = max(dp[1][j-1], dp[0][j]);
}
memcpy(dp[0], dp[1], sizeof(uint16)*(nb+1));
}
for (int i = 0; i <= nb; i++)
dpf[i] = dp[0][i];
return dpf[nb];
}
int lcs_len(const char *A, int na, const char *B, int nb, int dpf[]) {
if (nb < 256)
return lcs_len_seq(A, na, B, nb, dpf);
int c2i[128] = {['A'] = 0, ['C'] = 1, ['G'] = 2, ['T'] = 3};
char i2c[CHARSET] = {'A', 'C', 'G', 'T'};
int P[CHARSET][MAXN];
uint16 dp[2][MAXN];
A--, B--;
for (int i = 0; i < CHARSET; i++) {
P[i][0] = nb+1;
for (int j = 1; j <= nb; j++)
P[i][j] = (B[j] == i2c[i]) ? j-1 : P[i][j-1];
}
for (int i = 0; i < 2; i++) {
memset(dp[i], 0, sizeof(uint16)*(nb+1));
dp[i][nb+1] = -1;
}
for (int i = 1; i <= na; i++) {
int *Pv = P[c2i[A[i]]];
uint16 *dpIn = dp[i&1^1];
uint16 *dpOut = dp[i&1];
for (int j = 1; j <= nb; j++) {
uint16 t1 = dpIn[Pv[j]]+1;
uint16 t2 = dpIn[j];
dpOut[j] = t1 > t2 ? t1 : t2;
}
}
for (int i = 0; i <= nb; i++)
dpf[i] = dp[na&1][i];
return dpf[nb];
}
char* alloc_str(int sz) {
return (char *) calloc(sz, sizeof(char));
}
char* substr(const char *s, int pos, int len) {
char *t = alloc_str(len+1);
memcpy(t, s+pos, len);
return t;
}
char* cat(const char *sa, const char *sb) {
int na = strlen(sa), nb = strlen(sb);
char *t = alloc_str(na + nb + 1);
memcpy(t, sa, na);
memcpy(t+na, sb, nb);
return t;
}
char* reverse(const char *s, int len) {
char *t = substr(s, 0, len);
char *l = t, *r = t + len - 1;
char tmp;
while (l < r) {
tmp = *l, *l = *r, *r = tmp;
l++, r--;
}
return t;
}
char* find_lcs(const char *a, int na, const char *b, int nb, int dep) {
if (na > nb) {
const char *c; int t;
c = a, a = b, b = c;
t = na, na = nb, nb = t;
}
if (na == 0)
return alloc_str(1);
if (na == 1) {
for (int i = 0; i < nb; i++) {
if (a[0] == b[i])
return substr(a, 0, 1);
}
return alloc_str(1);
}
int t1[MAXN];
int t2[MAXN];
int len = -1;
int half_len = na / 2;
char *la = substr(a, 0, half_len);
char *ra = substr(a, half_len, na - half_len);
char *tb = reverse(b, nb);
char *ta = reverse(ra, na - half_len);
#pragma omp task untied shared(t1)
lcs_len(la, half_len, b, nb, t1);
#pragma omp task untied shared(t2)
lcs_len(ta, na - half_len, tb, nb, t2);
#pragma omp taskwait
int split = -1;
for (int i = 0; i <= nb; i++) {
if (t1[i] + t2[nb-i] > len)
split = i, len = t1[i] + t2[nb-i];
}
if (len == 0)
return alloc_str(1);
char *lb = substr(b, 0, split);
char *rb = substr(b, split, nb - split);
char *sl, *sr;
#pragma omp task untied shared(sl)
sl = find_lcs(la, half_len, lb, split, dep+1);
#pragma omp task untied shared(sr)
sr = find_lcs(ra, na - half_len, rb, nb - split, dep+1);
#pragma omp taskwait
char *ret = cat(sl, sr);
free(la), free(ra), free(ta);
free(lb), free(rb), free(tb);
free(sl), free(sr);
return ret;
}
int main() {
static char A[MAXN], B[MAXN];
int dp[MAXN];
while (scanf("%s %s", A, B) == 2) {
int na = strlen(A);
int nb = strlen(B);
char *str;
#pragma omp parallel
{
#pragma omp single
str = find_lcs(A, na, B, nb, 0);
}
printf("%d\n", strlen(str));
printf("%s\n", str);
free(str);
}
return 0;
}
Read More +

批改娘 10110. Longest Common Subsequence (OpenMP)

題目描述

給兩個字串 $X, \; Y$,在兩個字串中都有出現且最長的子序列 (subsequence),就是最長共同子字串

輸入格式

有多組測資,每組測資有兩行字串 $X, \; Y$,$X, \; Y$ 只由 A T C G 四個字母構成。

  • $1 \le |X|, |Y| \le 60000$

輸出格式

針對每一組測資,輸出一行 $X, \; Y$ 的最長共同子字串長度。

範例輸入

1
2
3
4
TCA
GTA
TGGAC
TATCT

範例輸出

1
2
2
3

Solution

由於同學在課堂中提及這個應用,然後實作的效能稍微感到懷疑,再加上這一題的難度跟演算法上相當簡單,難的地方在於如何解決 DP 平行。

一般平行化

從遞迴公式來看,

$$\begin{align*} dp[x][y] = \left\{\begin{matrix} dp[x-1][y-1] + 1 & \text{if } A_x = B_x \\ \max(dp[x-1][y],dp[x][y-1]) & \text{otherwise}\\ \end{matrix}\right. \end{align*}$$

唯一的解決方案就是把紀錄矩陣旋轉 45 度,並且搭配滾動數組,需要保留三排進行完成遞迴轉移。

然而,在旋轉 45 度後,平行度在每一列是呈現波形,逐漸變大後又逐漸變小,這導致快取效果不是很好,當分配在不同的 CPU 上時,他們之間先前代入的資料可能不是這次所需,因為變大變小的原因,分配的區段不與之前重疊,傳到另一個 CPU 上平行的效率非常低落,若是在數個 core 在同一個 CPU 上,就能不掉出 L3 cache,但平行度也因此受到限制。

在我們實驗主機上,一個 CPU 總共有 6 個 core,加速最多 6 倍。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#include <cstdio>
#include <cstring>
#include <iostream>
#include <algorithm>
#include <omp.h>
using namespace std;
const int MAXN = 65536;
static char A[MAXN], B[MAXN];
#define DP_TYPE unsigned short
int lcs_len(const char *A, int na, const char *B, int nb, int dpf[]) {
if (na > nb)
swap(A, B), swap(na, nb);
static DP_TYPE dp[3][MAXN<<1];
for (int i = 0; i < 3; i++)
memset(dp[i], 0, sizeof(DP_TYPE)*(nb+na+2));
memset(dpf, 0, sizeof(DP_TYPE)*(nb+1));
omp_set_num_threads(4);
int last_l = 0, last_r = 0;
for (int i = 0; i < na+nb-1; i++) {
int l = max(0, i-na+1), r = min(i, nb-1);
#pragma omp parallel for schedule(static) firstprivate(na, A, B)
for (int j = l; j <= r; j++) {
int x = i-j, y = j;
if (A[x] == B[y])
dp[2][j+1] = dp[0][j]+1;
else
dp[2][j+1] = dp[1][j] > dp[1][j+1] ? dp[1][j] : dp[1][j+1];
}
if (i-l == na-1)
dpf[l+1] = dp[2][l+1];
memcpy(dp[0]+last_l+1, dp[1]+last_l+1, sizeof(DP_TYPE)*(last_r-last_l+1));
memcpy(dp[1]+l+1, dp[2]+l+1, sizeof(DP_TYPE)*(r-l+1));
last_l = l, last_r = r;
}
return dpf[nb];
}
int main() {
int dp[MAXN];
while (scanf("%s %s", A, B) == 2) {
string a = A, b = B;
int len = lcs_len(a.c_str(), strlen(A), b.c_str(), strlen(B), dp);
printf("%d\n", len);
}
return 0;
}

高速平行化

  • 參考論文 An Efficient Parallel Algorithm for Longest Common Subsequence Problem on GPUs

感謝 R04922075 古耕竹同學提供相關資訊,將這一題越推越快。

在論文中,他將原先的 LCS 的遞迴公式改變,多一個額外的輔助數組,其輔助數組的空間大小為 $O(|\alpha|\cdot N)$。仍然 $dp[i][j]$ 為字串 $A$ 前 $i$ 個字元與字串 $B$ 前 $j$ 個字元的最常共同子字串長度。藉由輔助數組 $P[c][j]$ 為上一個字串 $c$ 出現在位置 $j$ 之前的位置在哪。

遞迴公式改為 (這裡我們先忽略邊界條件):

$$dp[x][y] = \max(dp[x-1][y],dp[x-1][P[A[x]][y]-1]+1)$$

從最優子結構分析,分成最後 $A[x]$ 是否與 $B[y]$ 匹配,在不匹配的情況下選擇 $dp[x-1][y]$,匹配的情況下選擇 $dp[x-1][P[A[x]][y]-1]+1$。

下述提及的特殊陣列宣告方式,採用 linux 常用的 -std=c99 標準贊助

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#include <stdio.h>
#include <string.h>
#include <omp.h>
#define MAXN 60005
#define CHARSET 4
typedef unsigned short uint16;
static char A[MAXN], B[MAXN];
int lcs_len(const char *A, int na, const char *B, int nb, int dpf[]) {
static int c2i[128] = {['A'] = 0, ['C'] = 1, ['G'] = 2, ['T'] = 3};
static char i2c[CHARSET] = {'A', 'C', 'G', 'T'};
static int P[CHARSET][MAXN] = {};
static uint16 dp[2][MAXN];
A--, B--;
#pragma omp parallel for
for (int i = 0; i < CHARSET; i++) {
for (int j = 1; j <= nb; j++)
P[i][j] = (B[j] == i2c[i])? j : P[i][j-1];
}
for (int i = 0; i < 2; i++)
memset(dp[i], 0, sizeof(uint16)*(nb+1));
#pragma omp parallel
for (int i = 1; i <= na; i++) {
int *Pv = P[c2i[A[i]]];
uint16 *dpIn = dp[i&1^1];
uint16 *dpOut = dp[i&1];
#pragma omp for
for (int j = 1; j <= nb; j++) {
int last_match = Pv[j];
uint16 t1 = last_match ? dpIn[last_match-1]+1 : 0;
uint16 t2 = dpIn[j];
dpOut[j] = t1 > t2 ? t1 : t2;
}
}
for (int i = 0; i <= nb; i++)
dpf[i] = dp[na&1][i];
return dpf[nb];
}
int main() {
int dp[MAXN];
while (scanf("%s %s", A, B) == 2) {
int len = lcs_len(A, strlen(A), B, strlen(B), dp);
printf("%d\n", len);
}
return 0;
}

最終優化

這部分由高等編譯器課程獨家贊助 Advanced Compiler Support

由於 OpenMP 有很多 shared memory 存取,導致在平行區塊儘管開了 -O2 優化,很多表達式都要重複計算,沒辦法像一般序列化程式,可以讓編譯器理解到要做 Strength reduction 或者是 Common subexpression elimination,因此由程序員自己將 C 寫得接近組合語言,這樣效能才會達到最好。

因此在遞迴公式中,經常存取二維陣列,假設不會重疊或相同的情況,直接用兩個指針維護,意即在 inner loop 常常呼叫 ... += dp[i][j] ... 的話,改成 ... += dpI[j] ...,就可以減少 i * SIZE2D 的 offset 計算。同理找到一些 loop invariant,如 A[x],將變數往前提,就能減少 share memory 存取次數。

那為了處理邊界條件,可能會需要一些 branch 完成,這裡採用前處理和失敗指針的方式避開 branch 發生,這樣效能又有小幅度地上升。若在 intel CPU 上,可以透過 intel Parallel Studio XE 2016 下的 Analyzers 中其中一個 VTune Amplifier 調校看出。

這樣還不是極限,甚至可以加入 static __thread 或者利用 #pragma omp threadprivate(var) 進行 copy optimization 來提高資料局部性 (data local locality),但這會牽涉到 cache size,屬於 machine dependency 的相關優化,加或不加要看機器硬體情況。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#include <stdio.h>
#include <string.h>
#include <omp.h>
#define MAXN 60005
#define CHARSET 4
typedef unsigned short uint16;
static char A[MAXN], B[MAXN];
int lcs_len(const char *A, int na, const char *B, int nb, int dpf[]) {
static int c2i[128] = {['A'] = 0, ['C'] = 1, ['G'] = 2, ['T'] = 3};
static char i2c[CHARSET] = {'A', 'C', 'G', 'T'};
static int P[CHARSET][MAXN] = {};
static uint16 dp[2][MAXN];
A--, B--;
#pragma omp parallel for
for (int i = 0; i < CHARSET; i++) {
P[i][0] = nb+1;
for (int j = 1; j <= nb; j++)
P[i][j] = (B[j] == i2c[i]) ? j-1 : P[i][j-1];
}
for (int i = 0; i < 2; i++) {
memset(dp[i], 0, sizeof(uint16)*(nb+1));
dp[i][nb+1] = -1;
}
#pragma omp parallel
for (int i = 1; i <= na; i++) {
int *Pv = P[c2i[A[i]]];
uint16 *dpIn = dp[i&1^1];
uint16 *dpOut = dp[i&1];
#pragma omp for
for (int j = 1; j <= nb; j++) {
uint16 t1 = dpIn[Pv[j]]+1;
uint16 t2 = dpIn[j];
dpOut[j] = t1 > t2 ? t1 : t2;
}
}
for (int i = 0; i <= nb; i++)
dpf[i] = dp[na&1][i];
return dpf[nb];
}
int main() {
int dp[MAXN];
while (scanf("%s %s", A, B) == 2) {
int len = lcs_len(A, strlen(A), B, strlen(B), dp);
printf("%d\n", len);
}
return 0;
}
Read More +

批改娘 10109. Sorting (CUDA)

題目描述

請加速以下程序。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#include <bits/stdc++.h>
using namespace std;
#define MAXN 16777216
uint32_t A[MAXN];
inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (m*m + key)%key;
}
int main() {
int N, K;
while (scanf("%d %d", &N, &K) == 2) {
assert(N&(-N) == N);
for (int i = 0; i < N; i++)
A[i] = encrypt(i, K);
sort(A, A+N);
uint32_t sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < N; i++)
sum += A[i] * i;
printf("%u\n", sum);
}
return 0;
}

輸入格式

有多行測資,每組第一行會有兩個整數 $N, \; K$,表示要排序 $N$ 個整數,並且以亂數種子 $K$ 生成。

  • $N = 2^i, \; 0 \le i \le 24$
  • $1 \le K \le N$

輸出格式

輸出一行 $\sum i \cdot A_i$。

範例輸入

1
2
8 8
8 7

範例輸出

1
2
66
75

Solution

bitonic sort

實作 bitonic sort,最簡單全部使用 global memory 完成,但是在遞迴公式中,當 $n$ 已經足夠小就可以代入 shared memory 進行計算,這時候效能就非常好。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <assert.h>
#include <omp.h>
using namespace std;
#define MAXN 16777216
#define MAXBLK 1024
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t A[MAXN];
__device__ inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (m*m + key)%key;
}
__device__ inline void swap(uint32_t &x, uint32_t &y) {
uint32_t t = x; x = y; y = t;
}
__global__ void bitonic_step(uint32_t *A, int p, int q) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int up = ((i >> p)&2) == 0;
int d = 1 << (p - q);
if ((i & d) == 0 && (A[i] > A[i|d]) == up)
swap(A[i], A[i|d]);
}
__global__ void bitonic_step_fit(uint32_t *A, int p, int q) {
extern __shared__ uint32_t buff[];
int i = blockIdx.x * blockDim.x + threadIdx.x;
int up = ((i >> p)&2) == 0;
buff[threadIdx.x] = A[i];
__syncthreads();
for (; q <= p; q++) {
int d = 1 << (p - q);
if ((i & d) == 0 && (buff[threadIdx.x] > buff[threadIdx.x|d]) == up)
swap(buff[threadIdx.x], buff[threadIdx.x|d]);
__syncthreads();
}
A[i] = buff[threadIdx.x];
}
__global__ void sum_arr(uint32_t *A, uint32_t *B, int N) {
extern __shared__ uint32_t buff[];
int i = blockIdx.x * blockDim.x + threadIdx.x;
buff[threadIdx.x] = A[i] * i;
__syncthreads();
for (int i = blockDim.x>>1; i; i >>= 1) {
if (threadIdx.x < i)
buff[threadIdx.x] += buff[threadIdx.x + i];
__syncthreads();
}
if (threadIdx.x == 0)
B[blockIdx.x] = buff[0];
}
__global__ void rand_gen(uint32_t *A, int N, int K) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
A[i] = encrypt(i, K);
}
void output(int N, uint32_t *cuA, uint32_t *cuB) {
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
CheckErr(cudaGetLastError());
sum_arr<<<cuGrid, cuBlock, sizeof(uint32_t)*min(MAXBLK, N)>>>(cuA, cuB, N);
CheckErr(cudaGetLastError());
cudaMemcpy(A, cuB, sizeof(uint32_t) * N / min(MAXBLK, N), cudaMemcpyDeviceToHost);
CheckErr(cudaGetLastError());
uint32_t sum = 0;
for (int i = 0; i < N / min(MAXBLK, N); i++)
sum += A[i];
printf("%u\n", sum);
}
void sort_test(int N, int K, uint32_t *cuA, uint32_t *cuB) {
assert((N&-N) == N);
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
rand_gen<<<cuGrid, cuBlock>>>(cuA, N, K);
CheckErr(cudaGetLastError());
int logN = 1;
while ((1 << logN) < N) logN++;
for (int i = 0; i < logN; i++) {
for (int j = 0; j <= i; j++) {
if ((1 << (i - j) >= MAXBLK)) {
bitonic_step<<<cuGrid, cuBlock>>>(cuA, i, j);
CheckErr(cudaGetLastError());
} else {
bitonic_step_fit<<<cuGrid, cuBlock, sizeof(uint32_t)*(MAXBLK)>>>(cuA, i, j);
CheckErr(cudaGetLastError());
break;
}
}
}
output(N, cuA, cuB);
}
int main() {
uint32_t *cuA, *cuB;
cudaMalloc((void **) &cuA, sizeof(uint32_t) * MAXN);
cudaMalloc((void **) &cuB, sizeof(uint32_t) * MAXN / MAXBLK);
CheckErr(cudaGetLastError());
int N, K;
while(scanf("%d %d", &N, &K) == 2) {
sort_test(N, K, cuA, cuB);
}
cudaFree(cuA);
cudaFree(cuB);
return 0;
}

內建 thrust

當然這種最常見的函數一定有內建可以協助,如 thrust library 就有提供相關的函數,但第一次執行時會特別地慢,有可能是 library 要代入 cache 的緣故。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <vector>
#include <assert.h>
#include <omp.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
using namespace std;
#define MAXN 16777216
#define MAXBLK 1024
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t A[MAXN];
__device__ inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (m*m + key)%key;
}
__device__ inline void swap(uint32_t &x, uint32_t &y) {
uint32_t t = x; x = y; y = t;
}
__global__ void sum_arr(uint32_t *A, uint32_t *B, int N) {
extern __shared__ uint32_t buff[];
int i = blockIdx.x * blockDim.x + threadIdx.x;
buff[threadIdx.x] = A[i] * i;
__syncthreads();
for (int i = blockDim.x>>1; i; i >>= 1) {
if (threadIdx.x < i)
buff[threadIdx.x] += buff[threadIdx.x + i];
__syncthreads();
}
if (threadIdx.x == 0)
B[blockIdx.x] = buff[0];
}
__global__ void rand_gen(uint32_t *A, int N, int K) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
A[i] = encrypt(i, K);
}
void output(int N, uint32_t *cuA, uint32_t *cuB) {
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
CheckErr(cudaGetLastError());
sum_arr<<<cuGrid, cuBlock, sizeof(uint32_t)*min(MAXBLK, N)>>>(cuA, cuB, N);
CheckErr(cudaGetLastError());
cudaMemcpy(A, cuB, sizeof(uint32_t) * N / min(MAXBLK, N), cudaMemcpyDeviceToHost);
CheckErr(cudaGetLastError());
uint32_t sum = 0;
for (int i = 0; i < N / min(MAXBLK, N); i++)
sum += A[i];
printf("%u\n", sum);
}
void cpu_compute(int N, int K) {
vector<int> A;
for (int i = 0; i < N; i++)
A.push_back((i*i + K)%K);
sort(A.begin(), A.end());
uint32_t sum = 0;
for (int i = 0; i < A.size(); i++)
sum += A[i] * i;
printf("%u\n", sum);
return ;
}
void sort_test(int N, int K, uint32_t *cuA, uint32_t *cuB) {
assert((N&-N) == N);
if (N < MAXBLK) {
cpu_compute(N, K);
return ;
}
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
rand_gen<<<cuGrid, cuBlock>>>(cuA, N, K);
CheckErr(cudaGetLastError());
thrust::device_ptr<uint32_t> cuAptr(cuA);
thrust::sort(cuAptr, cuAptr+N);
output(N, cuA, cuB);
}
int main() {
uint32_t *cuA, *cuB;
cudaMalloc((void **) &cuA, sizeof(uint32_t) * MAXN);
cudaMalloc((void **) &cuB, sizeof(uint32_t) * MAXN / MAXBLK);
CheckErr(cudaGetLastError());
int N, K;
while(scanf("%d %d", &N, &K) == 2)
sort_test(N, K, cuA, cuB);
cudaFree(cuA);
cudaFree(cuB);
return 0;
}
Read More +

批改娘 10108. Streams and Concurrency II (CUDA)

題目描述

Demo

規格

  • Accepted 判斷依準:單一 Device 是否同時執行兩個以上的 kernel。
  • 目前只有舊的 GPU 可以提供 Judge,請確定程式運行在第三個 GPU 上。意即
1
2
int device = 2;
cudaSetDevice(device);

Solution

這一題要完成數個 kernel function 可以同時運行,因為有時候使用的 core 並不會同時運作,在有剩餘的 core 情況下,就可以將下一個 kerenl function 帶進來運作,這時候效能就可以大幅度提升。

隨便寫一個測試即可,但是在設計這一題時,發現新版的 GTX 980Ti 並不支援,藉由 CUDA 環境變數仍然無法看出,最後只能在舊版的 GPU 上,利用舊有的 nvprof 進行分析,儘管是最新版的 nvvp 仍然無法針對在 GTX 980Ti 裝置上運作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <cuda.h>
#include <omp.h>
#define MAXN (64)
#define nStreams 4
__global__ void test_global1(uint32_t IN[], int m, uint32_t OUT[]) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t sum = 0;
int LOCALIT = m * 500000;
for (int i = 0; i < LOCALIT; i++) {
sum += IN[x];
}
OUT[x] = sum;
}
uint32_t hostIn[MAXN], hostOut[MAXN];
#define CheckCuda(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
cudaStream_t stream[nStreams];
void pipelTest(uint32_t *cuIn[], uint32_t *cuOut[], int n, int m[]) {
dim3 cuBlock(1);
dim3 cuGrid(n / 1);
test_global1<<<cuGrid, cuBlock, 0, stream[0]>>>(cuIn[0], m[0], cuOut[0]);
test_global1<<<cuGrid, cuBlock, 0, stream[1]>>>(cuIn[1], m[1], cuOut[1]);
test_global1<<<cuGrid, cuBlock, 0, stream[2]>>>(cuIn[2], m[2], cuOut[2]);
test_global1<<<cuGrid, cuBlock, 0, stream[3]>>>(cuIn[3], m[3], cuOut[3]);
}
int main() {
int device = 2;
cudaSetDevice(device);
// Find device clock rate to calculate number of cycles (for 10ms)
cudaDeviceProp deviceProp;
cudaGetDevice(&device);
cudaGetDeviceProperties(&deviceProp, device);
int clockRate = deviceProp.clockRate;
printf("Device clock rate: %.3f GHz\n", (float)clockRate/1000000);
// Check card supports concurrency
if (deviceProp.concurrentKernels == 0) {
printf("GPU does not support concurrent kernel execution\n");
printf("CUDA kernel runs will be serialised\n");
}
//
srand(time(NULL));
uint32_t *cuIn[nStreams];
uint32_t *cuOut[nStreams];
for (int i = 0; i < nStreams; i++) {
CheckCuda(cudaStreamCreate(&stream[i]));
CheckCuda(cudaMalloc((void **)&cuIn[i], MAXN*sizeof(uint32_t)));
CheckCuda(cudaMalloc((void **)&cuOut[i], MAXN*sizeof(uint32_t)));
for (int j = 0; j < MAXN; j++)
hostIn[j] = rand();
cudaMemcpy(cuIn[i], hostIn, MAXN*sizeof(uint32_t), cudaMemcpyHostToDevice);
}
int m[] = {1, 2, 4, 8};
for (int i = 0; i < 5; i++) {
pipelTest(cuIn, cuOut, MAXN, m);
CheckCuda(cudaThreadSynchronize());
}
CheckCuda(cudaDeviceSynchronize());
for (int i = 0; i < nStreams; i++) {
cudaMemcpy(hostOut, cuOut[i], MAXN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
uint32_t sum = 0;
for (int j = 0; j < MAXN; j++)
sum += hostOut[j];
printf("%u\n", sum);
}
for (int i = 0; i < nStreams; i++)
cudaFree(cuIn[i]);
return 0;
}
Read More +

批改娘 10107. Sparse Matrix Multiplication (CUDA)

題目描述

稀疏矩陣為大部份元素皆為零的矩陣,在科學與工程領域中求解線性模型時經常出現大型的稀疏矩陣。現在給予最常見的 Coordinate Format (簡稱 COO 格式),請問兩個矩陣相乘結果為何。

給予矩陣 $A{n, m}$ 和 $B{m, r}$,請計算稀疏矩陣相乘。

$$A_{4,4} = \begin{bmatrix} 0 & 0 & 0 & 0 \\ 5 & 8 & 0 & 0 \\ 0 & 0 & 3 & 0 \\ 0 & 6 & 0 & 0 \\ \end{bmatrix}, \qquad B_{4,4} = \begin{bmatrix} 0 & 0 & 1 & 3 \\ 0 & 5 & 2 & 0 \\ 3 & 5 & 0 & 0 \\ 0 & 2 & 0 & 0 \\ \end{bmatrix}$$

根據 COO 格式,分別轉換矩陣 $A$ 和 $B$ 的所有非零元素,如下表所示

COO of Matrix $A$

row_index col_index value
1 0 5
1 1 8
2 2 3
3 1 6

COO of Matrix $B$

row_index col_index value
0 2 1
0 3 3
1 1 5
1 2 2
2 0 3
2 1 5
3 1 2

輸入格式

測資只有一組,第一行會有三個整數 $N, \; M, \; R$,分別表示矩陣 $A, \; B$ 的大小。下一行會有兩個整數 $N_A, \; N_B$,接下來會有 $N_A$ 行,每一行表示矩陣 $A$ 的 COO 格式,隨後會有 $N_B$ 行,每一行表示矩陣 $B$ 的 COO 格式。

  • $0 < N, \; M, \; R \le 10^6$
  • $0 < N_A, \; N_B \le 10^6$

給予 COO 格式時,先按照 row_index 由小到大,相同情況由 col_index 由小到大的方式給定,保證不會有重複,每一個元素值 $v$ 介於 $1$ 到 $2^{31}-1$ 之間。

輸出格式

輸出 $C_{n, r} = A_{n, m} \times B_{m, r}$ 的雜湊值。

定義 $\mathit{hash}(C_{n, r}) = \sum\nolimits_{e_{i, j} \in C \text{ and } e_{i, j} \neq 0} \mathit{encrypt}((i+1)*(j+1), e_{i, j})$,實際運作的 流程 可參考以下作法,當然你沒辦法宣告 $N \times M$ 的空間使用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
#define MAXN 1024
uint32_t A[MAXN][MAXN], B[MAXN][MAXN];
int main() {
int x, y, v;
int N, M, R, nA, nB;
scanf("%d %d %d", &N, &M, &R);
scanf("%d %d", &nA, &nB);
for (int i = 0; i < nA; i++)
scanf("%d %d %d", &x, &y, &v), A[x][y] = v;
for (int i = 0; i < nB; i++)
scanf("%d %d %d", &x, &y, &v), B[x][y] = v;
uint32_t hash = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < R; j++) {
uint32_t sum = 0;
for (int k = 0; k < M; k++)
sum += A[i][k] * B[k][j];
if (sum)
hash += encrypt((i+1)*(j+1), sum);
}
}
printf("%u\n", hash);
return 0;
}

範例輸入

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
4 4 4
4 7
1 0 5
1 1 8
2 2 3
3 1 6
0 2 1
0 3 3
1 1 5
1 2 2
2 0 3
2 1 5
3 1 2

範例輸出

1
13093438

範例解釋

$$A_{n, m} \times B_{m, r} = C_{4,4}=\begin{bmatrix} 0 & 0 & 0 & 0 \\ 0 & 40 & 21 & 15 \\ 9 & 15 & 0 & 0 \\ 0 & 30 & 12 & 0 \\ \end{bmatrix}$$

Solution

轉置版本

需要將矩陣 $B$ 轉置後排序,整理成 COO 格式,宣告足夠多的 thread,每一個 thread 分配一個列所有的值,接著使用類似合併排序的方式將答案統計出來。這個版本的效率不算高,因為太多的 branch 以及資料部分上導致 load balance 非常不好,再者是一堆 global memory access。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
using namespace std;
#define MAXN 1048576
#define MAXL (MAXN<<2)
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
__device__ uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
typedef struct ELE {
int i[MAXL], j[MAXL];
uint32_t v[MAXL];
} ELE;
ELE LA, LB, LT;
int D[MAXL];
__global__ void SpMM(ELE *LA, ELE *LB, ELE *LC, int aoff[], int boff[], int mb) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t hash = 0;
for (int j = 0; j < mb; j++) {
int idx1 = aoff[i], idx2 = boff[j];
int top1 = aoff[i+1], top2 = boff[j+1];
uint32_t sum = 0;
int r = LA->i[idx1], c = LB->i[idx2];
while (idx1 < top1 && idx2 < top2) {
if (LA->j[idx1] < LB->j[idx2])
idx1++;
else if (LA->j[idx1] > LB->j[idx2])
idx2++;
else
sum += LA->v[idx1] * LB->v[idx2], idx1++, idx2++;
}
if (sum) {
hash += encrypt((r+1)*(c+1), sum);
}
}
LC->v[i] = hash;
}
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
int ma = 0, mb = 0;
static int aoff[MAXN], boff[MAXN];
for (int i = 0, p = -1; i < na; i++) {
if (LA.i[i] != p)
aoff[ma++] = i, p = LA.i[i];
}
for (int i = 0, p = -1; i < nb; i++) {
if (LB.i[i] != p)
boff[mb++] = i, p = LB.i[i];
}
aoff[ma] = na, boff[mb] = nb;
ELE *cuMA, *cuMB, *cuMC;
int *cuAoff, *cuBoff;
cudaMalloc((void **) &cuMA, sizeof(ELE));
cudaMalloc((void **) &cuMB, sizeof(ELE));
cudaMalloc((void **) &cuMC, sizeof(ELE));
cudaMalloc((void **) &cuAoff, (ma+1)*sizeof(int));
cudaMalloc((void **) &cuBoff, (mb+1)*sizeof(int));
cudaMemcpy(cuMA, &LA, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuMB, &LB, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuAoff, aoff, sizeof(int)*(ma+1), cudaMemcpyHostToDevice);
cudaMemcpy(cuBoff, boff, sizeof(int)*(mb+1), cudaMemcpyHostToDevice);
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (ma%i == 0) {
localSz = i;
}
}
dim3 cuBlock(localSz);
dim3 cuGrid(ma/localSz);
SpMM<<<cuGrid, cuBlock>>>(cuMA, cuMB, cuMC, cuAoff, cuBoff, mb);
CheckErr(cudaGetLastError());
cudaMemcpy(&LA, cuMC, sizeof(ELE), cudaMemcpyDeviceToHost);
uint32_t hash = 0;
#pragma omp parallel for reduction(+: hash)
for (int i = 0; i < ma; i++)
hash += LA.v[i];
printf("%u\n", hash);
cudaFree(cuMA);
cudaFree(cuMB);
cudaFree(cuMC);
cudaFree(cuAoff);
cudaFree(cuBoff);
}
bool cmp(int a, int b) {
if (LT.i[a] != LT.i[b])
return LT.i[a] < LT.i[b];
return LT.j[a] < LT.j[b];
}
int main() {
int N, M, R, nA, nB;
while (scanf("%d %d %d", &N, &M, &R) == 3) {
scanf("%d %d", &nA, &nB);
for (int i = 0; i < nA; i++)
scanf("%d %d %d", &LA.i[i], &LA.j[i], &LA.v[i]);
for (int i = 0; i < nB; i++)
scanf("%d %d %d", &LT.j[i], &LT.i[i], &LT.v[i]);
for (int i = 0; i < nB; i++)
D[i] = i;
sort(D, D+nB, cmp);
for (int i = 0; i < nB; i++)
LB.i[i] = LT.i[D[i]], LB.j[i] = LT.j[D[i]], LB.v[i] = LT.v[D[i]];
SpMV(N, M, R, LA, nA, LB, nB);
}
return 0;
}

額外空間

每一個 thread 計算 $C_{i, [L, R]}$,然後用額外的空間進行 mapping,並且把這個空間宣告在 private memory,也就是 register 上來加速。這一個做法比上述作法還快個三到四倍。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <omp.h>
using namespace std;
#define MAXN 1000005
#define MAXINT 256
#define MAXBLK 1024
#define MAXL (MAXN)
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
typedef struct ELE {
int i[MAXL], j[MAXL];
uint32_t v[MAXL];
} ELE;
typedef struct AUX {
int aoff[MAXL], boff[MAXL], bmap[MAXL];
int ma, mb, na, nb;
} AUX;
ELE LA, LB;
AUX AU;
uint32_t H[MAXL];
__device__ inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
__device__ inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
__global__ void SpMM(ELE *LA, ELE *LB, uint32_t *H, AUX *AU, int T) {
#define INTSZ MAXINT
__shared__ uint32_t buff[MAXBLK];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int x = idx / T, y = idx % T;
int L = y * INTSZ;
int R = L + INTSZ;
uint32_t tmp[INTSZ];
for (int i = 0; i < INTSZ; i++)
tmp[i] = 0;
int lx = AU->aoff[x], rx = AU->aoff[x+1];
for (int i = lx; i < rx; i++) {
if (AU->bmap[LA->j[i]] != -1) {
int k = AU->bmap[LA->j[i]];
uint32_t val = LA->v[i];
int ly = AU->boff[k], ry = AU->boff[k+1];
for (int j = ly; j < ry; j++) {
if (L <= LB->j[j] && LB->j[j] < R)
tmp[LB->j[j] - L] += val * LB->v[j];
}
}
}
uint32_t hash = 0;
uint32_t X = LA->i[AU->aoff[x]];
for (int i = 0; i < INTSZ; i++) {
if (tmp[i])
hash += encrypt((X+1)*(i+L+1), tmp[i]);
}
buff[threadIdx.x] = hash;
__syncthreads();
for (int i = blockDim.x>>1; i; i >>= 1) {
if (threadIdx.x < i)
buff[threadIdx.x] += buff[threadIdx.x + i];
__syncthreads();
}
if (threadIdx.x == 0)
H[blockIdx.x] = buff[0];
#undef INTSZ
}
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
AU.ma = 0, AU.mb = 0;
AU.na = na, AU.nb = nb;
memset(AU.bmap, -1, sizeof(AU.bmap));
#pragma omp parallel sections
{
#pragma omp section
{
for (int i = 0, p = -1; i < na; i++) {
if (LA.i[i] != p)
AU.aoff[AU.ma++] = i, p = LA.i[i];
}
AU.aoff[AU.ma] = na;
}
#pragma omp section
{
for (int i = 0, p = -1; i < nb; i++) {
if (LB.i[i] != p) {
AU.bmap[LB.i[i]] = AU.mb;
AU.boff[AU.mb++] = i, p = LB.i[i];
}
}
AU.boff[AU.mb] = nb;
}
}
uint32_t *cuHH;
ELE *cuMA, *cuMB;
AUX *cuAU;
cudaMalloc((void **) &cuHH, sizeof(H));
cudaMalloc((void **) &cuMA, sizeof(ELE));
cudaMalloc((void **) &cuMB, sizeof(ELE));
cudaMalloc((void **) &cuAU, sizeof(AUX));
cudaMemcpy(cuHH, &H, sizeof(H), cudaMemcpyHostToDevice);
cudaMemcpy(cuMA, &LA, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuMB, &LB, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuAU, &AU, sizeof(AUX), cudaMemcpyHostToDevice);
int roundR = (R + MAXINT-1) / MAXINT * MAXINT;
int TOT = N * roundR;
while (TOT / MAXINT % MAXBLK)
TOT ++;
dim3 cuBlock(MAXBLK); // MAXTHREAD
dim3 cuGrid(TOT / MAXINT / MAXBLK);
// printf("%d\n", TOT/MAXINT);
SpMM<<<cuGrid, cuBlock>>>(cuMA, cuMB, cuHH, cuAU, roundR / MAXINT);
CheckErr(cudaGetLastError());
cudaMemcpy(H, cuHH, sizeof(H), cudaMemcpyDeviceToHost);
uint32_t hash = 0;
#ifdef _OPENMP
omp_set_num_threads(4);
#endif
#pragma omp parallel for reduction(+: hash)
for (int i = 0; i < TOT/MAXINT/MAXBLK; i++)
hash += H[i];
printf("%u\n", hash);
cudaFree(cuMA);
cudaFree(cuMB);
cudaFree(cuHH);
cudaFree(cuAU);
}
inline int readchar() {
const int N = 1048576;
static char buf[N];
static char *p = buf, *end = buf;
if(p == end) {
if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
p = buf;
}
return *p++;
}
inline int ReadInt(int *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}
inline uint32_t ReadUint(uint32_t *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}
int main() {
int N, M, R, nA, nB;
// scanf("%d %d %d", &N, &M, &R);
// scanf("%d %d", &nA, &nB);
ReadInt(&N), ReadInt(&M), ReadInt(&R);
ReadInt(&nA), ReadInt(&nB);
for (int i = 0; i < nA; i++)
ReadInt(&LA.i[i]), ReadInt(&LA.j[i]), ReadUint(&LA.v[i]);
// scanf("%d%d%d", &LA.i[i], &LA.j[i], &LA.v[i]);
for (int i = 0; i < nB; i++)
ReadInt(&LB.i[i]), ReadInt(&LB.j[i]), ReadUint(&LB.v[i]);
// scanf("%d%d%d", &LB.i[i], &LB.j[i], &LB.v[i]);
SpMV(N, M, R, LA, nA, LB, nB);
// }
return 0;
}
Read More +

批改娘 10106. Multiple Device (CUDA)

題目描述

小明的數學作業要計算方陣,現在請你幫幫他!

題目給定數個 $N \times N$ 的矩陣和 $2$ 小題。

  • $X = AB+CD$
  • $Y = ABE+CDF$

sequence.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
UINT sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}
void add(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
C[i][j] = A[i][j] + B[i][j];
}
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + A[i][j]) * 2654435761LU;
}
return h;
}
UINT IN[6][MAXN][MAXN], TMP[6][MAXN][MAXN];
int main() {
int N, S[6];
scanf("%d", &N);
for (int i = 0; i < 6; i++) {
scanf("%d", &S[i]);
rand_gen(S[i], N, IN[i]);
}
// AB
multiply(N, IN[0], IN[1], TMP[0]);
// CD
multiply(N, IN[2], IN[3], TMP[1]);
// AB+CD
add(N, TMP[0], TMP[1], TMP[2]);
printf("%u\n", signature(N, TMP[2]));
// ABE
multiply(N, TMP[0], IN[4], TMP[3]);
// CDF
multiply(N, TMP[1], IN[5], TMP[4]);
// ABE+CDF
add(N, TMP[3], TMP[4], TMP[5]);
printf("%u\n", signature(N, TMP[5]));
return 0;
}

輸入格式

輸入有多組測資,每組第一行會有一個整數 $N$,表示題目給定 $N \times N$ 矩陣,第二行上會有 $6$ 個整數,分別為矩陣 $A, B, C, D, E, F$ 的生成種子。

  • $1 \le N \le 1024$
  • $0 \le S_i \le 2^{31}$

輸出格式

輸出兩行 $X$ 和 $Y$ 的雜湊值,可參考 sequence.c 的流程。

編譯參數

1
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main

Sample Input

1
2
3
4
2
0 1 2 3 4 5
10
0 1 2 3 4 5

Sample Output

1
2
3
4
2385860290
1374821695
617438354
1897844131

Solution

與 OpenMP 一起設計,仿造 OpenCL 版本。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <cuda.h>
#define MAXGPU 3
#define MAXN 1024
#define GPULOCAL 64
#define UNLOOP 8
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostMtx[MAXGPU][6][MAXN*MAXN];
uint32_t hostMid[MAXGPU][2][MAXN*MAXN];
__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
int x = r / N, y = r % N;
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += A[x*N + i] * B[i*N + y];
C[x * N + y] = sum;
}
__global__ void matrixAdd(uint32_t A[], uint32_t B[], uint32_t C[]) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
C[r] = A[r] + B[r];
}
int readIn(uint32_t S[], int *n, int devId) {
int N, M;
if (scanf("%d", &N) != 1)
return 0;
M = 6;
for (int i = 0; i < M; i++)
assert(scanf("%d", &S[i]) == 1);
for (int p = 0; p < M; p++)
#pragma omp task
{
uint32_t x = 2, n = N*N, c = S[p];
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
hostMtx[devId][p][i*N+j] = x;
}
}
}
#pragma omp taskwait
*n = N;
return 1;
}
uint32_t writeOut(uint32_t *hostC, int N) {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
return h;
}
void matrix_mul(int N, uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
}
void matrix_add(int N, uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixAdd<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC);
CheckErr(cudaGetLastError());
}
void solver(int devId, int N, uint32_t *cuMtx[], uint32_t *cuMtxTmp[], uint32_t ret[]) {
uint32_t memSz = N*N*sizeof(uint32_t);
for (int i = 0; i < 6; i++) {
cudaMemcpy(cuMtx[i], hostMtx[devId][i], memSz, cudaMemcpyHostToDevice);
CheckErr(cudaGetLastError());
}
// cuMtxTmp[0] = AB
matrix_mul(N, cuMtx[0], cuMtx[1], cuMtxTmp[0]);
// cuMtxTmp[1] = CD
matrix_mul(N, cuMtx[2], cuMtx[3], cuMtxTmp[1]);
// cuMtxTmp[2] = ABE
matrix_mul(N, cuMtxTmp[0], cuMtx[4], cuMtxTmp[2]);
// cuMtxTmp[3] = CDF
matrix_mul(N, cuMtxTmp[1], cuMtx[5], cuMtxTmp[3]);
// cuMtxTmp[4] = AB + CD
matrix_add(N, cuMtxTmp[0], cuMtxTmp[1], cuMtxTmp[4]);
// cuMtxTmp[5] = ABE+CDF
matrix_add(N, cuMtxTmp[2], cuMtxTmp[3], cuMtxTmp[5]);
cudaMemcpy(hostMid[devId][0], cuMtxTmp[4], memSz, cudaMemcpyDeviceToHost);
cudaMemcpy(hostMid[devId][1], cuMtxTmp[5], memSz, cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++)
#pragma omp task
{
ret[i] = writeOut(hostMid[devId][i], N);
}
#pragma omp taskwait
}
int main(int argc, char *argv[]) {
uint32_t *cuMtx[MAXGPU][6], *cuMtxTmp[MAXGPU][6];
uint32_t memSz = MAXN*MAXN*sizeof(uint32_t);
for (int p = 0; p < MAXGPU; p++) {
cudaSetDevice(p);
for (int i = 0; i < 6; i++) {
cudaMalloc((void **) &cuMtx[p][i], memSz);
CheckErr(cudaGetLastError());
}
for (int i = 0; i < 6; i++)
cudaMalloc((void **) &cuMtxTmp[p][i], memSz);
}
int inN = 0;
static uint32_t ansQue[32767][2];
#pragma omp parallel sections
{
#pragma omp section
{
cudaSetDevice(0);
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 0);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(0, N, cuMtx[0], cuMtxTmp[0], ansQue[pid]);
}
}
#pragma omp section
{
cudaSetDevice(1);
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 1);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(1, N, cuMtx[1], cuMtxTmp[1], ansQue[pid]);
}
}
#pragma omp section
{
cudaSetDevice(2);
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 2);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(2, N, cuMtx[2], cuMtxTmp[2], ansQue[pid]);
}
}
}
for (int i = 0; i < inN; i++)
printf("%u\n%u\n", ansQue[i][0], ansQue[i][1]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtx[i]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtxTmp[i]);
return 0;
}
Read More +

批改娘 10104. Streams and Concurrency (CUDA)

題目描述

根據 Nvidia - Streams and Concurrency 讓 Data transfer 和 Kernel execute 時間重疊達到加速。

  • 任何程式都可以
  • 沒有指定輸入、輸出

測試流程

profiler script

下載 nvprof.sh 和 nvvp.log

編譯執行

1
2
3
4
$ chmod +x nvprof.sh
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main
$ ./nvprof.sh ./main
$ cat nvvp.log

Accepted 判斷依準:單一 Device 是否在運行過程中發生并行。

Accepted

Wrong Answer

Solution

出這一題是為了測試 software pipeline 的設計,加快批次處理的效能,藉由數個 stream 的使用,讓資料傳輸和計算相互重疊。

這一題讓我設計測試 Judge 相當懊惱,藉由 Nvidia 提供環境變數的 debug 資訊產生的 log 檔就能分析執行區間是否有重疊,這個問題就迎刃而解。當然此題不在課程範圍內,提案給老師說要不要教,預料中地被打槍。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
/* Copyright (c) 1993-2015, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void kernel(float *a, int offset)
{
int i = offset + threadIdx.x + blockIdx.x*blockDim.x;
float x = (float)i;
float s = sinf(x);
float c = cosf(x);
a[i] = a[i] + sqrtf(s*s+c*c);
}
float maxError(float *a, int n)
{
float maxE = 0;
for (int i = 0; i < n; i++) {
float error = fabs(a[i]-1.0f);
if (error > maxE) maxE = error;
}
return maxE;
}
int main(int argc, char **argv)
{
const int blockSize = 256, nStreams = 4;
const int n = 4 * 1024 * blockSize * nStreams;
const int streamSize = n / nStreams;
const int streamBytes = streamSize * sizeof(float);
const int bytes = n * sizeof(float);
int devId = 0;
if (argc > 1) devId = atoi(argv[1]);
cudaDeviceProp prop;
checkCuda( cudaGetDeviceProperties(&prop, devId));
printf("Device : %s\n", prop.name);
checkCuda( cudaSetDevice(devId) );
// allocate pinned host memory and device memory
float *a, *d_a;
checkCuda( cudaMallocHost((void**)&a, bytes) ); // host pinned
checkCuda( cudaMalloc((void**)&d_a, bytes) ); // device
float ms; // elapsed time in milliseconds
// create events and streams
cudaEvent_t startEvent, stopEvent, dummyEvent;
cudaStream_t stream[nStreams];
checkCuda( cudaEventCreate(&startEvent) );
checkCuda( cudaEventCreate(&stopEvent) );
checkCuda( cudaEventCreate(&dummyEvent) );
for (int i = 0; i < nStreams; ++i)
checkCuda( cudaStreamCreate(&stream[i]) );
// baseline case - sequential transfer and execute
memset(a, 0, bytes);
checkCuda( cudaEventRecord(startEvent,0) );
checkCuda( cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice) );
kernel<<<n/blockSize, blockSize>>>(d_a, 0);
checkCuda( cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost) );
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
printf("Time for sequential transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// asynchronous version 1: loop over {copy, kernel, copy}
memset(a, 0, bytes);
checkCuda( cudaEventRecord(startEvent,0) );
for (int i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]) );
kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]) );
}
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// asynchronous version 2:
// loop over copy, loop over kernel, loop over copy
memset(a, 0, bytes);
checkCuda( cudaEventRecord(startEvent,0) );
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]) );
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]) );
}
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// cleanup
checkCuda( cudaEventDestroy(startEvent) );
checkCuda( cudaEventDestroy(stopEvent) );
checkCuda( cudaEventDestroy(dummyEvent) );
for (int i = 0; i < nStreams; ++i)
checkCuda( cudaStreamDestroy(stream[i]) );
cudaFree(d_a);
cudaFreeHost(a);
return 0;
}
Read More +

批改娘 10103. Advanced Matrix Calculator (CUDA)

題目描述

小明的數學作業要計算方陣,現在請你幫幫他!

題目給定數個 $N \times N$ 的矩陣和 $Q$ 小題,每一小題只由加法和乘法構成。

sequence.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
UINT sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}
void add(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
C[i][j] = A[i][j] + B[i][j];
}
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + A[i][j]) * 2654435761LU;
}
return h;
}
UINT IN[6][MAXN][MAXN], TMP[6][MAXN][MAXN];
int main() {
int N, S[6];
scanf("%d", &N);
for (int i = 0; i < 6; i++) {
scanf("%d", &S[i]);
rand_gen(S[i], N, IN[i]);
}
// AB
multiply(N, IN[0], IN[1], TMP[0]);
// CD
multiply(N, IN[2], IN[3], TMP[1]);
// AB+CD
add(N, TMP[0], TMP[1], TMP[2]);
printf("%u\n", signature(N, TMP[2]));
// ABE
multiply(N, TMP[0], IN[4], TMP[3]);
// CDF
multiply(N, TMP[1], IN[5], TMP[4]);
// ABE+CDF
add(N, TMP[3], TMP[4], TMP[5]);
printf("%u\n", signature(N, TMP[5]));
return 0;
}

輸入格式

測資只有一組,第一行會有兩個整數 $M,N$,表示題目給定 $M$ 個 $N \times N$ 矩陣,第二行上會有 $N$ 個整數 $S_i$ 個第 $i$ 個矩陣生成種子。最後會有一行一個整數 $Q$,表示接下來有 $Q$ 行詢問,每一行上會有一個字串 $E$ 表示接下來要處理的矩陣表達式,$E$ 只包含 A-Z 以及 +

  • $1 \le M \le 26$
  • $1 \le N \le 1024$
  • $0 \le S_i \le 2^{31}$
  • $1 \le Q \le 100$
  • $|E| \le 26$

輸出格式

對於每一組測資輸出一行。

範例輸入 1

1
2
3
4
5
6 2
0 1 2 3 4 5
2
AB+CD
ABE+CDF

範例輸出 1

1
2
2385860290
1374821695

編譯參數

1
2
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main
$ ./main

Solution

跟 OpenCL 的做法類似,讓三個 device 共同合作,每一個表達式都交給一個 device 完成,所以目標分配這些表達式使得計算最長時間最小化。處理手法都差不多,經過調校比 OpenCL 還要快上一些。

由於 CUDA 要藉由 cudaSetDevice(p); 設定計算裝置,那麼相信這個 global function 設置變數是採用 __thread 保留字完成,因此用 OpenMP 建立三條 thread,分別設置就不會相互影響,設置 __thread 的變數在各自 thread 下是獨立不受影響的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include <omp.h>
#define MAXGPU 3
#define MAXN 1024
#define MAXM 32
#define MAXMID 32
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostMtx[MAXM][MAXN*MAXN];
uint32_t hostMid[MAXGPU][MAXN*MAXN];
int N = MAXN, M, Q;
int clNeedDevCnt = 3;
__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
int x = r / N, y = r % N;
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += A[x*N + i] * B[i*N + y];
C[x * N + y] = sum;
}
__global__ void matrixAdd(uint32_t A[], uint32_t B[], uint32_t C[]) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
C[r] = A[r] + B[r];
}
void matrix_multiply(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
}
void matrix_add(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixAdd<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC);
CheckErr(cudaGetLastError());
}
char expr[1024];
typedef struct Node {
struct Node *l, *r;
int opcode;
uint32_t *hostV, *cuV;
int regNeed, regId;
int pid, mid;
long long h;
} Node;
void replaceReg(Node *u, int a, int b) {
if (u->l == NULL) return ;
if (u->regId == a)
u->regId = b;
replaceReg(u->l, a, b);
replaceReg(u->r, a, b);
}
void updateNode(Node *u, Node *l, Node *r, int opcode) {
u->l = l, u->r = r, u->opcode = opcode;
if (opcode == '+') {
u->h = u->l->h + u->r->h + N;
// -- register allocation
if (u->l->regNeed == u->r->regNeed) {
u->regNeed = u->l->regNeed + 1;
u->regId = u->regNeed;
replaceReg(u->r, u->r->regId, u->regId);
} else {
u->regNeed = u->l->regNeed > u->r->regNeed ? u->l->regNeed : u->r->regNeed;
u->regId = u->regNeed;
}
} else if (opcode == '*') {
u->h = u->l->h + u->r->h + N*N;
// -- register allocation
if (abs(u->l->regNeed - u->r->regNeed) == 1) {
u->regNeed = u->l->regNeed + 1;
u->regId = u->regNeed;
} else if (u->l->regNeed == u->r->regNeed) {
u->regNeed = u->l->regNeed + 2;
u->regId = u->regNeed;
replaceReg(u->r, u->r->regId, u->regId-1);
} else {
u->regNeed = u->l->regNeed > u->r->regNeed ? u->l->regNeed : u->r->regNeed;
u->regId = u->regNeed;
int a, b;
if (u->l->regId == u->regId) {
a = u->l->regId, b = u->l->regId-1;
replaceReg(u->l, a, -1);
replaceReg(u->l, b, a);
replaceReg(u->l, -1, b);
} else {
a = u->r->regId, b = u->r->regId-1;
replaceReg(u->r, a, -1);
replaceReg(u->r, b, a);
replaceReg(u->r, -1, b);
}
}
}
assert(u->regId < MAXMID);
}
Node* parseExpr(int l, int r, char expr[], int procId) {
Node *u = (Node *) calloc(1, sizeof(Node));
u->pid = procId;
if (l == r) {
int idx = expr[l] - 'A';
u->mid = idx;
u->h = 0;
return u;
}
int cnt = 0;
for (int i = l; i <= r; i++) {
if (expr[i] == '(') {
cnt++;
} else if (expr[i] == ')') {
cnt--;
} else if (expr[i] == '+' && cnt == 0) {
updateNode(u, parseExpr(l, i-1, expr, procId), parseExpr(i+1, r, expr, procId), '+');
return u;
}
}
for (int i = l; i <= r; i++) {
if (expr[i] == '(') {
if (cnt == 0 && i != l) {
updateNode(u, parseExpr(l, i-1, expr, procId), parseExpr(i, r, expr, procId), '*');
return u;
}
cnt++;
} else if (expr[i] == ')') {
cnt--;
} else if (expr[i] >= 'A' && expr[i] <= 'Z' && cnt == 0 && i != l) {
updateNode(u, parseExpr(l, i-1, expr, procId), parseExpr(i, r, expr, procId), '*');
return u;
}
}
free(u);
return parseExpr(l+1, r-1, expr, procId);
}
uint32_t writeOut(uint32_t *hostC) {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
return h;
}
uint32_t *cuMemMid[MAXGPU][MAXMID];
uint32_t *cuMemIn[MAXGPU][MAXM];
void memRelocation(Node *u, int did, Node *nodes[], int *offset) {
if (u->l == NULL) {
nodes[*offset] = u, (*offset)++;
return ;
}
u->cuV = cuMemMid[did][u->regId];
if (u->l->regNeed > u->r->regNeed) {
memRelocation(u->l, did, nodes, offset);
memRelocation(u->r, did, nodes, offset);
} else {
memRelocation(u->r, did, nodes, offset);
memRelocation(u->l, did, nodes, offset);
}
fprintf(stderr, "reg%d = %s ", u->regId, u->opcode == '+' ? "add" : "mul");
if (u->l->l == NULL) fprintf(stderr, "%c ", u->l->mid + 'A');
else fprintf(stderr, "reg%d ", u->l->regId);
if (u->r->l == NULL) fprintf(stderr, "%c\n", u->r->mid + 'A');
else fprintf(stderr, "reg%d\n", u->r->regId);
nodes[*offset] = u, (*offset)++;
return ;
}
int executeGPU(Node *workQue[][128], int workQueSz[], uint32_t resultBuff[]) {
Node* nodes[MAXGPU][128];
int offset[MAXGPU] = {};
uint32_t memSz = N*N*sizeof(uint32_t);
int memDeploy[MAXGPU][MAXM] = {};
int regDeploy[MAXGPU][MAXMID] = {};
// -- execute multi-device
#pragma omp parallel for
for (int p = 0; p < clNeedDevCnt; p++) {
cudaSetDevice(p);
for (int q = 0; q < workQueSz[p]; q++) {
// -- flatten binary tree
offset[p] = 0;
memRelocation(workQue[p][q], p, nodes[p], &offset[p]);
// -- execute in order
for (int i = 0; i < offset[p]; i++) {
Node *u = nodes[p][i];
// -- is leaf, deploy memory copy
if (u->l == NULL) {
if (!memDeploy[p][u->mid]) {
cudaMalloc((void **) &cuMemIn[p][u->mid], memSz);
cudaMemcpy(cuMemIn[p][u->mid], hostMtx[u->mid], memSz, cudaMemcpyHostToDevice);
memDeploy[p][u->mid] = 1;
}
u->cuV = cuMemIn[p][u->mid];
continue;
}
// -- inner node using minimum #buffer
if (!regDeploy[p][u->regId])
cudaMalloc((void **) &cuMemMid[p][u->regId], memSz);
if (u->cuV == NULL)
u->cuV = cuMemMid[p][u->regId];
if (u->opcode == '*')
matrix_multiply(u->l->cuV, u->r->cuV, u->cuV);
else
matrix_add(u->l->cuV, u->r->cuV, u->cuV);
}
// -- read back and store answer
Node *root = workQue[p][q];
fprintf(stderr, "register need %d\n", root->regNeed);
cudaMemcpy(hostMid[p], root->cuV, memSz, cudaMemcpyDeviceToHost);
uint32_t ret = writeOut(hostMid[p]);
resultBuff[root->pid] = ret;
// -- free inner node buffer
for (int i = 0; i < offset[p]; i++) {
Node *u = nodes[p][i];
if (u->l != NULL && u->hostV)
free(u->hostV);
free(u);
}
}
// -- free buffer
cudaSetDevice(p);
for (int i = 0; i < MAXMID; i++) {
cudaFree(cuMemMid[p][i]);
}
for (int i = 0; i < M; i++) {
cudaFree(cuMemIn[p][i]);
}
}
return 1;
}
int readIn() {
if (scanf("%s", expr) != 1)
return 0;
return 1;
}
int balance_cmp(const void *a, const void *b) {
Node *x = *(Node **) a;
Node *y = *(Node **) b;
if (x->h == y->h) return 0;
if (x->h < y->h) return 1;
return -1;
}
void onStart() {
int S[64];
assert(scanf("%d %d", &M, &N) == 2);
for (int i = 0; i < M; i++)
assert(scanf("%d", &S[i]) == 1);
#pragma omp parallel for
for (int p = 0; p < M; p++) {
uint32_t x = 2, n = N*N;
uint32_t c = S[p];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
hostMtx[p][i*N+j] = x;
}
}
}
Node *procBuff[128];
if (scanf("%d", &Q) != 1)
return ;
for (int i = 0; i < Q; i++) {
readIn();
int expr_len = strlen(expr);
procBuff[i] = parseExpr(0, expr_len-1, expr, i);
}
qsort(procBuff, Q, sizeof(Node*), balance_cmp);
float gpuSpeed[MAXGPU] = {1.f, 1.8f, 2.0f};
long long workload[MAXGPU] = {};
int workQueSz[MAXGPU] = {};
uint32_t resultBuff[MAXGPU] = {};
Node *workQue[MAXGPU][128];
for (int i = 0; i < Q; i++) {
int mn = 0;
for (int j = 0; j < clNeedDevCnt; j++) {
if (workload[j]*gpuSpeed[j] < workload[mn]*gpuSpeed[mn])
mn = j;
}
workload[mn] += procBuff[i]->h;
workQue[mn][workQueSz[mn]++] = procBuff[i];
}
executeGPU(workQue, workQueSz, resultBuff);
for (int i = 0; i < Q; i++)
printf("%u\n", resultBuff[i]);
}
int main(int argc, char *argv[]) {
onStart();
return 0;
}
Read More +