批改娘 10113. Longest Common Subsequence II (CUDA)

contents

  1. 1. 題目描述
  2. 2. 輸入格式
  3. 3. 輸出格式
  4. 4. 範例輸入
  5. 5. 範例輸出
  6. 6. 編譯參數
  7. 7. Solution

題目描述

給兩個字串 $X, \; Y$,在兩個字串中都有出現且最長的子序列 (subsequence),就是最長共同子字串

輸入格式

有多組測資,每組測資有兩行字串 $X, \; Y$,$X, \; Y$ 只由 A T C G 四個字母構成。

  • $1 \le |X|, |Y| \le 60000$

輸出格式

針對每一組測資,輸出一行 $X, \; Y$ 的最長共同子字串長度以及任何一組最長共同子字串。

範例輸入

1
2
3
4
TCA
GTA
TGGAC
TATCT

範例輸出

1
2
3
4
2
TA
3
TAC

編譯參數

1
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main

Solution

附錄為舊版設計,可以針對 OpenMP 的設計再重新設計 CUDA 版本。

舊版設計根據 CPU 平行與不平行以及 GPU 三種狀態,根據閥值決定要選擇運行哪一種版本。若加上足夠小的情況下,直接用 $O(N^2)$ 表格直接回溯,和 OpenMP 判斷閥值的公式,效能還能在加速個兩到三倍,但整體而言未必比純 OpenMP 還要快,因為有 GPU 啟動的 overhead,要跑夠多的測資才看得出來。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <omp.h>
using namespace std;
#define MAXN 60005
#define CHARSET 4
#define max(x, y) (x) > (y) ? (x) : (y)
typedef unsigned short uint16;
static char A[MAXN], B[MAXN];
static int c2i[128];
static char i2c[CHARSET+1] = "ACGT";
static int *cuP;
static char *cuB;
static uint16 *cuDP;
__global__ void prepare(int *P, char *B, int nb) {
int i = threadIdx.x;
int *p = P + i*MAXN;
char c = "ACGT"[i];
p[0] = 0;
for (int j = 1; j <= nb; j++)
p[j] = (B[j] == c) ? j : p[j-1];
}
__global__ void run(int nb, uint16 *dpIn, uint16 *dpOut, int *P) {
int j = blockDim.x*blockIdx.x+threadIdx.x+1;
if (j > nb) return ;
int pos = P[j];
uint16 t1 = pos ? dpIn[pos-1]+1 : 0;
uint16 t2 = dpIn[j];
dpOut[j] = t1 > t2 ? t1 : t2;
}
int lcs_len_seq(const char *A, int na, const char *B, int nb, uint16 dpf[]) {
static uint16 dp[2][MAXN];
memset(dp[0], 0, sizeof(uint16)*(nb+1));
dp[1][0] = 0, dpf[0] = 0;
for (int i = 1; i <= na; i++) {
for (int j = 1; j <= nb; j++) {
if (A[i-1] == B[j-1])
dp[1][j] = dp[0][j-1]+1;
else
dp[1][j] = max(dp[1][j-1], dp[0][j]);
}
memcpy(dp[0], dp[1], sizeof(uint16)*(nb+1));
}
for (int i = 0; i <= nb; i++)
dpf[i] = dp[0][i];
return dpf[nb];
}
int lcs_len_omp(const char *A, int na, const char *B, int nb, uint16 dpf[]) {
static int P[CHARSET][MAXN];
static uint16 dp[2][MAXN];
A--, B--;
#pragma omp parallel for
for (int i = 0; i < CHARSET; i++) {
memset(P[i], 0, sizeof(int)*(nb+1));
for (int j = 1; j <= nb; j++)
P[i][j] = (B[j] == i2c[i])? j : P[i][j-1];
}
for (int i = 0; i < 2; i++)
memset(dp[i], 0, sizeof(uint16)*(nb+1));
#pragma omp parallel
for (int i = 1; i <= na; i++) {
int *Pv = P[c2i[A[i]]];
#pragma omp for
for (int j = 1; j <= nb; j++) {
int last_match = Pv[j];
uint16 tmp = last_match ? dp[i&1^1][last_match-1]+1 : 0;
dp[i&1][j] = max(dp[i&1^1][j], tmp);
}
}
for (int i = 0; i <= nb; i++)
dpf[i] = dp[na&1][i];
return dpf[nb];
}
int lcs_len(const char *A, int na, const char *B, int nb, uint16 dpf[]) {
if (max(na, nb) < 2048)
return lcs_len_seq(A, na, B, nb, dpf);
if (nb < 10000)
return lcs_len_omp(A, na, B, nb, dpf);
B--;
cudaMemcpy(cuB, B, sizeof(char)*(nb+1), cudaMemcpyHostToDevice);
cudaMemset(cuDP, 0, sizeof(uint16)*(nb+1));
cudaMemset(cuDP+MAXN, 0, sizeof(uint16)*(nb+1));

int bsz = 1024;
dim3 bb(bsz);
dim3 gg((nb+bsz-1)/bsz);
prepare<<<1, 4>>>(cuP, cuB, nb);
for (int i = 0; i < na; i++) {
int v = c2i[A[i]];
run<<<gg, bb>>>(nb, cuDP+(i&1)*MAXN, cuDP+((i&1)^1)*MAXN, cuP+v*MAXN);
}
cudaMemcpy(dpf, cuDP+(na&1)*MAXN, sizeof(uint16)*(nb+1), cudaMemcpyDeviceToHost);
return dpf[nb];
}
char* alloc_str(int sz) {
return (char *) calloc(sz, sizeof(char));
}
char* substr(const char *s, int pos, int len) {
char *t = alloc_str(len+1);
memcpy(t, s+pos, len);
return t;
}
char* cat(const char *sa, const char *sb) {
int na = strlen(sa), nb = strlen(sb);
char *t = alloc_str(na + nb + 1);
memcpy(t, sa, na);
memcpy(t+na, sb, nb);
return t;
}
char* reverse(const char *s, int len) {
char *t = substr(s, 0, len);
char *l = t, *r = t + len - 1;
char tmp;
while (l < r) {
tmp = *l, *l = *r, *r = tmp;
l++, r--;
}
return t;
}
char* find_lcs(const char *a, int na, const char *b, int nb) {
if (na > nb) {
const char *c; int t;
c = a, a = b, b = c;
t = na, na = nb, nb = t;
}

if (na == 0)
return alloc_str(1);

if (na == 1) {
for (int i = 0; i < nb; i++) {
if (a[0] == b[i])
return substr(a, 0, 1);
}
return alloc_str(1);
}

static uint16 t1[MAXN];
static uint16 t2[MAXN];
int len = lcs_len(a, na, b, nb, t1);
if (len == 0)
return alloc_str(1);
int half_len = na / 2;
char *la = substr(a, 0, half_len);
char *ra = substr(a, half_len, na - half_len);
char *tb = reverse(b, nb);
char *ta = reverse(ra, na - half_len);
lcs_len(la, half_len, b, nb, t1);
lcs_len(ta, na - half_len, tb, nb, t2);

int split = -1;
for (int i = 0; i <= nb; i++) {
if (t1[i] + t2[nb-i] == len) {
split = i;
break;
}
}

char *lb = substr(b, 0, split);
char *rb = substr(b, split, nb - split);
char *sl = find_lcs(la, half_len, lb, split);
char *sr = find_lcs(ra, na - half_len, rb, nb - split);
char *ret = cat(sl, sr);
free(la), free(ra), free(ta);
free(lb), free(rb), free(tb);
free(sl), free(sr);
return ret;
}
int main() {
for (int i = 0; i < CHARSET; i++)
c2i[i2c[i]] = i;
cudaMalloc(&cuB, sizeof(char)*MAXN);
cudaMalloc(&cuP, sizeof(int)*MAXN*4);
cudaMalloc(&cuDP, sizeof(uint16)*MAXN*2);
static uint16 dpf[MAXN];
while (scanf("%s %s", A, B) == 2) {
int na = strlen(A);
int nb = strlen(B);
int len = lcs_len(A, na, B, nb, dpf);
char *str = find_lcs(A, na, B, nb);
printf("%d\n", len);
printf("%s\n", str);
free(str);
}
cudaFree(cuB);
cudaFree(cuP);
cudaFree(cuDP);
return 0;
}