UVa 12415 - Digit Patterns

contents

  1. 1. Problem
  2. 2. Sample Input
  3. 3. Sample Output
  4. 4. Solution

Problem

讀入一個 regex,一個主字串 $s$,請問有不同的 $i$ 滿足 $s$ 的子字串 s[j...i] 匹配 regex。

正規表達式長度最多 500,主字串 $s$ 長度最多 $10^7$

Sample Input

1
2
3
4
6 1(2+3)*4
012345
2 00*(10+100)*
00100

Sample Output

1
2
5
1 2 4 5

Solution

隔了一年才解決它,這一題很明顯地在編譯器的範疇,要將一個 regex 轉換成 NFA 甚至 DFA。一開始設想轉換成 DFA,但長度 500 轉換成 DFA,用狀態壓縮的方式狀態增長非常大,不知道是不是指數成長,上傳就得到 Runtime Error。

將 regex 轉換成 NFA (nondeterministic finite automata) 並不難,但節點會很多且存在很多 $a \overset{\varepsilon}{\rightarrow} b$ 這種類型的邊。在解析 regular expression 時,用線性 $\mathcal{O}(n)$ 或者是 $\mathcal{O}(n^2)$ 都沒關係。因為在計算答案至少 $\mathcal{O}(10^7)$

得到最原生的 NFA 後,接下來要思考怎麼找到匹配的子字串 $s_{j, i}$,維護可轉移的狀態集合 $S$,若當前 $S$ 中存在 acceptable 狀態,則找到一個 $i$ 解。依序加入字符 $s_i$,維護 $S$ 的做法如下:

  • 將起始狀態丟入 $S$,計算閉包 (closure),也就是所有 $a \overset{\varepsilon}{\rightarrow} b$ 能連到的狀態都應該被丟入 $S$
  • 定義下一個狀態集合 $S'$,在 $S$ 中的每一個狀態 $q$,都嘗試藉由 $s_i$ 轉移到 $q'$,將所有 $q'$ 丟入 $S'$ 中。
  • $\text{closure}(S')$ 存在 acceptable 狀態,則 $i$ 是一組解。
  • $S = \text{closure}(S' + \mathit{q}_{start})$

上面的算法並沒有錯,但原生的 NFA 產生方式在計算閉包時很慢,過多的邊和重複狀態導致。因此需要重建邊,預處理每一個字符轉移,移除所有 $a \overset{\varepsilon}{\rightarrow} b$,這麼一來就能在時限內通過。時間複雜度 $\mathcal{O}(|s| \times (\text{state} + \text{edge}))$

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#include <bits/stdc++.h>
using namespace std;
const int MAXSTATE = 32767;
const int epsilon = 0;
int toIndex(char c) {
return c - '0' + 1;
}
struct State {
vector<State*> trans[11];
int ac, label;
State() {
ac = label = 0;
for (int i = 0; i < 11; i++)
trans[i].clear();
}
} _mem[MAXSTATE];
int nodesize = 0;
State *newState() {
assert(nodesize < MAXSTATE);
State *p = &_mem[nodesize++];
*p = State();
return p;
}
struct NFA {
State *st, *ac;
NFA() {
st = ac = NULL;
}
NFA(char c) {
st = newState(), ac = newState();
ac->ac = 1;
st->trans[toIndex(c)].push_back(ac);
}
NFA(NFA L, char c) { // (A)*
st = ac = NULL;
if (c != '*')
return ;
st = newState(), ac = newState();
ac->ac = 1;
st->trans[epsilon].push_back(L.st);
L.st->trans[epsilon].push_back(L.ac);
L.ac->trans[epsilon].push_back(L.st);
L.ac->trans[epsilon].push_back(ac);
L.ac->ac = 0;
}
NFA(NFA L, NFA R, char c) {
if (R.st == NULL) {
st = L.st, ac = L.ac;
return;
}
if (c == '|') {
st = newState(), ac = newState();
ac->ac = 1;
st->trans[epsilon].push_back(L.st);
st->trans[epsilon].push_back(R.st);
L.ac->trans[epsilon].push_back(ac);
R.ac->trans[epsilon].push_back(ac);
L.ac->ac = R.ac->ac = 0;
} else if (c == '&') {
st = newState(), ac = newState();
ac->ac = 1;
st->trans[epsilon].push_back(L.st);
L.ac->trans[epsilon].push_back(R.st);
R.ac->trans[epsilon].push_back(ac);
L.ac->ac = R.ac->ac = 0;
}
}
};
NFA parser(string exp) {
if (exp.length() == 0)
return NFA();
if (exp.length() == 1)
return NFA(exp[0]);
int l = 0, pos = -1;
for (int i = 0; i < exp.length(); i++) {
if (exp[i] == '(') {
l++;
} else if (exp[i] == ')') {
l--;
} else if (exp[i] == '+') {
if (l == 0) {
pos = i;
break;
}
}
}
if (pos != -1) {
NFA L = parser(exp.substr(0, pos));
NFA R = parser(exp.substr(pos+1));
return NFA(L, R, '|');
}
int hasStar = 0;
string ls, rs;
if (exp[0] == '(') {
for (int i = 0; i < exp.length(); i++) {
if (exp[i] == '(') {
l++;
} else if (exp[i] == ')') {
l--;
if (l == 0) { // (...)...
if (i+1 < exp.length() && exp[i+1] == '*') { // (...)*...
hasStar = 1;
ls = exp.substr(1, i-1), rs = exp.substr(i+2);
} else { // (...)...
ls = exp.substr(1, i-1), rs = exp.substr(i+1);
}
break;
}
}
}
} else { // ...(...) or ...*... or ......
for (int i = 0; i < exp.length(); i++) {
if (exp[i] == '(') {
l++;
} else if (exp[i] == ')') {
l--;
}
if (l == 0) {
if (i+1 < exp.length() && exp[i+1] == '*') {
hasStar = 1;
ls = exp.substr(0, i+1), rs = exp.substr(i+2);
} else {
ls = exp.substr(0, i+1), rs = exp.substr(i+1);
}
break;
}
}
}
for (int i = 0; rs.length() > 0; ) {
while (i < rs.length() && rs[i] == '*')
i++;
rs = rs.substr(i);
break;
}
NFA L = parser(ls);
NFA R = parser(rs);
if (hasStar)
L = NFA(L, '*');
return NFA(L, R, '&');
}
State *gmap[MAXSTATE];
int relabel(NFA A) {
int size = 0;
State *u, *v;
queue<State*> Q;
Q.push(A.st), A.st->label = ++size;
while (!Q.empty()) {
u = Q.front(), Q.pop();
gmap[u->label] = u;
for (int it = 0; it < 11; it++) {
for (int i = 0; i < u->trans[it].size(); i++) {
v = u->trans[it][i];
if (v->label == 0) {
v->label = ++size;
Q.push(v);
}
}
}
}
return size;
}
char s[16777216];
int used[MAXSTATE], ACable[MAXSTATE];
int closure(vector<int> &A, int x, int cases) {
queue<int> Q;
State *u;
int accept = 0;
if (used[x] != cases)
A.push_back(x), used[x] = cases;
Q.push(x);
while (!Q.empty()) {
x = Q.front(), Q.pop();
u = gmap[x];
accept |= u->ac;
if (u->trans[epsilon].size() == 0)
continue;
for (auto &y : u->trans[epsilon]) {
if (used[y->label] != cases) {
Q.push(y->label), used[y->label] = cases;
A.push_back(y->label);
}
}
}
return accept;
}
void rebuild(int n, int ctype) {
memset(ACable, 0, sizeof(ACable));
memset(used, 0, sizeof(used));
int cases = 0;
for (int i = 1; i <= n; i++) {
cases++;
vector<int> cc;
closure(cc, i, cases);
for (int j = 1; j <= ctype; j++) {
set<int> S;
for (int k = 0; k < cc.size(); k++) {
State *u = gmap[cc[k]];
for (auto &p : u->trans[j])
S.insert(p->label);
ACable[i] |= u->ac;
}
State *u = gmap[i];
u->trans[j].clear();
for (auto &x : S)
u->trans[j].push_back(gmap[x]);
}
}
}
int main() {
int n, m;
char regex[512];
while (scanf("%d %s", &n, regex) == 2) {
nodesize = 0; // global
NFA nfa = parser(regex);
m = relabel(nfa);
rebuild(m, n);
scanf("%s", s);
memset(used, 0, sizeof(used));
int cases = 0, flag = 0;
vector<int> A;
cases++;
A.push_back(1);
for (int i = 0; s[i]; i++) {
vector<int> next;
cases++;
int accept = 0;
for (int j = 0; j < A.size(); j++) {
int x = A[j];
State *u = gmap[x];
if (u->trans[toIndex(s[i])].size() == 0)
continue;
for (auto &y : u->trans[toIndex(s[i])]) {
if (used[y->label] != cases) {
used[y->label] = cases, next.push_back(y->label);
accept |= ACable[y->label];
}
}
}
if (used[1] != cases)
used[1] = cases, next.push_back(1);
A = next;
if (accept) {
if (flag) putchar(' ');
flag = 1;
printf("%d", i+1);
}
}
puts("");
}
return 0;
}
/*
6 1(2+3)*4
012345
2 00*(10+100)*
00100
*/