UVa 13009 - Fence the vegetables fail

Problem

給一個平行兩軸的簡單多邊形花圃以及一堆蔬菜的座標,請問有多少蔬菜在柵欄的外部,把那些在外部的蔬菜編號加總後輸出。

Sample Input

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
4 8
1 2
1 0
5 3
3 4
0 1
6 1
6 4
4 4
4 3
2 3
2 5
0 5
6 12
6 5
1 9
3 6
3 4
2 0
4 4
5 8
5 3
2 3
2 5
4 5
4 7
0 7
0 1
7 1
7 10
0 10
0 8
1 4
1 1
2 0
2 2
0 2
0 0

Sample Output

1
2
3
6
15
0

Solution

判斷一個點是否在簡單多邊形內部可以使用射線法,然而有很多點詢問,單筆詢問複雜度 $\mathcal{O}(N)$,這樣很明顯地會 TLE。

為了加速判斷,採用掃描線算法,依序放入平行 x 軸的線段,掃描過程中離線詢問某點往 y 軸負方向的射線交點個數為何,維護求交點個數可以利用 binary indexed tree 維護前綴和,前綴和相當於射線交點個數。就能在 $\mathcal{O}(N \log N)$ 完成所有詢問。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <vector>
#include <map>
#include <set>
#include <math.h>
using namespace std;
struct Pt {
int x, y, v;
Pt(int a = 0, int b = 0):
x(a), y(b) {}
Pt operator-(const Pt &a) const {
return Pt(x - a.x, y - a.y);
}
Pt operator+(const Pt &a) const {
return Pt(x + a.x, y + a.y);
}
Pt operator*(const double a) const {
return Pt(x * a, y * a);
}
bool operator<(const Pt &a) const {
if (x != a.x)
return x < a.x;
if (y != a.y)
return y < a.y;
return false;
}
};
struct Seg {
int xl, xr, y;
Seg(int a = 0, int b = 0, int c = 0):
xl(a), xr(b), y(c) {}
bool operator<(const Seg &u) const {
return xl < u.xl;
}
};
Pt P[131072], D[131072];
int BIT[262144];
void modify(int x, int val, int L) {
while (x <= L)
BIT[x] += val, x += x&(-x);
}
int query(int x) {
int sum = 0;
while (x)
sum += BIT[x], x -= x&(-x);
return sum;
}
void solve(int N, int M) {
map<int, int> RX, RY;
for (int i = 0; i < N; i++)
RX[P[i].x] = RY[P[i].y] = 0;
for (int i = 0; i < M; i++)
RX[D[i].x] = RY[D[i].y] = 0;
int label_x = 0, label_y = 0;
for (auto &x : RX)
x.second = ++label_x;
for (auto &y : RY)
y.second = ++label_y;
memset(BIT, 0, sizeof(BIT));
for (int i = 0; i < N; i++)
P[i].x = RX[P[i].x], P[i].y = RY[P[i].y];
for (int i = 0; i < M; i++)
D[i].x = RX[D[i].x], D[i].y = RY[D[i].y];
vector<Seg> segs;
for (int i = 0; i < M; i++) {
if (D[i].y == D[(i+1)%M].y) {
int xl = min(D[i].x, D[(i+1)%M].x);
int xr = max(D[i].x, D[(i+1)%M].x);
segs.push_back(Seg(xl, xr, D[i].y));
}
}
long long outer_val = 0;
sort(P, P+N);
sort(segs.begin(), segs.end());
int Pidx = 0;
set<std::pair<int, int>> PQ;
for (int i = 0, line_x = 1; line_x <= label_x; line_x++) {
while (Pidx < N && P[Pidx].x <= line_x) {
int intersect = query(P[Pidx].y);
if (intersect%2 == 0)
outer_val += P[Pidx].v;
Pidx++;
}
while (!PQ.empty() && PQ.begin()->first <= line_x)
modify(PQ.begin()->second, -1, label_y), PQ.erase(PQ.begin());
while (i < segs.size() && segs[i].xl == line_x) {
modify(segs[i].y, 1, label_y);
PQ.insert(make_pair(segs[i].xr, segs[i].y));
i++;
}
}
printf("%lld\n", outer_val);
}
int main() {
int N, M;
while (scanf("%d %d", &N, &M) == 2) {
for (int i = 0; i < N; i++) {
scanf("%d %d", &P[i].x, &P[i].y);
P[i].v = i+1;
}
for (int i = 0; i < M; i++)
scanf("%d %d", &D[i].x, &D[i].y);
solve(N, M);
}
return 0;
}
Read More +

UVa 13046 - Bus Collisions

Problem

給予兩個公車行駛路線,每台公車的速度都相同,分別從起始位置出發,請問第一個碰撞位置為何?

Sample Input

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
2
4
0 4
2 4
2 5
0 5
4
1 7
1 2
3 2
3 7
4
5 0
6 0
6 1
5 1
4
7 0
8 0
8 1
7 1

Sample Output

1
2
Case 1: 1 5
Case 2: No Collision

Solution

抓路線長度的最小公倍數,一定能在這段時間內找到碰撞位置,否則不存在碰撞地點。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#include <bits/stdc++.h>
using namespace std;
struct Pt {
int x, y;
Pt(int x = 0, int y = 0): x(x), y(y) {
}
int dist(Pt u) {
return abs(x - u.x) + abs(y - u.y);
}
bool operator==(const Pt &u) const {
return x == u.x && y == u.y;
}
Pt operator-(const Pt &u) const {
return Pt(x - u.x, y - u.y);
}
Pt operator+(const Pt &u) const {
return Pt(x + u.x, y + u.y);
}
};
int main() {
int testcase, cases = 0;
int N[2], x, y;
Pt R[2][64];
scanf("%d", &testcase);
while (testcase--) {
int Len[2] = {};
for (int i = 0; i < 2; i++) {
scanf("%d", &N[i]);
for (int j = 0; j < N[i]; j++) {
scanf("%d %d", &x, &y);
R[i][j] = Pt(x, y);
}
for (int j = 0; j < N[i]; j++)
Len[i] += R[i][j].dist(R[i][(j+1)%N[i]]);
}
printf("Case %d: ", ++cases);
int simT = Len[0]/__gcd(Len[0], Len[1])*Len[1];
int posIdx[2], has = 0;
Pt posXY[2];
for (int i = 0; i < 2; i++)
posIdx[i] = 0, posXY[i] = R[i][0];
for (int it = 0; it < simT; it++) {
for (int i = 0; i < 2; i++) {
// move
if (posXY[i] == R[i][posIdx[i]])
posIdx[i] = (posIdx[i]+1)%N[i];
Pt dv = R[i][posIdx[i]] - posXY[i];
int lenDv = abs(dv.x) + abs(dv.y);
dv.x /= lenDv, dv.y /= lenDv;
posXY[i] = posXY[i] + dv;
}
if (posXY[0] == posXY[1]) {
printf("%d %d\n", posXY[0].x, posXY[0].y);
has = 1;
break;
}
}
if (!has)
puts("No Collision");
}
return 0;
}
Read More +

UVa 1718 - Tile Cutting

Problem

請問在網格中,若頂點設置在網格邊上,構成平行四邊形的面積介於 $[l, r]$,最多的方法數和面積分別為多少。

Sample Input

1
2
3
2
4 4
2 6

Sample Output

1
2
48
620

Solution

由於是平行四邊形,假設矩形被劃分的兩個對稱三角形的底高 $a, b, c, d$,且 $a, b, c, d > 0$。

得到平行四邊形面積為

$$\begin{align*} \text{Area} &= (a + c)(b + d) - 2 \times 0.5 \times a b - 2 \times 0.5 \times c d \\ &= ad + bc \end{align*}$$

最後找到構成方法數為 ways of area = #(a, b, c, d) sat. ad + bc = Area。分別統計兩數相乘為 $x$ 的方法數 $W_x$,可以構成多項式 $W_1 x + W_2 x^2 + \cdots W_n x^n$,多項式相乘後,得到的係數就是所有的方法數。套用 FFT 的卷積計算即可。

FFT

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <vector>
#include <assert.h>
#include <algorithm>
#include <complex>
using namespace std;
struct Complex {
double x, y;
Complex(double x = 0, double y = 0): x(x), y(y) {}
Complex operator-(const Complex &v) const {
return Complex(x-v.x,y-v.y);
}
Complex operator+(const Complex &v) const {
return Complex(x+v.x,y+v.y);
}
Complex operator*(const Complex &v) const {
return Complex(x*v.x-y*v.y,x*v.y+y*v.x);
}
Complex operator/(const int &v) const {
return Complex(x/v,y/v);
}
double real() {
return x;
}
};
template<typename T> class TOOL_FFT {
public:
typedef unsigned int UINT32;
#define MAXN (1048576<<1)
Complex p[2][MAXN];
int pre_n;
T PI;
TOOL_FFT() {
pre_n = 0;
PI = acos(-1);
}
int NumberOfBitsNeeded(int PowerOfTwo) {
for (int i = 0; ; ++i) {
if (PowerOfTwo & (1 << i)) {
return i;
}
}
}
inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
return a >> (32 - NumBits);
}
void FFT(bool InverseTransform, vector<Complex>& In, vector<Complex>& Out) {
// simultaneous data copy and bit-reversal ordering into outputs
int NumSamples = In.size();
int NumBits = NumberOfBitsNeeded(NumSamples);
for (int i = 0; i < NumSamples; ++i) {
Out[FastReverseBits(i, NumBits)] = In[i];
}
// the FFT process
for (register int i = 1; i <= NumBits; i++) {
int BlockSize = 1<<i, BlockEnd = BlockSize>>1, BlockCnt = NumSamples/BlockSize;
for (register int j = 0; j < NumSamples; j += BlockSize) {
Complex *t = p[InverseTransform];
for (register int k = 0; k < BlockEnd; k++, t += BlockCnt) {
Complex a = (*t) * Out[k+j+BlockEnd];
Out[k+j+BlockEnd] = Out[k+j] - a;
Out[k+j] = Out[k+j] + a;
}
}
}
// normalize if inverse transform
if (InverseTransform) {
for (int i = 0; i < NumSamples; ++i) {
Out[i] = Out[i] / NumSamples;
}
}
}
void prework(int n) {
if (pre_n == n)
return ;
pre_n = n;
p[0][0] = Complex(1, 0);
p[1][0] = Complex(1, 0);
for (register int i = 1; i < n; i++) {
p[0][i] = Complex(cos(2*i*PI / n ) , sin(2*i*PI / n ));
p[1][i] = Complex(cos(2*i*PI / n ) , -sin(2*i*PI / n ));
}
}
vector<T> convolution(Complex *a, Complex *b, int n) {
prework(n);
vector<Complex> s(a, a+n), d1(n), d2(n), y(n);
vector<T> ret(n);
FFT(false, s, d1);
s[0] = b[0];
for (int i = 1, j = n-1; i < n; ++i, --j)
s[i] = b[j];
FFT(false, s, d2);
for (int i = 0; i < n; ++i) {
y[i] = d1[i] * d2[i];
}
FFT(true, y, s);
for (int i = 0; i < n; ++i) {
ret[i] = s[i].real();
}
return ret;
}
};
TOOL_FFT<double> tool;
Complex a[MAXN], b[MAXN];
vector<double> c;
long long ret[1048576];
long long pr[1048576] = {};
int main() {
int m;
const int N = 500000;
for (m = 1; m < (N<<1); m <<= 1);
/*
(a + c)(b + d) - 2 * 0.5 * (a b) - 2 * 0.5 * (c * d)
= ad + bc = area
a, b, c, d > 0,
ways of area = #(a, b, c, d) sat. ad + bc = area
*/
for (int i = 1; i <= N; i++) {
for (int j = i; j <= N; j += i)
pr[j]++;
}
memset(a, 0, sizeof(a[0]) * m);
memset(b, 0, sizeof(b[0]) * m);
for (int i = 1; i <= N; i++) {
a[i] = Complex(pr[i], 0);
b[m-i] = Complex(pr[i], 0);
}
c = tool.convolution(a, b, m);
for (int i = 1; i <= N; i++)
ret[i] = (long long) (c[i] + 0.5);
int testcase, L, R;
while (scanf("%d", &testcase) == 1) {
while (testcase--) {
scanf("%d %d", &L, &R);
assert(L <= R);
assert(L >= 1 && R <= 500000);
int mxA = L;
for (int i = L; i <= R; i++) {
if (ret[i] > ret[mxA])
mxA = i;
}
printf("%d %lld\n", mxA, ret[mxA]);
}
}
return 0;
}

NTT

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <vector>
#include <assert.h>
#include <algorithm>
#include <complex>
using namespace std;
typedef unsigned int UINT32;
typedef long long INT64;
class TOOL_NTT {
public:
#define MAXN (1048576<<1)
const INT64 P = (479 << 21) + 1LL; // prime m = kn+1
const INT64 G = 3;
INT64 wn[22];
INT64 s[MAXN], d1[MAXN], d2[MAXN], y[MAXN];
TOOL_NTT() {
for (int i = 0; i < 22; i++)
wn[i] = mod_pow(G, (P-1) / (1<<i), P);
}
INT64 mod_mul(INT64 a, INT64 b, INT64 mod) {
return (a*b - (long long)(a/(long double)mod*b+1e-3)*mod+mod)%mod;
// INT64 ret = 0;
// for (a = a >= mod ? a%mod : a, b = b >= mod ? b%mod : b; b != 0; b>>=1, a <<= 1, a = a >= mod ? a - mod : a) {
// if (b&1) {
// ret += a;
// if (ret >= mod)
// ret -= mod;
// }
// }
// return ret;
}
INT64 mod_pow(INT64 n, INT64 e, INT64 m) {
INT64 x = 1;
for (n = n >= m ? n%m : n; e; e >>= 1) {
if (e&1)
x = mod_mul(x, n, m);
n = mod_mul(n, n, m);
}
return x;
}
int NumberOfBitsNeeded(int PowerOfTwo) {
for (int i = 0;; ++i) {
if (PowerOfTwo & (1 << i)) {
return i;
}
}
}
inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
return a >> (32 - NumBits);
}
void NTT(int on, INT64 *In, INT64 *Out, int n) {
int NumBits = NumberOfBitsNeeded(n);
for (int i = 0; i < n; ++i)
Out[FastReverseBits(i, NumBits)] = In[i];
for(int h = 2, id = 1; h <= n; h <<= 1, id++) {
for(int j = 0; j < n; j += h) {
INT64 w = 1, u, t;
int block = h/2, blockEnd = j + h/2;
for(int k = j; k < blockEnd; k++) {
u = Out[k], t = mod_mul(w, Out[k+block], P);
Out[k] = u + t;
Out[k + block] = u - t + P;
if (Out[k] >= P) Out[k] -= P;
if (Out[k+block] >= P) Out[k+block] -= P;
w = mod_mul(w, wn[id], P);
}
}
}
if (on == 1) {
for (int i = 1; i < n/2; i++)
swap(Out[i], Out[n-i]);
INT64 invn = mod_pow(n, P-2, P);
for (int i = 0; i < n; i++)
Out[i] = mod_mul(Out[i], invn, P);
}
}
void convolution(INT64 *a, INT64 *b, int n, INT64 *c) {
NTT(0, a, d1, n);
s[0] = b[0];
for (int i = 1; i < n; ++i)
s[i] = b[n-i];
NTT(0, s, d2, n);
for (int i = 0; i < n; i++)
s[i] = mod_mul(d1[i], d2[i], P);
NTT(1, s, c, n);
}
} tool;
INT64 a[MAXN], b[MAXN], c[MAXN];
long long ret[1048576];
long long pr[1048576] = {};
int main() {
int m;
const int N = 500000;
for (m = 1; m < (N<<1); m <<= 1);
/*
(a + c)(b + d) - 2 * 0.5 * (a b) - 2 * 0.5 * (c * d)
= ad + bc = area
a, b, c, d > 0,
ways of area = #(a, b, c, d) sat. ad + bc = area
*/
for (int i = 1; i <= N; i++) {
for (int j = i; j <= N; j += i)
pr[j]++;
}
memset(a, 0, sizeof(a[0]) * m);
memset(b, 0, sizeof(b[0]) * m);
for (int i = 1; i <= N; i++) {
a[i] = pr[i];
b[m-i] = pr[i];
}
tool.convolution(a, b, m, c);
for (int i = 1; i <= N; i++)
ret[i] = c[i];
int testcase, L, R;
while (scanf("%d", &testcase) == 1) {
while (testcase--) {
scanf("%d %d", &L, &R);
assert(L <= R);
assert(L >= 1 && R <= 500000);
int mxA = L;
for (int i = L; i <= R; i++) {
if (ret[i] > ret[mxA])
mxA = i;
}
printf("%d %lld\n", mxA, ret[mxA]);
}
}
return 0;
}
Read More +

UVa 801 - Flight Planning

Problem

給予飛機在平流層飛行資訊,在每一種高度下維持飛行的耗油量成線性關係,以及每一段飛行旅程上的平流層的風速為何,藉由線性內插得到在某高度下的飛行速度,忽略在轉換高度時的耗油量,問從起點到終點的最少耗油量為何 (無條件進位)。

Sample Input

1
2
3
4
5
6
7
8
2
2
1500 -50 50
1000 0 0
3
1000 50 0
2000 0 20
1800 50 100

Sample Output

1
2
Flight 1: 35 30 13986
Flight 2: 20 30 30 23502

Solution

單源最短路徑,只是每一點要使用 $20$ 個狀態維護抵達那一層時在哪一種高度下飛行。必須窮舉轉移到下一層的所有高度,因為有可能飛行慢而增加耗油時間。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <algorithm>
#include <vector>
using namespace std;
struct Leg {
double mile, f20, f40;
} legs[1024];
const double VCRUISE = 400;
const double AOPT = 30000;
const double GPHOPT = 2000;
const double GPHEXTRA = 10;
const double CLIMBCOST = 50;
const int MAXN = 1024; // maximum #legs
const int MAXH = 45; // between 20,000 and 40,000 feet
const double DINF = 1e+20;
double inter(Leg a, double h) {
return a.f20 + (a.f40 - a.f20) * (h - 20000) / (40000 - 20000);
}
void solve(int n) {
double dp[MAXN][MAXH];
int from[MAXN][MAXH];
for (int i = 0; i <= n; i++) {
for (int j = 0; j <= 40; j++)
dp[i][j] = DINF;
}
dp[0][0] = 0; // leg 0: altitude 0
for (int i = 0; i < n; i++) {
for (int j = 0; j <= 40; j++) {
if (dp[i][j] >= DINF)
continue;
double alti_a = j * 1000.f;
for (int k = 20; k <= 40; k++) {
double cc = 0, alti_b;
alti_b = k * 1000.f;
if (alti_b > alti_a)
cc += CLIMBCOST * (alti_b - alti_a) / 1000.f;
double v = VCRUISE + inter(legs[i+1], alti_b);
if (v <= 0)
continue;
double c = fabs(alti_b - AOPT) * GPHEXTRA / 1000.f + GPHOPT;
cc += c * legs[i+1].mile / v;
if (dp[i][j] + cc < dp[i+1][k])
dp[i+1][k] = dp[i][j] + cc, from[i+1][k] = j;
}
}
}
double ret = DINF;
int last = -1;
vector<int> solution;
for (int i = 20; i <= 40; i++) {
if (dp[n][i] < ret)
ret = dp[n][i], last = i;
}
for (int i = n; i > 0; i--) {
solution.push_back(last);
last = from[i][last];
}
for (int i = n-1; i >= 0; i--)
printf(" %d", solution[i]);
// tricky
printf(" %.0lf\n", ceil(ret));
}
int main() {
int testcase, cases = 0;
int n;
scanf("%d", &testcase);
while (testcase--) {
scanf("%d", &n);
for (int i = 1; i <= n; i++) {
scanf("%lf %lf %lf",
&legs[i].mile, &legs[i].f20, &legs[i].f40);
}
printf("Flight %d:", ++cases);
solve(n);
}
return 0;
}
Read More +

批改娘 10082. Fast Find Prime Number (OpenMP)

題目描述

找出 $[ L, \; R ]$ 區間內有多少個質數。

輸入格式

有多組測資,每組測資一行兩個整數 $L$, $R$,表示左右區間。

  • $1 \le L \le R \le 2^{31}-1$

輸出格式

對於每一組詢問輸出一行一個整數。

輸入範例

1
2
1000000 2000000
1 1000000000

輸出範例

1
2
70435
50847534

提示

Sieve of Eratosthenes

Solution

這一題沒有出得很好,後來想到可以作弊打表,目前仍沒有同學挑戰成功,略感傷心。

線性篩法

  • 線性篩法 Accepted (1724 ms, 2816 KB)

由於只要統計 $2^{31}$ 以內的質數個數,只需要把 50000 以內的所有質數找到,利用這不到 5000 的質數進行區塊篩法即可。

由於 $[1, 2^{31}]$ 也是非常龐大的區間,可以利用 bitmask 的技術,將記憶體壓縮 32 倍,但這樣子平行度還是不高,因此若詢問區間 $[l, r]$ 有多少個質數,將區間切割成數個足夠大的 chunk,這裡決定 chunk 大約在 100000 到 1000000 都有不錯的效果。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <omp.h>
#define MAXL (50000>>5)+1
#define BLOCK (1<<17)
#define MAXN (BLOCK>>5)+1
#define GET1(x) (mark1[(x)>>5]>>((x)&31)&1)
#define SET1(x) (mark1[(x)>>5] |= 1<<((x)&31))
#define GET2(x) (mark2[(x)>>5]>>((x)&31)&1)
#define SET2(x) (mark2[(x)>>5] |= 1<<((x)&31))
int mark1[MAXL];
int P[5500], Pt = 0;
void sieve() {
register int i, j, k;
SET1(1);
int n = 50000;
for (i = 2; i <= n; i++) {
if (!GET1(i)) {
for (k = n/i, j = i*k; k >= i; k--, j -= i)
SET1(j);
P[Pt++] = i;
}
}
}
int local_sieve(int a, int b) {
int sqr = (int) sqrt(b), gap = b - a;
int mark2[MAXN] = {};
for (int i = 0; i < Pt && P[i] <= sqr; i++) {
unsigned p = P[i], base = a / p * p;
while (base < a) base += p;
if (base == p) base += p;
for (unsigned j = base; j <= b; j += p)
SET2(j - a);
}
if (a == 1) SET2(0);
int ret = gap + 1;
for (int i = gap>>5; i >= 0; i--)
ret -= __builtin_popcount(mark2[i]);
return ret;
}
long long min(long long x, long long y) {
return x < y ? x : y;
}
int main() {
sieve();
int l, r;
while (scanf("%d %d", &l, &r) == 2) {
int ret = 0;
#pragma omp parallel for schedule(dynamic) reduction(+: ret)
for (long long i = l; i <= r; i += BLOCK)
ret += local_sieve(i, min(i+BLOCK-1, r));
printf("%d\n", ret);
}
return 0;
}

作弊打表

由於問題設計只針對個數,還可以偷偷本地建表完成每一個區間的值,針對非完全包含的區間單獨計算即可。打表部分可以藉由上述的程序做到。

  • 作弊打表 Accepted (622 ms, 384 KB)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <omp.h>
#define MAXL (50000>>5)+1
#define BLOCK (1<<20)
#define MAXN (BLOCK>>5)+1
#define GET1(x) (mark1[(x)>>5]>>((x)&31)&1)
#define SET1(x) (mark1[(x)>>5] |= 1<<((x)&31))
#define GET2(x) (mark2[(x)>>5]>>((x)&31)&1)
#define SET2(x) (mark2[(x)>>5] |= 1<<((x)&31))
int preprocess[] = {82025,73586,70938,69398,68248,67307, <...>};
int mark1[MAXL];
int P[5500], Pt = 0;
void sieve() {
register int i, j, k;
SET1(1);
int n = 50000;
for (i = 2; i <= n; i++) {
if (!GET1(i)) {
for (k = n/i, j = i*k; k >= i; k--, j -= i)
SET1(j);
P[Pt++] = i;
}
}
}
int local_sieve(int a, int b) {
int sqr = (int) sqrt(b), gap = b - a;
int mark2[MAXN] = {};
for (int i = 0; i < Pt && P[i] <= sqr; i++) {
unsigned p = P[i], base = a / p * p;
while (base < a) base += p;
if (base == p) base += p;
for (unsigned j = base; j <= b; j += p)
SET2(j - a);
}
if (a == 1) SET2(0);
int ret = gap + 1;
for (int i = gap>>5; i >= 0; i--)
ret -= __builtin_popcount(mark2[i]);
return ret;
}
long long min(long long x, long long y) {
return x < y ? x : y;
}
long long max(long long x, long long y) {
return x > y ? x : y;
}
int main() {
sieve();
int prepro_n = sizeof(preprocess) / sizeof(int);
int l, r;
while (scanf("%d %d", &l, &r) == 2) {
int ret = 0;
for (int i = 0; i < prepro_n; i++) {
long long tl = (long long) i * BLOCK + 1;
long long tr = (long long) (i+1) * BLOCK;
if (l <= tl && tr <= r) {
ret += preprocess[i];
} else {
if (tl <= l && l <= tr) {
ret += local_sieve(l, min(r, tr));
} else if (tl <= r && r <= tr) {
ret += local_sieve(max(tl, l), r);
}
}
}
printf("%d\n", ret);
}
return 0;
}
Read More +

批改娘 10081. Fast Game of Life

題目描述

生命遊戲中,對於任意細胞,規則如下:
每個細胞有兩種狀態-存活或死亡,每個細胞與以自身為中心的周圍八格細胞產生互動。

  • 當前細胞為存活狀態時,當周圍低於 2 個 (不包含 2 個) 存活細胞時,該細胞變成死亡狀態。
  • 當前細胞為存活狀態時,當周圍有 2 個或 3 個存活細胞時, 該細胞保持原樣。
  • 當前細胞為存活狀態時,當周圍有 3 個以上的存活細胞時,該細胞變成死亡狀態。
  • 當前細胞為死亡狀態時,當周圍有 3 個存活細胞時,該細胞變成存活狀態。

可以把最初的細胞結構定義為種子,當所有在種子中的細胞同時被以上規則處理後,可以得到第一代細胞圖。按規則繼續處理當前的細胞圖,可以得到下一代的細胞圖,周而復始。

輸入格式

輸入第一行有兩個整數 $N$, $M$,表示盤面大小為 $N \times N$,模擬週期次數 $M$。接下來會有 $N$ 行,每一行上會有 $N$ 個字符,以 0 表示 $(i, j)$ 格子上的細胞屬於死亡狀態,反之 1 為存活狀態。

  • $1 \le N \le 2000$
  • $0 \le M \le 1000$

輸出格式

對於每一組測資輸出 $N$ 行,每一行上有 $N$ 個字元表示模擬 $M$ 次的最終盤面結果。

範例輸入 1

1
2
3
4
5
6
5 1
10001
00100
01110
00100
01010

範例輸出 1

1
2
3
4
5
00000
00100
01010
00000
00100

範例輸入 2

1
2
3
4
5
6
5 3
10001
00100
01110
00100
01010

範例輸出 2

1
2
3
4
5
00000
00000
01110
00000
00000

Solution

在還沒有做任何平行前,生命遊戲的模擬非常簡單,只需要統計鄰近八格的狀態即可,然而由於是常數範圍,直接打八個加法比起迴圈去累計八格的結果快上非常多,別忽視 for loop 在做 branch 的 cycle 數量。

  • 樸素平行 Accepted (378 ms, 24 MB)
  • 循環展開 Accepted (289 ms, 39 MB)

平行採用 big chunk 的方式,考慮到 cache miss 的關係,最好是每一個處理器都把相鄰的列放置在各自的 L1-L2-L3 cache,防止使用時不在自己的 cache,而產生嚴重的 cache miss。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <omp.h>
#define UINT unsigned long
#define MAXN 2048
char g[2][MAXN][MAXN];
int n, m;
int main() {
while (scanf("%d %d", &n, &m) == 2) {
while (getchar() != '\n');
// read input
for (int i = 1; i <= n; i++) {
fgets(g[0][i]+1, MAXN, stdin);
for (int j = 1; j <= n; j++)
g[0][i][j] -= '0';
g[0][i][n+1] = 0;
}
// game of life
int flag = 0;
int chunk = 64;
for (int it = 0; it < m; it++) {
#pragma omp parallel for schedule(static, chunk)
for (int i = 1; i <= n; i++) {
for (int j = 1; j <= n; j++) {
int adj = g[flag][i-1][j-1] + g[flag][i-1][j] + g[flag][i-1][j+1] +
g[flag][i ][j-1] + g[flag][i ][j+1] +
g[flag][i+1][j-1] + g[flag][i+1][j] + g[flag][i+1][j+1];
g[!flag][i][j] = (g[flag][i][j] == 0 && adj == 3) ||
(g[flag][i][j] == 1 && (adj == 2 || adj == 3));
}
}
flag = !flag;
}
// print result
for (int i = 1; i <= n; i++) {
for (int j = 1; j <= n; j++)
g[flag][i][j] += '0';
g[flag][i][n+1] = '\0';
puts(g[flag][i]+1);
}
}
return 0;
}

循環展開

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <omp.h>
#define UINT unsigned long
#define MAXN 2048
char g[2][MAXN][MAXN];
int n, m;
int main() {
while (scanf("%d %d", &n, &m) == 2) {
while (getchar() != '\n');
// read input
for (int i = 1; i <= n; i++) {
fgets(g[0][i]+1, MAXN, stdin);
for (int j = 1; j <= n; j++)
g[0][i][j] -= '0';
g[0][i][n+1] = 0;
}
// game of life
#ifdef _OPENMP
int chunk = n / omp_get_max_threads();
#endif
for (int it = 0; it < m; it++) {
#pragma omp parallel for schedule(static, chunk)
for (int i = 1; i <= n; i++) {
int adj, j = 1;
#define UNLOOP { \
adj = g[0][i-1][j-1] + g[0][i-1][j] + g[0][i-1][j+1] + \
g[0][i ][j-1] + g[0][i ][j+1] + \
g[0][i+1][j-1] + g[0][i+1][j] + g[0][i+1][j+1]; \
g[1][i][j] = (g[0][i][j] == 0 && adj == 3) || \
(g[0][i][j] == 1 && (adj == 2 || adj == 3)); j++; \
}
#define UNLOOP4 {UNLOOP UNLOOP UNLOOP UNLOOP}
#define UNLOOP8 {UNLOOP4 UNLOOP4}
for (; j + 8 <= n; )
UNLOOP8;
for (; j <= n; )
UNLOOP;
}
#pragma omp parallel for schedule(static, chunk)
for (int i = 1; i <= n; i++)
memcpy(g[0][i], g[1][i], sizeof(g[0][0][0]) * (n+1));
}
// print result
for (int i = 1; i <= n; i++) {
for (int j = 1; j <= n; j++)
g[0][i][j] += '0';
g[0][i][n+1] = '\0';
puts(g[0][i]+1);
}
}
return 0;
}
Read More +

批改娘 10026. Fast N-Queen

題目描述

相信 n-Queen 問題對每個研究 backtracking 的人來講都不陌生,這個問題是要在一個 $n \times n$ 大小的棋盤上擺n個皇后,讓她們不會互相攻擊到。為了讓這個問題更難一點,我們設計了一些障礙物在棋盤上,在這些點上不能放皇后。請留意這些障礙物並不會防止皇后被攻擊。

在傳統的 8-Queen 問題中,旋轉與鏡射被視為不同解法,因此我們有 92 種可能的方式來放置皇后。

輸入格式

輸入的資料最多包含 10 筆測試個案,每個測試個案的第一行會有一個整數 $n$ ($3 < n < 20$),接下來的 $n$ 行代表棋盤資料,點號 '.' 代表空的盤格,星號 '*' 代表放有障礙物的盤格。

輸出格式

對每筆測試個案,輸出這是第幾個 case 以及這個 case 有幾種可能的放置方式。

範例輸入

1
2
3
4
5
6
7
8
9
10
11
12
13
14
8
........
........
........
........
........
........
........
........
4
.*..
....
....
....

範例輸出

1
2
Case 1: 92
Case 2: 1

Solution

首先,必須先認識最快的 N 皇后問題可以利用 bitmask 的技術,把過多的 memory access 的指令捨去而加速。

  • 樸素平行 Accepted (2766 ms, 512 KB)
  • 負載平衡 Accepted (1771 ms, 5248 KB)

樸素平行

在大多數的教科書上,平行只針對第一個列的位置平行,因此平行度最多 $N$,針對第一列每一個位置都分別開一個 thread 找解,這樣運算時間相當於跑最慢的那一個 thread。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#include <stdio.h>
int y_valid[32] = {};
int dfs(int lv, int row, int dia1, int dia2, int mask) {
if (row == mask)
return 1;
int pos = mask & (~(row | dia1 | dia2)) & y_valid[lv], p;
int ret = 0;
while (pos) {
p = pos & (-pos);
pos -= p;
ret += dfs(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
}
return ret;
}
int totalNQueens(int n) {
int sum = 0;
int chunk = 1;
int row = 0, dia1 = 0, dia2 = 0, mask = (1<<n)-1, lv = 0;
int pos = mask & (~(row | dia1 | dia2)) & y_valid[0], p;
#pragma omp parallel for schedule(dynamic, chunk) reduction(+:sum)
for (int i = 0; i < n; i++) {
if ((pos>>i)&1) {
p = 1<<i;
int t = dfs(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
sum += t;
}
}
return sum;
}
int main() {
int cases = 0, n;
char s[32] = {};
while (scanf("%d", &n) == 1 && n) {
for (int i = 0; i < n; i++) {
scanf("%s", s);
y_valid[i] = (1<<n)-1;
for (int j = 0; j < n; j++) {
if (s[j] == '*')
y_valid[i] ^= (1<<j);
}
}
int ret = totalNQueens(n);
printf("Case %d: %d\n", ++cases, ret);
}
return 0;
}

負載平衡

為了達到每一個 thread 盡可能分到相同的工作量,避免最慘的那一條有太多的解而很慢,做好細粒度 (fine-grain) 的分配,如此一來工作量就很高的機率會相同。

因此,一開始先進行廣度搜索,把前幾列的解展開,也許會獲得 $N^x$ 種盤面,這時候平均切給 $M$ 條 thread,分配的工作量平衡上就有比較好的展現。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#include <stdio.h>
#define MAXQ 16384
#define MAXN 32
int y_valid[32] = {};
typedef struct Board {
int row, dia1, dia2;
} Board;
Board R[MAXQ*MAXN];
int dfs(int lv, int row, int dia1, int dia2, int mask) {
if (row == mask)
return 1;
int pos = mask & (~(row | dia1 | dia2)) & y_valid[lv], p;
int ret = 0;
while (pos) {
p = pos & (-pos);
pos -= p;
ret += dfs(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
}
return ret;
}
int ida_dep, ida_cnt;
void IDA(int lv, int row, int dia1, int dia2, int mask) {
if (lv == ida_dep) {
Board b = {.row = row, .dia1 = dia1, .dia2 = dia2};
R[ida_cnt++] = b;
return ;
}
int pos = mask & (~(row | dia1 | dia2)) & y_valid[lv], p;
while (pos) {
p = pos & (-pos);
pos -= p;
IDA(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
}
}
int totalNQueens(int n) {
int it = 0;
int row = 0, dia1 = 0, dia2 = 0, mask = (1<<n)-1, lv = 0;
for (it = 1; it <= n; it++) {
ida_dep = it, ida_cnt = 0;
IDA(0, row, dia1, dia2, mask);
if (ida_cnt >= MAXQ) {
int sum = 0;
int chunk = 1024;
#pragma omp parallel for schedule(static, chunk) reduction(+:sum)
for (int i = 0; i < ida_cnt; i++)
sum += dfs(it, R[i].row, R[i].dia1, R[i].dia2, mask);
return sum;
}
}
return ida_cnt;
}
int main() {
int cases = 0, n;
char s[32] = {};
while (scanf("%d", &n) == 1 && n) {
for (int i = 0; i < n; i++) {
scanf("%s", s);
y_valid[i] = (1<<n)-1;
for (int j = 0; j < n; j++) {
if (s[j] == '*')
y_valid[i] ^= (1<<j);
}
}
int ret = totalNQueens(n);
printf("Case %d: %d\n", ++cases, ret);
}
return 0;
}
Read More +

批改娘 10025. Fast Image Match (OpenMP)

背景

圖片匹配和字串匹配有一點不同,字串匹配通常要求其子字串與搜尋字串完全相符,而圖片匹配則用相似度為依據,當圖片大、複雜且具有干擾,或者需要匹配數量非常多,更先進的領域會利用特徵擷取,用機率統計的方式來篩選可能的匹配數量,篩選過後才進行圖片的細節匹配。

題目描述

給予兩個圖片 $A, B$,圖片格式為灰階影像,每個像素 $\mathit{pixel}(x, y)$ 採用 8-bits 表示,範圍為 $\mathit{pixel}(x, y) \in [0, 255]$。

舉一個例子,有一個 $3 \times 3$ 的圖片 $A$ 和一個 $2 \times 2$ 的圖片 $B$,用矩陣表示如下:

$$A := \begin{bmatrix} a1 & a2 & a3 \\ a4 & a5 & a6 \\ a7 & a8 & a9 \end{bmatrix} ,\; B := \begin{bmatrix} b1 & b2\\ b3 & b4\\ \end{bmatrix}$$

假設左上角座標 $(1, 1)$ 即 $a1$ 的位置、$(1, 2)$ 即 $a2$ 的位置。

  • 若把影像 $B$ 左上角對齊 $A$ 的 $(1, 1)$ 位置,其差異程度 $\mathit{diff}(A, B) = (a1 - b1)^2 + (a2 - b2)^2 + (a4 - b3)^2 + (a5 - b4)^2$
  • 相同地,對齊 $(2, 1)$ 位置,其差異程度 $\mathit{diff}(A, B) = (a4 - b1)^2 + (a5 - b2)^2 + (a7 - b3)^2 + (a8 - b4)^2$

比較時,整張 $B$ 都要落在 $A$ 中。現在要找到一個對齊位置 $(x, y)$,使得 $\mathit{diff}(A, B)$ 最小。

輸入格式

多組測資,每一組第一行有四個整數 $A_H, A_W, B_H, B_W$,分別表示圖片 $A$ 的高寬、$B$ 的高寬。

接下來會有 $A_H$ 行,每一行上會有 $A_W$ 個整數,第 $i$ 行上的第 $j$ 個整數 $x$ 表示 $A(i, j)$ 的灰階像素值。同樣地,接下來會有 $B_H$ 行,每一行上會有 $B_W$ 個整數,第 $i$ 行上的第 $j$ 個整數 $x$ 表示 $B(i, j)$ 的灰階像素值。

  • $ 1\le A_H, A_W, B_H, B_W \le 500$
  • $ B_H \le A_H, B_W \le A_W$
  • $ 0 \le x \le 255$

輸出格式

對於每一組測資輸出一行,兩個整數 $x, y$,滿足 $\mathit{diff}(A, B)$ 最小。若有相同情況滿足,則優先挑選 $x$ 最小,若 $x$ 仍相同,則挑選 $y$ 最小。

範例輸入

1
2
3
4
5
6
7
8
9
10
11
12
4 4 2 2
0 3 3 0
0 3 3 0
0 0 5 5
0 0 5 5
5 5
5 5
3 5 1 2
2 3 0 4 5
0 0 7 0 0
0 0 7 0 0
3 4

範例輸出

1
2
3 3
1 1

Solution

樸素平行

若圖片是 $N \times N$,樸素算法運作要 $\mathcal{O}(N^4)$,傅立葉算法要 $\mathcal{O}(N^2 \log N)$,這裡針對樸素算法平行。從最高層次的找解平行,每一個匹配位置都要耗費 $\mathcal{O}(M^2)$ 計算相似性,又由於平行限制跟核心數有關,那麼針對每一個 thread 要求找到好幾個 row 的解,分別存在各自的答案陣列區中,之後再線性跑一次取最小值,利用額外的空間儲存,避免在運算過程中遇到 critical section。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#pragma omp parallel for schedule(dynamic, chunk) shared(A, B)
for (int i = 0; i <= Ch; i++) {
for (int j = 0; j <= Cw; j++) {
long long d = 0;
for (int p = 0; p < Bh; p++) {
for (int q = 0; q < Bw; q++) {
d += (long long) (B[p][q] - A[i+p][j+q])*(B[p][q] - A[i+p][j+q]);
}
}
if (d < diff)
{
#pragma omp critical
if (d < diff)
diff = d, bx = i, by = j;
}
}
}

就是上列的寫法會卡在 critical section,倒不如存在另一個陣列中,效能還差距個幾百毫秒。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#include <stdio.h>
#include <limits.h>
#define MAXN 512
int A[MAXN][MAXN], B[MAXN][MAXN];
int C[MAXN][MAXN];
int main() {
int Ah, Aw, Bh, Bw, x;
int cases = 0;
while(scanf("%d %d %d %d", &Ah, &Aw, &Bh, &Bw) == 4) {
for (int i = 0; i < Ah; i++)
for (int j = 0; j < Aw; j++)
scanf("%d", &x), A[i][j] = x;
for (int i = 0; i < Bh; i++)
for (int j = 0; j < Bw; j++)
scanf("%d", &x), B[i][j] = x;
int Ch = Ah - Bh, Cw = Aw - Bw;
int bx = -1, by = -1;
int diff = INT_MAX;
#pragma omp parallel for
for (int i = 0; i <= Ch; i++) {
for (int j = 0; j <= Cw; j++) {
int d = 0;
for (int p = 0; p < Bh; p++) {
for (int q = 0; q < Bw; q++) {
d += (B[p][q] - A[i+p][j+q])*(B[p][q] - A[i+p][j+q]);
}
}
C[i][j] = d;
}
}
for (int i = 0; i <= Ch; i++) {
for (int j = 0; j <= Cw; j++)
if (C[i][j] < diff)
diff = C[i][j], bx = i, by = j;
}
printf("%d %d\n", bx + 1, by + 1);
}
return 0;
}

FFT

儘管使用快速傅立葉 (FFT) 已經達到 $\mathcal{O}(N \log N)$ 的時間複雜度的極致,只比平行樸素算法快個兩到三倍,加入平行之後,效能可以達到五到六倍加速。

FFT 用到大量的三角函數計算,一般而言若使用序列化的乘法代替三角函數計算 (疊加角度) 會比重新計算快上很多,但這樣會損失一大部分的精準度 (但速度快很多),而且會產生很多相互依賴的指令,這也導致不容易對迴圈平行。

如果需要平行,每一個三角函數可以預先存在陣列中,每一個 thread 直接存取陣列中的元素即可,但麻煩的地方在於最好使用多核處理器,然而跨越多個處理器有可能會因為快取 … 等機制,導致速度下降,因此這裡採用單一處理器六核心 (只有三個是 physical core),因此 thread 限定在五個以內。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#include <stdio.h>
#include <limits.h>
#include <stdbool.h>
#include <math.h>
#include <string.h>
#include <omp.h>
// FFT header
#define MAXN 262144
typedef struct {
float x, y;
} complex;
typedef unsigned int UINT32;
typedef struct {
int (* const NumberOfBitsNeeded) (int);
UINT32 (* const FastReverseBits) (UINT32, int);
void (* const FFT) (bool, complex[], complex[], int);
void (* const convolution) (double *, double *, int , double *);
} FFT_namespace;
static inline complex init_complex(double a, double b) {
complex t = {a, b};
return t;
}
static inline complex add_complex(complex *a, complex *b) {
return init_complex(a->x + b->x, a->y + b->y);
}
static inline complex sub_complex(complex *a, complex *b) {
return init_complex(a->x - b->x, a->y - b->y);
}
static inline complex mul_complex(complex *a, complex *b) {
return init_complex(a->x * b->x - a->y * b->y, a->x * b->y+ a->y * b->x);
}
static complex pre_theta[2][MAXN];
void prework(int n) {
static int pre_n = 0;
static const double PI = acos(-1);
if (pre_n == n)
return ;
pre_n = n;
pre_theta[0][0] = init_complex(1, 0);
pre_theta[1][0] = init_complex(1, 0);
#pragma omp parallel for
for (register int i = 1; i < n; i++) {
pre_theta[0][i] = init_complex(cos(2*i*PI / n ) , sin(2*i*PI / n ));
pre_theta[1][i] = init_complex(cos(2*i*PI / n ) , -sin(2*i*PI / n ));
}
}
// FFT body
int NumberOfBitsNeeded(int PowerOfTwo) {
for (int i = 0;; ++i) {
if (PowerOfTwo & (1 << i)) {
return i;
}
}
}
UINT32 FastReverseBits(UINT32 a, int NumBits) {
a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
return a >> (32 - NumBits);
}
void _FFT(bool InverseTransform, complex In[], complex Out[], int n) {
// simultaneous data copy and bit-reversal ordering into outputs
int NumSamples = n;
int NumBits = NumberOfBitsNeeded(NumSamples);
for (int i = 0; i < NumSamples; ++i)
Out[FastReverseBits(i, NumBits)] = In[i];
// the FFT process
#pragma omp parallel
for (int i = 1; i <= NumBits; i++) {
int BlockSize = 1<<i, BlockEnd = BlockSize>>1, BlockCnt = NumSamples/BlockSize;
#pragma omp for
for (int j = 0; j < NumSamples; j += BlockSize) {
complex *t = pre_theta[InverseTransform];
for (int k = 0; k < BlockEnd; k++, t += BlockCnt) {
complex a = mul_complex(&(*t), &Out[k+j+BlockEnd]);
Out[k+j+BlockEnd] = sub_complex(&Out[k+j], &a);
Out[k+j] = add_complex(&Out[k+j], &a);
}
}
}
// normalize if inverse transform
if (InverseTransform) {
#pragma omp parallel for
for (int i = 0; i < NumSamples; ++i) {
Out[i].x = Out[i].x / NumSamples;
}
}
}
void convolution(double *a, double *b, int n, double *c) {
static complex s[MAXN], d1[MAXN], d2[MAXN], y[MAXN];
prework(n);
#pragma omp parallel for
for (int i = 0; i < n; ++i)
s[i] = init_complex(a[i], 0);
_FFT(false, s, d1, n);
s[0] = init_complex(b[0], 0);
#pragma omp parallel for
for (int i = 1; i < n; ++i)
s[i] = init_complex(b[n - i], 0);
_FFT(false, s, d2, n);
#pragma omp parallel for
for (int i = 0; i < n; ++i)
y[i] = mul_complex(&d1[i], &d2[i]);
_FFT(true, y, s, n);
#pragma omp parallel for
for (int i = 0; i < n; ++i)
c[i] = s[i].x;
}
FFT_namespace const FFT = {NumberOfBitsNeeded, FastReverseBits, _FFT, convolution};
// FFT end
double a[MAXN], b[MAXN], c[MAXN];
long long sum[512][512];
long long getArea(int lx, int ly, int rx, int ry) {
long long ret = sum[rx][ry];
if (lx) ret -= sum[lx-1][ry];
if (ly) ret -= sum[rx][ly-1];
if (lx && ly) ret += sum[lx-1][ly-1];
return ret;
}
int main() {
omp_set_num_threads(5);
int m, n, p, q, x, N, M, S;
while (scanf("%d %d %d %d", &m, &n, &p, &q) == 4) {
N = m > p ? m : p, M = n > q ? n : q;
S = 1;
for (; S < N*M; S <<= 1);
memset(a, 0, sizeof(a[0]) * S);
memset(b, 0, sizeof(b[0]) * S);
for (int i = 0; i < m; i++) {
long long s = 0;
for (int j = 0; j < n; j++) {
scanf("%d", &x);
a[i*M+j] = x;
s += x*x;
sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;
}
}
for (int i = 0; i < p; i++) {
for (int j = 0; j < q; j++) {
scanf("%d", &x);
b[i*M+j] = x;
}
}
FFT.convolution(a, b, S, c);
int qx = m - p, qy = n - q, bX = INT_MAX, bY = INT_MAX;
long long diff = LONG_MAX;
#pragma omp parallel
{
long long t_diff = LONG_MAX;
int t_bX = INT_MAX, t_bY = INT_MAX;
#pragma omp for
for (int i = 0; i <= qx; i++) {
for (int j = 0; j <= qy; j++) {
long long v = getArea(i, j, i+p-1, j+q-1) - 2*c[i*M + j] + 0.5;
if (v < t_diff)
t_diff = v, t_bX = i, t_bY = j;
}
}
#pragma omp critical
{
if (t_diff < diff)
diff = t_diff, bX = t_bX, bY = t_bY;
if (t_diff == diff && (t_bX < bX || (t_bX == bX && t_bY < bY))) {
bX = t_bX, bY = t_bY;
}
}
}
printf("%d %d\n", bX+1, bY+1);
}
return 0;
}
Read More +

批改娘 10022. Fast Matrix Multiplication

背景

飼料

題目描述

相信不少人都已經實作所謂的矩陣乘法,計算兩個方陣大小為 $N \times N$ 的矩陣 $A, B$。為了方便起見,提供一個偽隨機數的生成,減少在輸入處理浪費的時間,同時也減少上傳測資的辛苦。

根據種子 $c = S1$ 生成矩陣 $A$,種子 $c = S2$ 生成矩陣 $B$,計算矩陣相乘 $A \times B$,為了方便起見,使用 hash 函數進行簽章,最後輸出一個值。由於會牽涉到 overflow 問題,此題作為快取實驗就不考慮這個,overflow 問題都會相同。

matrix.h

1
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

1
2
3
4
5
6
7
8
9
10
11
12
#include "matrix.h"
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
unsigned long sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#include <stdio.h>
#include "matrix.h"
// #define DEBUG
#define UINT unsigned long
#define MAXN 2048
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT hash(UINT x) {
return (x * 2654435761LU);
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = hash(h + A[i][j]);
}
return h;
}
UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
int N, S1, S2;
while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
rand_gen(S1, N, A);
rand_gen(S2, N, B);
multiply(N, A, B, C);
#ifdef DEBUG
print_matrix(N, A);
print_matrix(N, B);
print_matrix(N, C);
#endif
printf("%u\n", signature(N, C));
}
return 0;
}

輸入格式

多組測資,每組測資一行三個整數 $N, S1, S2$

  • $1 \le N \le 1000$
  • $0 \le S1, S2 \le 65536$

輸出格式

每組測資輸出一行,一個整數 signature 的結果。

範例輸入

1
2
2 2 5
2 2 5

範例輸出

1
2
573770929
573770929

解釋

範例輸入產生 $2 \times 2$ 的矩陣。

$$A = \begin{bmatrix} 2 & 3\\ 0 & 0 \end{bmatrix} , B = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix} , A \times B = \begin{bmatrix} 11 & 6\\ 0 & 0 \end{bmatrix}$$

備註

  • 2016/02/17 加入平行程式設計 OpenMP 部份,並增加時間限制!

Solution

解法跟 thread 寫法一樣,用 OpenMP 的好處在於不用處理那些 thread 的設定,用 OpenMP 提供的前處理完成執行緒的建造和移除操作。

在這裡特別提供達夫裝置 (Duff’s device) 對於迴圈展開 loop unrolling 的撰寫方式,巧妙地運用 C 語言中的 switch,相比一般寫法需要兩次迴圈處理,最後一個迴圈必須處理剩餘不整除的部分。就實作看起來,在現在版本 4.8 gcc 編譯結果下,效能沒有明顯的差別。

在高等編譯器課程中,聽說迴圈展開的數目最好不是 $2^n$,某些情況會造成 alignment 的 cache miss 的嚴重影響,實際情況還是要看怎麼運用,在這裡就不多做嘗試。

matrix.h

1
2
3
#define UINT unsigned long
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c (達夫裝置)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include "matrix.h"
#define LOOP_UNROLL 8
void transpose(int N, UINT A[][2048]) {
UINT t;
for (int i = 0; i < N; i++)
for (int j = i+1; j < N; j++)
t = A[i][j], A[i][j] = A[j][i], A[j][i] = t;
}
void multiply(int N, UINT A[][2048], UINT B[][2048], UINT C[][2048]) {
transpose(N, B);
#pragma omp parallel for
for (int i = N-1; i >= 0; i--) {
for (int j = N-1; j >= 0; j--) {
register UINT sum = 0;
UINT *a = &A[i][0], *b = &B[j][0];
int k = N;
switch (k % LOOP_UNROLL) {
case 0: do { sum += *a * *b, a++, b++;
case 7: sum += *a * *b, a++, b++;
case 6: sum += *a * *b, a++, b++;
case 5: sum += *a * *b, a++, b++;
case 4: sum += *a * *b, a++, b++;
case 3: sum += *a * *b, a++, b++;
case 2: sum += *a * *b, a++, b++;
case 1: sum += *a * *b, a++, b++;
} while ((k -= LOOP_UNROLL) > 0);
}
C[i][j] = sum;
}
}
}

matrix.c (迴圈展開)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#include "matrix.h"
#define LOOP_UNROLL 8
void transpose(int N, UINT A[][2048]) {
int chunk = 8;
#pragma omp parallel for schedule(dynamic, chunk)
for (int i = 0; i < N; i++) {
for (int j = i+1; j < N; j++) {
UINT t;
t = A[i][j], A[i][j] = A[j][i], A[j][i] = t;
}
}
}
void multiply(int N, UINT A[][2048], UINT B[][2048], UINT C[][2048]) {
transpose(N, B);
int chunk = 8;
#pragma omp parallel for schedule(dynamic, chunk) shared(C)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
register UINT sum = 0;
UINT *a = &A[i][0], *b = &B[j][0];
int k;
for (k = 0; k+LOOP_UNROLL < N; k += LOOP_UNROLL) {
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
}
for (; k < N; k++)
sum += *a * *b, a++, b++;
C[i][j] = sum;
}
}
}
Read More +

批改娘 10080. Fast Matrix Multiplication (pthread)

題目描述

相信不少人都已經實作所謂的矩陣乘法,計算兩個方陣大小為 $N \times N$ 的矩陣 $A, B$。為了方便起見,提供一個偽隨機數的生成,減少在輸入處理浪費的時間,同時也減少上傳測資的辛苦。

根據種子 $c = S1$ 生成矩陣 $A$,種子 $c = S2$ 生成矩陣 $B$,計算矩陣相乘 $A \times B$,為了方便起見,使用 hash 函數進行簽章,最後輸出一個值。由於會牽涉到 overflow 問題,此題作為快取實驗就不考慮這個,overflow 問題都會相同。

matrix.h

1
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

1
2
3
4
5
6
7
8
9
10
11
12
#include "matrix.h"
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
unsigned long sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#include <stdio.h>
#include "matrix.h"
// #define DEBUG
#define UINT unsigned long
#define MAXN 2048
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT hash(UINT x) {
return (x * 2654435761LU);
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = hash(h + A[i][j]);
}
return h;
}
static UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
int N, S1, S2;
while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
rand_gen(S1, N, A);
rand_gen(S2, N, B);
multiply(N, A, B, C);
#ifdef DEBUG
print_matrix(N, A);
print_matrix(N, B);
print_matrix(N, C);
#endif
printf("%u\n", signature(N, C));
}
return 0;
}

輸入格式

多組測資,每組測資一行三個整數 $N, S1, S2$

  • $1 \le N \le 1000$
  • $0 \le S1, S2 \le 65536$

輸出格式

每組測資輸出一行,一個整數 signature 的結果。

範例輸入

1
2
2 2 5
2 2 5

範例輸出

1
2
573770929
573770929

解釋

範例輸入產生 $2 \times 2$ 的矩陣。

$$A = \begin{bmatrix} 2 & 3\\ 0 & 0 \end{bmatrix} , B = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix} , A \times B = \begin{bmatrix} 11 & 6\\ 0 & 0 \end{bmatrix}$$

Makefile

1
2
matrix.out: matrix.h matrix.c main.c
gcc -std=c99 -O2 matrix.h matrix.c main.c -o matrix -lpthread -lm

Solution

針對矩陣乘法平行很簡單,在 CPU 上由於需要搬運資料,不一定開的 thread 數量接近 core 數量就一定最好。而在平行的部分要把握資料局部性的加速,以下實作根據最終答案進行 row-based 策略切割,每一個 thread 計算那一個 row 的所有答案。

matrix.h

1
2
3
#define UINT unsigned long
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#include "matrix.h"
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define LOOP_UNROLL 8
#define MAXTHREAD 8
#define MAXN 2048
typedef struct Argu {
int l, r, N;
UINT *A, *B, *C;
} Argu;
static void transpose(int N, UINT A[][2048]) {
UINT t;
for (int i = 0; i < N; i++) {
for (int j = i+1; j < N; j++)
t = A[i][j], A[i][j] = A[j][i], A[j][i] = t;
}
}
void* thread_multiply(void *arg) {
Argu data = * ((Argu *) arg);
int l = data.l, r = data.r;
int N = data.N;
UINT *A = data.A, *B = data.B, *C = data.C;
UINT stkA[MAXN];
for (int i = l; i <= r; i++) {
memcpy(stkA, A + (i * MAXN), sizeof(UINT) * N);
for (int j = 0; j < N; j++) {
UINT sum = 0;
// UINT *a = A + (i * MAXN), *b = B + (j * MAXN);
UINT *a = stkA, *b = B + (j * MAXN);
int k;
for (k = 0; k+LOOP_UNROLL < N; k += LOOP_UNROLL) {
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
}
for (; k < N; k++)
sum += *a * *b, a++, b++;
*(C + (i * MAXN + j)) = sum;
}
}
return NULL;
}
void multiply(int N, UINT A[][2048], UINT B[][2048], UINT C[][2048]) {
transpose(N, B);
pthread_t *threads;
threads = (pthread_t *) malloc(MAXTHREAD * sizeof(pthread_t));
for (int i = 0; i < MAXTHREAD; i++) {
Argu *data = (Argu *) malloc(sizeof(Argu));
data->l = i * N / MAXTHREAD;
data->r = (i+1) * N / MAXTHREAD - 1;
data->N = N;
data->A = &A[0][0], data->B = &B[0][0], data->C = &C[0][0];
pthread_create(&threads[i], NULL, thread_multiply, (void *) (data));
}
for (int i = 0; i < MAXTHREAD; i++)
pthread_join(threads[i], NULL);
return ;
}
Read More +