b451. 圖片匹配

contents

  1. 1. Problem
    1. 1.1. 背景
    2. 1.2. 題目描述
  2. 2. Sample Input
  3. 3. Sample Output
  4. 4. Solution
    1. 4.1. 參考資料
    2. 4.2. 小記
    3. 4.3. FFT
    4. 4.4. FFT 編譯優化
    5. 4.5. NTT/FNT
    6. 4.6. NTT/FNT CRT 加速

Problem

背景

圖片匹配和字串匹配有一點不同,字串匹配通常要求其子字串與搜尋字串完全相符,而圖片匹配則用相似度為依據,當圖片大、複雜且具有干擾,或者需要匹配數量非常多,更先進的領域會利用特徵擷取,用機率統計的方式來篩選可能的匹配數量,篩選過後才進行圖片的細節匹配。

題目描述

給予兩個圖片 $A, B$,圖片格式為灰階影像,每個像素 $\mathit{pixel}(x, y)$ 採用 8-bits 表示,範圍為 $\mathit{pixel}(x, y) \in [0, 255]$

舉一個例子,有一個 $3 \times 3$ 的圖片 $A$ 和一個 $2 \times 2$ 的圖片 $B$,用矩陣表示如下:

$$A := \begin{bmatrix} a1 & a2 & a3 \\ a4 & a5 & a6 \\ a7 & a8 & a9 \end{bmatrix} ,\; B := \begin{bmatrix} b1 & b2\\ b3 & b4\\ \end{bmatrix}$$

假設左上角座標 $(1, 1)$$a1$ 的位置、$(1, 2)$$a2$ 的位置。

  • 把影像 $B$ 左上角對齊 $A$$(1, 1)$ 位置,其差異程度 $\mathit{diff}(A, B) = (a1 - b1)^2 + (a2 - b2)^2 + (a4 - b3)^2 + (a5 - b4)^2$
  • 相同地,對齊 $(2, 1)$ 位置,其差異程度 $\mathit{diff}(A, B) = (a4 - b1)^2 + (a5 - b2)^2 + (a7 - b3)^2 + (a8 - b4)^2$

比較時,整張 $B$ 都要落在 $A$ 中。現在要找到一個對齊位置 $(x, y)$,使得 $\mathit{diff}(A, B)$ 最小。

Sample Input

1
2
3
4
5
6
7
8
9
10
11
12
4 4 2 2
0 3 3 0
0 3 3 0
0 0 5 5
0 0 5 5
5 5
5 5
3 5 1 2
2 3 0 4 5
0 0 7 0 0
0 0 7 0 0
3 4

Sample Output

1
2
3 3
1 1

Solution

參考資料

相同樸素 FFT 題目 ZOJ 1637 - Fast Image Match

小記

雖然不是很明白 FFT,參考上面的資料,了解 FFT 的旋積 (convolution) 可以在 $O(n \log n)$ 完成就好,接著套模板。

從差異公式中得到$\mathit{diff}(A, B) = \sum (a_i - b_i)^2 = \sum a_{i}^{2} - \sum 2 a_i b_i + \sum b_{i}^{2}$$\sum a_{i}^{2}$$\sum b_{i}^{2}$ 都是獨立的,麻煩的是在於$\sum 2 a_i b_i$ 向量內積,若樸素去計算會在 $O(H^4)$ 完成,套用 FFT 旋積計算,得到 $O(H^2 \log H)$。FFT 有一個缺點,在浮點數的複數域下運行,計算時會失去精準度,要四捨五入到整數。儘管如此,由於不用像 NTT 那樣有很多模運算,速度是最快的。

數論變換 (NTT) / 快速數論變換 (FNT),採用費馬數數論變換,取代複數根的疊加,利用原根的性質來完成。NTT/FNT 處理整數域內積時不存在誤差,所有計算皆在整數,但是要取模變得非常慢。

為了加速計算,丟 CRT 下去降一半的 bits,乘法速度只能提升兩倍,但要做兩次計算,速度稍微快一點點而已。在實作時,特別要注意到,CRT 運作時,挑選兩個質數 $P1, \; P2$ 分別計算 FNT,最後 CRT 逆推回去。

其他實作細節

  1. Fast Reverse Bit 使用位元運算取代建表、迴圈方案。除非多測資。
  2. std::complex<double>struct complex 取代,加速 method interface 拷貝。
  3. sin() cos() 不考慮建表,採用乘法和加法疊加,這會損失一點點精度。若固定測資考慮內存池去搞。

在 O1 下編輯跟 O3 一樣快,由於第二點的貢獻,速度直接從快兩倍。0.3s -> 90ms。

FFT

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#include <bits/stdc++.h>
using namespace std;

template<typename T> class TOOL_FFT {
public:
typedef unsigned int UINT32;
int NumberOfBitsNeeded(int PowerOfTwo) {
for (int i = 0;; ++i) {
if (PowerOfTwo & (1 << i)) {
return i;
}
}
}
inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
return a >> (32 - NumBits);
}

void FFT(bool InverseTransform, vector<complex<T> >& In, vector<complex<T> >& Out) {
// simultaneous data copy and bit-reversal ordering into outputs
int NumSamples = In.size();
int NumBits = NumberOfBitsNeeded(NumSamples);
for (int i = 0; i < NumSamples; ++i) {
Out[FastReverseBits(i, NumBits)] = In[i];
}
// the FFT process
T angle_numerator = acos(-1.) * (InverseTransform ? -2 : 2);
for (int BlockEnd = 1, BlockSize = 2; BlockSize <= NumSamples; BlockSize <<= 1) {
T delta_angle = angle_numerator / BlockSize;
T sin1 = sin(-delta_angle);
T cos1 = cos(-delta_angle);
T sin2 = sin(-delta_angle * 2);
T cos2 = cos(-delta_angle * 2);
for (int i = 0; i < NumSamples; i += BlockSize) {
complex<T> a1(cos1, sin1), a2(cos2, sin2), a0;
for (int j = i, n = 0; n < BlockEnd; ++j, ++n) {
a0 = complex<T>(2 * cos1 * a1.real() - a2.real(), 2 * cos1 * a1.imag() - a2.imag());
a2 = a1;
a1 = a0;
complex<T> a = a0 * Out[j + BlockEnd];
Out[j + BlockEnd] = Out[j] - a;
Out[j] += a;
}
}
BlockEnd = BlockSize;
}
// normalize if inverse transform
if (InverseTransform) {
for (int i = 0; i < NumSamples; ++i) {
Out[i] /= NumSamples;
}
}
}

vector<T> convolution(T *a, T *b, int n) {
vector<std::complex<T>> s(n), d1(n), d2(n), y(n);
vector<T> ret(n);
for (int i = 0; i < n; ++i) {
s[i] = complex<T>(a[i], 0);
}
FFT(false, s, d1);
s[0] = complex<T>(b[0], 0);
for (int i = 1; i < n; ++i) {
s[i] = complex<T>(b[n - i], 0);
}
FFT(false, s, d2);
for (int i = 0; i < n; ++i) {
y[i] = d1[i] * d2[i];
}
FFT(true, y, s);
for (int i = 0; i < n; ++i) {
ret[i] = s[i].real();
}
return ret;
}
};
TOOL_FFT<double> tool;

double a[262144], b[262144];
long long sum[512][512];
long long getArea(int lx, int ly, int rx, int ry) {
long long ret = sum[rx][ry];
if(lx-1 >= 0)
ret -= sum[lx-1][ry];
if(ly-1 >= 0)
ret -= sum[rx][ly-1];
if(lx-1 >= 0 && ly-1 >= 0)
ret += sum[lx-1][ly-1];
return ret;
}
inline int readchar() {
const int N = 1048576;
static char buf[N];
static char *p = buf, *end = buf;
if(p == end) {
if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
p = buf;
}
return *p++;
}
inline int ReadInt(int *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}
int main() {
int m, n, p, q, x, N, M, S;
while (ReadInt(&m)) {
ReadInt(&n), ReadInt(&p), ReadInt(&q);
N = max(m, p), M = max(n, q);
S = 1;
for (; S < N*M; S <<= 1);
memset(a, 0, sizeof(a[0]) * S);
memset(b, 0, sizeof(b[0]) * S);
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
ReadInt(&x);
a[i*M+j] = x;
}
}
for (int i = 0; i < p; i++) {
for (int j = 0; j < q; j++) {
ReadInt(&x);
b[i*M+j] = x;
}
}

for (int i = 0; i < m; i++) {
long long s = 0;
for (int j = 0; j < n; j++) {
x = a[i*M+j];
s += x*x;
sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;
}
}
vector<double> r = tool.convolution(a, b, S);
int qx = m - p, qy = n - q, bX = 0, bY = 0;
long long diff = LONG_MAX;
for (int i = 0; i <= qx; i++) {
for (int j = 0; j <= qy; j++) {
long long v = round(getArea(i, j, i+p-1, j+q-1) - 2*r[i*M + j]);
if (v < diff) {
diff = v, bX = i, bY = j;
}
}
}
printf("%d %d\n", bX+1, bY+1);
}
return 0;
}

FFT 編譯優化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#include <bits/stdc++.h>
using namespace std;

template<typename T> class TOOL_FFT {
public:
struct complex {
T x, y;
complex(T x = 0, T y = 0):
x(x), y(y) {}
complex operator+(const complex &A) {
return complex(x+A.x,y+A.y);
}
complex operator-(const complex &A) {
return complex(x-A.x,y-A.y);
}
complex operator*(const complex &A) {
return complex(x*A.x-y*A.y,x*A.y+y*A.x);
}
};
typedef unsigned int UINT32;
int NumberOfBitsNeeded(int PowerOfTwo) {
for (int i = 0;; ++i) {
if (PowerOfTwo & (1 << i)) {
return i;
}
}
}
inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
return a >> (32 - NumBits);
}

void FFT(bool InverseTransform, vector<complex>& In, vector<complex>& Out) {
// simultaneous data copy and bit-reversal ordering into outputs
int NumSamples = In.size();
int NumBits = NumberOfBitsNeeded(NumSamples);
for (int i = 0; i < NumSamples; ++i) {
Out[FastReverseBits(i, NumBits)] = In[i];
}
// the FFT process
T angle_numerator = acos(-1.) * (InverseTransform ? -2 : 2);
for (int BlockEnd = 1, BlockSize = 2; BlockSize <= NumSamples; BlockSize <<= 1) {
T delta_angle = angle_numerator / BlockSize;
T sin1 = sin(-delta_angle);
T cos1 = cos(-delta_angle);
T sin2 = sin(-delta_angle * 2);
T cos2 = cos(-delta_angle * 2);
for (int i = 0; i < NumSamples; i += BlockSize) {
complex a1(cos1, sin1), a2(cos2, sin2), a0, a;
int j, n;
for (j = i, n = 0; n+8 < BlockEnd; ) {
#define UNLOOP {\
a0 = complex(2 * cos1 * a1.x - a2.x, 2 * cos1 * a1.y - a2.y); \
a2 = a1, a1 = a0; \
a = a0 * Out[j + BlockEnd]; \
Out[j + BlockEnd] = Out[j] - a; \
Out[j] = Out[j] + a; \
++j, ++n; }
#define UNLOOP8 {UNLOOP UNLOOP UNLOOP UNLOOP UNLOOP UNLOOP UNLOOP UNLOOP}
UNLOOP8;
}
for (; n < BlockEnd; )
UNLOOP;
}
BlockEnd = BlockSize;
}
// normalize if inverse transform
if (InverseTransform) {
for (int i = 0; i < NumSamples; ++i) {
Out[i] = Out[i].x / NumSamples;
}
}
}

void convolution(T *a, T *b, int n, T *c) {
vector<complex> s(n), d1(n), d2(n), y(n);
for (int i = 0; i < n; ++i)
s[i] = complex(a[i], 0);
FFT(false, s, d1);
s[0] = complex(b[0], 0);
for (int i = 1; i < n; ++i)
s[i] = complex(b[n - i], 0);
FFT(false, s, d2);
for (int i = 0; i < n; ++i)
y[i] = d1[i] * d2[i];
FFT(true, y, s);
for (int i = 0; i < n; ++i)
c[i] = s[i].x;
}
};
TOOL_FFT<double> tool;

double a[262144], b[262144], c[262144];
long long sum[512][512];
inline long long getArea(int lx, int ly, int rx, int ry) {
long long ret = sum[rx][ry];
if(lx) ret -= sum[lx-1][ry];
if(ly) ret -= sum[rx][ly-1];
if(lx && ly) ret += sum[lx-1][ly-1];
return ret;
}
inline int readchar() {
const int N = 1048576;
static char buf[N];
static char *p = buf, *end = buf;
if(p == end) {
if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
p = buf;
}
return *p++;
}
inline int ReadInt(int *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}
int main() {
int m, n, p, q, x, N, M, S;
while (ReadInt(&m)) {
ReadInt(&n), ReadInt(&p), ReadInt(&q);
N = max(m, p), M = max(n, q);
S = 1;
for (; S < N*M; S <<= 1);
memset(a, 0, sizeof(a[0]) * S);
memset(b, 0, sizeof(b[0]) * S);
for (int i = 0; i < m; i++) {
long long s = 0;
for (int j = 0; j < n; j++) {
ReadInt(&x);
a[i*M+j] = x;
s += x*x;
sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;
}
}
for (int i = 0; i < p; i++) {
for (int j = 0; j < q; j++) {
ReadInt(&x);
b[i*M+j] = x;
}
}
tool.convolution(a, b, S, c);
int qx = m - p, qy = n - q, bX = 0, bY = 0;
long long diff = LONG_MAX;
for (int i = 0; i <= qx; i++) {
for (int j = 0; j <= qy; j++) {
long long v = getArea(i, j, i+p-1, j+q-1) - 2*c[i*M + j] + 0.5;
if (v < diff)
diff = v, bX = i, bY = j;
}
}
fprintf(stderr, "%lld\n", diff);
printf("%d %d\n", bX+1, bY+1);
}
return 0;
}

NTT/FNT

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#include <bits/stdc++.h>
using namespace std;

typedef unsigned int UINT32;
typedef long long INT64;
class TOOL_NTT {
public:
#define MAXN 262144
const INT64 P = 50000000001507329LL; // prime m = kn+1
const INT64 G = 3;
INT64 wn[20];
INT64 s[MAXN], d1[MAXN], d2[MAXN], y[MAXN];
TOOL_NTT() {
for (int i = 0; i < 20; i++)
wn[i] = mod_pow(G, (P-1) / (1<<i), P);
}
INT64 mod_mul(INT64 a, INT64 b, INT64 mod) {
long long y = (long long)((double)a*b/mod+0.5); // fast for P < 2^58
long long r = (a*b-y*mod)%mod;
return r < 0 ? r + mod : r;
// INT64 ret = 0;
// for (a = a >= mod ? a%mod : a, b = b >= mod ? b%mod : b; b != 0; b>>=1, a <<= 1, a = a >= mod ? a - mod : a) {
// if (b&1) {
// ret += a;
// if (ret >= mod)
// ret -= mod;
// }
// }
// return ret;
}
INT64 mod_pow(INT64 n, INT64 e, INT64 m) {
INT64 x = 1;
for (n = n >= m ? n%m : n; e; e >>= 1) {
if (e&1)
x = mod_mul(x, n, m);
n = mod_mul(n, n, m);
}
return x;
}
int NumberOfBitsNeeded(int PowerOfTwo) {
for (int i = 0;; ++i) {
if (PowerOfTwo & (1 << i)) {
return i;
}
}
}
inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
return a >> (32 - NumBits);
}
void NTT(int on, INT64 *In, INT64 *Out, int n) {
int NumBits = NumberOfBitsNeeded(n);
for (int i = 0; i < n; ++i)
Out[FastReverseBits(i, NumBits)] = In[i];
for(int h = 2, id = 1; h <= n; h <<= 1, id++) {
for(int j = 0; j < n; j += h) {
INT64 w = 1, u, t;
int block = h/2, blockEnd = j + h/2;
for(int k = j; k < blockEnd; k++) {
u = Out[k], t = mod_mul(w, Out[k+block], P);
Out[k] = u + t;
Out[k+block] = u - t + P;
if (Out[k] >= P) Out[k] -= P;
if (Out[k+block] >= P) Out[k+block] -= P;
w = mod_mul(w, wn[id], P);
}
}
}
if (on == 1) {
for (int i = 1; i < n/2; i++)
swap(Out[i], Out[n-i]);
INT64 invn = mod_pow(n, P-2, P);
for (int i = 0; i < n; i++)
Out[i] = mod_mul(Out[i], invn, P);
}
}
void convolution(INT64 *a, INT64 *b, int n, INT64 *c) {
NTT(0, a, d1, n);
s[0] = b[0];
for (int i = 1; i < n; ++i)
s[i] = b[n-i];
NTT(0, s, d2, n);
for (int i = 0; i < n; i++)
s[i] = mod_mul(d1[i], d2[i], P);
NTT(1, s, c, n);
}
} tool;
INT64 a[262144], b[262144], c[262144];
long long sum[512][512];
inline long long getArea(int lx, int ly, int rx, int ry) {
long long ret = sum[rx][ry];
if(lx) ret -= sum[lx-1][ry];
if(ly) ret -= sum[rx][ly-1];
if(lx && ly) ret += sum[lx-1][ly-1];
return ret;
}
inline int readchar() {
const int N = 1048576;
static char buf[N];
static char *p = buf, *end = buf;
if(p == end) {
if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
p = buf;
}
return *p++;
}
inline int ReadInt(int *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}
int main() {
int m, n, p, q, x, N, M, S;
while (ReadInt(&m)) {
ReadInt(&n), ReadInt(&p), ReadInt(&q);
N = max(m, p), M = max(n, q);
S = 1;
for (; S < N*M; S <<= 1);
memset(a, 0, sizeof(a[0]) * S);
memset(b, 0, sizeof(b[0]) * S);
for (int i = 0; i < m; i++) {
long long s = 0;
for (int j = 0; j < n; j++) {
ReadInt(&x);
a[i*M+j] = x;
s += x*x;
sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;
}
}
for (int i = 0; i < p; i++) {
for (int j = 0; j < q; j++) {
ReadInt(&x);
b[i*M+j] = x;
}
}
tool.convolution(a, b, S, c);
int qx = m - p, qy = n - q, bX = 0, bY = 0;
long long diff = LONG_MAX;
for (int i = 0; i <= qx; i++) {
for (int j = 0; j <= qy; j++) {
long long v = getArea(i, j, i+p-1, j+q-1) - 2*c[i*M + j];
if (v < diff)
diff = v, bX = i, bY = j;
}
}
fprintf(stderr, "diff = %lld\n", diff);
printf("%d %d\n", bX+1, bY+1);
}
return 0;
}

NTT/FNT CRT 加速

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#include <bits/stdc++.h>
using namespace std;

typedef uint_fast32_t UINT32;
typedef long long INT64;
typedef uint_fast32_t INT32;
class TOOL_NTT {
public:
#define MAXN 262144
// INT64 P = 50000000001507329LL; // prime m = kn+1
// INT64 G = 3;
INT32 P = 3, G = 2;
INT32 wn[20];
INT32 s[MAXN], d1[MAXN], d2[MAXN], c1[MAXN], c2[MAXN];
const INT32 P1 = 998244353; // P1 = 2^23 * 7 * 17 + 1
const INT32 G1 = 3;
const INT32 P2 = 995622913; // P2 = 2^19 *3*3*211 + 1
const INT32 G2 = 5;
const INT64 M1 = 397550359381069386LL;
const INT64 M2 = 596324591238590904LL;
const INT64 MM = 993874950619660289LL; // MM = P1*P2
TOOL_NTT() {
for (int i = 0; i < 20; i++)
wn[i] = mod_pow(G, (P-1) / (1<<i), P);
}
void reset(INT32 p, INT32 g) {
P = p, G = g;
for (int i = 0; i < 20; i++)
wn[i] = mod_pow(G, (P-1) / (1<<i), P);
}
INT64 mod_mul(INT64 a, INT64 b, INT64 mod) {
long long y = (long long)((double)a*b/mod+0.5); // fast for P < 2^58
long long r = (a*b-y*mod)%mod;
return r < 0 ? r + mod : r;
// INT64 ret = 0;
// for (a = a >= mod ? a%mod : a, b = b >= mod ? b%mod : b; b != 0; b>>=1, a <<= 1, a = a >= mod ? a - mod : a) {
// if (b&1) {
// ret += a;
// if (ret >= mod)
// ret -= mod;
// }
// }
// return ret;
}
INT64 mod_pow(INT64 n, INT64 e, INT64 m) {
INT64 x = 1;
for (n = n >= m ? n%m : n; e; e >>= 1) {
if (e&1)
x = mod_mul(x, n, m);
n = mod_mul(n, n, m);
}
return x;
}
int NumberOfBitsNeeded(int PowerOfTwo) {
for (int i = 0;; ++i) {
if (PowerOfTwo & (1 << i)) {
return i;
}
}
}
inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
return a >> (32 - NumBits);
}
void NTT(int on, INT32 *In, INT32 *Out, int n) {
int NumBits = NumberOfBitsNeeded(n);
for (int i = 0; i < n; ++i)
Out[FastReverseBits(i, NumBits)] = In[i];
for (int h = 2, id = 1; h <= n; h <<= 1, id++) {
for (int j = 0; j < n; j += h) {
INT32 w = 1, u, t;
int block = h/2, blockEnd = j + h/2;
for (int k = j; k < blockEnd; k++) {
u = Out[k], t = (INT64)w*Out[k+block]%P;
Out[k] = (u + t)%P;
Out[k+block] = (u - t + P)%P;
w = (INT64)w * wn[id]%P;
}
}
}
if (on == 1) {
for (int i = 1; i < n/2; i++)
swap(Out[i], Out[n-i]);
INT32 invn = mod_pow(n, P-2, P);
for (int i = 0; i < n; i++)
Out[i] = (INT64)Out[i]*invn%P;
}
}
INT64 crt(INT32 a, INT32 b) {
return (mod_mul(a, M1, MM) + mod_mul(b, M2, MM))%MM;
}
void convolution(INT32 *a, INT32 *b, int n, INT64 *c) {
reset(P1, G1);
NTT(0, a, d1, n);
s[0] = b[0]; for (int i = 1; i < n; ++i) s[i] = b[n-i];
NTT(0, s, d2, n);
for (int i = 0; i < n; i++) s[i] = (INT64)d1[i] * d2[i]%P;
NTT(1, s, c1, n);
reset(P2, G2);
NTT(0, a, d1, n);
s[0] = b[0]; for (int i = 1; i < n; ++i) s[i] = b[n-i];
NTT(0, s, d2, n);
for (int i = 0; i < n; i++) s[i] = (INT64)d1[i] * d2[i]%P;
NTT(1, s, c2, n);
for (int i = 0; i < n; i++)
c[i] = crt(c1[i], c2[i]);
}
} tool;
INT32 a[262144], b[262144];
INT64 c[262144];
long long sum[512][512];
inline long long getArea(int lx, int ly, int rx, int ry) {
long long ret = sum[rx][ry];
if(lx) ret -= sum[lx-1][ry];
if(ly) ret -= sum[rx][ly-1];
if(lx && ly) ret += sum[lx-1][ly-1];
return ret;
}
inline int readchar() {
const int N = 1048576;
static char buf[N];
static char *p = buf, *end = buf;
if(p == end) {
if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
p = buf;
}
return *p++;
}
inline int ReadInt(int *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}
int main() {
int m, n, p, q, x, N, M, S;
while (ReadInt(&m)) {
ReadInt(&n), ReadInt(&p), ReadInt(&q);
N = max(m, p), M = max(n, q);
S = 1;
for (; S < N*M; S <<= 1);
memset(a, 0, sizeof(a[0]) * S);
memset(b, 0, sizeof(b[0]) * S);
for (int i = 0; i < m; i++) {
long long s = 0;
for (int j = 0; j < n; j++) {
ReadInt(&x);
a[i*M+j] = x;
s += x*x;
sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;
}
}
for (int i = 0; i < p; i++) {
for (int j = 0; j < q; j++) {
ReadInt(&x);
b[i*M+j] = x;
}
}
tool.convolution(a, b, S, c);
int qx = m - p, qy = n - q, bX = 0, bY = 0;
long long diff = LONG_MAX;
for (int i = 0; i <= qx; i++) {
for (int j = 0; j <= qy; j++) {
long long v = getArea(i, j, i+p-1, j+q-1) - 2*c[i*M + j];
if (v < diff)
diff = v, bX = i, bY = j;
}
}
fprintf(stderr, "diff = %lld\n", diff);
printf("%d %d\n", bX+1, bY+1);
}
return 0;
}