2015-07-21

b451. 圖片匹配

1. Problem
1. 1.1. 背景
2. 1.2. 題目描述
2. Sample Input
3. Sample Output
4. Solution

Problem

背景

圖片匹配和字串匹配有一點不同，字串匹配通常要求其子字串與搜尋字串完全相符，而圖片匹配則用相似度為依據，當圖片大、複雜且具有干擾，或者需要匹配數量非常多，更先進的領域會利用特徵擷取，用機率統計的方式來篩選可能的匹配數量，篩選過後才進行圖片的細節匹配。

題目描述

給予兩個圖片 $A, B$，圖片格式為灰階影像，每個像素 $\mathit{pixel}(x, y)$ 採用 8-bits 表示，範圍為 $\mathit{pixel}(x, y) \in [0, 255]$。

舉一個例子，有一個 $3 \times 3$ 的圖片 $A$ 和一個 $2 \times 2$ 的圖片 $B$，用矩陣表示如下：

$$A := \begin{bmatrix} a1 & a2 & a3 \\ a4 & a5 & a6 \\ a7 & a8 & a9 \end{bmatrix} ,\; B := \begin{bmatrix} b1 & b2\\ b3 & b4\\ \end{bmatrix}$$

假設左上角座標 $(1, 1)$ 即 $a1$ 的位置、$(1, 2)$ 即 $a2$ 的位置。

把影像 $B$ 左上角對齊 $A$ 的 $(1, 1)$ 位置，其差異程度 $\mathit{diff}(A, B) = (a1 - b1)^2 + (a2 - b2)^2 + (a4 - b3)^2 + (a5 - b4)^2$
相同地，對齊 $(2, 1)$ 位置，其差異程度 $\mathit{diff}(A, B) = (a4 - b1)^2 + (a5 - b2)^2 + (a7 - b3)^2 + (a8 - b4)^2$

比較時，整張 $B$ 都要落在 $A$ 中。現在要找到一個對齊位置 $(x, y)$，使得 $\mathit{diff}(A, B)$ 最小。

Sample Input

Sample Output

1
2

3 3
1 1

Solution

參考資料

相同樸素 FFT 題目 ZOJ 1637 - Fast Image Match

小記

雖然不是很明白 FFT，參考上面的資料，了解 FFT 的旋積 (convolution) 可以在 $O(n \log n)$ 完成就好，接著套模板。

從差異公式中得到$\mathit{diff}(A, B) = \sum (a_i - b_i)^2 = \sum a_{i}^{2} - \sum 2 a_i b_i + \sum b_{i}^{2}$$\sum a_{i}^{2}$ 和$\sum b_{i}^{2}$ 都是獨立的，麻煩的是在於$\sum 2 a_i b_i$ 向量內積，若樸素去計算會在 $O(H^4)$ 完成，套用 FFT 旋積計算，得到 $O(H^2 \log H)$。FFT 有一個缺點，在浮點數的複數域下運行，計算時會失去精準度，要四捨五入到整數。儘管如此，由於不用像 NTT 那樣有很多模運算，速度是最快的。

數論變換 (NTT) / 快速數論變換 (FNT)，採用費馬數數論變換，取代複數根的疊加，利用原根的性質來完成。NTT/FNT 處理整數域內積時不存在誤差，所有計算皆在整數，但是要取模變得非常慢。

為了加速計算，丟 CRT 下去降一半的 bits，乘法速度只能提升兩倍，但要做兩次計算，速度稍微快一點點而已。在實作時，特別要注意到，CRT 運作時，挑選兩個質數 $P1, \; P2$ 分別計算 FNT，最後 CRT 逆推回去。

其他實作細節

Fast Reverse Bit 使用位元運算取代建表、迴圈方案。除非多測資。
std::complex<double> 帶 struct complex 取代，加速 method interface 拷貝。
sin() cos() 不考慮建表，採用乘法和加法疊加，這會損失一點點精度。若固定測資考慮內存池去搞。

在 O1 下編輯跟 O3 一樣快，由於第二點的貢獻，速度直接從快兩倍。0.3s -> 90ms。

FFT

#include <bits/stdc++.h>
using namespace std;
template<typename T> class TOOL_FFT {
public:
typedef unsigned int UINT32;	
    int NumberOfBitsNeeded(int PowerOfTwo) {
        for (int i = 0;; ++i) {
            if (PowerOfTwo & (1 << i)) {
                return i;
            }
        }
    }
    inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
        a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
        a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
        a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
        a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
        a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
        return a >> (32 - NumBits);
    }
    
    void FFT(bool InverseTransform, vector<complex<T> >& In, vector<complex<T> >& Out) {
        // simultaneous data copy and bit-reversal ordering into outputs
        int NumSamples = In.size();
        int NumBits = NumberOfBitsNeeded(NumSamples);
        for (int i = 0; i < NumSamples; ++i) {
            Out[FastReverseBits(i, NumBits)] = In[i];
        }
        // the FFT process
        T angle_numerator = acos(-1.) * (InverseTransform ? -2 : 2);
        for (int BlockEnd = 1, BlockSize = 2; BlockSize <= NumSamples; BlockSize <<= 1) {
            T delta_angle = angle_numerator / BlockSize;
            T sin1 = sin(-delta_angle);
            T cos1 = cos(-delta_angle);
            T sin2 = sin(-delta_angle * 2);
            T cos2 = cos(-delta_angle * 2);
            for (int i = 0; i < NumSamples; i += BlockSize) {
                complex<T> a1(cos1, sin1), a2(cos2, sin2), a0;
                for (int j = i, n = 0; n < BlockEnd; ++j, ++n) {
                    a0 = complex<T>(2 * cos1 * a1.real() - a2.real(), 2 * cos1 * a1.imag() - a2.imag());
                    a2 = a1;
                    a1 = a0;
                    complex<T> a = a0 * Out[j + BlockEnd];
                    Out[j + BlockEnd] = Out[j] - a;
                    Out[j] += a;
                }
            }
            BlockEnd = BlockSize;
        }
        // normalize if inverse transform
        if (InverseTransform) {
            for (int i = 0; i < NumSamples; ++i) {
                Out[i] /= NumSamples;
            }
        }
    }
    
    vector<T> convolution(T *a, T *b, int n) {
        vector<std::complex<T>> s(n), d1(n), d2(n), y(n);
        vector<T> ret(n);
        for (int i = 0; i < n; ++i) {
            s[i] = complex<T>(a[i], 0);
        }
        FFT(false, s, d1);
        s[0] = complex<T>(b[0], 0);
        for (int i = 1; i < n; ++i) {
            s[i] = complex<T>(b[n - i], 0);
        }
        FFT(false, s, d2);
        for (int i = 0; i < n; ++i) {
            y[i] = d1[i] * d2[i];
        }
        FFT(true, y, s);
        for (int i = 0; i < n; ++i) {
            ret[i] = s[i].real();
        }
        return ret;
    }
};
TOOL_FFT<double> tool;
double a[262144], b[262144];
long long sum[512][512];
long long getArea(int lx, int ly, int rx, int ry) {
    long long ret = sum[rx][ry];
    if(lx-1 >= 0)
        ret -= sum[lx-1][ry];
    if(ly-1 >= 0)
        ret -= sum[rx][ly-1];
    if(lx-1 >= 0 && ly-1 >= 0)	
        ret += sum[lx-1][ly-1];
    return ret;
}
inline int readchar() {
    const int N = 1048576;
    static char buf[N];
    static char *p = buf, *end = buf;
    if(p == end) {
        if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
        p = buf;
    }
    return *p++;
}
inline int ReadInt(int *x) {
    static char c, neg;
    while((c = readchar()) < '-')    {if(c == EOF) return 0;}
    neg = (c == '-') ? -1 : 1;
    *x = (neg == 1) ? c-'0' : 0;
    while((c = readchar()) >= '0')
        *x = (*x << 3) + (*x << 1) + c-'0';
    *x *= neg;
    return 1;
}
int main() {
    int m, n, p, q, x, N, M, S;
    while (ReadInt(&m)) {
    	ReadInt(&n), ReadInt(&p), ReadInt(&q);
        N = max(m, p), M = max(n, q);
        S = 1;
        for (; S < N*M; S <<= 1);
        memset(a, 0, sizeof(a[0]) * S);
        memset(b, 0, sizeof(b[0]) * S);
        for (int i = 0; i < m; i++) {
            for (int j = 0; j < n; j++) {
            	ReadInt(&x);
            	a[i*M+j] = x;                
            }
        }
        for (int i = 0; i < p; i++) {
            for (int j = 0; j < q; j++) {
            	ReadInt(&x);
            	b[i*M+j] = x;
            }
        }
        
        for (int i = 0; i < m; i++) {
        	long long s = 0;
        	for (int j = 0; j < n; j++) {
        		x = a[i*M+j];
        		s += x*x;
        		sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;
        	}
        }
        vector<double> r = tool.convolution(a, b, S);
        int qx = m - p, qy = n - q, bX = 0, bY = 0;
        long long diff = LONG_MAX;
        for (int i = 0; i <= qx; i++)    {
            for (int j = 0; j <= qy; j++) {
                long long v = round(getArea(i, j, i+p-1, j+q-1) - 2*r[i*M + j]);
                if (v < diff) {
                    diff = v, bX = i, bY = j;
                }
            }
        }
        printf("%d %d\n", bX+1, bY+1);
    }
    return 0;
}

FFT 編譯優化

#include <bits/stdc++.h>
using namespace std;
template<typename T> class TOOL_FFT {
public:
    struct complex {
        T x, y;
        complex(T x = 0, T y = 0):
            x(x), y(y) {}
        complex operator+(const complex &A) {
            return complex(x+A.x,y+A.y);
        }
        complex operator-(const complex &A) {
            return complex(x-A.x,y-A.y);
        }
        complex operator*(const complex &A) {
            return complex(x*A.x-y*A.y,x*A.y+y*A.x);
        }
    };
typedef unsigned int UINT32;	
    int NumberOfBitsNeeded(int PowerOfTwo) {
        for (int i = 0;; ++i) {
            if (PowerOfTwo & (1 << i)) {
                return i;
            }
        }
    }
    inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
        a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
        a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
        a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
        a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
        a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
        return a >> (32 - NumBits);
    }
    
    void FFT(bool InverseTransform, vector<complex>& In, vector<complex>& Out) {
        // simultaneous data copy and bit-reversal ordering into outputs
        int NumSamples = In.size();
        int NumBits = NumberOfBitsNeeded(NumSamples);
        for (int i = 0; i < NumSamples; ++i) {
            Out[FastReverseBits(i, NumBits)] = In[i];
        }
        // the FFT process
        T angle_numerator = acos(-1.) * (InverseTransform ? -2 : 2);
        for (int BlockEnd = 1, BlockSize = 2; BlockSize <= NumSamples; BlockSize <<= 1) {
            T delta_angle = angle_numerator / BlockSize;
            T sin1 = sin(-delta_angle);
            T cos1 = cos(-delta_angle);
            T sin2 = sin(-delta_angle * 2);
            T cos2 = cos(-delta_angle * 2);
            for (int i = 0; i < NumSamples; i += BlockSize) {
                complex a1(cos1, sin1), a2(cos2, sin2), a0, a;
                int j, n;
                for (j = i, n = 0; n+8 < BlockEnd; ) {
#define UNLOOP {\
    a0 = complex(2 * cos1 * a1.x - a2.x, 2 * cos1 * a1.y - a2.y); \
    a2 = a1, a1 = a0; \
    a = a0 * Out[j + BlockEnd]; \
    Out[j + BlockEnd] = Out[j] - a; \
    Out[j] = Out[j] + a; \
    ++j, ++n; }
#define UNLOOP8	{UNLOOP UNLOOP UNLOOP UNLOOP UNLOOP UNLOOP UNLOOP UNLOOP}
                    UNLOOP8;
                }
                for (; n < BlockEnd; )
                	UNLOOP;
            }
            BlockEnd = BlockSize;
        }
        // normalize if inverse transform
        if (InverseTransform) {
            for (int i = 0; i < NumSamples; ++i) {
                Out[i] = Out[i].x / NumSamples;
            }
        }
    }
    
    void convolution(T *a, T *b, int n, T *c) {
        vector<complex> s(n), d1(n), d2(n), y(n);
        for (int i = 0; i < n; ++i)
            s[i] = complex(a[i], 0);
        FFT(false, s, d1);
        s[0] = complex(b[0], 0);
        for (int i = 1; i < n; ++i)
            s[i] = complex(b[n - i], 0);
        FFT(false, s, d2);
        for (int i = 0; i < n; ++i)
            y[i] = d1[i] * d2[i];
        FFT(true, y, s);
        for (int i = 0; i < n; ++i)
            c[i] = s[i].x;
    }
};
TOOL_FFT<double> tool;
double a[262144], b[262144], c[262144];
long long sum[512][512];
inline long long getArea(int lx, int ly, int rx, int ry) {
    long long ret = sum[rx][ry];
    if(lx)	ret -= sum[lx-1][ry];
    if(ly)	ret -= sum[rx][ly-1];
    if(lx && ly)	ret += sum[lx-1][ly-1];
    return ret;
}
inline int readchar() {
    const int N = 1048576;
    static char buf[N];
    static char *p = buf, *end = buf;
    if(p == end) {
        if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
        p = buf;
    }
    return *p++;
}
inline int ReadInt(int *x) {
    static char c, neg;
    while((c = readchar()) < '-')    {if(c == EOF) return 0;}
    neg = (c == '-') ? -1 : 1;
    *x = (neg == 1) ? c-'0' : 0;
    while((c = readchar()) >= '0')
        *x = (*x << 3) + (*x << 1) + c-'0';
    *x *= neg;
    return 1;
}
int main() {
    int m, n, p, q, x, N, M, S;
    while (ReadInt(&m)) {
    	ReadInt(&n), ReadInt(&p), ReadInt(&q);
        N = max(m, p), M = max(n, q);
        S = 1;
        for (; S < N*M; S <<= 1);
        memset(a, 0, sizeof(a[0]) * S);
        memset(b, 0, sizeof(b[0]) * S);
        for (int i = 0; i < m; i++) {
        	long long s = 0;
            for (int j = 0; j < n; j++) {
            	ReadInt(&x);
            	a[i*M+j] = x;
        		s += x*x;
        		sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;          
            }
        }
        for (int i = 0; i < p; i++) {
            for (int j = 0; j < q; j++) {
            	ReadInt(&x);
            	b[i*M+j] = x;
            }
        }        
        tool.convolution(a, b, S, c);
        int qx = m - p, qy = n - q, bX = 0, bY = 0;
        long long diff = LONG_MAX;
        for (int i = 0; i <= qx; i++)    {
            for (int j = 0; j <= qy; j++) {
                long long v = getArea(i, j, i+p-1, j+q-1) - 2*c[i*M + j] + 0.5;
                if (v < diff)
                    diff = v, bX = i, bY = j;
            }
        }
        fprintf(stderr, "%lld\n", diff);
        printf("%d %d\n", bX+1, bY+1);
    }
    return 0;
}

NTT/FNT

#include <bits/stdc++.h>
using namespace std;
typedef unsigned int UINT32;
typedef long long INT64;
class TOOL_NTT {
public:
#define MAXN 262144
    const INT64 P = 50000000001507329LL; // prime m = kn+1
    const INT64 G = 3;
    INT64 wn[20];
    INT64 s[MAXN], d1[MAXN], d2[MAXN], y[MAXN];
    TOOL_NTT() {
        for (int i = 0; i < 20; i++)
            wn[i] = mod_pow(G, (P-1) / (1<<i), P);
    }
    INT64 mod_mul(INT64 a, INT64 b, INT64 mod) { 
        long long y = (long long)((double)a*b/mod+0.5); // fast for P < 2^58
    	long long r = (a*b-y*mod)%mod;
    	return r < 0 ? r + mod : r;
//	  	INT64 ret = 0; 
//		for (a = a >= mod ? a%mod : a, b = b >= mod ? b%mod : b; b != 0; b>>=1, a <<= 1, a = a >= mod ? a - mod : a) { 
//			if (b&1) {
//				ret += a;
//				if (ret >= mod) 
//					ret -= mod;
//			} 
//		} 
//		return ret; 
    }
    INT64 mod_pow(INT64 n, INT64 e, INT64 m) {
    	INT64 x = 1;
    	for (n = n >= m ? n%m : n; e; e >>= 1) {
        	if (e&1) 
                x = mod_mul(x, n, m);
        	n = mod_mul(n, n, m);
    	}
        return x;
    }
    int NumberOfBitsNeeded(int PowerOfTwo) {
        for (int i = 0;; ++i) {
            if (PowerOfTwo & (1 << i)) {
                return i;
            }
        }
    }
    inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
        a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
        a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
        a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
        a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
        a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
        return a >> (32 - NumBits);
    }
    void NTT(int on, INT64 *In, INT64 *Out, int n) {
        int NumBits = NumberOfBitsNeeded(n);
        for (int i = 0; i < n; ++i)
            Out[FastReverseBits(i, NumBits)] = In[i];
    	for(int h = 2, id = 1; h <= n; h <<= 1, id++) {
        	for(int j = 0; j < n; j += h) {
                INT64 w = 1, u, t;
                int block = h/2, blockEnd = j + h/2;
            	for(int k = j; k < blockEnd; k++) {
                	u = Out[k], t = mod_mul(w, Out[k+block], P);
                	Out[k] = u + t;
                	Out[k+block] = u - t + P;
                	if (Out[k] >= P)		Out[k] -= P;
                	if (Out[k+block] >= P)	Out[k+block] -= P;
                	w = mod_mul(w, wn[id], P);
            	}
        	}
    	}
        if (on == 1) {
            for (int i = 1; i < n/2; i++)
                swap(Out[i], Out[n-i]);
            INT64 invn = mod_pow(n, P-2, P);
            for (int i = 0; i < n; i++)
                Out[i] = mod_mul(Out[i], invn, P);
        }
    }
    void convolution(INT64 *a, INT64 *b, int n, INT64 *c) {
        NTT(0, a, d1, n);
        s[0] = b[0];
        for (int i = 1; i < n; ++i)
            s[i] = b[n-i];
        NTT(0, s, d2, n);
        for (int i = 0; i < n; i++)
            s[i] = mod_mul(d1[i], d2[i], P);
        NTT(1, s, c, n);
    }
} tool;
INT64 a[262144], b[262144], c[262144];
long long sum[512][512];
inline long long getArea(int lx, int ly, int rx, int ry) {
    long long ret = sum[rx][ry];
    if(lx)	ret -= sum[lx-1][ry];
    if(ly)	ret -= sum[rx][ly-1];
    if(lx && ly)	ret += sum[lx-1][ly-1];
    return ret;
}
inline int readchar() {
    const int N = 1048576;
    static char buf[N];
    static char *p = buf, *end = buf;
    if(p == end) {
        if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
        p = buf;
    }
    return *p++;
}
inline int ReadInt(int *x) {
    static char c, neg;
    while((c = readchar()) < '-')    {if(c == EOF) return 0;}
    neg = (c == '-') ? -1 : 1;
    *x = (neg == 1) ? c-'0' : 0;
    while((c = readchar()) >= '0')
        *x = (*x << 3) + (*x << 1) + c-'0';
    *x *= neg;
    return 1;
}
int main() {
    int m, n, p, q, x, N, M, S;
    while (ReadInt(&m)) {
    	ReadInt(&n), ReadInt(&p), ReadInt(&q);
    	N = max(m, p), M = max(n, q);
        S = 1;
        for (; S < N*M; S <<= 1);
        memset(a, 0, sizeof(a[0]) * S);
        memset(b, 0, sizeof(b[0]) * S);
        for (int i = 0; i < m; i++) {
        	long long s = 0;
            for (int j = 0; j < n; j++) {
        		ReadInt(&x);
            	a[i*M+j] = x;
        		s += x*x;
        		sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;          
            }
        }
        for (int i = 0; i < p; i++) {
            for (int j = 0; j < q; j++) {
            	ReadInt(&x);
            	b[i*M+j] = x;
            }
        }        
        tool.convolution(a, b, S, c);
        int qx = m - p, qy = n - q, bX = 0, bY = 0;
        long long diff = LONG_MAX;
        for (int i = 0; i <= qx; i++)    {
            for (int j = 0; j <= qy; j++) {
                long long v = getArea(i, j, i+p-1, j+q-1) - 2*c[i*M + j];
                if (v < diff)
                    diff = v, bX = i, bY = j;
            }
        }
        fprintf(stderr, "diff = %lld\n", diff);
        printf("%d %d\n", bX+1, bY+1);
    }
    return 0;
}

NTT/FNT CRT 加速

#include <bits/stdc++.h>
using namespace std;
typedef uint_fast32_t UINT32;
typedef long long INT64;
typedef uint_fast32_t INT32;
class TOOL_NTT {
public:
#define MAXN 262144
//	INT64 P = 50000000001507329LL; // prime m = kn+1
//	INT64 G = 3;
    INT32 P = 3, G = 2;
    INT32 wn[20];
    INT32 s[MAXN], d1[MAXN], d2[MAXN], c1[MAXN], c2[MAXN];
    const INT32 P1 = 998244353;	// P1 = 2^23 * 7 * 17 + 1
    const INT32 G1 = 3;
    const INT32 P2 = 995622913; // P2 = 2^19 *3*3*211 + 1
    const INT32 G2 = 5;
    const INT64 M1 = 397550359381069386LL;
    const INT64 M2 = 596324591238590904LL;
    const INT64 MM = 993874950619660289LL; // MM = P1*P2
    TOOL_NTT() {
        for (int i = 0; i < 20; i++)
            wn[i] = mod_pow(G, (P-1) / (1<<i), P);
    }
    void reset(INT32 p, INT32 g) {
        P = p, G = g;
        for (int i = 0; i < 20; i++)
            wn[i] = mod_pow(G, (P-1) / (1<<i), P);
    }
    INT64 mod_mul(INT64 a, INT64 b, INT64 mod) { 
        long long y = (long long)((double)a*b/mod+0.5); // fast for P < 2^58
    	long long r = (a*b-y*mod)%mod;
    	return r < 0 ? r + mod : r;
//	  	INT64 ret = 0; 
//		for (a = a >= mod ? a%mod : a, b = b >= mod ? b%mod : b; b != 0; b>>=1, a <<= 1, a = a >= mod ? a - mod : a) { 
//			if (b&1) {
//				ret += a;
//				if (ret >= mod) 
//					ret -= mod;
//			} 
//		} 
//		return ret; 
    }
    INT64 mod_pow(INT64 n, INT64 e, INT64 m) {
    	INT64 x = 1;
    	for (n = n >= m ? n%m : n; e; e >>= 1) {
        	if (e&1) 
                x = mod_mul(x, n, m);
        	n = mod_mul(n, n, m);
    	}
        return x;
    }
    int NumberOfBitsNeeded(int PowerOfTwo) {
        for (int i = 0;; ++i) {
            if (PowerOfTwo & (1 << i)) {
                return i;
            }
        }
    }
    inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
        a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
        a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
        a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
        a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
        a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
        return a >> (32 - NumBits);
    }
    void NTT(int on, INT32 *In, INT32 *Out, int n) {
        int NumBits = NumberOfBitsNeeded(n);
        for (int i = 0; i < n; ++i)
            Out[FastReverseBits(i, NumBits)] = In[i];
    	for (int h = 2, id = 1; h <= n; h <<= 1, id++) {
        	for (int j = 0; j < n; j += h) {
                INT32 w = 1, u, t;
                int block = h/2, blockEnd = j + h/2;
            	for (int k = j; k < blockEnd; k++) {
                	u = Out[k], t = (INT64)w*Out[k+block]%P;
                	Out[k] = (u + t)%P;
                	Out[k+block] = (u - t + P)%P;
                	w = (INT64)w * wn[id]%P;
            	}
        	}
    	}
        if (on == 1) {
            for (int i = 1; i < n/2; i++)
                swap(Out[i], Out[n-i]);
            INT32 invn = mod_pow(n, P-2, P);
            for (int i = 0; i < n; i++)
                Out[i] = (INT64)Out[i]*invn%P;
        }
    }
    INT64 crt(INT32 a, INT32 b) {
        return (mod_mul(a, M1, MM) + mod_mul(b, M2, MM))%MM;
    }
    void convolution(INT32 *a, INT32 *b, int n, INT64 *c) {
        reset(P1, G1);
        NTT(0, a, d1, n);
        s[0] = b[0];	for (int i = 1; i < n; ++i) s[i] = b[n-i];
        NTT(0, s, d2, n);
        for (int i = 0; i < n; i++)	s[i] = (INT64)d1[i] * d2[i]%P;
        NTT(1, s, c1, n);
        reset(P2, G2);
        NTT(0, a, d1, n);
        s[0] = b[0];	for (int i = 1; i < n; ++i) s[i] = b[n-i];
        NTT(0, s, d2, n);
        for (int i = 0; i < n; i++)	s[i] = (INT64)d1[i] * d2[i]%P;
        NTT(1, s, c2, n);
        for (int i = 0; i < n; i++)
            c[i] = crt(c1[i], c2[i]);
    }
} tool;
INT32 a[262144], b[262144];
INT64 c[262144];
long long sum[512][512];
inline long long getArea(int lx, int ly, int rx, int ry) {
    long long ret = sum[rx][ry];
    if(lx)	ret -= sum[lx-1][ry];
    if(ly)	ret -= sum[rx][ly-1];
    if(lx && ly)	ret += sum[lx-1][ly-1];
    return ret;
}
inline int readchar() {
    const int N = 1048576;
    static char buf[N];
    static char *p = buf, *end = buf;
    if(p == end) {
        if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
        p = buf;
    }
    return *p++;
}
inline int ReadInt(int *x) {
    static char c, neg;
    while((c = readchar()) < '-')    {if(c == EOF) return 0;}
    neg = (c == '-') ? -1 : 1;
    *x = (neg == 1) ? c-'0' : 0;
    while((c = readchar()) >= '0')
        *x = (*x << 3) + (*x << 1) + c-'0';
    *x *= neg;
    return 1;
}
int main() {
    int m, n, p, q, x, N, M, S;
    while (ReadInt(&m)) {
    	ReadInt(&n), ReadInt(&p), ReadInt(&q);
    	N = max(m, p), M = max(n, q);
        S = 1;
        for (; S < N*M; S <<= 1);
        memset(a, 0, sizeof(a[0]) * S);
        memset(b, 0, sizeof(b[0]) * S);
        for (int i = 0; i < m; i++) {
        	long long s = 0;
            for (int j = 0; j < n; j++) {
        		ReadInt(&x);
            	a[i*M+j] = x;
        		s += x*x;
        		sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;          
            }
        }
        for (int i = 0; i < p; i++) {
            for (int j = 0; j < q; j++) {
            	ReadInt(&x);
            	b[i*M+j] = x;
            }
        }        
        tool.convolution(a, b, S, c);
        int qx = m - p, qy = n - q, bX = 0, bY = 0;
        long long diff = LONG_MAX;
        for (int i = 0; i <= qx; i++) {
            for (int j = 0; j <= qy; j++) {
                long long v = getArea(i, j, i+p-1, j+q-1) - 2*c[i*M + j];
                if (v < diff)
                    diff = v, bX = i, bY = j;
            }
        }
        fprintf(stderr, "diff = %lld\n", diff);
        printf("%d %d\n", bX+1, bY+1);
    }
    return 0;
}

Morris' Blog