#include <stdio.h>
#include <limits.h>
#include <stdbool.h>
#include <math.h>
#include <string.h>
#include <omp.h>
#define MAXN 262144
typedef struct {
float x, y;
} complex;
typedef unsigned int UINT32;
typedef struct {
int (* const NumberOfBitsNeeded) (int);
UINT32 (* const FastReverseBits) (UINT32, int);
void (* const FFT) (bool, complex[], complex[], int);
void (* const convolution) (double *, double *, int , double *);
} FFT_namespace;
static inline complex init_complex(double a, double b) {
complex t = {a, b};
return t;
}
static inline complex add_complex(complex *a, complex *b) {
return init_complex(a->x + b->x, a->y + b->y);
}
static inline complex sub_complex(complex *a, complex *b) {
return init_complex(a->x - b->x, a->y - b->y);
}
static inline complex mul_complex(complex *a, complex *b) {
return init_complex(a->x * b->x - a->y * b->y, a->x * b->y+ a->y * b->x);
}
static complex pre_theta[2][MAXN];
void prework(int n) {
static int pre_n = 0;
static const double PI = acos(-1);
if (pre_n == n)
return ;
pre_n = n;
pre_theta[0][0] = init_complex(1, 0);
pre_theta[1][0] = init_complex(1, 0);
#pragma omp parallel for
for (register int i = 1; i < n; i++) {
pre_theta[0][i] = init_complex(cos(2*i*PI / n ) , sin(2*i*PI / n ));
pre_theta[1][i] = init_complex(cos(2*i*PI / n ) , -sin(2*i*PI / n ));
}
}
int NumberOfBitsNeeded(int PowerOfTwo) {
for (int i = 0;; ++i) {
if (PowerOfTwo & (1 << i)) {
return i;
}
}
}
UINT32 FastReverseBits(UINT32 a, int NumBits) {
a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
return a >> (32 - NumBits);
}
void _FFT(bool InverseTransform, complex In[], complex Out[], int n) {
int NumSamples = n;
int NumBits = NumberOfBitsNeeded(NumSamples);
for (int i = 0; i < NumSamples; ++i)
Out[FastReverseBits(i, NumBits)] = In[i];
#pragma omp parallel
for (int i = 1; i <= NumBits; i++) {
int BlockSize = 1<<i, BlockEnd = BlockSize>>1, BlockCnt = NumSamples/BlockSize;
#pragma omp for
for (int j = 0; j < NumSamples; j += BlockSize) {
complex *t = pre_theta[InverseTransform];
for (int k = 0; k < BlockEnd; k++, t += BlockCnt) {
complex a = mul_complex(&(*t), &Out[k+j+BlockEnd]);
Out[k+j+BlockEnd] = sub_complex(&Out[k+j], &a);
Out[k+j] = add_complex(&Out[k+j], &a);
}
}
}
if (InverseTransform) {
#pragma omp parallel for
for (int i = 0; i < NumSamples; ++i) {
Out[i].x = Out[i].x / NumSamples;
}
}
}
void convolution(double *a, double *b, int n, double *c) {
static complex s[MAXN], d1[MAXN], d2[MAXN], y[MAXN];
prework(n);
#pragma omp parallel for
for (int i = 0; i < n; ++i)
s[i] = init_complex(a[i], 0);
_FFT(false, s, d1, n);
s[0] = init_complex(b[0], 0);
#pragma omp parallel for
for (int i = 1; i < n; ++i)
s[i] = init_complex(b[n - i], 0);
_FFT(false, s, d2, n);
#pragma omp parallel for
for (int i = 0; i < n; ++i)
y[i] = mul_complex(&d1[i], &d2[i]);
_FFT(true, y, s, n);
#pragma omp parallel for
for (int i = 0; i < n; ++i)
c[i] = s[i].x;
}
FFT_namespace const FFT = {NumberOfBitsNeeded, FastReverseBits, _FFT, convolution};
double a[MAXN], b[MAXN], c[MAXN];
long long sum[512][512];
long long getArea(int lx, int ly, int rx, int ry) {
long long ret = sum[rx][ry];
if (lx) ret -= sum[lx-1][ry];
if (ly) ret -= sum[rx][ly-1];
if (lx && ly) ret += sum[lx-1][ly-1];
return ret;
}
int main() {
omp_set_num_threads(5);
int m, n, p, q, x, N, M, S;
while (scanf("%d %d %d %d", &m, &n, &p, &q) == 4) {
N = m > p ? m : p, M = n > q ? n : q;
S = 1;
for (; S < N*M; S <<= 1);
memset(a, 0, sizeof(a[0]) * S);
memset(b, 0, sizeof(b[0]) * S);
for (int i = 0; i < m; i++) {
long long s = 0;
for (int j = 0; j < n; j++) {
scanf("%d", &x);
a[i*M+j] = x;
s += x*x;
sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;
}
}
for (int i = 0; i < p; i++) {
for (int j = 0; j < q; j++) {
scanf("%d", &x);
b[i*M+j] = x;
}
}
FFT.convolution(a, b, S, c);
int qx = m - p, qy = n - q, bX = INT_MAX, bY = INT_MAX;
long long diff = LONG_MAX;
#pragma omp parallel
{
long long t_diff = LONG_MAX;
int t_bX = INT_MAX, t_bY = INT_MAX;
#pragma omp for
for (int i = 0; i <= qx; i++) {
for (int j = 0; j <= qy; j++) {
long long v = getArea(i, j, i+p-1, j+q-1) - 2*c[i*M + j] + 0.5;
if (v < t_diff)
t_diff = v, t_bX = i, t_bY = j;
}
}
#pragma omp critical
{
if (t_diff < diff)
diff = t_diff, bX = t_bX, bY = t_bY;
if (t_diff == diff && (t_bX < bX || (t_bX == bX && t_bY < bY))) {
bX = t_bX, bY = t_bY;
}
}
}
printf("%d %d\n", bX+1, bY+1);
}
return 0;
}