2016-04-30

批改娘 10025. Fast Image Match (OpenMP)

1. 背景
2. 題目描述
3. 輸入格式
4. 輸出格式
5. 範例輸入
6. 範例輸出
7. Solution
1. 7.1. 樸素平行
2. 7.2. FFT

背景

圖片匹配和字串匹配有一點不同，字串匹配通常要求其子字串與搜尋字串完全相符，而圖片匹配則用相似度為依據，當圖片大、複雜且具有干擾，或者需要匹配數量非常多，更先進的領域會利用特徵擷取，用機率統計的方式來篩選可能的匹配數量，篩選過後才進行圖片的細節匹配。

題目描述

給予兩個圖片 $A, B$，圖片格式為灰階影像，每個像素 $\mathit{pixel}(x, y)$ 採用 8-bits 表示，範圍為 $\mathit{pixel}(x, y) \in [0, 255]$。

舉一個例子，有一個 $3 \times 3$ 的圖片 $A$ 和一個 $2 \times 2$ 的圖片 $B$，用矩陣表示如下：

$$A := \begin{bmatrix} a1 & a2 & a3 \\ a4 & a5 & a6 \\ a7 & a8 & a9 \end{bmatrix} ,\; B := \begin{bmatrix} b1 & b2\\ b3 & b4\\ \end{bmatrix}$$

假設左上角座標 $(1, 1)$ 即 $a1$ 的位置、$(1, 2)$ 即 $a2$ 的位置。

若把影像 $B$ 左上角對齊 $A$ 的 $(1, 1)$ 位置，其差異程度 $\mathit{diff}(A, B) = (a1 - b1)^2 + (a2 - b2)^2 + (a4 - b3)^2 + (a5 - b4)^2$
相同地，對齊 $(2, 1)$ 位置，其差異程度 $\mathit{diff}(A, B) = (a4 - b1)^2 + (a5 - b2)^2 + (a7 - b3)^2 + (a8 - b4)^2$

比較時，整張 $B$ 都要落在 $A$ 中。現在要找到一個對齊位置 $(x, y)$，使得 $\mathit{diff}(A, B)$ 最小。

輸入格式

多組測資，每一組第一行有四個整數 $A_H, A_W, B_H, B_W$，分別表示圖片 $A$ 的高寬、$B$ 的高寬。

接下來會有 $A_H$ 行，每一行上會有 $A_W$ 個整數，第 $i$ 行上的第 $j$ 個整數 $x$ 表示 $A(i, j)$ 的灰階像素值。同樣地，接下來會有 $B_H$ 行，每一行上會有 $B_W$ 個整數，第 $i$ 行上的第 $j$ 個整數 $x$ 表示 $B(i, j)$ 的灰階像素值。

$ 1\le A_H, A_W, B_H, B_W \le 500$
$ B_H \le A_H, B_W \le A_W$
$ 0 \le x \le 255$

輸出格式

對於每一組測資輸出一行，兩個整數 $x, y$，滿足 $\mathit{diff}(A, B)$ 最小。若有相同情況滿足，則優先挑選 $x$ 最小，若 $x$ 仍相同，則挑選 $y$ 最小。

範例輸入

範例輸出

1
2

3 3
1 1

Solution

樸素平行

若圖片是 $N \times N$，樸素算法運作要 $\mathcal{O}(N^4)$，傅立葉算法要 $\mathcal{O}(N^2 \log N)$，這裡針對樸素算法平行。從最高層次的找解平行，每一個匹配位置都要耗費 $\mathcal{O}(M^2)$ 計算相似性，又由於平行限制跟核心數有關，那麼針對每一個 thread 要求找到好幾個 row 的解，分別存在各自的答案陣列區中，之後再線性跑一次取最小值，利用額外的空間儲存，避免在運算過程中遇到 critical section。

#pragma omp parallel for schedule(dynamic, chunk) shared(A, B)
for (int i = 0; i <= Ch; i++) {
    for (int j = 0; j <= Cw; j++) {
        long long d = 0;
        for (int p = 0; p < Bh; p++) {
            for (int q = 0; q < Bw; q++) {
                d += (long long) (B[p][q] - A[i+p][j+q])*(B[p][q] - A[i+p][j+q]);
            }                   
        }
        if (d < diff)
        {
            #pragma omp critical
            if (d < diff)
                diff = d, bx = i, by = j;
        }
    }
}

就是上列的寫法會卡在 critical section，倒不如存在另一個陣列中，效能還差距個幾百毫秒。

#include <stdio.h>
#include <limits.h>
#define MAXN 512
int A[MAXN][MAXN], B[MAXN][MAXN];
int C[MAXN][MAXN];
int main() {
    int Ah, Aw, Bh, Bw, x;
    int cases = 0;
    while(scanf("%d %d %d %d", &Ah, &Aw, &Bh, &Bw) == 4) {
        for (int i = 0; i < Ah; i++)
            for (int j = 0; j < Aw; j++)
                scanf("%d", &x), A[i][j] = x;
        for (int i = 0; i < Bh; i++)
            for (int j = 0; j < Bw; j++)
                scanf("%d", &x), B[i][j] = x;
        int Ch = Ah - Bh, Cw = Aw - Bw;
        int bx = -1, by = -1;
        int diff = INT_MAX;
        #pragma omp parallel for
        for (int i = 0; i <= Ch; i++) {
            for (int j = 0; j <= Cw; j++) {
                int d = 0;
                for (int p = 0; p < Bh; p++) {
                    for (int q = 0; q < Bw; q++) {
                        d += (B[p][q] - A[i+p][j+q])*(B[p][q] - A[i+p][j+q]);
                    }                    
                }
                C[i][j] = d;
            }
        }
        for (int i = 0; i <= Ch; i++) {
            for (int j = 0; j <= Cw; j++)
                if (C[i][j] < diff)
                    diff = C[i][j], bx = i, by = j;
        }
 
        printf("%d %d\n", bx + 1, by + 1);
    }
    return 0;
}

FFT

儘管使用快速傅立葉 (FFT) 已經達到 $\mathcal{O}(N \log N)$ 的時間複雜度的極致，只比平行樸素算法快個兩到三倍，加入平行之後，效能可以達到五到六倍加速。

FFT 用到大量的三角函數計算，一般而言若使用序列化的乘法代替三角函數計算 (疊加角度) 會比重新計算快上很多，但這樣會損失一大部分的精準度 (但速度快很多)，而且會產生很多相互依賴的指令，這也導致不容易對迴圈平行。

如果需要平行，每一個三角函數可以預先存在陣列中，每一個 thread 直接存取陣列中的元素即可，但麻煩的地方在於最好使用多核處理器，然而跨越多個處理器有可能會因為快取 … 等機制，導致速度下降，因此這裡採用單一處理器六核心 (只有三個是 physical core)，因此 thread 限定在五個以內。

#include <stdio.h>
#include <limits.h>
#include <stdbool.h>
#include <math.h>
#include <string.h>
#include <omp.h>
// FFT header
#define MAXN 262144
typedef struct {
    float x, y;
} complex;
typedef unsigned int UINT32;
typedef struct {
    int (* const NumberOfBitsNeeded) (int);
    UINT32 (* const FastReverseBits) (UINT32, int);
    void (* const FFT) (bool, complex[], complex[], int);
    void (* const convolution) (double *, double *, int , double *);
} FFT_namespace;
static inline complex init_complex(double a, double b) {
    complex t = {a, b};
    return t;
}
static inline complex add_complex(complex *a, complex *b) {
    return init_complex(a->x + b->x, a->y + b->y);
}
static inline complex sub_complex(complex *a, complex *b) {
    return init_complex(a->x - b->x, a->y - b->y);
}
static inline complex mul_complex(complex *a, complex *b) {
    return init_complex(a->x * b->x - a->y * b->y, a->x * b->y+ a->y * b->x);
}
static complex pre_theta[2][MAXN];
void prework(int n) {
    static int pre_n = 0;
    static const double PI = acos(-1);
    if (pre_n == n)
        return ;
    pre_n = n;
    pre_theta[0][0] = init_complex(1, 0);
    pre_theta[1][0] = init_complex(1, 0);
    #pragma omp parallel for
    for (register int i = 1; i < n; i++) {
        pre_theta[0][i] = init_complex(cos(2*i*PI / n ) , sin(2*i*PI / n ));
        pre_theta[1][i] = init_complex(cos(2*i*PI / n ) , -sin(2*i*PI / n ));
    }
}
// FFT body
int NumberOfBitsNeeded(int PowerOfTwo) {
    for (int i = 0;; ++i) {
        if (PowerOfTwo & (1 << i)) {
            return i;
        }
    }
}
UINT32 FastReverseBits(UINT32 a, int NumBits) {
    a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
    a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
    a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
    a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
    a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
    return a >> (32 - NumBits);
}
void _FFT(bool InverseTransform, complex In[], complex Out[], int n) {
    // simultaneous data copy and bit-reversal ordering into outputs
    int NumSamples = n;
    int NumBits = NumberOfBitsNeeded(NumSamples);
    for (int i = 0; i < NumSamples; ++i)
        Out[FastReverseBits(i, NumBits)] = In[i];
    // the FFT process
    #pragma omp parallel
    for (int i = 1; i <= NumBits; i++) {
        int BlockSize = 1<<i, BlockEnd = BlockSize>>1, BlockCnt = NumSamples/BlockSize;
        #pragma omp for
        for (int j = 0; j < NumSamples; j += BlockSize) {
            complex *t = pre_theta[InverseTransform];
            for (int k = 0; k < BlockEnd; k++, t += BlockCnt) {
                complex a = mul_complex(&(*t), &Out[k+j+BlockEnd]);
                Out[k+j+BlockEnd] = sub_complex(&Out[k+j], &a);
                Out[k+j] = add_complex(&Out[k+j], &a);
            }
        }
    }
    
    // normalize if inverse transform
    if (InverseTransform) {
        #pragma omp parallel for
        for (int i = 0; i < NumSamples; ++i) {
            Out[i].x = Out[i].x / NumSamples;
        }
    }
}
void convolution(double *a, double *b, int n, double *c) {
    static complex s[MAXN], d1[MAXN], d2[MAXN], y[MAXN];
    prework(n);
    #pragma omp parallel for
    for (int i = 0; i < n; ++i)
        s[i] = init_complex(a[i], 0);
    _FFT(false, s, d1, n);
    s[0] = init_complex(b[0], 0);
    #pragma omp parallel for
    for (int i = 1; i < n; ++i)
        s[i] = init_complex(b[n - i], 0);
    _FFT(false, s, d2, n);
    #pragma omp parallel for
    for (int i = 0; i < n; ++i)
        y[i] = mul_complex(&d1[i], &d2[i]);
    _FFT(true, y, s, n);
    #pragma omp parallel for
    for (int i = 0; i < n; ++i)
        c[i] = s[i].x;
}
FFT_namespace const FFT = {NumberOfBitsNeeded, FastReverseBits, _FFT, convolution};
// FFT end
double a[MAXN], b[MAXN], c[MAXN];
long long sum[512][512];
long long getArea(int lx, int ly, int rx, int ry) {
    long long ret = sum[rx][ry];
    if (lx) ret -= sum[lx-1][ry];
    if (ly) ret -= sum[rx][ly-1];
    if (lx && ly)   ret += sum[lx-1][ly-1];
    return ret;
}
int main() {
    omp_set_num_threads(5);
    int m, n, p, q, x, N, M, S;
    while (scanf("%d %d %d %d", &m, &n, &p, &q) == 4) {
        N = m > p ? m : p, M = n > q ? n : q;
        S = 1;
        for (; S < N*M; S <<= 1);
        memset(a, 0, sizeof(a[0]) * S);
        memset(b, 0, sizeof(b[0]) * S);
        for (int i = 0; i < m; i++) {
            long long s = 0;
            for (int j = 0; j < n; j++) {
                scanf("%d", &x);
                a[i*M+j] = x;
                s += x*x;
                sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;          
            }
        }
        for (int i = 0; i < p; i++) {
            for (int j = 0; j < q; j++) {
                scanf("%d", &x);
                b[i*M+j] = x;
            }
        }        
        FFT.convolution(a, b, S, c);
        int qx = m - p, qy = n - q, bX = INT_MAX, bY = INT_MAX;
        long long diff = LONG_MAX;
        #pragma omp parallel
        {
            long long t_diff = LONG_MAX;
            int t_bX = INT_MAX, t_bY = INT_MAX;
            #pragma omp for
            for (int i = 0; i <= qx; i++)    {
                for (int j = 0; j <= qy; j++) {
                    long long v = getArea(i, j, i+p-1, j+q-1) - 2*c[i*M + j] + 0.5;
                    if (v < t_diff)
                        t_diff = v, t_bX = i, t_bY = j;
                }
            }
            #pragma omp critical
            {
                if (t_diff < diff)
                    diff = t_diff, bX = t_bX, bY = t_bY;
                if (t_diff == diff && (t_bX < bX || (t_bX == bX && t_bY < bY))) {
                    bX = t_bX, bY = t_bY;
                }
            }
        }
        printf("%d %d\n", bX+1, bY+1);
    }
    return 0;
}

Morris' Blog