2016-04-30

UVa 13009 - Fence the vegetables fail

Problem

給一個平行兩軸的簡單多邊形花圃以及一堆蔬菜的座標，請問有多少蔬菜在柵欄的外部，把那些在外部的蔬菜編號加總後輸出。

Sample Input

Sample Output

1
2
3

6
15
0

Solution

判斷一個點是否在簡單多邊形內部可以使用射線法，然而有很多點詢問，單筆詢問複雜度 $\mathcal{O}(N)$，這樣很明顯地會 TLE。

為了加速判斷，採用掃描線算法，依序放入平行 x 軸的線段，掃描過程中離線詢問某點往 y 軸負方向的射線交點個數為何，維護求交點個數可以利用 binary indexed tree 維護前綴和，前綴和相當於射線交點個數。就能在 $\mathcal{O}(N \log N)$ 完成所有詢問。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <vector>
#include <map>
#include <set>
#include <math.h>
using namespace std;
struct Pt {
    int x, y, v;
    Pt(int a = 0, int b = 0):
    x(a), y(b) {}
    Pt operator-(const Pt &a) const {
        return Pt(x - a.x, y - a.y);
    }
    Pt operator+(const Pt &a) const {
        return Pt(x + a.x, y + a.y);
    }
    Pt operator*(const double a) const {
        return Pt(x * a, y * a);
    }
    bool operator<(const Pt &a) const {
        if (x != a.x)
            return x < a.x;
        if (y != a.y)
            return y < a.y;
        return false;
    }
};
struct Seg {
    int xl, xr, y;
    Seg(int a = 0, int b = 0, int c = 0):
    xl(a), xr(b), y(c) {}
    bool operator<(const Seg &u) const {
        return xl < u.xl;
    }
};
Pt P[131072], D[131072];
int BIT[262144];
void modify(int x, int val, int L) {
    while (x <= L)
        BIT[x] += val, x += x&(-x);
}
int query(int x) {
    int sum = 0;
    while (x)
        sum += BIT[x], x -= x&(-x);
    return sum;
}
void solve(int N, int M) {
    map<int, int> RX, RY;
    for (int i = 0; i < N; i++)
        RX[P[i].x] = RY[P[i].y] = 0;
    for (int i = 0; i < M; i++)
        RX[D[i].x] = RY[D[i].y] = 0;
    
    int label_x = 0, label_y = 0;
    
    for (auto &x : RX)
        x.second = ++label_x;
    for (auto &y : RY)
        y.second = ++label_y;
    
    memset(BIT, 0, sizeof(BIT));
    
    for (int i = 0; i < N; i++)
        P[i].x = RX[P[i].x], P[i].y = RY[P[i].y];
    for (int i = 0; i < M; i++)
        D[i].x = RX[D[i].x], D[i].y = RY[D[i].y];
    
    vector<Seg> segs;
    for (int i = 0; i < M; i++) {
        if (D[i].y == D[(i+1)%M].y) {
            int xl = min(D[i].x, D[(i+1)%M].x);
            int xr = max(D[i].x, D[(i+1)%M].x);
            segs.push_back(Seg(xl, xr, D[i].y));
        }
    }
    
    long long outer_val = 0;
    sort(P, P+N);
    sort(segs.begin(), segs.end());
    int Pidx = 0;
    set<std::pair<int, int>> PQ;
    for (int i = 0, line_x = 1; line_x <= label_x; line_x++) {
        while (Pidx < N && P[Pidx].x <= line_x) {
            int intersect = query(P[Pidx].y);
            if (intersect%2 == 0)
                outer_val += P[Pidx].v;
            Pidx++;
        }
        while (!PQ.empty() && PQ.begin()->first <= line_x)
            modify(PQ.begin()->second, -1, label_y), PQ.erase(PQ.begin());
        while (i < segs.size() && segs[i].xl == line_x) {
            modify(segs[i].y, 1, label_y);
            PQ.insert(make_pair(segs[i].xr, segs[i].y));
            i++;
        }
    }
    printf("%lld\n", outer_val);
}
int main() {
    int N, M;
    while (scanf("%d %d", &N, &M) == 2) {
        for (int i = 0; i < N; i++) {
            scanf("%d %d", &P[i].x, &P[i].y);
            P[i].v = i+1;
        }
        
        for (int i = 0; i < M; i++)
            scanf("%d %d", &D[i].x, &D[i].y);
        
        solve(N, M);
    }
    return 0;
}

Read More +

2016-04-30

解題區/解題區 - UVa

UVa 13046 - Bus Collisions

Problem

給予兩個公車行駛路線，每台公車的速度都相同，分別從起始位置出發，請問第一個碰撞位置為何？

Sample Input

Sample Output

1 2	Case 1: 1 5 Case 2: No Collision

Solution

抓路線長度的最小公倍數，一定能在這段時間內找到碰撞位置，否則不存在碰撞地點。

#include <bits/stdc++.h>
using namespace std;
struct Pt {
    int x, y;
    Pt(int x = 0, int y = 0): x(x), y(y) {
    }
    int dist(Pt u) {
        return abs(x - u.x) + abs(y - u.y);
    }
    bool operator==(const Pt &u) const {
        return x == u.x && y == u.y;
    }
    Pt operator-(const Pt &u) const {
        return Pt(x - u.x, y - u.y);
    }
    Pt operator+(const Pt &u) const {
        return Pt(x + u.x, y + u.y);
    }
};
int main() {
    int testcase, cases = 0;
    int N[2], x, y;
    Pt R[2][64];
    scanf("%d", &testcase);
    while (testcase--) {
        int Len[2] = {};
        for (int i = 0; i < 2; i++) {
            scanf("%d", &N[i]);
            for (int j = 0; j < N[i]; j++) {
                scanf("%d %d", &x, &y);
                R[i][j] = Pt(x, y);
            }
            for (int j = 0; j < N[i]; j++)
                Len[i] += R[i][j].dist(R[i][(j+1)%N[i]]);
        }
        
        printf("Case %d: ", ++cases);
        
        int simT = Len[0]/__gcd(Len[0], Len[1])*Len[1];
        int posIdx[2], has = 0;
        Pt posXY[2];
        for (int i = 0; i < 2; i++)
            posIdx[i] = 0, posXY[i] = R[i][0];
            
        for (int it = 0; it < simT; it++) {
            for (int i = 0; i < 2; i++) {
                // move
                if (posXY[i] == R[i][posIdx[i]])
                    posIdx[i] = (posIdx[i]+1)%N[i];
                Pt dv = R[i][posIdx[i]] - posXY[i];
                int lenDv = abs(dv.x) + abs(dv.y);
                dv.x /= lenDv, dv.y /= lenDv;
                posXY[i] = posXY[i] + dv;
            }
            if (posXY[0] == posXY[1]) {
                printf("%d %d\n", posXY[0].x, posXY[0].y);
                has = 1;
                break;
            }
        }
        if (!has)
            puts("No Collision");
    }
    return 0;
}

Read More +

2016-04-30

解題區/解題區 - UVa

UVa 1718 - Tile Cutting

Problem

請問在網格中，若頂點設置在網格邊上，構成平行四邊形的面積介於 $[l, r]$，最多的方法數和面積分別為多少。

Sample Input

1
2
3

2
4 4
2 6

Sample Output

1
2

48
620

Solution

由於是平行四邊形，假設矩形被劃分的兩個對稱三角形的底高 $a, b, c, d$，且 $a, b, c, d > 0$。

得到平行四邊形面積為

$$\begin{align*} \text{Area} &= (a + c)(b + d) - 2 \times 0.5 \times a b - 2 \times 0.5 \times c d \\ &= ad + bc \end{align*}$$

最後找到構成方法數為 ways of area = #(a, b, c, d) sat. ad + bc = Area。分別統計兩數相乘為 $x$ 的方法數 $W_x$，可以構成多項式 $W_1 x + W_2 x^2 + \cdots W_n x^n$，多項式相乘後，得到的係數就是所有的方法數。套用 FFT 的卷積計算即可。

FFT

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <vector>
#include <assert.h>
#include <algorithm>
#include <complex>
using namespace std;
struct Complex {
    double x, y;
    Complex(double x = 0, double y = 0): x(x), y(y) {}
    Complex operator-(const Complex &v) const {
        return Complex(x-v.x,y-v.y);
    }
    Complex operator+(const Complex &v) const {
        return Complex(x+v.x,y+v.y);
    }
    Complex operator*(const Complex &v) const {
        return Complex(x*v.x-y*v.y,x*v.y+y*v.x);
    }
    Complex operator/(const int &v) const {
        return Complex(x/v,y/v);
    }
    double real() {
        return x;
    }
};
template<typename T> class TOOL_FFT {
public:
typedef unsigned int UINT32;
#define MAXN (1048576<<1)
    Complex p[2][MAXN];
    int pre_n;
    T PI;
    TOOL_FFT() {
        pre_n = 0;
        PI = acos(-1);
    }
    int NumberOfBitsNeeded(int PowerOfTwo) {
        for (int i = 0; ; ++i) {
            if (PowerOfTwo & (1 << i)) {
                return i;
            }
        }
    }
    inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
        a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
        a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
        a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
        a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
        a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
        return a >> (32 - NumBits);
    }
    
    void FFT(bool InverseTransform, vector<Complex>& In, vector<Complex>& Out) {
        // simultaneous data copy and bit-reversal ordering into outputs
        int NumSamples = In.size();
        int NumBits = NumberOfBitsNeeded(NumSamples);
        for (int i = 0; i < NumSamples; ++i) {
            Out[FastReverseBits(i, NumBits)] = In[i];
        }
        // the FFT process
        for (register int i = 1; i <= NumBits; i++) {
            int BlockSize = 1<<i, BlockEnd = BlockSize>>1, BlockCnt = NumSamples/BlockSize;
            for (register int j = 0; j < NumSamples; j += BlockSize) {
                Complex *t = p[InverseTransform];
                for (register int k = 0; k < BlockEnd; k++, t += BlockCnt) {
                    Complex a = (*t) * Out[k+j+BlockEnd];
                    Out[k+j+BlockEnd] = Out[k+j] - a;
                    Out[k+j] = Out[k+j] + a;
                }
            }
        }
        // normalize if inverse transform
        if (InverseTransform) {
            for (int i = 0; i < NumSamples; ++i) {
                Out[i] = Out[i] / NumSamples;
            }
        }
    }
    void prework(int n) {
        if (pre_n == n)
            return ;
        pre_n = n;
        p[0][0] = Complex(1, 0);
        p[1][0] = Complex(1, 0);
        for (register int i = 1; i < n; i++) {
            p[0][i] = Complex(cos(2*i*PI / n ) , sin(2*i*PI / n ));
            p[1][i] = Complex(cos(2*i*PI / n ) , -sin(2*i*PI / n ));
        }
    }
    vector<T> convolution(Complex *a, Complex *b, int n) {
        prework(n);
        vector<Complex> s(a, a+n), d1(n), d2(n), y(n);
        vector<T> ret(n);
        FFT(false, s, d1);
        s[0] = b[0];
        for (int i = 1, j = n-1; i < n; ++i, --j)
            s[i] = b[j];
        FFT(false, s, d2);
        for (int i = 0; i < n; ++i) {
            y[i] = d1[i] * d2[i];
        }
        FFT(true, y, s);
        for (int i = 0; i < n; ++i) {
            ret[i] = s[i].real();
        }
        return ret;
    }
};
TOOL_FFT<double> tool;
Complex a[MAXN], b[MAXN];
vector<double> c;
long long ret[1048576];
long long pr[1048576] = {};
int main() {
    int m;
    const int N = 500000;
    for (m = 1; m < (N<<1); m <<= 1);
    /*
		(a + c)(b + d) - 2 * 0.5 * (a b) - 2 * 0.5 * (c * d)
		= ad + bc = area
		a, b, c, d > 0,
		ways of area = #(a, b, c, d) sat. ad + bc = area
	*/
    for (int i = 1; i <= N; i++) {
    	for (int j = i; j <= N; j += i)
    		pr[j]++;
    }	
    
    memset(a, 0, sizeof(a[0]) * m);
    memset(b, 0, sizeof(b[0]) * m);
    
    for (int i = 1; i <= N; i++) {
        a[i] = Complex(pr[i], 0);
        b[m-i] = Complex(pr[i], 0);
    }	
    
    c = tool.convolution(a, b, m);
    
    for (int i = 1; i <= N; i++)
        ret[i] = (long long) (c[i] + 0.5);
        
    int testcase, L, R;
    while (scanf("%d", &testcase) == 1) {
        while (testcase--)	 {
            scanf("%d %d", &L, &R);
            assert(L <= R);
            assert(L >= 1 && R <= 500000);
            int mxA = L;
            for (int i = L; i <= R; i++) {
                if (ret[i] > ret[mxA])
                    mxA = i;
            }
            printf("%d %lld\n", mxA, ret[mxA]);
        }
    }
    return 0;
}

NTT

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <vector>
#include <assert.h>
#include <algorithm>
#include <complex>
using namespace std;
typedef unsigned int UINT32;
typedef long long INT64;
class TOOL_NTT {
public:
#define MAXN (1048576<<1)
    const INT64 P = (479 << 21) + 1LL; // prime m = kn+1
    const INT64 G = 3;
    INT64 wn[22];
    INT64 s[MAXN], d1[MAXN], d2[MAXN], y[MAXN];
    TOOL_NTT() {
        for (int i = 0; i < 22; i++)
            wn[i] = mod_pow(G, (P-1) / (1<<i), P);
    }
    INT64 mod_mul(INT64 a, INT64 b, INT64 mod) {
        return (a*b - (long long)(a/(long double)mod*b+1e-3)*mod+mod)%mod;
//              INT64 ret = 0;
//              for (a = a >= mod ? a%mod : a, b = b >= mod ? b%mod : b; b != 0; b>>=1, a <<= 1, a = a >= mod ? a - mod : a) {
//                  if (b&1) {
//                      ret += a;
//                      if (ret >= mod)
//                          ret -= mod;
//                  }
//              }
//              return ret;
    }
    INT64 mod_pow(INT64 n, INT64 e, INT64 m) {
        INT64 x = 1;
        for (n = n >= m ? n%m : n; e; e >>= 1) {
            if (e&1)
                x = mod_mul(x, n, m);
            n = mod_mul(n, n, m);
        }
        return x;
    }
    int NumberOfBitsNeeded(int PowerOfTwo) {
        for (int i = 0;; ++i) {
            if (PowerOfTwo & (1 << i)) {
                return i;
            }
        }
    }
    inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
        a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
        a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
        a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
        a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
        a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
        return a >> (32 - NumBits);
    }
    void NTT(int on, INT64 *In, INT64 *Out, int n) {
        int NumBits = NumberOfBitsNeeded(n);
        for (int i = 0; i < n; ++i)
            Out[FastReverseBits(i, NumBits)] = In[i];
        for(int h = 2, id = 1; h <= n; h <<= 1, id++) {
            for(int j = 0; j < n; j += h) {
                INT64 w = 1, u, t;
                int block = h/2, blockEnd = j + h/2;
                for(int k = j; k < blockEnd; k++) {
                    u = Out[k], t = mod_mul(w, Out[k+block], P);
                    Out[k] = u + t;
                    Out[k + block] = u - t + P;
                    if (Out[k] >= P)        Out[k] -= P;
                    if (Out[k+block] >= P)  Out[k+block] -= P;
                    w = mod_mul(w, wn[id], P);
                }
            }
        }
        if (on == 1) {
            for (int i = 1; i < n/2; i++)
                swap(Out[i], Out[n-i]);
            INT64 invn = mod_pow(n, P-2, P);
            for (int i = 0; i < n; i++)
                Out[i] = mod_mul(Out[i], invn, P);
        }
    }
    void convolution(INT64 *a, INT64 *b, int n, INT64 *c) {
        NTT(0, a, d1, n);
        s[0] = b[0];
        for (int i = 1; i < n; ++i)
            s[i] = b[n-i];
        NTT(0, s, d2, n);
        for (int i = 0; i < n; i++)
            s[i] = mod_mul(d1[i], d2[i], P);
        NTT(1, s, c, n);
    }
} tool;
INT64 a[MAXN], b[MAXN], c[MAXN];
long long ret[1048576];
long long pr[1048576] = {};
int main() {
    int m;
    const int N = 500000;
    for (m = 1; m < (N<<1); m <<= 1);
    /*
		(a + c)(b + d) - 2 * 0.5 * (a b) - 2 * 0.5 * (c * d)
		= ad + bc = area
		a, b, c, d > 0,
		ways of area = #(a, b, c, d) sat. ad + bc = area
	*/
    for (int i = 1; i <= N; i++) {
    	for (int j = i; j <= N; j += i)
    		pr[j]++;
    }	
    
    memset(a, 0, sizeof(a[0]) * m);
    memset(b, 0, sizeof(b[0]) * m);
    
    for (int i = 1; i <= N; i++) {
        a[i] = pr[i];
        b[m-i] = pr[i];
    }	
    
    tool.convolution(a, b, m, c);
    
    for (int i = 1; i <= N; i++)
        ret[i] = c[i];
        
    int testcase, L, R;
    while (scanf("%d", &testcase) == 1) {
        while (testcase--)	 {
            scanf("%d %d", &L, &R);
            assert(L <= R);
            assert(L >= 1 && R <= 500000);
            int mxA = L;
            for (int i = L; i <= R; i++) {
                if (ret[i] > ret[mxA])
                    mxA = i;
            }
            printf("%d %lld\n", mxA, ret[mxA]);
        }
    }
    return 0;
}

Read More +

2016-04-30

解題區/解題區 - UVa

UVa 801 - Flight Planning

Problem

給予飛機在平流層飛行資訊，在每一種高度下維持飛行的耗油量成線性關係，以及每一段飛行旅程上的平流層的風速為何，藉由線性內插得到在某高度下的飛行速度，忽略在轉換高度時的耗油量，問從起點到終點的最少耗油量為何 (無條件進位)。

Sample Input

2
2
1500 -50 50
1000 0 0
3
1000 50 0
2000 0 20
1800 50 100

Sample Output

1 2	Flight 1: 35 30 13986 Flight 2: 20 30 30 23502

Solution

單源最短路徑，只是每一點要使用 $20$ 個狀態維護抵達那一層時在哪一種高度下飛行。必須窮舉轉移到下一層的所有高度，因為有可能飛行慢而增加耗油時間。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <algorithm>
#include <vector>
using namespace std;
struct Leg {
    double mile, f20, f40;
} legs[1024];
const double VCRUISE = 400;
const double AOPT = 30000;
const double GPHOPT = 2000;
const double GPHEXTRA = 10;
const double CLIMBCOST = 50;
const int MAXN = 1024;	// maximum #legs
const int MAXH = 45;	// between 20,000 and 40,000 feet
const double DINF = 1e+20;
double inter(Leg a, double h) {
    return a.f20 + (a.f40 - a.f20) * (h - 20000) / (40000 - 20000);
}
void solve(int n) {
    double dp[MAXN][MAXH];
    int from[MAXN][MAXH];
    for (int i = 0; i <= n; i++) {
        for (int j = 0; j <= 40; j++)
            dp[i][j] = DINF;
    }
    
    dp[0][0] = 0;	// leg 0: altitude 0
    for (int i = 0; i < n; i++) {
        for (int j = 0; j <= 40; j++) {
            if (dp[i][j] >= DINF)
                continue;
            double alti_a = j * 1000.f;
            for (int k = 20; k <= 40; k++) {
                double cc = 0, alti_b;
                alti_b = k * 1000.f;
                
                if (alti_b > alti_a)
                    cc += CLIMBCOST * (alti_b - alti_a) / 1000.f;
                double v = VCRUISE + inter(legs[i+1], alti_b);
                if (v <= 0)
                	continue;
                double c = fabs(alti_b - AOPT) * GPHEXTRA / 1000.f + GPHOPT;
                cc += c * legs[i+1].mile / v;
                if (dp[i][j] + cc < dp[i+1][k])
                    dp[i+1][k] = dp[i][j] + cc, from[i+1][k] = j;
            }
        }
    }
    
    double ret = DINF;
    int last = -1;
    vector<int> solution;
    for (int i = 20; i <= 40; i++) {
        if (dp[n][i] < ret)
            ret = dp[n][i], last = i;
    }
    for (int i = n; i > 0; i--) {
        solution.push_back(last);
        last = from[i][last];
    }
    for (int i = n-1; i >= 0; i--)
        printf(" %d", solution[i]);
    // tricky 
    printf(" %.0lf\n", ceil(ret));
}
int main() {
    int testcase, cases = 0;
    int n;
    scanf("%d", &testcase);
    while (testcase--) {
        scanf("%d", &n);
        for (int i = 1; i <= n; i++) {
            scanf("%lf %lf %lf",
                  &legs[i].mile, &legs[i].f20, &legs[i].f40);
        }
        printf("Flight %d:", ++cases);
        solve(n);
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10082. Fast Find Prime Number (OpenMP)

題目描述

找出 $[ L, \; R ]$ 區間內有多少個質數。

輸入格式

有多組測資，每組測資一行兩個整數 $L$, $R$，表示左右區間。

$1 \le L \le R \le 2^{31}-1$

輸出格式

對於每一組詢問輸出一行一個整數。

輸入範例

1 2	1000000 2000000 1 1000000000

輸出範例

1 2	70435 50847534

提示

Sieve of Eratosthenes

Solution

這一題沒有出得很好，後來想到可以作弊打表，目前仍沒有同學挑戰成功，略感傷心。

線性篩法

線性篩法 Accepted (1724 ms, 2816 KB)

由於只要統計 $2^{31}$ 以內的質數個數，只需要把 50000 以內的所有質數找到，利用這不到 5000 的質數進行區塊篩法即可。

由於 $[1, 2^{31}]$ 也是非常龐大的區間，可以利用 bitmask 的技術，將記憶體壓縮 32 倍，但這樣子平行度還是不高，因此若詢問區間 $[l, r]$ 有多少個質數，將區間切割成數個足夠大的 chunk，這裡決定 chunk 大約在 100000 到 1000000 都有不錯的效果。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <omp.h>
 
#define MAXL (50000>>5)+1
#define BLOCK (1<<17)
#define MAXN (BLOCK>>5)+1
#define GET1(x) (mark1[(x)>>5]>>((x)&31)&1)
#define SET1(x) (mark1[(x)>>5] |= 1<<((x)&31))
#define GET2(x) (mark2[(x)>>5]>>((x)&31)&1)
#define SET2(x) (mark2[(x)>>5] |= 1<<((x)&31))
int mark1[MAXL];
int P[5500], Pt = 0;
void sieve() {
    register int i, j, k;
    SET1(1);
    int n = 50000;
    for (i = 2; i <= n; i++) {
        if (!GET1(i)) {
            for (k = n/i, j = i*k; k >= i; k--, j -= i)
                SET1(j);
            P[Pt++] = i;
        }
    }
}
 
int local_sieve(int a, int b) {
    int sqr = (int) sqrt(b), gap = b - a;
    int mark2[MAXN] = {};
 
    for (int i = 0; i < Pt && P[i] <= sqr; i++) {
        unsigned p = P[i], base = a / p * p;
        while (base < a)    base += p;
        if (base == p)    base += p;
        for (unsigned j = base; j <= b; j += p)
            SET2(j - a);
    }
    if (a == 1)    SET2(0);
    int ret = gap + 1;
    for (int i = gap>>5; i >= 0; i--)
        ret -= __builtin_popcount(mark2[i]);
    return ret;
}
long long min(long long x, long long y) {
    return x < y ? x : y;
}
int main() {
    sieve();
    int l, r;
    while (scanf("%d %d", &l, &r) == 2) {
        int ret = 0;
        #pragma omp parallel for schedule(dynamic) reduction(+: ret)
        for (long long i = l; i <= r; i += BLOCK)
            ret += local_sieve(i, min(i+BLOCK-1, r));
        printf("%d\n", ret);
    }
    return 0;
}

作弊打表

由於問題設計只針對個數，還可以偷偷本地建表完成每一個區間的值，針對非完全包含的區間單獨計算即可。打表部分可以藉由上述的程序做到。

作弊打表 Accepted (622 ms, 384 KB)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <omp.h>
 
#define MAXL (50000>>5)+1
#define BLOCK (1<<20)
#define MAXN (BLOCK>>5)+1
#define GET1(x) (mark1[(x)>>5]>>((x)&31)&1)
#define SET1(x) (mark1[(x)>>5] |= 1<<((x)&31))
#define GET2(x) (mark2[(x)>>5]>>((x)&31)&1)
#define SET2(x) (mark2[(x)>>5] |= 1<<((x)&31))
int preprocess[] = {82025,73586,70938,69398,68248,67307, <...>};
int mark1[MAXL];
int P[5500], Pt = 0;
void sieve() {
    register int i, j, k;
    SET1(1);
    int n = 50000;
    for (i = 2; i <= n; i++) {
        if (!GET1(i)) {
            for (k = n/i, j = i*k; k >= i; k--, j -= i)
                SET1(j);
            P[Pt++] = i;
        }
    }
}
 
int local_sieve(int a, int b) {
    int sqr = (int) sqrt(b), gap = b - a;
    int mark2[MAXN] = {};
 
    for (int i = 0; i < Pt && P[i] <= sqr; i++) {
        unsigned p = P[i], base = a / p * p;
        while (base < a)    base += p;
        if (base == p)    base += p;
        for (unsigned j = base; j <= b; j += p)
            SET2(j - a);
    }
    if (a == 1)    SET2(0);
    int ret = gap + 1;
    for (int i = gap>>5; i >= 0; i--)
        ret -= __builtin_popcount(mark2[i]);
    return ret;
}
 
long long min(long long x, long long y) {
    return x < y ? x : y;
}
long long max(long long x, long long y) {
    return x > y ? x : y;
}
int main() {
    sieve();
 
    int prepro_n = sizeof(preprocess) / sizeof(int);
    int l, r;
    while (scanf("%d %d", &l, &r) == 2) {
        int ret = 0; 
        for (int i = 0; i < prepro_n; i++) {
            long long tl = (long long) i * BLOCK + 1;
            long long tr = (long long) (i+1) * BLOCK;
            if (l <= tl && tr <= r) {
                ret += preprocess[i];
            } else {
                if (tl <= l && l <= tr) {
                    ret += local_sieve(l, min(r, tr));
                } else if (tl <= r && r <= tr) {
                    ret += local_sieve(max(tl, l), r);
                }
            }
        }
 
        printf("%d\n", ret);
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10081. Fast Game of Life

題目描述

生命遊戲中，對於任意細胞，規則如下：
每個細胞有兩種狀態-存活或死亡，每個細胞與以自身為中心的周圍八格細胞產生互動。

當前細胞為存活狀態時，當周圍低於 2 個 (不包含 2 個) 存活細胞時，該細胞變成死亡狀態。
當前細胞為存活狀態時，當周圍有 2 個或 3 個存活細胞時，該細胞保持原樣。
當前細胞為存活狀態時，當周圍有 3 個以上的存活細胞時，該細胞變成死亡狀態。
當前細胞為死亡狀態時，當周圍有 3 個存活細胞時，該細胞變成存活狀態。

可以把最初的細胞結構定義為種子，當所有在種子中的細胞同時被以上規則處理後，可以得到第一代細胞圖。按規則繼續處理當前的細胞圖，可以得到下一代的細胞圖，周而復始。

輸入格式

輸入第一行有兩個整數 $N$, $M$，表示盤面大小為 $N \times N$，模擬週期次數 $M$。接下來會有 $N$ 行，每一行上會有 $N$ 個字符，以 0 表示 $(i, j)$ 格子上的細胞屬於死亡狀態，反之 1 為存活狀態。

$1 \le N \le 2000$
$0 \le M \le 1000$

輸出格式

對於每一組測資輸出 $N$ 行，每一行上有 $N$ 個字元表示模擬 $M$ 次的最終盤面結果。

範例輸入 1

範例輸出 1

範例輸入 2

範例輸出 2

Solution

在還沒有做任何平行前，生命遊戲的模擬非常簡單，只需要統計鄰近八格的狀態即可，然而由於是常數範圍，直接打八個加法比起迴圈去累計八格的結果快上非常多，別忽視 for loop 在做 branch 的 cycle 數量。

樸素平行 Accepted (378 ms, 24 MB)
循環展開 Accepted (289 ms, 39 MB)

平行採用 big chunk 的方式，考慮到 cache miss 的關係，最好是每一個處理器都把相鄰的列放置在各自的 L1-L2-L3 cache，防止使用時不在自己的 cache，而產生嚴重的 cache miss。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <omp.h>
 
#define UINT unsigned long
#define MAXN 2048
char g[2][MAXN][MAXN];
int n, m;
int main() {
    while (scanf("%d %d", &n, &m) == 2) {
        while (getchar() != '\n');
 
        // read input
        for (int i = 1; i <= n; i++) {
            fgets(g[0][i]+1, MAXN, stdin);
            for (int j = 1; j <= n; j++)
                g[0][i][j] -= '0';
            g[0][i][n+1] = 0;
        }
 
        // game of life
        int flag = 0;
        int chunk = 64;
        for (int it = 0; it < m; it++) {
            #pragma omp parallel for schedule(static, chunk)
            for (int i = 1; i <= n; i++) {
                for (int j = 1; j <= n; j++) {
                    int adj = g[flag][i-1][j-1] + g[flag][i-1][j] + g[flag][i-1][j+1] +
                        g[flag][i  ][j-1] + g[flag][i  ][j+1] +
                        g[flag][i+1][j-1] + g[flag][i+1][j] + g[flag][i+1][j+1];
                    g[!flag][i][j] = (g[flag][i][j] == 0 && adj == 3) || 
                                        (g[flag][i][j] == 1 && (adj == 2 || adj == 3));
                }
            }
            flag = !flag;
        }
 
        // print result
        for (int i = 1; i <= n; i++) {
            for (int j = 1; j <= n; j++)
                g[flag][i][j] += '0';
            g[flag][i][n+1] = '\0';
            puts(g[flag][i]+1);
        }
    }
    return 0;
}

循環展開

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <omp.h>
 
#define UINT unsigned long
#define MAXN 2048
char g[2][MAXN][MAXN];
int n, m;
int main() {
    while (scanf("%d %d", &n, &m) == 2) {
        while (getchar() != '\n');
 
        // read input
        for (int i = 1; i <= n; i++) {
            fgets(g[0][i]+1, MAXN, stdin);
            for (int j = 1; j <= n; j++)
                g[0][i][j] -= '0';
            g[0][i][n+1] = 0;
        }
 
        // game of life
#ifdef _OPENMP
        int chunk = n / omp_get_max_threads();
#endif
        for (int it = 0; it < m; it++) {
            #pragma omp parallel for schedule(static, chunk)
            for (int i = 1; i <= n; i++) {
                int adj, j = 1;
#define UNLOOP { \
    adj = g[0][i-1][j-1] + g[0][i-1][j] + g[0][i-1][j+1] + \
        g[0][i  ][j-1] + g[0][i  ][j+1] + \
        g[0][i+1][j-1] + g[0][i+1][j] + g[0][i+1][j+1]; \
    g[1][i][j] = (g[0][i][j] == 0 && adj == 3) || \
                    (g[0][i][j] == 1 && (adj == 2 || adj == 3)); j++; \
    }
#define UNLOOP4 {UNLOOP UNLOOP UNLOOP UNLOOP}
#define UNLOOP8 {UNLOOP4 UNLOOP4}
                for (; j + 8 <= n; )
                    UNLOOP8;
                for (; j <= n; )
                    UNLOOP;
            }
            #pragma omp parallel for schedule(static, chunk)
            for (int i = 1; i <= n; i++)
                memcpy(g[0][i], g[1][i], sizeof(g[0][0][0]) * (n+1));
        }
 
        // print result
        for (int i = 1; i <= n; i++) {
            for (int j = 1; j <= n; j++)
                g[0][i][j] += '0';
            g[0][i][n+1] = '\0';
            puts(g[0][i]+1);
        }
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10026. Fast N-Queen

題目描述

相信 n-Queen 問題對每個研究 backtracking 的人來講都不陌生，這個問題是要在一個 $n \times n$ 大小的棋盤上擺n個皇后，讓她們不會互相攻擊到。為了讓這個問題更難一點，我們設計了一些障礙物在棋盤上，在這些點上不能放皇后。請留意這些障礙物並不會防止皇后被攻擊。

在傳統的 8-Queen 問題中，旋轉與鏡射被視為不同解法，因此我們有 92 種可能的方式來放置皇后。

輸入格式

輸入的資料最多包含 10 筆測試個案，每個測試個案的第一行會有一個整數 $n$ ($3 < n < 20$)，接下來的 $n$ 行代表棋盤資料，點號 '.' 代表空的盤格，星號 '*' 代表放有障礙物的盤格。

輸出格式

對每筆測試個案，輸出這是第幾個 case 以及這個 case 有幾種可能的放置方式。

範例輸入

8
........
........
........
........
........
........
........
........
4
.*..
....
....
....

範例輸出

1 2	Case 1: 92 Case 2: 1

Solution

首先，必須先認識最快的 N 皇后問題可以利用 bitmask 的技術，把過多的 memory access 的指令捨去而加速。

樸素平行 Accepted (2766 ms, 512 KB)
負載平衡 Accepted (1771 ms, 5248 KB)

樸素平行

在大多數的教科書上，平行只針對第一個列的位置平行，因此平行度最多 $N$，針對第一列每一個位置都分別開一個 thread 找解，這樣運算時間相當於跑最慢的那一個 thread。

#include <stdio.h>
int y_valid[32] = {};
int dfs(int lv, int row, int dia1, int dia2, int mask) {
    if (row == mask)
        return 1;
    int pos = mask & (~(row | dia1 | dia2)) & y_valid[lv], p;
    int ret = 0;
    while (pos) {
        p = pos & (-pos);
        pos -= p;
        ret += dfs(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
    }
    return ret;
}
int totalNQueens(int n) {
    int sum = 0;
    int chunk = 1;
    int row = 0, dia1 = 0, dia2 = 0, mask = (1<<n)-1, lv = 0;
    int pos = mask & (~(row | dia1 | dia2)) & y_valid[0], p;
    #pragma omp parallel for schedule(dynamic, chunk) reduction(+:sum)
    for (int i = 0; i < n; i++) {
        if ((pos>>i)&1) {
            p = 1<<i;
            int t = dfs(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
            sum += t;
        }
    }
    return sum;
}
int main() {
    int cases = 0, n;
    char s[32] = {};
    while (scanf("%d", &n) == 1 && n) {
        for (int i = 0; i < n; i++) {
            scanf("%s", s);
            y_valid[i] = (1<<n)-1;
            for (int j = 0; j < n; j++) {
                if (s[j] == '*')
                    y_valid[i] ^= (1<<j);
            }
        }
        int ret = totalNQueens(n);
        printf("Case %d: %d\n", ++cases, ret);
    }
    return 0;
}

負載平衡

為了達到每一個 thread 盡可能分到相同的工作量，避免最慘的那一條有太多的解而很慢，做好細粒度 (fine-grain) 的分配，如此一來工作量就很高的機率會相同。

因此，一開始先進行廣度搜索，把前幾列的解展開，也許會獲得 $N^x$ 種盤面，這時候平均切給 $M$ 條 thread，分配的工作量平衡上就有比較好的展現。

#include <stdio.h>
#define MAXQ 16384
#define MAXN 32
int y_valid[32] = {};
typedef struct Board {
    int row, dia1, dia2;
} Board;
Board R[MAXQ*MAXN];
int dfs(int lv, int row, int dia1, int dia2, int mask) {
    if (row == mask)
        return 1;
    int pos = mask & (~(row | dia1 | dia2)) & y_valid[lv], p;
    int ret = 0;
    while (pos) {
        p = pos & (-pos);
        pos -= p;
        ret += dfs(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
    }
    return ret;
}
int ida_dep, ida_cnt;
void IDA(int lv, int row, int dia1, int dia2, int mask) {
    if (lv == ida_dep) {
        Board b = {.row = row, .dia1 = dia1, .dia2 = dia2};
        R[ida_cnt++] = b;
        return ;
    }
    int pos = mask & (~(row | dia1 | dia2)) & y_valid[lv], p;
    while (pos) {
        p = pos & (-pos);
        pos -= p;
        IDA(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
    }
}
int totalNQueens(int n) {
    int it = 0;
    int row = 0, dia1 = 0, dia2 = 0, mask = (1<<n)-1, lv = 0;
    for (it = 1; it <= n; it++) {
        ida_dep = it, ida_cnt = 0;
        IDA(0, row, dia1, dia2, mask);
        if (ida_cnt >= MAXQ) {
            int sum = 0;
            int chunk = 1024;
            #pragma omp parallel for schedule(static, chunk) reduction(+:sum)
            for (int i = 0; i < ida_cnt; i++)
                sum += dfs(it, R[i].row, R[i].dia1, R[i].dia2, mask);
            return sum;
        }
    }
    return ida_cnt;
}
int main() {
    int cases = 0, n;
    char s[32] = {};
    while (scanf("%d", &n) == 1 && n) {
        for (int i = 0; i < n; i++) {
            scanf("%s", s);
            y_valid[i] = (1<<n)-1;
            for (int j = 0; j < n; j++) {
                if (s[j] == '*')
                    y_valid[i] ^= (1<<j);
            }
        }
        int ret = totalNQueens(n);
        printf("Case %d: %d\n", ++cases, ret);
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10025. Fast Image Match (OpenMP)

背景

圖片匹配和字串匹配有一點不同，字串匹配通常要求其子字串與搜尋字串完全相符，而圖片匹配則用相似度為依據，當圖片大、複雜且具有干擾，或者需要匹配數量非常多，更先進的領域會利用特徵擷取，用機率統計的方式來篩選可能的匹配數量，篩選過後才進行圖片的細節匹配。

題目描述

給予兩個圖片 $A, B$，圖片格式為灰階影像，每個像素 $\mathit{pixel}(x, y)$ 採用 8-bits 表示，範圍為 $\mathit{pixel}(x, y) \in [0, 255]$。

舉一個例子，有一個 $3 \times 3$ 的圖片 $A$ 和一個 $2 \times 2$ 的圖片 $B$，用矩陣表示如下：

$$A := \begin{bmatrix} a1 & a2 & a3 \\ a4 & a5 & a6 \\ a7 & a8 & a9 \end{bmatrix} ,\; B := \begin{bmatrix} b1 & b2\\ b3 & b4\\ \end{bmatrix}$$

假設左上角座標 $(1, 1)$ 即 $a1$ 的位置、$(1, 2)$ 即 $a2$ 的位置。

若把影像 $B$ 左上角對齊 $A$ 的 $(1, 1)$ 位置，其差異程度 $\mathit{diff}(A, B) = (a1 - b1)^2 + (a2 - b2)^2 + (a4 - b3)^2 + (a5 - b4)^2$
相同地，對齊 $(2, 1)$ 位置，其差異程度 $\mathit{diff}(A, B) = (a4 - b1)^2 + (a5 - b2)^2 + (a7 - b3)^2 + (a8 - b4)^2$

比較時，整張 $B$ 都要落在 $A$ 中。現在要找到一個對齊位置 $(x, y)$，使得 $\mathit{diff}(A, B)$ 最小。

輸入格式

多組測資，每一組第一行有四個整數 $A_H, A_W, B_H, B_W$，分別表示圖片 $A$ 的高寬、$B$ 的高寬。

接下來會有 $A_H$ 行，每一行上會有 $A_W$ 個整數，第 $i$ 行上的第 $j$ 個整數 $x$ 表示 $A(i, j)$ 的灰階像素值。同樣地，接下來會有 $B_H$ 行，每一行上會有 $B_W$ 個整數，第 $i$ 行上的第 $j$ 個整數 $x$ 表示 $B(i, j)$ 的灰階像素值。

$ 1\le A_H, A_W, B_H, B_W \le 500$
$ B_H \le A_H, B_W \le A_W$
$ 0 \le x \le 255$

輸出格式

對於每一組測資輸出一行，兩個整數 $x, y$，滿足 $\mathit{diff}(A, B)$ 最小。若有相同情況滿足，則優先挑選 $x$ 最小，若 $x$ 仍相同，則挑選 $y$ 最小。

範例輸入

範例輸出

1
2

3 3
1 1

Solution

樸素平行

若圖片是 $N \times N$，樸素算法運作要 $\mathcal{O}(N^4)$，傅立葉算法要 $\mathcal{O}(N^2 \log N)$，這裡針對樸素算法平行。從最高層次的找解平行，每一個匹配位置都要耗費 $\mathcal{O}(M^2)$ 計算相似性，又由於平行限制跟核心數有關，那麼針對每一個 thread 要求找到好幾個 row 的解，分別存在各自的答案陣列區中，之後再線性跑一次取最小值，利用額外的空間儲存，避免在運算過程中遇到 critical section。

#pragma omp parallel for schedule(dynamic, chunk) shared(A, B)
for (int i = 0; i <= Ch; i++) {
    for (int j = 0; j <= Cw; j++) {
        long long d = 0;
        for (int p = 0; p < Bh; p++) {
            for (int q = 0; q < Bw; q++) {
                d += (long long) (B[p][q] - A[i+p][j+q])*(B[p][q] - A[i+p][j+q]);
            }                   
        }
        if (d < diff)
        {
            #pragma omp critical
            if (d < diff)
                diff = d, bx = i, by = j;
        }
    }
}

就是上列的寫法會卡在 critical section，倒不如存在另一個陣列中，效能還差距個幾百毫秒。

#include <stdio.h>
#include <limits.h>
#define MAXN 512
int A[MAXN][MAXN], B[MAXN][MAXN];
int C[MAXN][MAXN];
int main() {
    int Ah, Aw, Bh, Bw, x;
    int cases = 0;
    while(scanf("%d %d %d %d", &Ah, &Aw, &Bh, &Bw) == 4) {
        for (int i = 0; i < Ah; i++)
            for (int j = 0; j < Aw; j++)
                scanf("%d", &x), A[i][j] = x;
        for (int i = 0; i < Bh; i++)
            for (int j = 0; j < Bw; j++)
                scanf("%d", &x), B[i][j] = x;
        int Ch = Ah - Bh, Cw = Aw - Bw;
        int bx = -1, by = -1;
        int diff = INT_MAX;
        #pragma omp parallel for
        for (int i = 0; i <= Ch; i++) {
            for (int j = 0; j <= Cw; j++) {
                int d = 0;
                for (int p = 0; p < Bh; p++) {
                    for (int q = 0; q < Bw; q++) {
                        d += (B[p][q] - A[i+p][j+q])*(B[p][q] - A[i+p][j+q]);
                    }                    
                }
                C[i][j] = d;
            }
        }
        for (int i = 0; i <= Ch; i++) {
            for (int j = 0; j <= Cw; j++)
                if (C[i][j] < diff)
                    diff = C[i][j], bx = i, by = j;
        }
 
        printf("%d %d\n", bx + 1, by + 1);
    }
    return 0;
}

FFT

儘管使用快速傅立葉 (FFT) 已經達到 $\mathcal{O}(N \log N)$ 的時間複雜度的極致，只比平行樸素算法快個兩到三倍，加入平行之後，效能可以達到五到六倍加速。

FFT 用到大量的三角函數計算，一般而言若使用序列化的乘法代替三角函數計算 (疊加角度) 會比重新計算快上很多，但這樣會損失一大部分的精準度 (但速度快很多)，而且會產生很多相互依賴的指令，這也導致不容易對迴圈平行。

如果需要平行，每一個三角函數可以預先存在陣列中，每一個 thread 直接存取陣列中的元素即可，但麻煩的地方在於最好使用多核處理器，然而跨越多個處理器有可能會因為快取 … 等機制，導致速度下降，因此這裡採用單一處理器六核心 (只有三個是 physical core)，因此 thread 限定在五個以內。

#include <stdio.h>
#include <limits.h>
#include <stdbool.h>
#include <math.h>
#include <string.h>
#include <omp.h>
// FFT header
#define MAXN 262144
typedef struct {
    float x, y;
} complex;
typedef unsigned int UINT32;
typedef struct {
    int (* const NumberOfBitsNeeded) (int);
    UINT32 (* const FastReverseBits) (UINT32, int);
    void (* const FFT) (bool, complex[], complex[], int);
    void (* const convolution) (double *, double *, int , double *);
} FFT_namespace;
static inline complex init_complex(double a, double b) {
    complex t = {a, b};
    return t;
}
static inline complex add_complex(complex *a, complex *b) {
    return init_complex(a->x + b->x, a->y + b->y);
}
static inline complex sub_complex(complex *a, complex *b) {
    return init_complex(a->x - b->x, a->y - b->y);
}
static inline complex mul_complex(complex *a, complex *b) {
    return init_complex(a->x * b->x - a->y * b->y, a->x * b->y+ a->y * b->x);
}
static complex pre_theta[2][MAXN];
void prework(int n) {
    static int pre_n = 0;
    static const double PI = acos(-1);
    if (pre_n == n)
        return ;
    pre_n = n;
    pre_theta[0][0] = init_complex(1, 0);
    pre_theta[1][0] = init_complex(1, 0);
    #pragma omp parallel for
    for (register int i = 1; i < n; i++) {
        pre_theta[0][i] = init_complex(cos(2*i*PI / n ) , sin(2*i*PI / n ));
        pre_theta[1][i] = init_complex(cos(2*i*PI / n ) , -sin(2*i*PI / n ));
    }
}
// FFT body
int NumberOfBitsNeeded(int PowerOfTwo) {
    for (int i = 0;; ++i) {
        if (PowerOfTwo & (1 << i)) {
            return i;
        }
    }
}
UINT32 FastReverseBits(UINT32 a, int NumBits) {
    a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
    a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
    a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
    a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
    a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
    return a >> (32 - NumBits);
}
void _FFT(bool InverseTransform, complex In[], complex Out[], int n) {
    // simultaneous data copy and bit-reversal ordering into outputs
    int NumSamples = n;
    int NumBits = NumberOfBitsNeeded(NumSamples);
    for (int i = 0; i < NumSamples; ++i)
        Out[FastReverseBits(i, NumBits)] = In[i];
    // the FFT process
    #pragma omp parallel
    for (int i = 1; i <= NumBits; i++) {
        int BlockSize = 1<<i, BlockEnd = BlockSize>>1, BlockCnt = NumSamples/BlockSize;
        #pragma omp for
        for (int j = 0; j < NumSamples; j += BlockSize) {
            complex *t = pre_theta[InverseTransform];
            for (int k = 0; k < BlockEnd; k++, t += BlockCnt) {
                complex a = mul_complex(&(*t), &Out[k+j+BlockEnd]);
                Out[k+j+BlockEnd] = sub_complex(&Out[k+j], &a);
                Out[k+j] = add_complex(&Out[k+j], &a);
            }
        }
    }
    
    // normalize if inverse transform
    if (InverseTransform) {
        #pragma omp parallel for
        for (int i = 0; i < NumSamples; ++i) {
            Out[i].x = Out[i].x / NumSamples;
        }
    }
}
void convolution(double *a, double *b, int n, double *c) {
    static complex s[MAXN], d1[MAXN], d2[MAXN], y[MAXN];
    prework(n);
    #pragma omp parallel for
    for (int i = 0; i < n; ++i)
        s[i] = init_complex(a[i], 0);
    _FFT(false, s, d1, n);
    s[0] = init_complex(b[0], 0);
    #pragma omp parallel for
    for (int i = 1; i < n; ++i)
        s[i] = init_complex(b[n - i], 0);
    _FFT(false, s, d2, n);
    #pragma omp parallel for
    for (int i = 0; i < n; ++i)
        y[i] = mul_complex(&d1[i], &d2[i]);
    _FFT(true, y, s, n);
    #pragma omp parallel for
    for (int i = 0; i < n; ++i)
        c[i] = s[i].x;
}
FFT_namespace const FFT = {NumberOfBitsNeeded, FastReverseBits, _FFT, convolution};
// FFT end
double a[MAXN], b[MAXN], c[MAXN];
long long sum[512][512];
long long getArea(int lx, int ly, int rx, int ry) {
    long long ret = sum[rx][ry];
    if (lx) ret -= sum[lx-1][ry];
    if (ly) ret -= sum[rx][ly-1];
    if (lx && ly)   ret += sum[lx-1][ly-1];
    return ret;
}
int main() {
    omp_set_num_threads(5);
    int m, n, p, q, x, N, M, S;
    while (scanf("%d %d %d %d", &m, &n, &p, &q) == 4) {
        N = m > p ? m : p, M = n > q ? n : q;
        S = 1;
        for (; S < N*M; S <<= 1);
        memset(a, 0, sizeof(a[0]) * S);
        memset(b, 0, sizeof(b[0]) * S);
        for (int i = 0; i < m; i++) {
            long long s = 0;
            for (int j = 0; j < n; j++) {
                scanf("%d", &x);
                a[i*M+j] = x;
                s += x*x;
                sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;          
            }
        }
        for (int i = 0; i < p; i++) {
            for (int j = 0; j < q; j++) {
                scanf("%d", &x);
                b[i*M+j] = x;
            }
        }        
        FFT.convolution(a, b, S, c);
        int qx = m - p, qy = n - q, bX = INT_MAX, bY = INT_MAX;
        long long diff = LONG_MAX;
        #pragma omp parallel
        {
            long long t_diff = LONG_MAX;
            int t_bX = INT_MAX, t_bY = INT_MAX;
            #pragma omp for
            for (int i = 0; i <= qx; i++)    {
                for (int j = 0; j <= qy; j++) {
                    long long v = getArea(i, j, i+p-1, j+q-1) - 2*c[i*M + j] + 0.5;
                    if (v < t_diff)
                        t_diff = v, t_bX = i, t_bY = j;
                }
            }
            #pragma omp critical
            {
                if (t_diff < diff)
                    diff = t_diff, bX = t_bX, bY = t_bY;
                if (t_diff == diff && (t_bX < bX || (t_bX == bX && t_bY < bY))) {
                    bX = t_bX, bY = t_bY;
                }
            }
        }
        printf("%d %d\n", bX+1, bY+1);
    }
    return 0;
}

Read More +

2016-04-20

學校課程/平行程式

批改娘 10022. Fast Matrix Multiplication

背景

題目描述

相信不少人都已經實作所謂的矩陣乘法，計算兩個方陣大小為 $N \times N$ 的矩陣 $A, B$。為了方便起見，提供一個偽隨機數的生成，減少在輸入處理浪費的時間，同時也減少上傳測資的辛苦。

根據種子 $c = S1$ 生成矩陣 $A$，種子 $c = S2$ 生成矩陣 $B$，計算矩陣相乘 $A \times B$，為了方便起見，使用 hash 函數進行簽章，最後輸出一個值。由於會牽涉到 overflow 問題，此題作為快取實驗就不考慮這個，overflow 問題都會相同。

matrix.h

1	void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

#include "matrix.h"
 
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            unsigned long sum = 0;    // overflow, let it go.
            for (int k = 0; k < N; k++)
                sum += A[i][k] * B[k][j];
            C[i][j] = sum;
        }
    }
}

main.c

#include <stdio.h>
#include "matrix.h"
// #define DEBUG
#define UINT unsigned long 
#define MAXN 2048
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
    UINT x = 2, n = N*N;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            x = (x * x + c + i + j)%n;
            A[i][j] = x;
        }
    }
}
void print_matrix(int N, UINT A[][MAXN]) {
    for (int i = 0; i < N; i++) {
        fprintf(stderr, "[");
        for (int j = 0; j < N; j++)
            fprintf(stderr, " %u", A[i][j]);
        fprintf(stderr, " ]\n");
    }
}
UINT hash(UINT x) {
    return (x * 2654435761LU);
}
UINT signature(int N, UINT A[][MAXN]) {
    UINT h = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++)
            h = hash(h + A[i][j]);
    }
    return h;
}
UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
    int N, S1, S2;
    while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
        rand_gen(S1, N, A);
        rand_gen(S2, N, B);
        multiply(N, A, B, C);
#ifdef DEBUG
        print_matrix(N, A);
        print_matrix(N, B);
        print_matrix(N, C);
#endif
        printf("%u\n", signature(N, C));
    }
    return 0;
}

輸入格式

多組測資，每組測資一行三個整數 $N, S1, S2$。

$1 \le N \le 1000$
$0 \le S1, S2 \le 65536$

輸出格式

每組測資輸出一行，一個整數 signature 的結果。

範例輸入

1 2	2 2 5 2 2 5

範例輸出

1 2	573770929 573770929

解釋

範例輸入產生 $2 \times 2$ 的矩陣。

$$A = \begin{bmatrix} 2 & 3\\ 0 & 0 \end{bmatrix} , B = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix} , A \times B = \begin{bmatrix} 11 & 6\\ 0 & 0 \end{bmatrix}$$

備註

2016/02/17 加入平行程式設計 OpenMP 部份，並增加時間限制！

Solution

解法跟 thread 寫法一樣，用 OpenMP 的好處在於不用處理那些 thread 的設定，用 OpenMP 提供的前處理完成執行緒的建造和移除操作。

在這裡特別提供達夫裝置 (Duff’s device) 對於迴圈展開 loop unrolling 的撰寫方式，巧妙地運用 C 語言中的 switch，相比一般寫法需要兩次迴圈處理，最後一個迴圈必須處理剩餘不整除的部分。就實作看起來，在現在版本 4.8 gcc 編譯結果下，效能沒有明顯的差別。

在高等編譯器課程中，聽說迴圈展開的數目最好不是 $2^n$，某些情況會造成 alignment 的 cache miss 的嚴重影響，實際情況還是要看怎麼運用，在這裡就不多做嘗試。

matrix.h

1
2
3

#define UINT unsigned long 
 
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c (達夫裝置)

#include "matrix.h"
#define LOOP_UNROLL 8
void transpose(int N, UINT A[][2048]) {
    UINT t;
    for (int i = 0; i < N; i++)
        for (int j = i+1; j < N; j++)
            t = A[i][j], A[i][j] = A[j][i], A[j][i] = t;
}
void multiply(int N, UINT A[][2048], UINT B[][2048], UINT C[][2048]) {
    transpose(N, B);
    #pragma omp parallel for
    for (int i = N-1; i >= 0; i--) {
        for (int j = N-1; j >= 0; j--) {
            register UINT sum = 0;
            UINT *a = &A[i][0], *b = &B[j][0];
            int k = N;
            switch (k % LOOP_UNROLL) {
                case 0: do { sum += *a * *b, a++, b++;
                case 7: sum += *a * *b, a++, b++;
                case 6: sum += *a * *b, a++, b++;
                case 5: sum += *a * *b, a++, b++;
                case 4: sum += *a * *b, a++, b++;
                case 3: sum += *a * *b, a++, b++;
                case 2: sum += *a * *b, a++, b++;
                case 1: sum += *a * *b, a++, b++;
                } while ((k -= LOOP_UNROLL) > 0);
            }
            C[i][j] = sum;
        }
    }
}

matrix.c (迴圈展開)

#include "matrix.h"
#define LOOP_UNROLL 8
void transpose(int N, UINT A[][2048]) {
    int chunk = 8;
    #pragma omp parallel for schedule(dynamic, chunk)
    for (int i = 0; i < N; i++) {
        for (int j = i+1; j < N; j++) {
            UINT t;    
            t = A[i][j], A[i][j] = A[j][i], A[j][i] = t;
        }
    }
}
void multiply(int N, UINT A[][2048], UINT B[][2048], UINT C[][2048]) {
    transpose(N, B);
    int chunk = 8;
    #pragma omp parallel for schedule(dynamic, chunk) shared(C)
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            register UINT sum = 0;
            UINT *a = &A[i][0], *b = &B[j][0];
            int k;
            for (k = 0; k+LOOP_UNROLL < N; k += LOOP_UNROLL) {
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
            }
            for (; k < N; k++)
                sum += *a * *b, a++, b++;
            C[i][j] = sum;
        }
    }
}

Read More +

2016-04-20

學校課程/平行程式

批改娘 10080. Fast Matrix Multiplication (pthread)

題目描述

matrix.h

1	void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

#include "matrix.h"
 
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            unsigned long sum = 0;    // overflow, let it go.
            for (int k = 0; k < N; k++)
                sum += A[i][k] * B[k][j];
            C[i][j] = sum;
        }
    }
}

main.c

#include <stdio.h>
#include "matrix.h"
// #define DEBUG
#define UINT unsigned long 
#define MAXN 2048
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
    UINT x = 2, n = N*N;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            x = (x * x + c + i + j)%n;
            A[i][j] = x;
        }
    }
}
void print_matrix(int N, UINT A[][MAXN]) {
    for (int i = 0; i < N; i++) {
        fprintf(stderr, "[");
        for (int j = 0; j < N; j++)
            fprintf(stderr, " %u", A[i][j]);
        fprintf(stderr, " ]\n");
    }
}
UINT hash(UINT x) {
    return (x * 2654435761LU);
}
UINT signature(int N, UINT A[][MAXN]) {
    UINT h = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++)
            h = hash(h + A[i][j]);
    }
    return h;
}
static UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
    int N, S1, S2;
    while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
        rand_gen(S1, N, A);
        rand_gen(S2, N, B);
        multiply(N, A, B, C);
#ifdef DEBUG
        print_matrix(N, A);
        print_matrix(N, B);
        print_matrix(N, C);
#endif
        printf("%u\n", signature(N, C));
    }
    return 0;
}

輸入格式

多組測資，每組測資一行三個整數 $N, S1, S2$。

$1 \le N \le 1000$
$0 \le S1, S2 \le 65536$

輸出格式

每組測資輸出一行，一個整數 signature 的結果。

範例輸入

1 2	2 2 5 2 2 5

範例輸出

1 2	573770929 573770929

解釋

範例輸入產生 $2 \times 2$ 的矩陣。

$$A = \begin{bmatrix} 2 & 3\\ 0 & 0 \end{bmatrix} , B = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix} , A \times B = \begin{bmatrix} 11 & 6\\ 0 & 0 \end{bmatrix}$$

Makefile

1 2	matrix.out: matrix.h matrix.c main.c gcc -std=c99 -O2 matrix.h matrix.c main.c -o matrix -lpthread -lm

Solution

針對矩陣乘法平行很簡單，在 CPU 上由於需要搬運資料，不一定開的 thread 數量接近 core 數量就一定最好。而在平行的部分要把握資料局部性的加速，以下實作根據最終答案進行 row-based 策略切割，每一個 thread 計算那一個 row 的所有答案。

matrix.h

1
2
3

#define UINT unsigned long 
 
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

#include "matrix.h"
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define LOOP_UNROLL 8
#define MAXTHREAD 8
#define MAXN 2048
typedef struct Argu {
    int l, r, N;
    UINT *A, *B, *C;
} Argu;
static void transpose(int N, UINT A[][2048]) {
    UINT t;
    for (int i = 0; i < N; i++) {
        for (int j = i+1; j < N; j++)
                t = A[i][j], A[i][j] = A[j][i], A[j][i] = t;
    }
}
void* thread_multiply(void *arg) {
    Argu data = * ((Argu *) arg);
    int l = data.l, r = data.r;
    int N = data.N;
    UINT *A = data.A, *B = data.B, *C = data.C;
    UINT stkA[MAXN];
    for (int i = l; i <= r; i++) {
        memcpy(stkA, A + (i * MAXN), sizeof(UINT) * N);
        for (int j = 0; j < N; j++) {
            UINT sum = 0;
//            UINT *a = A + (i * MAXN), *b = B + (j *  MAXN);
            UINT *a = stkA, *b = B + (j * MAXN);
            int k;
            for (k = 0; k+LOOP_UNROLL < N; k += LOOP_UNROLL) {
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
            }
            for (; k < N; k++)
                sum += *a * *b, a++, b++;
            *(C + (i * MAXN + j)) = sum;
        }
    }
    return NULL;
}
void multiply(int N, UINT A[][2048], UINT B[][2048], UINT C[][2048]) {
    transpose(N, B);
 
    pthread_t *threads;
    threads = (pthread_t *) malloc(MAXTHREAD * sizeof(pthread_t));
 
    for (int i = 0; i < MAXTHREAD; i++) {
        Argu *data = (Argu *) malloc(sizeof(Argu));
        data->l = i * N / MAXTHREAD;
        data->r = (i+1) * N / MAXTHREAD - 1;
        data->N = N;
        data->A = &A[0][0], data->B = &B[0][0], data->C = &C[0][0];
        pthread_create(&threads[i], NULL, thread_multiply, (void *) (data));
    }
 
    for (int i = 0; i < MAXTHREAD; i++) 
        pthread_join(threads[i], NULL);
    return ;
}

Read More +