2016-04-30

學校課程/平行程式

批改娘 10094. Fast 0/1 Knapsack Problem

題目描述

有 $N$ 個重量和價值分別為 $W_i, V_i$ 的物品，現在要從這些物品中選出總重量不超過 $M$ 的物品，求所有挑選方案中的總價值最大值為何。

輸入格式

輸入只有一組測資，第一行會有兩個整數 $N, M$ 分別表示物品數量和背包能容大的最大重量 $M$。接下來會有 $N$ 行，第 $i$ 行上會有兩個整數 $W_i, V_i$，分別為物品 $i$ 的重量和價值。

$1 \le N \le 10000$
$1 \le M \le 1000000$
$1 \le W_i, \; V_i \le 100000$

輸出格式

輸出一行整數，挑選方案中的最大價值 $B$。

範例輸入

範例輸出

Solution

遞迴公式

$$\begin{align*} f(i, j) = \max(f(i-1, j-w)+v, f(i-1, j)) \end{align*}$$

最單純採用滾動數組完成，也就是所謂的 double buffer 的技巧，可以大幅度節省記憶體，但先決條件是他們的相依關係必須獨立。

在平行處理上的另一種解法，就是分成前 $N/2$ 個物品和後 $N/2$ 物品分開計算，之後再藉由 $\mathcal{O}(M)$ 合併答案，由於算法的限制，最多拆分兩組。而不像滾動數組的平行於最內層的迴圈，平行度有限，但在較少核心的處理上非常快速，其一原因是空間使用比較有效率。

Server (32 cores)
- 分組合併: Accepted (768 ms, 8320 KB)
- 滾動數組: Accepted (380 ms, 8 MB)
PC (4 cores)
- 分組合併: 600ms
- 滾動數組: 900ms

滾動數組

#include <stdio.h>
#include <stdlib.h>
 
#define MAXM 1000005
#define MAXN 10005
int dp[2][MAXM];
int W[MAXN], V[MAXN];
#define max(a, b) (a > b ? a : b)
int main() {
    int N, M;
    while (scanf("%d %d", &N, &M) == 2) {
        for (int i = 0; i < N; i++)
            scanf("%d %d", &W[i], &V[i]);
        memset(dp, 0, sizeof(dp));
 
        int p = 0;
        for (int i = 0; i < N; i++) {
            int q = 1 - p;
            #pragma omp parallel
            {
                int pp = p, qq = q;
                int v = V[i], w = W[i];
                #pragma omp for
                for (int i = w; i <= M; i++)
                    dp[qq][i] = max(dp[pp][i-w]+v, dp[pp][i]);
                #pragma omp for
                for (int i = 0; i < w; i++)
                    dp[qq][i] = dp[pp][i];
            }
            p = 1 - p;
        }
 
        printf("%d\n", dp[p][M]);
    }
    return 0;
}

分組合併

#include <stdio.h>
#include <stdlib.h>
 
static inline int max(int a, int b) {
    return a > b ? a : b;
}
void subKnapsack(int n, int m, int w[], int v[], int dp[]) {
    memset(dp, 0, sizeof(dp[0]) * (m+1));
    for (int i = 0; i < n; i++) {
        int vv = v[i], ww = w[i];
        for (int j = m; j >= ww; j--)
            dp[j] = max(dp[j], dp[j-ww]+vv);
    }
}
#define MAXM 1000005
#define MAXN 100005
int W[MAXN], V[MAXN];
int dp[2][MAXM];
int main() {
    int N, M;
    while (scanf("%d %d", &N, &M) == 2) {
        for (int i = 0; i < N; i++)
            scanf("%d %d", &W[i], &V[i]);
        memset(dp, 0, sizeof(dp));
 
        #pragma omp parallel sections
        {
            #pragma omp section
            subKnapsack(N/2, M, W, V, dp[0]);
            #pragma omp section
            subKnapsack(N-N/2, M, W+N/2, V+N/2, dp[1]);
        }
 
        int ret = 0;
        for (int i = M, j = 0, mx = 0; i >= 0; i--) {
            mx = max(mx, dp[1][j]), j++;
            ret = max(ret, dp[0][i] + mx);
        }
        printf("%d\n", ret);
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10027. Fast Sudoku

背景

不過，還想做得更好

但這個世界卻要求神通廣大、無所不知，那不聰明的我又該何去何從 …

題目描述

給一張 $9 \times 9$ 數獨，必須滿足每行每列的 1 到 9 的數字不重複，同時在九個 $3 \times 3$ 方格內的 1 到 9 數字也不重複。請計算可填入的方法數。

輸入格式

測資只有一筆，共有九行，每一行上有九個整數，第 $i$ 行上的第 $j$ 個整數表示 $\text{grid}[i][j]$ 填入的數字，若 $\text{grid}[i][j] = 0$ 表示尚未填入。

輸出格式

對於每一組測資輸出一行一個整數，表示數獨共有幾種填法。

範例輸入

0 6 0 0 0 4 0 5 0
0 0 8 3 0 5 6 0 0
2 0 0 0 0 0 0 0 1
8 0 0 4 0 7 0 0 6
0 0 6 0 0 0 3 0 0
7 0 0 9 0 1 0 0 4
5 0 0 0 0 0 0 0 2
0 0 7 2 0 6 9 0 0
0 4 0 5 0 8 0 7 0

範例輸出

範例輸入 2

0 0 0 0 0 0 0 0 0
0 0 3 6 0 0 0 0 0
0 7 0 0 9 0 2 0 0
0 5 0 0 0 7 0 0 0
0 0 0 0 4 5 7 0 0
0 0 0 1 0 0 0 3 0
0 0 1 0 0 0 0 6 8
0 0 8 5 0 0 0 1 0
0 9 0 0 0 0 4 0 0

範例輸出 2

Solution

牽涉到 load balance 問題，大部分都要先廣度搜尋一次，把狀態展開後，再利用 dynamic scheduling 進行分配工作，效果會好上許多。為了解決狀態展開而後不浪費記憶體空間，IDA* 會是一種好的選擇。當然在 OpenMP 3.0 以上有提供 task 來幫忙做到類似的事情，但搜索展開會變成寫死，估計上會比較困難。

IDA: Accepted (946 ms, 10752 KB)
OpenMP task: Accepted (1321 ms, 2560 KB)
DLX: Accepted (1208 ms, 56940 KB)

當然 DLX 在建表處理會變得很討厭，overhead 相較於其他的算法都高出許多，因此狀態展開不可以太多，因為他本身就搜得非常快，撰寫這一類搜索問題時，特別小心 #pragma omp private() 的使用，複製操作原則上都是根據 sizeof() 決定複製空間大小。因此若複製目標為函數參數，很容易只有複製到指標。

IDA

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <limits.h>
#include <assert.h>
#define MAXN 65536
 
int C[81][2], N;
int R[9][3] = {}, P[MAXN*9][9][3], Pidx;
int ida_dep;
void IDA(int idx, int state[][3]) {
    if (idx == ida_dep) {
        memcpy(P[Pidx], state, sizeof(P[0])), Pidx++;
        return ;
    }
    int x = C[idx][0], y = C[idx][1];
    int c = x/3*3 + y/3;
    for (int i = 0; i < 9; i++) {
        int mask = 1<<i;
        if ((state[x][0]&mask) || (state[y][1]&mask) ||
                (state[c][2]&mask))
            continue;
        state[x][0] |= mask;
        state[y][1] |= mask;
        state[c][2] |= mask;
        IDA(idx+1, state);
        state[x][0] ^= mask;
        state[y][1] ^= mask;
        state[c][2] ^= mask;
    }
}
int dfs(int idx, int state[][3]) {
    if (idx == N)
        return 1;
    int x = C[idx][0], y = C[idx][1];
    int c = x/3*3 + y/3, sum = 0;
    for (int i = 0; i < 9; i++) {
        int mask = 1<<i;
        if ((state[x][0]&mask) || (state[y][1]&mask) ||
                (state[c][2]&mask))
            continue;
        state[x][0] |= mask;
        state[y][1] |= mask;
        state[c][2] |= mask;
        sum += dfs(idx+1, state);
        state[c][2] ^= mask;
        state[y][1] ^= mask;
        state[x][0] ^= mask;
    }
    return sum;
}
int incomplete_bfs() {
    for (ida_dep = 1; ida_dep <= N; ida_dep++) {
        Pidx = 0;
        IDA(0, R);
        if (Pidx >= MAXN)
            break;
    }
    if (ida_dep == N+1)
        return Pidx;
    int ret = 0;
    #pragma omp parallel for schedule(guided) reduction(+: ret) 
    for (int i = 0; i < Pidx; i++)
        ret += dfs(ida_dep, P[i]);
    return ret;
}
int main() {
    int g[9][9], n = 9;
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            scanf("%d", &g[i][j]);
    N = 0;
    memset(R, 0, sizeof(R));
     for (int i = 0; i < n; i++) {
         for (int j = 0; j < n; j++) {
             if (g[i][j]) {
                 g[i][j]--;
                 R[i][0] |= 1<<g[i][j];
                 R[j][1] |= 1<<g[i][j];
                 R[i/3*3+j/3][2] |= 1<<g[i][j];
            } else {
                 C[N][0] = i, C[N][1] = j;
                 N++;
             }
         }
     }
    int ret = incomplete_bfs();
    printf("%d\n", ret);
    return 0;
}

Task

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <limits.h>
#include <assert.h>
#define MAXN 65536
 
int C[81][2], N;
int R[9][3] = {};
int threadcount = 0;
#pragma omp threadprivate(threadcount)
void dfs_serial(int idx, int state[][3]) {
    if (idx == N) {
        threadcount++;
        return ;
    }
    int x = C[idx][0], y = C[idx][1];
    int c = x/3*3 + y/3;
    for (int i = 0; i < 9; i++) 
    {
        int mask = 1<<i;
        if ((state[x][0]&mask) || (state[y][1]&mask) ||
                (state[c][2]&mask)) 
            continue;
        state[x][0] |= mask;
        state[y][1] |= mask;
        state[c][2] |= mask;
        dfs_serial(idx+1, state);
        state[c][2] ^= mask;
        state[y][1] ^= mask;
        state[x][0] ^= mask;
    }
}
void dfs(int idx, int state[][3]) {
    if (idx == N) {
        threadcount++;
        return ;
    }
    int x = C[idx][0], y = C[idx][1];
    int c = x/3*3 + y/3;
    for (int i = 0; i < 9; i++) 
    #pragma omp task untied
    {
        int mask = 1<<i;
        if ((state[x][0]&mask) || (state[y][1]&mask) ||
                (state[c][2]&mask)) {
 
        } else {
            int S[9][3] = {};
            memcpy(S, state, sizeof(S));
//            int (*S)[3] = malloc(sizeof(int)*9*3);
//            memcpy(S, state, sizeof(int)*9*3);
            S[x][0] |= mask;
            S[y][1] |= mask;
            S[c][2] |= mask;
            if (idx >= 6) {
                dfs_serial(idx+1, S);
            } else {
                dfs(idx+1, S);
            }
        }
    }
    #pragma omp taskwait
}
int incomplete_bfs() {
    int ret = 0;
    #pragma omp parallel
    {
        #pragma omp single
        {
            dfs(0, R);
        }
        #pragma omp critical
            ret += threadcount;
    }
    return ret;
}
int main() {
    int g[9][9], n = 9;
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            scanf("%d", &g[i][j]);
    N = 0;
    memset(R, 0, sizeof(R));
     for (int i = 0; i < n; i++) {
         for (int j = 0; j < n; j++) {
             if (g[i][j]) {
                 g[i][j]--;
                 R[i][0] |= 1<<g[i][j];
                 R[j][1] |= 1<<g[i][j];
                 R[i/3*3+j/3][2] |= 1<<g[i][j];
            } else {
                 C[N][0] = i, C[N][1] = j;
                 N++;
             }
         }
     }
    int ret = incomplete_bfs();
    printf("%d\n", ret);
    return 0;
}

DLX

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <limits.h>
#include <assert.h>
#define MAXN 65536
#define MAXR 1024
typedef unsigned char byte;
typedef struct {
    int left, right, up, down;
    int ch;
} Node;
// DLX header
typedef struct {
    int (* const sudoku) (byte [][9]);
} DLX_namespace;
// DLX body begin
void DLX_remove(int c, Node DL[], int s[]) {
    DL[DL[c].right].left = DL[c].left;
    DL[DL[c].left].right = DL[c].right;
    for (int i = DL[c].down; i != c; i = DL[i].down) {
        for (int j = DL[i].right; j != i; j = DL[j].right) {
            DL[DL[j].down].up = DL[j].up;
            DL[DL[j].up].down = DL[j].down;
            s[DL[j].ch]--;
        }
    }
}
void DLX_resume(int c, Node DL[], int s[]) {
    for (int i = DL[c].down; i != c; i = DL[i].down) {
        for (int j = DL[i].left; j != i; j = DL[j].left) {
            DL[DL[j].down].up = j;
            DL[DL[j].up].down = j;
            s[DL[j].ch]++;
        }
    }
    DL[DL[c].right].left = c;
    DL[DL[c].left].right = c;
}
int DLX_dfs(int k, Node DL[], int s[], int head) {
    if (DL[head].right == head)
        return 1;
    int t = INT_MAX, c;
    for (int i = DL[head].right; i != head; i = DL[i].right) {
        if (s[i] < t) {
            t = s[i], c = i;
        }
    }
    int ans = 0;
    DLX_remove(c, DL, s);
    for (int i = DL[c].down; i != c; i = DL[i].down) {
        for (int j = DL[i].right; j != i; j = DL[j].right)
            DLX_remove(DL[j].ch, DL, s);
        ans += DLX_dfs(k+1, DL, s, head);
        for (int j = DL[i].left; j != i; j = DL[j].left)
            DLX_resume(DL[j].ch, DL, s);
    }
    DLX_resume(c, DL, s);
    return ans;
}
int DLX_newnode(int up, int down, int left, int right, Node DL[], int *size) {
    DL[*size].up = up, DL[*size].down = down;
    DL[*size].left = left, DL[*size].right = right;
    DL[up].down = DL[down].up = DL[left].right = DL[right].left = *size;
    assert(*size < MAXN);
    return (*size)++;
}
void DLX_newrow(int n, int Row[], Node DL[], int s[], int *size) {
    int a, r, row = -1, k;
    for (a = 0; a < n; a++) {
        r = Row[a];
        DL[*size].ch = r, s[r]++;
        if (row == -1) {
            row = DLX_newnode(DL[DL[r].ch].up, DL[r].ch, *size, *size, DL, size);
        } else {
            k = DLX_newnode(DL[DL[r].ch].up, DL[r].ch, DL[row].left, row, DL, size);
        }
    }
}
void DLX_init(int m, Node DL[], int s[], int *size, int *head) {
    *size = 0;
    *head = DLX_newnode(0, 0, 0, 0, DL, size);
    for (int i = 1; i <= m; i++) {
        DLX_newnode(i, i, DL[*head].left, *head, DL, size);
        DL[i].ch = i, s[i] = 0;
    }
}
int DLX_sudoku(byte g[][9]) {
    Node DL[MAXN + MAXR];
    int s[MAXR], head, size;
    int row[10];
    int used[4][10][10] = {}, isValid = 1;
    int n = 9, tn = 3;
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            if (g[i][j]) {
                int x = g[i][j];
                int y = i/3*3 + j/3;
                if (used[1][i][x])    isValid = 0;
                if (used[2][j][x])    isValid = 0;
                if (used[3][y][x])    isValid = 0;
                used[0][i][j] = 1;
                used[1][i][x] = used[2][j][x] = used[3][y][x] = 1;
            }
        }
    }
    if (!isValid)
        return 0;
 
    int OFF[4] = {};
    int label[4][10][10] = {};
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            if (used[0][i][j] == 0)
                label[0][i][j] = ++OFF[0];
        }
    }
    for (int k = 1; k < 4; k++) {
        OFF[k] = OFF[k-1];
        for (int i = 0; i < n; i++) {
            for (int j = 1; j <= n; j++) {
                if (used[k][i][j] == 0) {
                    label[k][i][j] = ++OFF[k];
                }
            }
        }
    }
    DLX_init(OFF[3], DL, s, &size, &head);
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            if (g[i][j])    continue;
            for (int k = 1; k <= n; k++) {
                int x = k;
                int y = i/3*3 + j/3;
                if (used[1][i][x])    continue;
                if (used[2][j][x])    continue;
                if (used[3][y][x])    continue;
 
                row[0] = label[0][i][j];
                row[1] = label[1][i][k];
                row[2] = label[2][j][k];
                row[3] = label[3][y][k];
                DLX_newrow(4, row, DL, s, &size);
            }
        }
    }
    return DLX_dfs(0, DL, s, head);
}
DLX_namespace const DLX = {DLX_sudoku};
// DLX body end
 
// parallel utils
typedef struct {
    byte b[9][9];    // board
    bool r[9][9], c[9][9], g[9][9];    // row, column, grid
} Board;
static inline int gridID(int x, int y) {
    return x / 3 * 3 + y / 3;
}
void board_init(Board *b) {
    memset(b, 0, sizeof(Board));
}
bool board_test(const Board *b, int x, int y, int v) {
    return b->b[x][y] == 0 && b->r[x][v-1] == 0 && b->c[y][v-1] == 0 && b->g[gridID(x, y)][v-1] == 0;
}
bool board_fill(Board *b, int x, int y, int v) {
    b->b[x][y] = v;
    b->r[x][v-1] = b->c[y][v-1] = b->g[gridID(x, y)][v-1] = 1;
}
 
int incomplete_bfs(int g[][9]) {
    Board origin;
    int n = 9;
    int A[81][2], m = 0;
    board_init(&origin);
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            if (g[i][j]) {
                board_fill(&origin, i, j, g[i][j]);
            } else {
                A[m][0] = i, A[m][1] = j, m++;
            }
        }
    }
#define MAXQMEM 8192
    static Board Q[2][MAXQMEM*9];
    int Qcnt[2] = {}, Qflag = 0;
    Q[Qflag][Qcnt[Qflag]] = origin, Qcnt[Qflag]++;
    for (int it = 0; it < m; it++) {
        const int x = A[it][0], y = A[it][1];
        Qcnt[!Qflag] = 0;
        #pragma omp parallel for
        for (int i = 0; i < Qcnt[Qflag]; i++) {
            for (int j = 1; j <= 9; j++) {
                if (!board_test(&Q[Qflag][i], x, y, j))
                    continue;
                int pIdx;
                #pragma omp critical
                {
                    pIdx = Qcnt[!Qflag]++;
                }
                Q[!Qflag][pIdx] = Q[Qflag][i];
                board_fill(&Q[!Qflag][pIdx], x, y, j);
            }
        }
        if (Qcnt[!Qflag] >= MAXQMEM) {
            int ret = 0;
            #pragma omp parallel for reduction(+:ret)
            for (int i = 0; i < Qcnt[Qflag]; i++) {
                ret += DLX.sudoku(Q[Qflag][i].b);
            }
            return ret;
        }
        Qflag = !Qflag;
    }
    return Qcnt[Qflag];
}
int main() {
    int g[9][9], n = 9;
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            scanf("%d", &g[i][j]);
 
    int ret = incomplete_bfs(g);
    printf("%d\n", ret);
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10087. Sparse Matrix Multiplication (OpenMP)

題目描述

稀疏矩陣為大部份元素皆為零的矩陣，在科學與工程領域中求解線性模型時經常出現大型的稀疏矩陣。現在給予最常見的 Coordinate Format (簡稱 COO 格式)，請問兩個矩陣相乘結果為何。

給予矩陣 $A_{n, m}$ 和 $B_{m, r}$，請計算稀疏矩陣相乘。

$$A_{4,4} = \begin{bmatrix} 0 & 0 & 0 & 0 \\ 5 & 8 & 0 & 0 \\ 0 & 0 & 3 & 0 \\ 0 & 6 & 0 & 0 \\ \end{bmatrix}, \qquad B_{4,4} = \begin{bmatrix} 0 & 0 & 1 & 3 \\ 0 & 5 & 2 & 0 \\ 3 & 5 & 0 & 0 \\ 0 & 2 & 0 & 0 \\ \end{bmatrix}$$

根據 COO 格式，分別轉換矩陣 $A$ 和 $B$ 的所有非零元素，如下表所示

COO of Matrix $A$

`row_index`	`col_index`	`value`
1	0	5
1	1	8
2	2	3
3	1	6

COO of Matrix $B$

`row_index`	`col_index`	`value`
0	2	1
0	3	3
1	1	5
1	2	2
2	0	3
2	1	5
3	1	2

輸入格式

測資只有一組，第一行會有三個整數 $N, \; M, \; R$，分別表示矩陣 $A, \; B$ 的大小。下一行會有兩個整數 $N_A, \; N_B$，接下來會有 $N_A$ 行，每一行表示矩陣 $A$ 的 COO 格式，隨後會有 $N_B$ 行，每一行表示矩陣 $B$ 的 COO 格式。

$0 < N, \; M, \; R \le 10^6$
$0 < N_A, \; N_B \le 10^6$

給予 COO 格式時，先按照 row_index 由小到大，相同情況由 col_index 由小到大的方式給定，保證不會有重複，每一個元素值 $v$ 介於 $1$ 到 $2^{31}-1$ 之間。

輸出格式

輸出 $C_{n, r} = A_{n, m} \times B_{m, r}$ 的雜湊值。

定義 $\mathit{hash}(C_{n, r}) = \sum\nolimits_{e_{i, j} \in C \text{ and } e_{i, j} \neq 0} \mathit{encrypt}((i+1)*(j+1), e_{i, j})$，實際運作的流程可參考以下作法，當然你沒辦法宣告 $N \times M$ 的空間使用：

static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
#define MAXN 1024
uint32_t A[MAXN][MAXN], B[MAXN][MAXN];
int main() {
    int x, y, v;
    int N, M, R, nA, nB;
    scanf("%d %d %d", &N, &M, &R);
    scanf("%d %d", &nA, &nB);
    for (int i = 0; i < nA; i++)
        scanf("%d %d %d", &x, &y, &v), A[x][y] = v;
    for (int i = 0; i < nB; i++)
        scanf("%d %d %d", &x, &y, &v), B[x][y] = v;
    
    uint32_t hash = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < R; j++) {
            uint32_t sum = 0;
            for (int k = 0; k < M; k++)
                sum += A[i][k] * B[k][j];
            if (sum)
                hash += encrypt((i+1)*(j+1), sum);
        }
    }
    printf("%u\n", hash);
    return 0;
}

範例輸入

範例輸出

13093438

範例解釋

$$A_{n, m} \times B_{m, r} = C_{4,4}=\begin{bmatrix} 0 & 0 & 0 & 0 \\ 0 & 40 & 21 & 15 \\ 9 & 15 & 0 & 0 \\ 0 & 30 & 12 & 0 \\ \end{bmatrix}$$

Solution

傳統處理 Array of Structure: Accepted (14964 ms, 29104 KB)
傳統處理 Structure of Array: Accepted (12537 ms, 45568 KB)
額外空間 Array of Structure: Accepted (5711 ms, 29152 KB)
額外空間 Structure of Array: Accepted (5202 ms, 29056 KB)

Array of Structure 簡稱 AOS，而 Structure of Array 簡稱 SOA，這兩種寫法的差異在於運用 structure 時的寫法，根據 cache line 的設計，有時候一個 struct 下會有很多變數，然而我們只需要使用部分的變數，這時候沒有使用的變數仍會跟著一起進入 cache，很容易造成 cache 效率偏低，因此回歸原始寫出 SOA 的版本。

傳統處理

如果是兩個稀疏方陣，用 COO 格式需要先對後面那個矩陣轉置，接著在三欄 $(i, j, v)$ 的表格下，按照字典順序排序。此時會有兩個三欄 $A(i, j, v), \; B(i, j, v)$ (別忘記 $B$ 矩陣已經轉置過)，要計算 $A \times B = C$，為得到 $C_{i, j}$，$C_{i, j} = \sum A(i, k, v) B(j, k, v)$，由於已經排序好，那麼找到共同的 $k$ 不是難事，按照合併排序的手法，能在 $\mathcal{O}(N)$ 完成。

因此在最稠密的矩陣下，仍需要 $\mathcal{O}(N^3)$ 才能完成。

AOS

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
using namespace std;
#define MAXN 1048576
#define MAXL (MAXN<<2)
 
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
typedef struct ELE {
    int i, j;
    uint32_t v;
} ELE;
ELE LA[MAXL], LB[MAXL];
void SpMV(int N, int M, int R, ELE LA[], int na, ELE LB[], int nb) {
    int ma = 0, mb = 0;
    static int aoff[MAXN], boff[MAXN];
 
    for (int i = 0, p = -1; i < na; i++) {
        if (LA[i].i != p)
            aoff[ma++] = i, p = LA[i].i;
    }
    for (int i = 0, p = -1; i < nb; i++) {
        if (LB[i].i != p)
            boff[mb++] = i, p = LB[i].i;
    }
 
    uint32_t hash = 0;
    aoff[ma] = na, boff[mb] = nb;
    #pragma omp parallel for reduction(+: hash)
    for (int i = 0; i < ma; i++) {
        for (int j = 0; j < mb; j++) {
            int idx1 = aoff[i], idx2 = boff[j];
            int top1 = aoff[i+1], top2 = boff[j+1];
            uint32_t sum = 0;
            int r = LA[idx1].i, c = LB[idx2].i;
            while (idx1 < top1 && idx2 < top2) {
                if (LA[idx1].j == LB[idx2].j)
                    sum += LA[idx1].v * LB[idx2].v, idx1++, idx2++;
                else if (LA[idx1].j < LB[idx2].j)
                    idx1++;
                else
                    idx2++;                    
            }
 
            if (sum)
                hash += encrypt((r+1)*(c+1), sum);
        }
    }
    printf("%u\n", hash);
}
bool cmp(const ELE &a, const ELE &b) {
    if (a.i != b.i)
        return a.i < b.i;
    return a.j < b.j;
}
int main() {
    int N, M, R, nA, nB;
    while (scanf("%d %d %d", &N, &M, &R) == 3) {
        scanf("%d %d", &nA, &nB);
        for (int i = 0; i < nA; i++)
            scanf("%d %d %d", &LA[i].i, &LA[i].j, &LA[i].v);
        for (int i = 0; i < nB; i++)
            scanf("%d %d %d", &LB[i].j, &LB[i].i, &LB[i].v);
        sort(LB, LB+nB, cmp);
        SpMV(N, M, R, LA, nA, LB, nB);
    }
    return 0;
}

SOA

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
using namespace std;
#define MAXN 1048576
#define MAXL (MAXN<<2)
 
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
typedef struct ELE {
    int i[MAXL], j[MAXL];
    uint32_t v[MAXL];
} ELE;
ELE LA, LB, LT;
int D[MAXL];
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
    int ma = 0, mb = 0;
    static int aoff[MAXN], boff[MAXN];
 
    for (int i = 0, p = -1; i < na; i++) {
        if (LA.i[i] != p)
            aoff[ma++] = i, p = LA.i[i];
    }
    for (int i = 0, p = -1; i < nb; i++) {
        if (LB.i[i] != p)
            boff[mb++] = i, p = LB.i[i];
    }
 
    uint32_t hash = 0;
    aoff[ma] = na, boff[mb] = nb;
    #pragma omp parallel for reduction(+: hash)
    for (int i = 0; i < ma; i++) {
        for (int j = 0; j < mb; j++) {
            int idx1 = aoff[i], idx2 = boff[j];
            int top1 = aoff[i+1], top2 = boff[j+1];
            uint32_t sum = 0;
            int r = LA.i[idx1], c = LB.i[idx2];
            while (idx1 < top1 && idx2 < top2) {
                if (LA.j[idx1] < LB.j[idx2])
                    idx1++;
                else if (LA.j[idx1] > LB.j[idx2])
                    idx2++;                    
                else
                    sum += LA.v[idx1] * LB.v[idx2], idx1++, idx2++;
            }
 
            if (sum)
                hash += encrypt((r+1)*(c+1), sum);
        }
    }
    printf("%u\n", hash);
}
bool cmp(int a, int b) {
    if (LT.i[a] != LT.i[b])
        return LT.i[a] < LT.i[b];
    return LT.j[a] < LT.j[b];
}
int main() {
    int N, M, R, nA, nB;
    while (scanf("%d %d %d", &N, &M, &R) == 3) {
        scanf("%d %d", &nA, &nB);
        for (int i = 0; i < nA; i++)
            scanf("%d %d %d", &LA.i[i], &LA.j[i], &LA.v[i]);
        for (int i = 0; i < nB; i++)
            scanf("%d %d %d", <.j[i], <.i[i], <.v[i]);
        for (int i = 0; i < nB; i++)
            D[i] = i;
        sort(D, D+nB, cmp);
        for (int i = 0; i < nB; i++)
            LB.i[i] = LT.i[D[i]], LB.j[i] = LT.j[D[i]], LB.v[i] = LT.v[D[i]];
        SpMV(N, M, R, LA, nA, LB, nB);
    }
    return 0;
}

額外空間

若採用額外空間處理，一開始就不用對 $B$ 轉置，$C_{i, j} = \sum A(i, k, v) B(k, j, v)$。每一次解決 $C$ 的第 $i$ 列上的所有元素值，此時 $A(i, k, v)$ 按照 $k$ 遞增排列，$B(k, j, v)$ 要配對 $A(i, k, v)$，會恰好掃描完整的 $B$，累加元素值會不按照順序累加到 $C_{i, j}$。

因此在最稠密的矩陣下，仍需要 $\mathcal{O}(N^3)$ 才能完成。

AOS

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
 
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
 
#define MAXN 1048576
#define MAXL 1048576
typedef struct ELE {
    int i, j;
    uint32_t v;
} ELE;
ELE LA[MAXL], LB[MAXL];
 
void SpMV(int N, int M, int R, ELE LA[], int na, ELE LB[], int nb) {
    int ma = 0;
    static int aoff[MAXN];
    for (int i = 0, p = -1; i < na; i++) {
        if (LA[i].i != p)
            aoff[ma++] = i, p = LA[i].i;
    }
    aoff[ma] = na;
 
    uint32_t hash = 0;
    int chunk = 8;
    #pragma omp parallel for schedule(dynamic, chunk) reduction(+: hash)
    for (int i = 0; i < ma; i++) {
        int idx1 = aoff[i], top1 = aoff[i+1];
        int r = LA[idx1].i;
        uint32_t *C = (uint32_t *) calloc(R, sizeof(uint32_t));
        for (int idx2 = 0; idx2 < nb && idx1 < top1; ) {
            if (LA[idx1].j == LB[idx2].i) {
                C[LB[idx2].j] += LA[idx1].v * LB[idx2].v;
                idx2++;
            }
            if (idx2 < nb && LA[idx1].j != LB[idx2].i) {
                if (LA[idx1].j < LB[idx2].i)
                    idx1++;
                else
                    idx2++;
            }
        }
        for (int j = 0; j < R; j++) {
            if (C[j])
                hash += encrypt((r+1)*(j+1), C[j]);
        }
        free(C);
    }
 
    printf("%u\n", hash);
}
int main() {
    uint32_t N, M, R, nA, nB;
    while (scanf("%d %d %d", &N, &M, &R) == 3) {
        scanf("%d %d", &nA, &nB);
        for (int i = 0; i < nA; i++)
            scanf("%d %d %d", &LA[i].i, &LA[i].j, &LA[i].v);
        for (int i = 0; i < nB; i++)
            scanf("%d %d %d", &LB[i].i, &LB[i].j, &LB[i].v);
        SpMV(N, M, R, LA, nA, LB, nB);
    }
    return 0;
}

SOA

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
 
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
 
#define MAXN 1048576
#define MAXL 1048576
typedef struct ELE {
    int i[MAXL], j[MAXL];
    uint32_t v[MAXL];
} ELE;
ELE LA, LB;
 
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
    int ma = 0;
    static int aoff[MAXN];
    for (int i = 0, p = -1; i < na; i++) {
        if (LA.i[i] != p)
            aoff[ma++] = i, p = LA.i[i];
    }
    aoff[ma] = na;
 
    uint32_t hash = 0;
    int chunk = 8;
    #pragma omp parallel for schedule(dynamic, chunk) reduction(+: hash)
    for (int i = 0; i < ma; i++) {
        int idx1 = aoff[i], top1 = aoff[i+1];
        int r = LA.i[idx1];
        uint32_t *C = (uint32_t *) calloc(R, sizeof(uint32_t));
        for (int idx2 = 0; idx2 < nb && idx1 < top1; ) {
            if (LA.j[idx1] == LB.i[idx2]) {
                C[LB.j[idx2]] += LA.v[idx1] * LB.v[idx2];
                idx2++;
            }
            if (idx2 < nb && LA.j[idx1] != LB.i[idx2]) {
                if (LA.j[idx1] < LB.i[idx2])
                    idx1++;
                else
                    idx2++;
            }
        }
        for (int j = 0; j < R; j++) {
            if (C[j])
                hash += encrypt((r+1)*(j+1), C[j]);
        }
        free(C);
    }
 
    printf("%u\n", hash);
}
int main() {
    uint32_t N, M, R, nA, nB;
    while (scanf("%d %d %d", &N, &M, &R) == 3) {
        scanf("%d %d", &nA, &nB);
        for (int i = 0; i < nA; i++)
            scanf("%d %d %d", &LA.i[i], &LA.j[i], &LA.v[i]);
        for (int i = 0; i < nB; i++)
            scanf("%d %d %d", &LB.i[i], &LB.j[i], &LB.v[i]);
        SpMV(N, M, R, LA, nA, LB, nB);
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10082. Fast Find Prime Number (OpenMP)

題目描述

找出 $[ L, \; R ]$ 區間內有多少個質數。

輸入格式

有多組測資，每組測資一行兩個整數 $L$, $R$，表示左右區間。

$1 \le L \le R \le 2^{31}-1$

輸出格式

對於每一組詢問輸出一行一個整數。

輸入範例

1 2	1000000 2000000 1 1000000000

輸出範例

1 2	70435 50847534

提示

Sieve of Eratosthenes

Solution

這一題沒有出得很好，後來想到可以作弊打表，目前仍沒有同學挑戰成功，略感傷心。

線性篩法

線性篩法 Accepted (1724 ms, 2816 KB)

由於只要統計 $2^{31}$ 以內的質數個數，只需要把 50000 以內的所有質數找到，利用這不到 5000 的質數進行區塊篩法即可。

由於 $[1, 2^{31}]$ 也是非常龐大的區間，可以利用 bitmask 的技術，將記憶體壓縮 32 倍，但這樣子平行度還是不高，因此若詢問區間 $[l, r]$ 有多少個質數，將區間切割成數個足夠大的 chunk，這裡決定 chunk 大約在 100000 到 1000000 都有不錯的效果。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <omp.h>
 
#define MAXL (50000>>5)+1
#define BLOCK (1<<17)
#define MAXN (BLOCK>>5)+1
#define GET1(x) (mark1[(x)>>5]>>((x)&31)&1)
#define SET1(x) (mark1[(x)>>5] |= 1<<((x)&31))
#define GET2(x) (mark2[(x)>>5]>>((x)&31)&1)
#define SET2(x) (mark2[(x)>>5] |= 1<<((x)&31))
int mark1[MAXL];
int P[5500], Pt = 0;
void sieve() {
    register int i, j, k;
    SET1(1);
    int n = 50000;
    for (i = 2; i <= n; i++) {
        if (!GET1(i)) {
            for (k = n/i, j = i*k; k >= i; k--, j -= i)
                SET1(j);
            P[Pt++] = i;
        }
    }
}
 
int local_sieve(int a, int b) {
    int sqr = (int) sqrt(b), gap = b - a;
    int mark2[MAXN] = {};
 
    for (int i = 0; i < Pt && P[i] <= sqr; i++) {
        unsigned p = P[i], base = a / p * p;
        while (base < a)    base += p;
        if (base == p)    base += p;
        for (unsigned j = base; j <= b; j += p)
            SET2(j - a);
    }
    if (a == 1)    SET2(0);
    int ret = gap + 1;
    for (int i = gap>>5; i >= 0; i--)
        ret -= __builtin_popcount(mark2[i]);
    return ret;
}
long long min(long long x, long long y) {
    return x < y ? x : y;
}
int main() {
    sieve();
    int l, r;
    while (scanf("%d %d", &l, &r) == 2) {
        int ret = 0;
        #pragma omp parallel for schedule(dynamic) reduction(+: ret)
        for (long long i = l; i <= r; i += BLOCK)
            ret += local_sieve(i, min(i+BLOCK-1, r));
        printf("%d\n", ret);
    }
    return 0;
}

作弊打表

由於問題設計只針對個數，還可以偷偷本地建表完成每一個區間的值，針對非完全包含的區間單獨計算即可。打表部分可以藉由上述的程序做到。

作弊打表 Accepted (622 ms, 384 KB)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <omp.h>
 
#define MAXL (50000>>5)+1
#define BLOCK (1<<20)
#define MAXN (BLOCK>>5)+1
#define GET1(x) (mark1[(x)>>5]>>((x)&31)&1)
#define SET1(x) (mark1[(x)>>5] |= 1<<((x)&31))
#define GET2(x) (mark2[(x)>>5]>>((x)&31)&1)
#define SET2(x) (mark2[(x)>>5] |= 1<<((x)&31))
int preprocess[] = {82025,73586,70938,69398,68248,67307, <...>};
int mark1[MAXL];
int P[5500], Pt = 0;
void sieve() {
    register int i, j, k;
    SET1(1);
    int n = 50000;
    for (i = 2; i <= n; i++) {
        if (!GET1(i)) {
            for (k = n/i, j = i*k; k >= i; k--, j -= i)
                SET1(j);
            P[Pt++] = i;
        }
    }
}
 
int local_sieve(int a, int b) {
    int sqr = (int) sqrt(b), gap = b - a;
    int mark2[MAXN] = {};
 
    for (int i = 0; i < Pt && P[i] <= sqr; i++) {
        unsigned p = P[i], base = a / p * p;
        while (base < a)    base += p;
        if (base == p)    base += p;
        for (unsigned j = base; j <= b; j += p)
            SET2(j - a);
    }
    if (a == 1)    SET2(0);
    int ret = gap + 1;
    for (int i = gap>>5; i >= 0; i--)
        ret -= __builtin_popcount(mark2[i]);
    return ret;
}
 
long long min(long long x, long long y) {
    return x < y ? x : y;
}
long long max(long long x, long long y) {
    return x > y ? x : y;
}
int main() {
    sieve();
 
    int prepro_n = sizeof(preprocess) / sizeof(int);
    int l, r;
    while (scanf("%d %d", &l, &r) == 2) {
        int ret = 0; 
        for (int i = 0; i < prepro_n; i++) {
            long long tl = (long long) i * BLOCK + 1;
            long long tr = (long long) (i+1) * BLOCK;
            if (l <= tl && tr <= r) {
                ret += preprocess[i];
            } else {
                if (tl <= l && l <= tr) {
                    ret += local_sieve(l, min(r, tr));
                } else if (tl <= r && r <= tr) {
                    ret += local_sieve(max(tl, l), r);
                }
            }
        }
 
        printf("%d\n", ret);
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10081. Fast Game of Life

題目描述

生命遊戲中，對於任意細胞，規則如下：
每個細胞有兩種狀態-存活或死亡，每個細胞與以自身為中心的周圍八格細胞產生互動。

當前細胞為存活狀態時，當周圍低於 2 個 (不包含 2 個) 存活細胞時，該細胞變成死亡狀態。
當前細胞為存活狀態時，當周圍有 2 個或 3 個存活細胞時，該細胞保持原樣。
當前細胞為存活狀態時，當周圍有 3 個以上的存活細胞時，該細胞變成死亡狀態。
當前細胞為死亡狀態時，當周圍有 3 個存活細胞時，該細胞變成存活狀態。

可以把最初的細胞結構定義為種子，當所有在種子中的細胞同時被以上規則處理後，可以得到第一代細胞圖。按規則繼續處理當前的細胞圖，可以得到下一代的細胞圖，周而復始。

輸入格式

輸入第一行有兩個整數 $N$, $M$，表示盤面大小為 $N \times N$，模擬週期次數 $M$。接下來會有 $N$ 行，每一行上會有 $N$ 個字符，以 0 表示 $(i, j)$ 格子上的細胞屬於死亡狀態，反之 1 為存活狀態。

$1 \le N \le 2000$
$0 \le M \le 1000$

輸出格式

對於每一組測資輸出 $N$ 行，每一行上有 $N$ 個字元表示模擬 $M$ 次的最終盤面結果。

範例輸入 1

範例輸出 1

範例輸入 2

範例輸出 2

Solution

在還沒有做任何平行前，生命遊戲的模擬非常簡單，只需要統計鄰近八格的狀態即可，然而由於是常數範圍，直接打八個加法比起迴圈去累計八格的結果快上非常多，別忽視 for loop 在做 branch 的 cycle 數量。

樸素平行 Accepted (378 ms, 24 MB)
循環展開 Accepted (289 ms, 39 MB)

平行採用 big chunk 的方式，考慮到 cache miss 的關係，最好是每一個處理器都把相鄰的列放置在各自的 L1-L2-L3 cache，防止使用時不在自己的 cache，而產生嚴重的 cache miss。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <omp.h>
 
#define UINT unsigned long
#define MAXN 2048
char g[2][MAXN][MAXN];
int n, m;
int main() {
    while (scanf("%d %d", &n, &m) == 2) {
        while (getchar() != '\n');
 
        // read input
        for (int i = 1; i <= n; i++) {
            fgets(g[0][i]+1, MAXN, stdin);
            for (int j = 1; j <= n; j++)
                g[0][i][j] -= '0';
            g[0][i][n+1] = 0;
        }
 
        // game of life
        int flag = 0;
        int chunk = 64;
        for (int it = 0; it < m; it++) {
            #pragma omp parallel for schedule(static, chunk)
            for (int i = 1; i <= n; i++) {
                for (int j = 1; j <= n; j++) {
                    int adj = g[flag][i-1][j-1] + g[flag][i-1][j] + g[flag][i-1][j+1] +
                        g[flag][i  ][j-1] + g[flag][i  ][j+1] +
                        g[flag][i+1][j-1] + g[flag][i+1][j] + g[flag][i+1][j+1];
                    g[!flag][i][j] = (g[flag][i][j] == 0 && adj == 3) || 
                                        (g[flag][i][j] == 1 && (adj == 2 || adj == 3));
                }
            }
            flag = !flag;
        }
 
        // print result
        for (int i = 1; i <= n; i++) {
            for (int j = 1; j <= n; j++)
                g[flag][i][j] += '0';
            g[flag][i][n+1] = '\0';
            puts(g[flag][i]+1);
        }
    }
    return 0;
}

循環展開

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <omp.h>
 
#define UINT unsigned long
#define MAXN 2048
char g[2][MAXN][MAXN];
int n, m;
int main() {
    while (scanf("%d %d", &n, &m) == 2) {
        while (getchar() != '\n');
 
        // read input
        for (int i = 1; i <= n; i++) {
            fgets(g[0][i]+1, MAXN, stdin);
            for (int j = 1; j <= n; j++)
                g[0][i][j] -= '0';
            g[0][i][n+1] = 0;
        }
 
        // game of life
#ifdef _OPENMP
        int chunk = n / omp_get_max_threads();
#endif
        for (int it = 0; it < m; it++) {
            #pragma omp parallel for schedule(static, chunk)
            for (int i = 1; i <= n; i++) {
                int adj, j = 1;
#define UNLOOP { \
    adj = g[0][i-1][j-1] + g[0][i-1][j] + g[0][i-1][j+1] + \
        g[0][i  ][j-1] + g[0][i  ][j+1] + \
        g[0][i+1][j-1] + g[0][i+1][j] + g[0][i+1][j+1]; \
    g[1][i][j] = (g[0][i][j] == 0 && adj == 3) || \
                    (g[0][i][j] == 1 && (adj == 2 || adj == 3)); j++; \
    }
#define UNLOOP4 {UNLOOP UNLOOP UNLOOP UNLOOP}
#define UNLOOP8 {UNLOOP4 UNLOOP4}
                for (; j + 8 <= n; )
                    UNLOOP8;
                for (; j <= n; )
                    UNLOOP;
            }
            #pragma omp parallel for schedule(static, chunk)
            for (int i = 1; i <= n; i++)
                memcpy(g[0][i], g[1][i], sizeof(g[0][0][0]) * (n+1));
        }
 
        // print result
        for (int i = 1; i <= n; i++) {
            for (int j = 1; j <= n; j++)
                g[0][i][j] += '0';
            g[0][i][n+1] = '\0';
            puts(g[0][i]+1);
        }
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10026. Fast N-Queen

題目描述

相信 n-Queen 問題對每個研究 backtracking 的人來講都不陌生，這個問題是要在一個 $n \times n$ 大小的棋盤上擺n個皇后，讓她們不會互相攻擊到。為了讓這個問題更難一點，我們設計了一些障礙物在棋盤上，在這些點上不能放皇后。請留意這些障礙物並不會防止皇后被攻擊。

在傳統的 8-Queen 問題中，旋轉與鏡射被視為不同解法，因此我們有 92 種可能的方式來放置皇后。

輸入格式

輸入的資料最多包含 10 筆測試個案，每個測試個案的第一行會有一個整數 $n$ ($3 < n < 20$)，接下來的 $n$ 行代表棋盤資料，點號 '.' 代表空的盤格，星號 '*' 代表放有障礙物的盤格。

輸出格式

對每筆測試個案，輸出這是第幾個 case 以及這個 case 有幾種可能的放置方式。

範例輸入

8
........
........
........
........
........
........
........
........
4
.*..
....
....
....

範例輸出

1 2	Case 1: 92 Case 2: 1

Solution

首先，必須先認識最快的 N 皇后問題可以利用 bitmask 的技術，把過多的 memory access 的指令捨去而加速。

樸素平行 Accepted (2766 ms, 512 KB)
負載平衡 Accepted (1771 ms, 5248 KB)

樸素平行

在大多數的教科書上，平行只針對第一個列的位置平行，因此平行度最多 $N$，針對第一列每一個位置都分別開一個 thread 找解，這樣運算時間相當於跑最慢的那一個 thread。

#include <stdio.h>
int y_valid[32] = {};
int dfs(int lv, int row, int dia1, int dia2, int mask) {
    if (row == mask)
        return 1;
    int pos = mask & (~(row | dia1 | dia2)) & y_valid[lv], p;
    int ret = 0;
    while (pos) {
        p = pos & (-pos);
        pos -= p;
        ret += dfs(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
    }
    return ret;
}
int totalNQueens(int n) {
    int sum = 0;
    int chunk = 1;
    int row = 0, dia1 = 0, dia2 = 0, mask = (1<<n)-1, lv = 0;
    int pos = mask & (~(row | dia1 | dia2)) & y_valid[0], p;
    #pragma omp parallel for schedule(dynamic, chunk) reduction(+:sum)
    for (int i = 0; i < n; i++) {
        if ((pos>>i)&1) {
            p = 1<<i;
            int t = dfs(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
            sum += t;
        }
    }
    return sum;
}
int main() {
    int cases = 0, n;
    char s[32] = {};
    while (scanf("%d", &n) == 1 && n) {
        for (int i = 0; i < n; i++) {
            scanf("%s", s);
            y_valid[i] = (1<<n)-1;
            for (int j = 0; j < n; j++) {
                if (s[j] == '*')
                    y_valid[i] ^= (1<<j);
            }
        }
        int ret = totalNQueens(n);
        printf("Case %d: %d\n", ++cases, ret);
    }
    return 0;
}

負載平衡

為了達到每一個 thread 盡可能分到相同的工作量，避免最慘的那一條有太多的解而很慢，做好細粒度 (fine-grain) 的分配，如此一來工作量就很高的機率會相同。

因此，一開始先進行廣度搜索，把前幾列的解展開，也許會獲得 $N^x$ 種盤面，這時候平均切給 $M$ 條 thread，分配的工作量平衡上就有比較好的展現。

#include <stdio.h>
#define MAXQ 16384
#define MAXN 32
int y_valid[32] = {};
typedef struct Board {
    int row, dia1, dia2;
} Board;
Board R[MAXQ*MAXN];
int dfs(int lv, int row, int dia1, int dia2, int mask) {
    if (row == mask)
        return 1;
    int pos = mask & (~(row | dia1 | dia2)) & y_valid[lv], p;
    int ret = 0;
    while (pos) {
        p = pos & (-pos);
        pos -= p;
        ret += dfs(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
    }
    return ret;
}
int ida_dep, ida_cnt;
void IDA(int lv, int row, int dia1, int dia2, int mask) {
    if (lv == ida_dep) {
        Board b = {.row = row, .dia1 = dia1, .dia2 = dia2};
        R[ida_cnt++] = b;
        return ;
    }
    int pos = mask & (~(row | dia1 | dia2)) & y_valid[lv], p;
    while (pos) {
        p = pos & (-pos);
        pos -= p;
        IDA(lv+1, row|p, (dia1|p)<<1, (dia2|p)>>1, mask);
    }
}
int totalNQueens(int n) {
    int it = 0;
    int row = 0, dia1 = 0, dia2 = 0, mask = (1<<n)-1, lv = 0;
    for (it = 1; it <= n; it++) {
        ida_dep = it, ida_cnt = 0;
        IDA(0, row, dia1, dia2, mask);
        if (ida_cnt >= MAXQ) {
            int sum = 0;
            int chunk = 1024;
            #pragma omp parallel for schedule(static, chunk) reduction(+:sum)
            for (int i = 0; i < ida_cnt; i++)
                sum += dfs(it, R[i].row, R[i].dia1, R[i].dia2, mask);
            return sum;
        }
    }
    return ida_cnt;
}
int main() {
    int cases = 0, n;
    char s[32] = {};
    while (scanf("%d", &n) == 1 && n) {
        for (int i = 0; i < n; i++) {
            scanf("%s", s);
            y_valid[i] = (1<<n)-1;
            for (int j = 0; j < n; j++) {
                if (s[j] == '*')
                    y_valid[i] ^= (1<<j);
            }
        }
        int ret = totalNQueens(n);
        printf("Case %d: %d\n", ++cases, ret);
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10025. Fast Image Match (OpenMP)

背景

圖片匹配和字串匹配有一點不同，字串匹配通常要求其子字串與搜尋字串完全相符，而圖片匹配則用相似度為依據，當圖片大、複雜且具有干擾，或者需要匹配數量非常多，更先進的領域會利用特徵擷取，用機率統計的方式來篩選可能的匹配數量，篩選過後才進行圖片的細節匹配。

題目描述

給予兩個圖片 $A, B$，圖片格式為灰階影像，每個像素 $\mathit{pixel}(x, y)$ 採用 8-bits 表示，範圍為 $\mathit{pixel}(x, y) \in [0, 255]$。

舉一個例子，有一個 $3 \times 3$ 的圖片 $A$ 和一個 $2 \times 2$ 的圖片 $B$，用矩陣表示如下：

$$A := \begin{bmatrix} a1 & a2 & a3 \\ a4 & a5 & a6 \\ a7 & a8 & a9 \end{bmatrix} ,\; B := \begin{bmatrix} b1 & b2\\ b3 & b4\\ \end{bmatrix}$$

假設左上角座標 $(1, 1)$ 即 $a1$ 的位置、$(1, 2)$ 即 $a2$ 的位置。

若把影像 $B$ 左上角對齊 $A$ 的 $(1, 1)$ 位置，其差異程度 $\mathit{diff}(A, B) = (a1 - b1)^2 + (a2 - b2)^2 + (a4 - b3)^2 + (a5 - b4)^2$
相同地，對齊 $(2, 1)$ 位置，其差異程度 $\mathit{diff}(A, B) = (a4 - b1)^2 + (a5 - b2)^2 + (a7 - b3)^2 + (a8 - b4)^2$

比較時，整張 $B$ 都要落在 $A$ 中。現在要找到一個對齊位置 $(x, y)$，使得 $\mathit{diff}(A, B)$ 最小。

輸入格式

多組測資，每一組第一行有四個整數 $A_H, A_W, B_H, B_W$，分別表示圖片 $A$ 的高寬、$B$ 的高寬。

接下來會有 $A_H$ 行，每一行上會有 $A_W$ 個整數，第 $i$ 行上的第 $j$ 個整數 $x$ 表示 $A(i, j)$ 的灰階像素值。同樣地，接下來會有 $B_H$ 行，每一行上會有 $B_W$ 個整數，第 $i$ 行上的第 $j$ 個整數 $x$ 表示 $B(i, j)$ 的灰階像素值。

$ 1\le A_H, A_W, B_H, B_W \le 500$
$ B_H \le A_H, B_W \le A_W$
$ 0 \le x \le 255$

輸出格式

對於每一組測資輸出一行，兩個整數 $x, y$，滿足 $\mathit{diff}(A, B)$ 最小。若有相同情況滿足，則優先挑選 $x$ 最小，若 $x$ 仍相同，則挑選 $y$ 最小。

範例輸入

範例輸出

1
2

3 3
1 1

Solution

樸素平行

若圖片是 $N \times N$，樸素算法運作要 $\mathcal{O}(N^4)$，傅立葉算法要 $\mathcal{O}(N^2 \log N)$，這裡針對樸素算法平行。從最高層次的找解平行，每一個匹配位置都要耗費 $\mathcal{O}(M^2)$ 計算相似性，又由於平行限制跟核心數有關，那麼針對每一個 thread 要求找到好幾個 row 的解，分別存在各自的答案陣列區中，之後再線性跑一次取最小值，利用額外的空間儲存，避免在運算過程中遇到 critical section。

#pragma omp parallel for schedule(dynamic, chunk) shared(A, B)
for (int i = 0; i <= Ch; i++) {
    for (int j = 0; j <= Cw; j++) {
        long long d = 0;
        for (int p = 0; p < Bh; p++) {
            for (int q = 0; q < Bw; q++) {
                d += (long long) (B[p][q] - A[i+p][j+q])*(B[p][q] - A[i+p][j+q]);
            }                   
        }
        if (d < diff)
        {
            #pragma omp critical
            if (d < diff)
                diff = d, bx = i, by = j;
        }
    }
}

就是上列的寫法會卡在 critical section，倒不如存在另一個陣列中，效能還差距個幾百毫秒。

#include <stdio.h>
#include <limits.h>
#define MAXN 512
int A[MAXN][MAXN], B[MAXN][MAXN];
int C[MAXN][MAXN];
int main() {
    int Ah, Aw, Bh, Bw, x;
    int cases = 0;
    while(scanf("%d %d %d %d", &Ah, &Aw, &Bh, &Bw) == 4) {
        for (int i = 0; i < Ah; i++)
            for (int j = 0; j < Aw; j++)
                scanf("%d", &x), A[i][j] = x;
        for (int i = 0; i < Bh; i++)
            for (int j = 0; j < Bw; j++)
                scanf("%d", &x), B[i][j] = x;
        int Ch = Ah - Bh, Cw = Aw - Bw;
        int bx = -1, by = -1;
        int diff = INT_MAX;
        #pragma omp parallel for
        for (int i = 0; i <= Ch; i++) {
            for (int j = 0; j <= Cw; j++) {
                int d = 0;
                for (int p = 0; p < Bh; p++) {
                    for (int q = 0; q < Bw; q++) {
                        d += (B[p][q] - A[i+p][j+q])*(B[p][q] - A[i+p][j+q]);
                    }                    
                }
                C[i][j] = d;
            }
        }
        for (int i = 0; i <= Ch; i++) {
            for (int j = 0; j <= Cw; j++)
                if (C[i][j] < diff)
                    diff = C[i][j], bx = i, by = j;
        }
 
        printf("%d %d\n", bx + 1, by + 1);
    }
    return 0;
}

FFT

儘管使用快速傅立葉 (FFT) 已經達到 $\mathcal{O}(N \log N)$ 的時間複雜度的極致，只比平行樸素算法快個兩到三倍，加入平行之後，效能可以達到五到六倍加速。

FFT 用到大量的三角函數計算，一般而言若使用序列化的乘法代替三角函數計算 (疊加角度) 會比重新計算快上很多，但這樣會損失一大部分的精準度 (但速度快很多)，而且會產生很多相互依賴的指令，這也導致不容易對迴圈平行。

如果需要平行，每一個三角函數可以預先存在陣列中，每一個 thread 直接存取陣列中的元素即可，但麻煩的地方在於最好使用多核處理器，然而跨越多個處理器有可能會因為快取 … 等機制，導致速度下降，因此這裡採用單一處理器六核心 (只有三個是 physical core)，因此 thread 限定在五個以內。

#include <stdio.h>
#include <limits.h>
#include <stdbool.h>
#include <math.h>
#include <string.h>
#include <omp.h>
// FFT header
#define MAXN 262144
typedef struct {
    float x, y;
} complex;
typedef unsigned int UINT32;
typedef struct {
    int (* const NumberOfBitsNeeded) (int);
    UINT32 (* const FastReverseBits) (UINT32, int);
    void (* const FFT) (bool, complex[], complex[], int);
    void (* const convolution) (double *, double *, int , double *);
} FFT_namespace;
static inline complex init_complex(double a, double b) {
    complex t = {a, b};
    return t;
}
static inline complex add_complex(complex *a, complex *b) {
    return init_complex(a->x + b->x, a->y + b->y);
}
static inline complex sub_complex(complex *a, complex *b) {
    return init_complex(a->x - b->x, a->y - b->y);
}
static inline complex mul_complex(complex *a, complex *b) {
    return init_complex(a->x * b->x - a->y * b->y, a->x * b->y+ a->y * b->x);
}
static complex pre_theta[2][MAXN];
void prework(int n) {
    static int pre_n = 0;
    static const double PI = acos(-1);
    if (pre_n == n)
        return ;
    pre_n = n;
    pre_theta[0][0] = init_complex(1, 0);
    pre_theta[1][0] = init_complex(1, 0);
    #pragma omp parallel for
    for (register int i = 1; i < n; i++) {
        pre_theta[0][i] = init_complex(cos(2*i*PI / n ) , sin(2*i*PI / n ));
        pre_theta[1][i] = init_complex(cos(2*i*PI / n ) , -sin(2*i*PI / n ));
    }
}
// FFT body
int NumberOfBitsNeeded(int PowerOfTwo) {
    for (int i = 0;; ++i) {
        if (PowerOfTwo & (1 << i)) {
            return i;
        }
    }
}
UINT32 FastReverseBits(UINT32 a, int NumBits) {
    a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
    a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
    a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
    a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
    a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
    return a >> (32 - NumBits);
}
void _FFT(bool InverseTransform, complex In[], complex Out[], int n) {
    // simultaneous data copy and bit-reversal ordering into outputs
    int NumSamples = n;
    int NumBits = NumberOfBitsNeeded(NumSamples);
    for (int i = 0; i < NumSamples; ++i)
        Out[FastReverseBits(i, NumBits)] = In[i];
    // the FFT process
    #pragma omp parallel
    for (int i = 1; i <= NumBits; i++) {
        int BlockSize = 1<<i, BlockEnd = BlockSize>>1, BlockCnt = NumSamples/BlockSize;
        #pragma omp for
        for (int j = 0; j < NumSamples; j += BlockSize) {
            complex *t = pre_theta[InverseTransform];
            for (int k = 0; k < BlockEnd; k++, t += BlockCnt) {
                complex a = mul_complex(&(*t), &Out[k+j+BlockEnd]);
                Out[k+j+BlockEnd] = sub_complex(&Out[k+j], &a);
                Out[k+j] = add_complex(&Out[k+j], &a);
            }
        }
    }
    
    // normalize if inverse transform
    if (InverseTransform) {
        #pragma omp parallel for
        for (int i = 0; i < NumSamples; ++i) {
            Out[i].x = Out[i].x / NumSamples;
        }
    }
}
void convolution(double *a, double *b, int n, double *c) {
    static complex s[MAXN], d1[MAXN], d2[MAXN], y[MAXN];
    prework(n);
    #pragma omp parallel for
    for (int i = 0; i < n; ++i)
        s[i] = init_complex(a[i], 0);
    _FFT(false, s, d1, n);
    s[0] = init_complex(b[0], 0);
    #pragma omp parallel for
    for (int i = 1; i < n; ++i)
        s[i] = init_complex(b[n - i], 0);
    _FFT(false, s, d2, n);
    #pragma omp parallel for
    for (int i = 0; i < n; ++i)
        y[i] = mul_complex(&d1[i], &d2[i]);
    _FFT(true, y, s, n);
    #pragma omp parallel for
    for (int i = 0; i < n; ++i)
        c[i] = s[i].x;
}
FFT_namespace const FFT = {NumberOfBitsNeeded, FastReverseBits, _FFT, convolution};
// FFT end
double a[MAXN], b[MAXN], c[MAXN];
long long sum[512][512];
long long getArea(int lx, int ly, int rx, int ry) {
    long long ret = sum[rx][ry];
    if (lx) ret -= sum[lx-1][ry];
    if (ly) ret -= sum[rx][ly-1];
    if (lx && ly)   ret += sum[lx-1][ly-1];
    return ret;
}
int main() {
    omp_set_num_threads(5);
    int m, n, p, q, x, N, M, S;
    while (scanf("%d %d %d %d", &m, &n, &p, &q) == 4) {
        N = m > p ? m : p, M = n > q ? n : q;
        S = 1;
        for (; S < N*M; S <<= 1);
        memset(a, 0, sizeof(a[0]) * S);
        memset(b, 0, sizeof(b[0]) * S);
        for (int i = 0; i < m; i++) {
            long long s = 0;
            for (int j = 0; j < n; j++) {
                scanf("%d", &x);
                a[i*M+j] = x;
                s += x*x;
                sum[i][j] = (i > 0 ? sum[i-1][j] : 0) + s;          
            }
        }
        for (int i = 0; i < p; i++) {
            for (int j = 0; j < q; j++) {
                scanf("%d", &x);
                b[i*M+j] = x;
            }
        }        
        FFT.convolution(a, b, S, c);
        int qx = m - p, qy = n - q, bX = INT_MAX, bY = INT_MAX;
        long long diff = LONG_MAX;
        #pragma omp parallel
        {
            long long t_diff = LONG_MAX;
            int t_bX = INT_MAX, t_bY = INT_MAX;
            #pragma omp for
            for (int i = 0; i <= qx; i++)    {
                for (int j = 0; j <= qy; j++) {
                    long long v = getArea(i, j, i+p-1, j+q-1) - 2*c[i*M + j] + 0.5;
                    if (v < t_diff)
                        t_diff = v, t_bX = i, t_bY = j;
                }
            }
            #pragma omp critical
            {
                if (t_diff < diff)
                    diff = t_diff, bX = t_bX, bY = t_bY;
                if (t_diff == diff && (t_bX < bX || (t_bX == bX && t_bY < bY))) {
                    bX = t_bX, bY = t_bY;
                }
            }
        }
        printf("%d %d\n", bX+1, bY+1);
    }
    return 0;
}

Read More +

2016-04-20

學校課程/平行程式

批改娘 10022. Fast Matrix Multiplication

背景

題目描述

相信不少人都已經實作所謂的矩陣乘法，計算兩個方陣大小為 $N \times N$ 的矩陣 $A, B$。為了方便起見，提供一個偽隨機數的生成，減少在輸入處理浪費的時間，同時也減少上傳測資的辛苦。

根據種子 $c = S1$ 生成矩陣 $A$，種子 $c = S2$ 生成矩陣 $B$，計算矩陣相乘 $A \times B$，為了方便起見，使用 hash 函數進行簽章，最後輸出一個值。由於會牽涉到 overflow 問題，此題作為快取實驗就不考慮這個，overflow 問題都會相同。

matrix.h

1	void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

#include "matrix.h"
 
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            unsigned long sum = 0;    // overflow, let it go.
            for (int k = 0; k < N; k++)
                sum += A[i][k] * B[k][j];
            C[i][j] = sum;
        }
    }
}

main.c

#include <stdio.h>
#include "matrix.h"
// #define DEBUG
#define UINT unsigned long 
#define MAXN 2048
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
    UINT x = 2, n = N*N;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            x = (x * x + c + i + j)%n;
            A[i][j] = x;
        }
    }
}
void print_matrix(int N, UINT A[][MAXN]) {
    for (int i = 0; i < N; i++) {
        fprintf(stderr, "[");
        for (int j = 0; j < N; j++)
            fprintf(stderr, " %u", A[i][j]);
        fprintf(stderr, " ]\n");
    }
}
UINT hash(UINT x) {
    return (x * 2654435761LU);
}
UINT signature(int N, UINT A[][MAXN]) {
    UINT h = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++)
            h = hash(h + A[i][j]);
    }
    return h;
}
UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
    int N, S1, S2;
    while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
        rand_gen(S1, N, A);
        rand_gen(S2, N, B);
        multiply(N, A, B, C);
#ifdef DEBUG
        print_matrix(N, A);
        print_matrix(N, B);
        print_matrix(N, C);
#endif
        printf("%u\n", signature(N, C));
    }
    return 0;
}

輸入格式

多組測資，每組測資一行三個整數 $N, S1, S2$。

$1 \le N \le 1000$
$0 \le S1, S2 \le 65536$

輸出格式

每組測資輸出一行，一個整數 signature 的結果。

範例輸入

1 2	2 2 5 2 2 5

範例輸出

1 2	573770929 573770929

解釋

範例輸入產生 $2 \times 2$ 的矩陣。

$$A = \begin{bmatrix} 2 & 3\\ 0 & 0 \end{bmatrix} , B = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix} , A \times B = \begin{bmatrix} 11 & 6\\ 0 & 0 \end{bmatrix}$$

備註

2016/02/17 加入平行程式設計 OpenMP 部份，並增加時間限制！

Solution

解法跟 thread 寫法一樣，用 OpenMP 的好處在於不用處理那些 thread 的設定，用 OpenMP 提供的前處理完成執行緒的建造和移除操作。

在這裡特別提供達夫裝置 (Duff’s device) 對於迴圈展開 loop unrolling 的撰寫方式，巧妙地運用 C 語言中的 switch，相比一般寫法需要兩次迴圈處理，最後一個迴圈必須處理剩餘不整除的部分。就實作看起來，在現在版本 4.8 gcc 編譯結果下，效能沒有明顯的差別。

在高等編譯器課程中，聽說迴圈展開的數目最好不是 $2^n$，某些情況會造成 alignment 的 cache miss 的嚴重影響，實際情況還是要看怎麼運用，在這裡就不多做嘗試。

matrix.h

1
2
3

#define UINT unsigned long 
 
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c (達夫裝置)

#include "matrix.h"
#define LOOP_UNROLL 8
void transpose(int N, UINT A[][2048]) {
    UINT t;
    for (int i = 0; i < N; i++)
        for (int j = i+1; j < N; j++)
            t = A[i][j], A[i][j] = A[j][i], A[j][i] = t;
}
void multiply(int N, UINT A[][2048], UINT B[][2048], UINT C[][2048]) {
    transpose(N, B);
    #pragma omp parallel for
    for (int i = N-1; i >= 0; i--) {
        for (int j = N-1; j >= 0; j--) {
            register UINT sum = 0;
            UINT *a = &A[i][0], *b = &B[j][0];
            int k = N;
            switch (k % LOOP_UNROLL) {
                case 0: do { sum += *a * *b, a++, b++;
                case 7: sum += *a * *b, a++, b++;
                case 6: sum += *a * *b, a++, b++;
                case 5: sum += *a * *b, a++, b++;
                case 4: sum += *a * *b, a++, b++;
                case 3: sum += *a * *b, a++, b++;
                case 2: sum += *a * *b, a++, b++;
                case 1: sum += *a * *b, a++, b++;
                } while ((k -= LOOP_UNROLL) > 0);
            }
            C[i][j] = sum;
        }
    }
}

matrix.c (迴圈展開)

#include "matrix.h"
#define LOOP_UNROLL 8
void transpose(int N, UINT A[][2048]) {
    int chunk = 8;
    #pragma omp parallel for schedule(dynamic, chunk)
    for (int i = 0; i < N; i++) {
        for (int j = i+1; j < N; j++) {
            UINT t;    
            t = A[i][j], A[i][j] = A[j][i], A[j][i] = t;
        }
    }
}
void multiply(int N, UINT A[][2048], UINT B[][2048], UINT C[][2048]) {
    transpose(N, B);
    int chunk = 8;
    #pragma omp parallel for schedule(dynamic, chunk) shared(C)
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            register UINT sum = 0;
            UINT *a = &A[i][0], *b = &B[j][0];
            int k;
            for (k = 0; k+LOOP_UNROLL < N; k += LOOP_UNROLL) {
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
            }
            for (; k < N; k++)
                sum += *a * *b, a++, b++;
            C[i][j] = sum;
        }
    }
}

Read More +

2016-04-20

學校課程/平行程式

批改娘 10080. Fast Matrix Multiplication (pthread)

題目描述

matrix.h

1	void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

#include "matrix.h"
 
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            unsigned long sum = 0;    // overflow, let it go.
            for (int k = 0; k < N; k++)
                sum += A[i][k] * B[k][j];
            C[i][j] = sum;
        }
    }
}

main.c

#include <stdio.h>
#include "matrix.h"
// #define DEBUG
#define UINT unsigned long 
#define MAXN 2048
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
    UINT x = 2, n = N*N;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            x = (x * x + c + i + j)%n;
            A[i][j] = x;
        }
    }
}
void print_matrix(int N, UINT A[][MAXN]) {
    for (int i = 0; i < N; i++) {
        fprintf(stderr, "[");
        for (int j = 0; j < N; j++)
            fprintf(stderr, " %u", A[i][j]);
        fprintf(stderr, " ]\n");
    }
}
UINT hash(UINT x) {
    return (x * 2654435761LU);
}
UINT signature(int N, UINT A[][MAXN]) {
    UINT h = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++)
            h = hash(h + A[i][j]);
    }
    return h;
}
static UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
    int N, S1, S2;
    while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
        rand_gen(S1, N, A);
        rand_gen(S2, N, B);
        multiply(N, A, B, C);
#ifdef DEBUG
        print_matrix(N, A);
        print_matrix(N, B);
        print_matrix(N, C);
#endif
        printf("%u\n", signature(N, C));
    }
    return 0;
}

輸入格式

多組測資，每組測資一行三個整數 $N, S1, S2$。

$1 \le N \le 1000$
$0 \le S1, S2 \le 65536$

輸出格式

每組測資輸出一行，一個整數 signature 的結果。

範例輸入

1 2	2 2 5 2 2 5

範例輸出

1 2	573770929 573770929

解釋

範例輸入產生 $2 \times 2$ 的矩陣。

$$A = \begin{bmatrix} 2 & 3\\ 0 & 0 \end{bmatrix} , B = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix} , A \times B = \begin{bmatrix} 11 & 6\\ 0 & 0 \end{bmatrix}$$

Makefile

1 2	matrix.out: matrix.h matrix.c main.c gcc -std=c99 -O2 matrix.h matrix.c main.c -o matrix -lpthread -lm

Solution

針對矩陣乘法平行很簡單，在 CPU 上由於需要搬運資料，不一定開的 thread 數量接近 core 數量就一定最好。而在平行的部分要把握資料局部性的加速，以下實作根據最終答案進行 row-based 策略切割，每一個 thread 計算那一個 row 的所有答案。

matrix.h

1
2
3

#define UINT unsigned long 
 
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

#include "matrix.h"
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define LOOP_UNROLL 8
#define MAXTHREAD 8
#define MAXN 2048
typedef struct Argu {
    int l, r, N;
    UINT *A, *B, *C;
} Argu;
static void transpose(int N, UINT A[][2048]) {
    UINT t;
    for (int i = 0; i < N; i++) {
        for (int j = i+1; j < N; j++)
                t = A[i][j], A[i][j] = A[j][i], A[j][i] = t;
    }
}
void* thread_multiply(void *arg) {
    Argu data = * ((Argu *) arg);
    int l = data.l, r = data.r;
    int N = data.N;
    UINT *A = data.A, *B = data.B, *C = data.C;
    UINT stkA[MAXN];
    for (int i = l; i <= r; i++) {
        memcpy(stkA, A + (i * MAXN), sizeof(UINT) * N);
        for (int j = 0; j < N; j++) {
            UINT sum = 0;
//            UINT *a = A + (i * MAXN), *b = B + (j *  MAXN);
            UINT *a = stkA, *b = B + (j * MAXN);
            int k;
            for (k = 0; k+LOOP_UNROLL < N; k += LOOP_UNROLL) {
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
                sum += *a * *b, a++, b++;
            }
            for (; k < N; k++)
                sum += *a * *b, a++, b++;
            *(C + (i * MAXN + j)) = sum;
        }
    }
    return NULL;
}
void multiply(int N, UINT A[][2048], UINT B[][2048], UINT C[][2048]) {
    transpose(N, B);
 
    pthread_t *threads;
    threads = (pthread_t *) malloc(MAXTHREAD * sizeof(pthread_t));
 
    for (int i = 0; i < MAXTHREAD; i++) {
        Argu *data = (Argu *) malloc(sizeof(Argu));
        data->l = i * N / MAXTHREAD;
        data->r = (i+1) * N / MAXTHREAD - 1;
        data->N = N;
        data->A = &A[0][0], data->B = &B[0][0], data->C = &C[0][0];
        pthread_create(&threads[i], NULL, thread_multiply, (void *) (data));
    }
 
    for (int i = 0; i < MAXTHREAD; i++) 
        pthread_join(threads[i], NULL);
    return ;
}

Read More +

2016-04-20

學校課程/平行程式

批改娘 10086. Red/Blue Computation

題目描述

模擬工作流程，在一個 $N \times N$ 的網格 (左邊界可以通到右邊界，上邊界可以通到下邊界) 上有三種狀態紅 $R$, 藍 $B$, 空格 $W$，每次模擬分成兩個步驟

第一步，只有紅 $R$ 可移動，紅 $R$ 只能往右移動一格到空白 $W$ 的位置，否則仍在原處。
第二步，只有藍 $B$ 可移動，藍 $B$ 只能往下移動一格到空白 $W$ 的位置，否則仍在原處。

請問模擬 $M$ 次後盤面為何？

輸入格式

輸入只有一組測資，第一行有兩個整數 $N, \; M$，分別為盤面大小以及模擬次數。接下來會有 $N$ 行，每一行上會有 $N$ 個字元。

$1 \le N, M \le 1000$

輸出格式

輸出 $N \times N$ 盤面。

範例輸入

4 1
RRWR
WWBW
BWRW
WWWW

範例輸出

RWRR
WWWW
WWBR
BWWW

備註

Solution

模擬工作流程，平行計算下一個狀態，利用兩個滾動數組的方式交替使用。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXN 1024
#define C_WHITE 0
#define C_RED   1
#define C_BLUE  2
 
char g1[MAXN][MAXN], g2[MAXN][MAXN];
 
void moveRed(int n) {
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < n; i++)
        memset(g2[i], C_WHITE, sizeof(char) * n);
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            if (g1[i][j] == C_BLUE) {
                g2[i][j] = C_BLUE;
            } else if (g1[i][j] == C_RED) {
                int next = j+1 == n ? 0 : j+1;
                if (g1[i][next] == C_WHITE)
                    g2[i][next] = C_RED;
                else
                    g2[i][j] = C_RED;
            }
        }
    }
}
void moveBlue(int n) {
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < n; i++)
        memset(g1[i], C_WHITE, sizeof(char) * n);
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            if (g2[i][j] == C_RED) {
                g1[i][j] = C_RED;
            } else if (g2[i][j] == C_BLUE) {
                int next = i+1 == n ? 0 : i+1;
                if (g2[next][j] == C_WHITE)
                    g1[next][j] = C_BLUE;
                else
                    g1[i][j] = C_BLUE;
            }
        }
    }
}
static inline int toIndex(char c) {
    if (c == 'W')    return C_WHITE;
    if (c == 'R')    return C_RED;
    if (c == 'B')    return C_BLUE;
    fprintf(stderr, "[WARNING] %s %d\n", __FILE__, __LINE__);
}
static void printBoard(int n) {
    char color[4] = "WRB";
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++)
            putchar(color[g1[i][j]]);
        putchar('\n');
    }
}
int main() {
    int n, m;
    while (scanf("%d %d", &n, &m) == 2) {
        for (int i = 0; i < n; i++) {
            scanf("%s", &g1[i]);
            for (int j = 0; j < n; j++)
                g1[i][j] = toIndex(g1[i][j]);
        }
 
        for (int it = 0; it < m; it++) {
            moveRed(n);
            moveBlue(n);
        }
        printBoard(n);
    }
    return 0;
}
/*
4 1
RRWR
WWBW
BWRW
WWWW
*/

Read More +