2016-06-24

UVa 13015 - Promotions

Problem

有 $N$ 名員工，他們彼此之間會推薦、相互提拔對方，因此站在公司的角度來看，提拔順序不能違反這張圖的上下關係，亦即若要提拔某人，其推薦他的人全部都要被提拔。請問若要提拔 $A$ 個人，有哪些人一定會被提拔，同樣的，回答提拔 $B$ 個人的情況，最後回答，若提拔 $B$ 個人，誰一定不會被提拔？

Sample Input

Sample Output

1
2
3

2
4
3

Solution

一定是張 DAG，否則不能處理。對於輸入多存一張反向圖，接著每一次都去找其下屬節點有多少個，藉此構造出任何提拔方案中，最好和最差排名為多少。處理全部排名計算 $\mathcal{O}(V^2 E)$

#include <bits/stdc++.h>
using namespace std;
// O(VE)
const int MAXN = 5005;
int A, B, E, P;
vector<int> G[MAXN], invG[MAXN];
int used[MAXN] = {}, cases = 0;
int dfs(int u, vector<int> g[]) {
    if (used[u] == cases)	return 0;
    used[u] = cases;
    int ret = 1;
    for (auto v : g[u])
        ret += dfs(v, g);
    return ret;
}
int main() {
    int x, y;
    while (scanf("%d %d %d %d", &A, &B, &E, &P) == 4) {
        for (int i = 0; i < E; i++)
            G[i].clear(), invG[i].clear();
        // Must DAG
        for (int i = 0; i < P; i++) {
            scanf("%d %d", &x, &y);
            G[x].push_back(y), invG[y].push_back(x);
        }
        
        int retA = 0, retB = 0, retN = 0;
        for (int i = 0; i < E; i++) {
            int worst, best;
            cases++;
            worst = E - dfs(i, G) + 1;
            cases++;
            best = dfs(i, invG);
            if (worst <= A)	retA++;
            if (worst <= B)	retB++;
            if (best > B)	retN++;
        }
        printf("%d\n%d\n%d\n", retA, retB, retN);
    }
    return 0;
}

Read More +

2016-06-24

解題區/解題區 - UVa

UVa 13024 - Saint John Festival

Problem

給予 $N$ 個大天燈的所在位置，隨後有 $Q$ 個小天燈位置，施放天燈後，請問有多少小天燈處於任三個天燈構成的三角形內部？

Sample Input

Sample Output

Solution

由於只詢問任意三個點構成的三角形內部，貪心就能想到一定是挑選凸包上的三點，只需要判定某點是不是在凸包內部，由於所有天燈已經給定，只需要跑一次 $\mathcal{O}(N \log N)$ 凸包算法，接著詢問一點是否在凸包內，只需要 $\mathcal{O}(\log N)$。

#include <stdio.h>
#include <stdio.h>
#include <math.h>
#include <vector>
#include <assert.h>
#include <algorithm>
using namespace std;
#define eps 1e-10
struct Pt {
    double x, y;
    Pt(double a = 0, double b = 0):
    x(a), y(b) {}
    bool operator<(const Pt &a) const {
        if (fabs(x-a.x) > eps)	return x < a.x;
        return y < a.y;
    }
    bool operator==(const Pt &a) const {
        return fabs(x-a.x) < eps && fabs(y-a.y) < eps;
    }
    Pt operator+(const Pt &a) const {
        return Pt(x + a.x, y + a.y);
    }
    Pt operator-(const Pt &a) const {
        return Pt(x - a.x, y - a.y);
    }
    Pt operator/(const double val) const {
        return Pt(x / val, y / val);
    }
    Pt operator*(const double val) const {
        return Pt(x * val, y * val);
    }
};
double cross(Pt o, Pt a, Pt b) {
    return (a.x-o.x)*(b.y-o.y)-(a.y-o.y)*(b.x-o.x);
}
int monotone(int n, Pt p[], Pt ch[]) {
    sort(p, p+n);
    int i, m = 0, t;
    for (i = 0; i < n; i++) {
        while(m >= 2 && cross(ch[m-2], ch[m-1], p[i]) <= 0)
            m--;
        ch[m++] = p[i];
    }
    for (i = n-1, t = m+1; i >= 0; i--) {
        while(m >= t && cross(ch[m-2], ch[m-1], p[i]) <= 0)
            m--;
        ch[m++] = p[i];
    }
    return m-1;
}
double g(Pt a, Pt b, double x) {
    Pt vab = b - a;
    return a.y + vab.y * (x - a.x) / vab.x;
}
int inside_convex(const Pt &p, Pt ch[], int n) {
    if (n < 3)
        return false;
    if (cross(ch[0], p, ch[1]) > eps)
        return false;
    if (cross(ch[0], p, ch[n-1]) < -eps)
        return false;
    
    int l = 2, r = n-1;
    int line = -1;
    while (l <= r) {
        int mid = (l + r)>>1;
        if (cross(ch[0],p, ch[mid]) > -eps) {
            line = mid;
            r = mid - 1;
        } else l = mid + 1;
    }
    return cross(ch[line-1], p, ch[line]) < eps;
}
Pt D[131072], ch[262144];
int main() {
    int testcase, n, m;
    double x, y;
    while (scanf("%d", &n) == 1) {
        for (int i = 0; i < n; i++) {
            scanf("%lf %lf", &x, &y);
            D[i] = Pt(x, y);
        }
        n = monotone(n, D, ch);
        scanf("%d", &m);
        int ret = 0;
        for (int i = 0; i < m; i++) {
            scanf("%lf %lf", &x, &y);
            int f = inside_convex(Pt(x, y), ch, n);
            ret += f;
        }        
        printf("%d\n", ret);
    }
    return 0;
}

Read More +

2016-06-24

解題區/解題區 - UVa

UVa 13029 - Emoticons

Problem

給一個由 ^, _ 構成的字串，請湊出最多的 ^_^ 子字串。

Sample Input

5
_^^_^^_
^__^__^
______
^^__^^
^_^^_^

Sample Output

Case 1: 1
Case 2: 1
Case 3: 0
Case 4: 2
Case 5: 2

Solution

從左掃瞄到右邊，貪心著手容易得到兩種狀態

$h1$: ^ 的個數。
$h2$: 湊成 ^???_ 的個數，可以從 $h1$ 合併 _ 構成。

然而有一些特殊案例 ^_^_^^，需要重新排列，他們之間有巢狀關係 (^_(^_^)^)，因此建立狀態 $h3$ 為 ^_^???_，若遇到下一個 ^ 貪心拆解成 (^_(^_^)???)。

#include <bits/stdc++.h>
using namespace std;
char s[131072];
int main() {
    int testcase, cases = 0;
    scanf("%d", &testcase);
    while (testcase--) {
        scanf("%s", s);
        int n = strlen(s);
        int ret = 0;
        int h1 = 0, h2 = 0, h3 = 0;
        for (int i = 0; i < n; i++) {
            if (s[i] == '^' && h2) {
                h2--, ret++;
            } else if (s[i] == '^') {
                if (h3 && ret)
                    h2++, h3--;
                else
                    h1++;
            } else if (s[i] == '_' && h1) {
                h1--, h2++;
            } else if (s[i] == '_') {
                if (ret > h3)
                    h3++;
            }
        }
        printf("Case %d: %d\n", ++cases, ret);
    }
    return 0;
}

Read More +

2016-06-24

解題區/解題區 - UVa

UVa 13017 - Canvas Painting

Problem

有 $N$ 塊畫布排成一列，並且目標將每一塊畫布變成不同顏色，每一次可以將一個區間塗成相同顏色，花費便是區間 $C_i$ 總和，請問最少花費為何。

Sample Input

Sample Output

1
2

29
40

Solution

每一次最多完成一個不同顏色的畫布，那麼著色解法看來就是二元樹，每一個畫布處於葉節點，因此最少花費就相當於編碼長度乘上葉權重最少。

#include <bits/stdc++.h>
using namespace std;
// Huffman Coding, Greedy
// input A[1...n]
// minimum \sum A[i]*code[i]
int main() {
    int testcase;
    scanf("%d", &testcase);
    while (testcase--) {
        int n, x;
        long long t;
        multiset<long long> S;
        scanf("%d", &n);
        for (int i = 0; i < n; i++)
            scanf("%d", &x), S.insert(x);
            
        long long ret = 0;
        for (int i = n-2; i >= 0; i--) {
            long long a, b;
            a = *S.begin();
            S.erase(S.begin());
            b = *S.begin();
            S.erase(S.begin());
            ret += a+b;
            S.insert(a+b);
        }
        printf("%lld\n", ret);
    }
    return 0;
}

Read More +

2016-06-24

解題區/解題區 - UVa

UVa 13079 - On the beach

Problem

在炎炎夏日的海邊，需要建立地下通道，把所有海岸飯店聯通在一起，請問最少地下通道個數為何？

Sample Input

Sample Output

1
2
3

2
2
1

Solution

把這些區間排序，根據左端點由小到大開始貪心，可行解若看成一個區間，隨著區間加入，右端點會逐漸地靠近左端點，若產生無解，勢必要建立一個新通道。不斷地貪心，最後可得到最少通道個數。

#include <bits/stdc++.h>
using namespace std;
int main() {
    int N;
    while (scanf("%d", &N) == 1 && N) {
        vector<pair<int, int>> A;
        int L, R;
        for (int i = 0; i < N; i++) {
            scanf("%d %d", &L, &R);
            A.push_back(make_pair(L, R));
        }
        sort(A.begin(), A.end());
        
        int ret = 0;
        int PQ = INT_MAX;
        for (int i = 0; i < N; i++) {
            int line_x = A[i].first;
            if (PQ != INT_MAX && PQ <= line_x) {
                ret++;
                PQ = INT_MAX;
            }
            PQ = min(PQ, A[i].second);
        }
        if (PQ != INT_MAX)
            ret++;
        printf("%d\n", ret);
    }
    return 0;
}

Read More +

2016-06-24

解題區/解題區 - 其他題目

批改娘 10038. Fast Covering Problem

題目背景

終於把所有練習題都放上 Judge Girl，測資都已經確認過一遍，某 M 打算離開一陣子。「反正是個令人唾棄的助教吧 …」

題目描述

考試出題總很難讓所有人滿意，老師決定給予學生們選擇考試出題方向，但每一個人只能提出兩種意見，接著老師會出一套方案滿足每一位學生的其中一種意見。由於出一套題步驟繁瑣，把數個意見出在同一題非常困難，最後每一種意見將單獨被出成一道題。

由於助教們要負責出測資、檢驗測資正確與可行性，希望題目數量越少越好，否則助教會忙翻天。現在要找到最少題目來滿足所有學生的需求。

例如 :

現在有 4 名學生的意見
四名學生分別提案 $(0, 1), (100, 1), (100, 0), (100, 200)$
如果選擇 ${ 0, 100 }$ 或者是 ${ 1, 100 }$ 都是最好的選擇，助教只需要完成 2 題的檢驗。相反地，如果 ${0, 100, 200 }$ 雖然可以滿足所有學生，但需要出成 3 道題目。

輸入格式

只有一組測資，每組第一行會有一個整數 $N$，表示有多少位學生，接下來會有 $N$ 行，每 $i$ 行上會有兩個整數 $A_i, B_i$ 表示第 $i$ 個學生的出題意見。

$0 < N \le 1000$
$0 \le A_i, \; B_i \le 10000$
意見類型總數不超過 100 種

輸出格式

對於每一組測資輸出一行，表示最少要準備的方案數量能滿足所有學生。

範例輸入

範例輸出

提示

DLX

Solution

當年在 NCPC 搞不出來的 Problem I Christmas Gifts (NP-hard)，在賽後用 DLX 運行效果不錯，在啟發函數加上延遲標記更是屹立排名前數位已久。最近又因為平行把題目挖回來討論，在去年釣到大一學弟來解，便以飛快的速度擊破測資，最後達到加速 20x。再把當初需要跑 30 秒的測資來運行，現在只需要短短的 50ms。

原本要拿來出平行題目，看到這麼驚人的速度，想必就不要出題。

通用解法 DLX 加上啟發式函數就能解決最少重複覆蓋，然而在圖論的最少點集覆蓋問題中，性質又變得更加強烈。當不選某一個點時，必然與其相連的邊為了要覆蓋，另一端必然成為必選點。這時候搜索空間大幅度地下降。若在一般 DLX 算法中提及的最少可能的列中，窮舉某行來覆蓋一些列，那麼就很難看到搜索空間下降的性質。

因此，步驟簡單分成下列步驟：

將圖轉換成某行可以覆蓋哪幾列，轉換成 Dancing Links 的格式
找到可以覆蓋最多尚未覆蓋列最多的那一行
1. 選擇這一行，並且移除這一行所有覆蓋列，遞迴窮舉。
2. 移除這一行，選擇必選行，並嘗試移除等價行。

附錄：和交通大學謝旻錚(Min-Zheng Shieh) 教練的討論

在想確實能用 dancing links 來搞，還要搭配維護 degree order，不過這算法吳邦一老師是說 worst case 為 3 regular graph。
另外先前找過日本人 iwi 的論文，他也搞了個高級 VC solver 並做了測試，詳見論文 Branch-and-reduce exponential/FPT algorithms in practice: A case study of vertex cover, Takuya Akiba, Yoichi Iwata

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <limits.h>
#define MAXNODE 100000
#define MAXCOL 5005
#define MAXN 5005
struct DancingNode {
    int left, right, up, down;
    int ch, rh;
} DL[MAXNODE];
struct HelperNode {
    int head, size, next, prev;
} HN[MAXNODE];
int help_head;
int s[MAXCOL];
int head, size, Ans, Dep;
int markStk[MAXNODE], markIdx = -1;
void Remove(int c) {
    for (int i = DL[c].down; i != c; i = DL[i].down) {
    	if (HN[DL[i].rh].head == i)
    		HN[DL[i].rh].head = DL[i].left;
    	HN[DL[i].rh].size--;
        DL[DL[i].right].left = DL[i].left;
        DL[DL[i].left].right = DL[i].right;
        s[DL[i].ch]--;
    }
}
void Resume(int c) {
    for (int i = DL[c].down; i != c; i = DL[i].down) {
    	if (HN[DL[i].rh].head == i)
    		HN[DL[i].rh].head = DL[i].right;
    	HN[DL[i].rh].size++;
        DL[DL[i].right].left = i;
        DL[DL[i].left].right = i;
        s[DL[i].ch]++;
    }
}
void Reduce(int i) {
    int t = DL[i].rh;
    HN[HN[t].next].prev = HN[t].prev;
    HN[HN[t].prev].next = HN[t].next;
    s[DL[i].ch]--;
    DL[DL[i].down].up = DL[i].up;
    DL[DL[i].up].down = DL[i].down;
    for (int k = DL[i].right; k != i; k = DL[k].right) {
        DL[DL[k].down].up = DL[k].up;
        DL[DL[k].up].down = DL[k].down;
        s[DL[k].ch]--;
    }
}
void Recover(int i) {
    int t = DL[i].rh;
    HN[HN[t].next].prev = t;
    HN[HN[t].prev].next = t;
    s[DL[i].ch]++;
    DL[DL[i].down].up = i;
    DL[DL[i].up].down = i;
    for (int k = DL[i].right; k != i; k = DL[k].right) {
        DL[DL[k].down].up = k;
        DL[DL[k].up].down = k;
        s[DL[k].ch]++;
    }
}
void Select(int i) {
    int s = DL[i].rh;
    HN[HN[s].next].prev = HN[s].prev;
    HN[HN[s].prev].next = HN[s].next;
    Remove(i);
    for (int j = DL[i].right; j != i; j = DL[j].right)
        Remove(j);
    Dep++;
}
void Cancel(int i) {
    int s = DL[i].rh;
    for (int j = DL[i].right; j != i; j = DL[j].right)
        Resume(j);
    Resume(i);    
    HN[HN[s].next].prev = s;
    HN[HN[s].prev].next = s;
    Dep--;
}
int Decision() {
    int has = 0;
    for (int i = DL[head].right; i != head; i = DL[i].right) {
        if (s[i] == 1) {
            Select(DL[i].down);
            markStk[++markIdx] = DL[i].down;
            has = 1;
        }
    }
    return has;
}
int Subset(int x, int y) {	// 0: x in y, 1: y in x 
    assert(DL[x].ch == DL[y].ch);
    int a = 0, b = 0;
    int i, j;
    for (i = DL[x].right, j = DL[y].right; i != x && j != y; ) {
        if (DL[i].ch == DL[j].ch)
            i = DL[i].right, j = DL[j].right;
        else if (DL[i].ch < DL[j].ch)
            i = DL[i].right, a = 1;
        else
            j = DL[j].right, b = 1;
        if (a && b)
            break;
    }
    if (i != x)    a = 1;
    if (j != y)    b = 1;
    return a || b ? (a - b) : 1;
}
int Duplicate() {
    int has = 0;
    for (int i = DL[head].right; i != head; i = DL[i].right) {
        for (int j = DL[i].down; j != i; j = DL[j].down) {
            for (int k = DL[j].down; k != j && k != i; k = DL[k].down) {
                int cmp = Subset(j, k);
                if (cmp == 0)
                    continue;
                if (cmp == 1) {
                	markStk[++markIdx] = j;
                	Select(j);
            	} else {
            		markStk[++markIdx] = k;
                	Select(k);
                }
                has = 1;
            }
        }
    }
    return has;
}
int H(int limit) {
    static int c, ret, i, j, time = 0;
    static int used[MAXCOL] = {};
    for (c = DL[head].right, ++time, ret = 0; c != head; c = DL[c].right) {
        if (used[c] == time)
        	continue;
    	ret ++, used[c] = time;
    	if (ret >= limit)	return ret;
        for (i = DL[c].down; i != c; i = DL[i].down) {
            for (j = DL[i].right; j != i; j = DL[j].right)
            	used[DL[j].ch] = time;
        }
    }
    return ret;
}
void DFS() {
    int hval = H(Ans - Dep);
    if (DL[head].right == head && Dep < Ans)
    	Ans = Dep;
    if (Dep + hval >= Ans)
        return ;
    int cover_max = -1, cover_idx = -1;
    for (int i = HN[help_head].next; i != help_head; i = HN[i].next) {
        if (HN[i].size > cover_max) {
            cover_max = HN[i].size;
            cover_idx = HN[i].head;
        }
    }
    
    Select(cover_idx);
    DFS();
    Cancel(cover_idx);
    
    Reduce(cover_idx);
    int markEsp = markIdx;
    while (!Decision() && !Duplicate());
    DFS();
    while (markIdx > markEsp)
        Cancel(markStk[markIdx--]);
    Recover(cover_idx);
}
int new_node(int up, int down, int left, int right) {
    DL[size].up = up, DL[size].down = down;
    DL[size].left = left, DL[size].right = right;
    DL[up].down = DL[down].up = DL[left].right = DL[right].left = size;
    return size++;
}
void new_row(int n, int Row[], int rh) {
    int r, row = -1, k;
    int h = size;
    for (int i = 0; i < n; i++) {
        r = Row[i];
        DL[size].ch = r, s[r]++;
        DL[size].rh = rh;
        if (row == -1) {
            row = new_node(DL[DL[r].ch].up, DL[r].ch, size, size);
        } else {
            k = new_node(DL[DL[r].ch].up, DL[r].ch, DL[row].left, row);
        }
    }
    HN[rh].size = n;
    HN[rh].head = h;
    HN[rh].next = HN[help_head].next;
    HN[rh].prev = help_head;
    HN[HN[help_head].next].prev = rh;
    HN[help_head].next = rh;
}
void init(int m) {
    size = 0;
    help_head = 0, HN[help_head].next = HN[help_head].prev = help_head;
    head = new_node(0, 0, 0, 0);
    int i;
    for (i = 1; i <= m; i++) {
        new_node(i, i, DL[head].left, head);
        DL[i].ch = i, s[i] = 0;
        DL[i].rh = 0;	// important pointer (fail pointer)
    }
}
int main() {
    int n;
    while (scanf("%d", &n) == 1 && n) {
    	int A[MAXN], B[MAXN];
        int toy[10005] = {}, R[MAXCOL];
        for (int i = 1; i <= n; i++) {
            scanf("%d %d", &A[i], &B[i]);
            toy[A[i]]++, toy[B[i]]++;
        }
        
        init(n);
        int run_id = 0;
        for (int i = 0; i <= 10000; i++) {
            int nt = 0;
            for (int j = 1; j <= n; j++) {
                if (A[j] == i || B[j] == i)
                    R[nt++] = j;
            }
            if (nt) {
                run_id++;
                new_row(nt, R, run_id);
            }
        }
        Ans = n;
        Dep = 0, markIdx = -1;
        Decision();
        DFS();
        printf("%d\n", Ans);
        fflush(stdout);
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10093. Fast Matrix Chain Multiplication (OpenMP)

題目描述

計算矩陣鏈乘積 $A_{r_1, c_1} B_{r_2, c_2} \cdots$ 的值。

sample.c

// generate matrix, row-major
uint32_t* rand_gen(uint32_t seed, int R, int C) {
    uint32_t *m = (uint32_t *) malloc(sizeof(uint32_t) * R*C);
    uint32_t x = 2, n = R*C;
    for (int i = 0; i < R; i++) {
        for (int j = 0; j < C; j++) {
            x = (x * x + seed + i + j)%n;
            m[i*C + j] = x;
        }
    }
    return m;
}
uint32_t hash(uint32_t x) {
    return (x * 2654435761LU);
}
// output
uint32_t signature(uint32_t *A, int r, int c) {
    uint32_t h = 0;
    for (int i = 0; i < r; i++) {
        for (int j = 0; j < c; j++)
            h = hash(h + A[i*c + j]);
    }
    return h;
}

輸入格式

有多組測資，每組第一行會有一個整數 $N$ 表示矩陣鏈上有 $N$ 個矩陣，第二行上會有 $N+1$ 個整數 $Z_i$，表示矩陣鏈的每一個行列大小，例如當 $N = 3$ 時，輸入 10 30 5 60 表示矩陣 $A_{10, 30} B_{30, 5} C_{5, 60}$ 相乘。第三行會有 $N$ 個整數，第 $i$ 個整數 $S_i$ 為第 $i$ 個矩陣生成種子。

$1 \le N \le 100$
$1 \le Z_i \le 1000$
$0 \le S_i \le 32767$

輸出格式

對於每組測資輸出一行，將最後的矩陣結果輸出雜湊值。

範例輸入

2
2 2 2
2 5
3
10 30 5 60
0 0 0
3
1 5 20 1
0 0 0
3
5 10 20 35
0 0 0 
6
30 35 15 5 10 20 25
0 0 0 0 0 0

範例輸出

備註

輸出請用 printf("%u", answer);，計算 Dynamic Programming 時，請使用 64-bit 型態紀錄，因為最慘情況下會超過 32-bit 所能容納的範圍。

Solution

充分地運用當初在演算法學到的，計算矩陣鍊乘積的最少乘法數，接著再針對優化後的乘法順序進行平行。平行可以從單純矩陣乘法，又或者針對可以同時進行矩陣乘法操作開始。甚至可以套用編譯器學到的最少暫存器算法，想辦法從少量的空間換取好的快取效果。

下述程式只針對矩陣乘法計算平行，而非兩個乘法同時進行，其一原因在於很難保證 load balance。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <assert.h>
#define MAXN 128
#define LOOP_UNROLL 8
#define INF (1LL<<60)
int N, SZ[MAXN], SEED[MAXN];
long long dp[MAXN][MAXN] = {};
int argdp[MAXN][MAXN];
uint32_t* rand_gen(uint32_t c, int R, int C) {
    uint32_t *m = (uint32_t *) malloc(sizeof(uint32_t) * R*C);
    assert(m != NULL);
    uint32_t x = 2, n = R*C;
    for (int i = 0; i < R; i++) {
        for (int j = 0; j < C; j++) {
            x = (x * x + c + i + j)%n;
            m[i*C + j] = x;
        }
    }
    return m;
}
uint32_t* multiplyAndDel(uint32_t *A, uint32_t *B, int r, int rc, int c) {
    uint32_t *C = (uint32_t *) malloc(sizeof(uint32_t) * r * c);
    uint32_t *tB = (uint32_t *) malloc(sizeof(uint32_t) * rc * c);
    assert(C != NULL);
    assert(tB != NULL);
    for (int i = 0; i < rc; i++) {
        for (int j = 0; j < c; j++)
            tB[j*rc + i] = B[i*c + j];
    }
    free(B);
    #pragma omp parallel for
    for (int i = r-1; i >= 0; i--) {
        for (int j = c-1; j >= 0; j--) {
            register uint32_t sum = 0;
            uint32_t *a = &A[i*rc], *b = &tB[j*rc];
            int k = rc;
            switch (k % LOOP_UNROLL) {
                case 0: do { sum += *a * *b, a++, b++;
                case 7: sum += *a * *b, a++, b++;
                case 6: sum += *a * *b, a++, b++;
                case 5: sum += *a * *b, a++, b++;
                case 4: sum += *a * *b, a++, b++;
                case 3: sum += *a * *b, a++, b++;
                case 2: sum += *a * *b, a++, b++;
                case 1: sum += *a * *b, a++, b++;
                } while ((k -= LOOP_UNROLL) > 0);
            }
            C[i*c + j] = sum;
        }
    }
    free(A), free(tB);
    return C;
}
uint32_t hash(uint32_t x) {
    return (x * 2654435761LU);
}
uint32_t signatureAndDel(uint32_t *A, int r, int c) {
    uint32_t h = 0;
    for (int i = 0; i < r; i++) {
        for (int j = 0; j < c; j++)
            h = hash(h + A[i*c + j]);
    }
    free(A);
    return h;
}
uint32_t* dfs(int l, int r, int *mR, int *mC) {
    if (l == r) {
        *mR = SZ[l], *mC = SZ[l+1];
        return rand_gen(SEED[l], *mR, *mC);
    }
    int split = argdp[l][r];
    int r1, r2, c1, c2;
    uint32_t *A, *B;
    A = dfs(l, split, &r1, &c1);
    B = dfs(split+1, r, &r2, &c2);
    assert(c1 == r2);
    *mR = r1, *mC = c2;
    return multiplyAndDel(A, B, r1, c1, c2);
}
int main() {
    while (scanf("%d", &N) == 1) {
        for (int i = 0; i <= N; i++)
            scanf("%d", &SZ[i]);
        for (int i = 0; i < N; i++)
            scanf("%d", &SEED[i]);
        memset(dp, 0, sizeof(dp));
        for (int i = 1; i <= N; i++) {
            for (int j = 0; j+i < N; j++) {
                int l = j, r = j+i;
                dp[l][r] = INF;
                for (int k = l; k < r; k++) {
                    long long t = dp[l][k] + dp[k+1][r] + (long long) SZ[l] * SZ[k+1] * SZ[r+1];
                    if (t < dp[l][r])
                        dp[l][r] = t, argdp[l][r] = k;
                }
            }
        }
        int retR, retC;
        uint32_t *retM;
        uint32_t hval;
        retM = dfs(0, N-1, &retR, &retC);
        hval = signatureAndDel(retM, retR, retC);
        printf("%u\n", hval);
 
        long long test = 0;
        for (int i = 1; i < N; i++) {
            test += SZ[0] * SZ[i] * SZ[i+1];
        }
        fprintf(stderr, "best %lld, origin %lld, %lf\n", dp[0][N-1], test, dp[0][N-1]*1.f / test);
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10094. Fast 0/1 Knapsack Problem

題目描述

有 $N$ 個重量和價值分別為 $W_i, V_i$ 的物品，現在要從這些物品中選出總重量不超過 $M$ 的物品，求所有挑選方案中的總價值最大值為何。

輸入格式

輸入只有一組測資，第一行會有兩個整數 $N, M$ 分別表示物品數量和背包能容大的最大重量 $M$。接下來會有 $N$ 行，第 $i$ 行上會有兩個整數 $W_i, V_i$，分別為物品 $i$ 的重量和價值。

$1 \le N \le 10000$
$1 \le M \le 1000000$
$1 \le W_i, \; V_i \le 100000$

輸出格式

輸出一行整數，挑選方案中的最大價值 $B$。

範例輸入

範例輸出

Solution

遞迴公式

$$\begin{align*} f(i, j) = \max(f(i-1, j-w)+v, f(i-1, j)) \end{align*}$$

最單純採用滾動數組完成，也就是所謂的 double buffer 的技巧，可以大幅度節省記憶體，但先決條件是他們的相依關係必須獨立。

在平行處理上的另一種解法，就是分成前 $N/2$ 個物品和後 $N/2$ 物品分開計算，之後再藉由 $\mathcal{O}(M)$ 合併答案，由於算法的限制，最多拆分兩組。而不像滾動數組的平行於最內層的迴圈，平行度有限，但在較少核心的處理上非常快速，其一原因是空間使用比較有效率。

Server (32 cores)
- 分組合併: Accepted (768 ms, 8320 KB)
- 滾動數組: Accepted (380 ms, 8 MB)
PC (4 cores)
- 分組合併: 600ms
- 滾動數組: 900ms

滾動數組

#include <stdio.h>
#include <stdlib.h>
 
#define MAXM 1000005
#define MAXN 10005
int dp[2][MAXM];
int W[MAXN], V[MAXN];
#define max(a, b) (a > b ? a : b)
int main() {
    int N, M;
    while (scanf("%d %d", &N, &M) == 2) {
        for (int i = 0; i < N; i++)
            scanf("%d %d", &W[i], &V[i]);
        memset(dp, 0, sizeof(dp));
 
        int p = 0;
        for (int i = 0; i < N; i++) {
            int q = 1 - p;
            #pragma omp parallel
            {
                int pp = p, qq = q;
                int v = V[i], w = W[i];
                #pragma omp for
                for (int i = w; i <= M; i++)
                    dp[qq][i] = max(dp[pp][i-w]+v, dp[pp][i]);
                #pragma omp for
                for (int i = 0; i < w; i++)
                    dp[qq][i] = dp[pp][i];
            }
            p = 1 - p;
        }
 
        printf("%d\n", dp[p][M]);
    }
    return 0;
}

分組合併

#include <stdio.h>
#include <stdlib.h>
 
static inline int max(int a, int b) {
    return a > b ? a : b;
}
void subKnapsack(int n, int m, int w[], int v[], int dp[]) {
    memset(dp, 0, sizeof(dp[0]) * (m+1));
    for (int i = 0; i < n; i++) {
        int vv = v[i], ww = w[i];
        for (int j = m; j >= ww; j--)
            dp[j] = max(dp[j], dp[j-ww]+vv);
    }
}
#define MAXM 1000005
#define MAXN 100005
int W[MAXN], V[MAXN];
int dp[2][MAXM];
int main() {
    int N, M;
    while (scanf("%d %d", &N, &M) == 2) {
        for (int i = 0; i < N; i++)
            scanf("%d %d", &W[i], &V[i]);
        memset(dp, 0, sizeof(dp));
 
        #pragma omp parallel sections
        {
            #pragma omp section
            subKnapsack(N/2, M, W, V, dp[0]);
            #pragma omp section
            subKnapsack(N-N/2, M, W+N/2, V+N/2, dp[1]);
        }
 
        int ret = 0;
        for (int i = M, j = 0, mx = 0; i >= 0; i--) {
            mx = max(mx, dp[1][j]), j++;
            ret = max(ret, dp[0][i] + mx);
        }
        printf("%d\n", ret);
    }
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10027. Fast Sudoku

背景

不過，還想做得更好

但這個世界卻要求神通廣大、無所不知，那不聰明的我又該何去何從 …

題目描述

給一張 $9 \times 9$ 數獨，必須滿足每行每列的 1 到 9 的數字不重複，同時在九個 $3 \times 3$ 方格內的 1 到 9 數字也不重複。請計算可填入的方法數。

輸入格式

測資只有一筆，共有九行，每一行上有九個整數，第 $i$ 行上的第 $j$ 個整數表示 $\text{grid}[i][j]$ 填入的數字，若 $\text{grid}[i][j] = 0$ 表示尚未填入。

輸出格式

對於每一組測資輸出一行一個整數，表示數獨共有幾種填法。

範例輸入

0 6 0 0 0 4 0 5 0
0 0 8 3 0 5 6 0 0
2 0 0 0 0 0 0 0 1
8 0 0 4 0 7 0 0 6
0 0 6 0 0 0 3 0 0
7 0 0 9 0 1 0 0 4
5 0 0 0 0 0 0 0 2
0 0 7 2 0 6 9 0 0
0 4 0 5 0 8 0 7 0

範例輸出

範例輸入 2

0 0 0 0 0 0 0 0 0
0 0 3 6 0 0 0 0 0
0 7 0 0 9 0 2 0 0
0 5 0 0 0 7 0 0 0
0 0 0 0 4 5 7 0 0
0 0 0 1 0 0 0 3 0
0 0 1 0 0 0 0 6 8
0 0 8 5 0 0 0 1 0
0 9 0 0 0 0 4 0 0

範例輸出 2

Solution

牽涉到 load balance 問題，大部分都要先廣度搜尋一次，把狀態展開後，再利用 dynamic scheduling 進行分配工作，效果會好上許多。為了解決狀態展開而後不浪費記憶體空間，IDA* 會是一種好的選擇。當然在 OpenMP 3.0 以上有提供 task 來幫忙做到類似的事情，但搜索展開會變成寫死，估計上會比較困難。

IDA: Accepted (946 ms, 10752 KB)
OpenMP task: Accepted (1321 ms, 2560 KB)
DLX: Accepted (1208 ms, 56940 KB)

當然 DLX 在建表處理會變得很討厭，overhead 相較於其他的算法都高出許多，因此狀態展開不可以太多，因為他本身就搜得非常快，撰寫這一類搜索問題時，特別小心 #pragma omp private() 的使用，複製操作原則上都是根據 sizeof() 決定複製空間大小。因此若複製目標為函數參數，很容易只有複製到指標。

IDA

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <limits.h>
#include <assert.h>
#define MAXN 65536
 
int C[81][2], N;
int R[9][3] = {}, P[MAXN*9][9][3], Pidx;
int ida_dep;
void IDA(int idx, int state[][3]) {
    if (idx == ida_dep) {
        memcpy(P[Pidx], state, sizeof(P[0])), Pidx++;
        return ;
    }
    int x = C[idx][0], y = C[idx][1];
    int c = x/3*3 + y/3;
    for (int i = 0; i < 9; i++) {
        int mask = 1<<i;
        if ((state[x][0]&mask) || (state[y][1]&mask) ||
                (state[c][2]&mask))
            continue;
        state[x][0] |= mask;
        state[y][1] |= mask;
        state[c][2] |= mask;
        IDA(idx+1, state);
        state[x][0] ^= mask;
        state[y][1] ^= mask;
        state[c][2] ^= mask;
    }
}
int dfs(int idx, int state[][3]) {
    if (idx == N)
        return 1;
    int x = C[idx][0], y = C[idx][1];
    int c = x/3*3 + y/3, sum = 0;
    for (int i = 0; i < 9; i++) {
        int mask = 1<<i;
        if ((state[x][0]&mask) || (state[y][1]&mask) ||
                (state[c][2]&mask))
            continue;
        state[x][0] |= mask;
        state[y][1] |= mask;
        state[c][2] |= mask;
        sum += dfs(idx+1, state);
        state[c][2] ^= mask;
        state[y][1] ^= mask;
        state[x][0] ^= mask;
    }
    return sum;
}
int incomplete_bfs() {
    for (ida_dep = 1; ida_dep <= N; ida_dep++) {
        Pidx = 0;
        IDA(0, R);
        if (Pidx >= MAXN)
            break;
    }
    if (ida_dep == N+1)
        return Pidx;
    int ret = 0;
    #pragma omp parallel for schedule(guided) reduction(+: ret) 
    for (int i = 0; i < Pidx; i++)
        ret += dfs(ida_dep, P[i]);
    return ret;
}
int main() {
    int g[9][9], n = 9;
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            scanf("%d", &g[i][j]);
    N = 0;
    memset(R, 0, sizeof(R));
     for (int i = 0; i < n; i++) {
         for (int j = 0; j < n; j++) {
             if (g[i][j]) {
                 g[i][j]--;
                 R[i][0] |= 1<<g[i][j];
                 R[j][1] |= 1<<g[i][j];
                 R[i/3*3+j/3][2] |= 1<<g[i][j];
            } else {
                 C[N][0] = i, C[N][1] = j;
                 N++;
             }
         }
     }
    int ret = incomplete_bfs();
    printf("%d\n", ret);
    return 0;
}

Task

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <limits.h>
#include <assert.h>
#define MAXN 65536
 
int C[81][2], N;
int R[9][3] = {};
int threadcount = 0;
#pragma omp threadprivate(threadcount)
void dfs_serial(int idx, int state[][3]) {
    if (idx == N) {
        threadcount++;
        return ;
    }
    int x = C[idx][0], y = C[idx][1];
    int c = x/3*3 + y/3;
    for (int i = 0; i < 9; i++) 
    {
        int mask = 1<<i;
        if ((state[x][0]&mask) || (state[y][1]&mask) ||
                (state[c][2]&mask)) 
            continue;
        state[x][0] |= mask;
        state[y][1] |= mask;
        state[c][2] |= mask;
        dfs_serial(idx+1, state);
        state[c][2] ^= mask;
        state[y][1] ^= mask;
        state[x][0] ^= mask;
    }
}
void dfs(int idx, int state[][3]) {
    if (idx == N) {
        threadcount++;
        return ;
    }
    int x = C[idx][0], y = C[idx][1];
    int c = x/3*3 + y/3;
    for (int i = 0; i < 9; i++) 
    #pragma omp task untied
    {
        int mask = 1<<i;
        if ((state[x][0]&mask) || (state[y][1]&mask) ||
                (state[c][2]&mask)) {
 
        } else {
            int S[9][3] = {};
            memcpy(S, state, sizeof(S));
//            int (*S)[3] = malloc(sizeof(int)*9*3);
//            memcpy(S, state, sizeof(int)*9*3);
            S[x][0] |= mask;
            S[y][1] |= mask;
            S[c][2] |= mask;
            if (idx >= 6) {
                dfs_serial(idx+1, S);
            } else {
                dfs(idx+1, S);
            }
        }
    }
    #pragma omp taskwait
}
int incomplete_bfs() {
    int ret = 0;
    #pragma omp parallel
    {
        #pragma omp single
        {
            dfs(0, R);
        }
        #pragma omp critical
            ret += threadcount;
    }
    return ret;
}
int main() {
    int g[9][9], n = 9;
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            scanf("%d", &g[i][j]);
    N = 0;
    memset(R, 0, sizeof(R));
     for (int i = 0; i < n; i++) {
         for (int j = 0; j < n; j++) {
             if (g[i][j]) {
                 g[i][j]--;
                 R[i][0] |= 1<<g[i][j];
                 R[j][1] |= 1<<g[i][j];
                 R[i/3*3+j/3][2] |= 1<<g[i][j];
            } else {
                 C[N][0] = i, C[N][1] = j;
                 N++;
             }
         }
     }
    int ret = incomplete_bfs();
    printf("%d\n", ret);
    return 0;
}

DLX

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <limits.h>
#include <assert.h>
#define MAXN 65536
#define MAXR 1024
typedef unsigned char byte;
typedef struct {
    int left, right, up, down;
    int ch;
} Node;
// DLX header
typedef struct {
    int (* const sudoku) (byte [][9]);
} DLX_namespace;
// DLX body begin
void DLX_remove(int c, Node DL[], int s[]) {
    DL[DL[c].right].left = DL[c].left;
    DL[DL[c].left].right = DL[c].right;
    for (int i = DL[c].down; i != c; i = DL[i].down) {
        for (int j = DL[i].right; j != i; j = DL[j].right) {
            DL[DL[j].down].up = DL[j].up;
            DL[DL[j].up].down = DL[j].down;
            s[DL[j].ch]--;
        }
    }
}
void DLX_resume(int c, Node DL[], int s[]) {
    for (int i = DL[c].down; i != c; i = DL[i].down) {
        for (int j = DL[i].left; j != i; j = DL[j].left) {
            DL[DL[j].down].up = j;
            DL[DL[j].up].down = j;
            s[DL[j].ch]++;
        }
    }
    DL[DL[c].right].left = c;
    DL[DL[c].left].right = c;
}
int DLX_dfs(int k, Node DL[], int s[], int head) {
    if (DL[head].right == head)
        return 1;
    int t = INT_MAX, c;
    for (int i = DL[head].right; i != head; i = DL[i].right) {
        if (s[i] < t) {
            t = s[i], c = i;
        }
    }
    int ans = 0;
    DLX_remove(c, DL, s);
    for (int i = DL[c].down; i != c; i = DL[i].down) {
        for (int j = DL[i].right; j != i; j = DL[j].right)
            DLX_remove(DL[j].ch, DL, s);
        ans += DLX_dfs(k+1, DL, s, head);
        for (int j = DL[i].left; j != i; j = DL[j].left)
            DLX_resume(DL[j].ch, DL, s);
    }
    DLX_resume(c, DL, s);
    return ans;
}
int DLX_newnode(int up, int down, int left, int right, Node DL[], int *size) {
    DL[*size].up = up, DL[*size].down = down;
    DL[*size].left = left, DL[*size].right = right;
    DL[up].down = DL[down].up = DL[left].right = DL[right].left = *size;
    assert(*size < MAXN);
    return (*size)++;
}
void DLX_newrow(int n, int Row[], Node DL[], int s[], int *size) {
    int a, r, row = -1, k;
    for (a = 0; a < n; a++) {
        r = Row[a];
        DL[*size].ch = r, s[r]++;
        if (row == -1) {
            row = DLX_newnode(DL[DL[r].ch].up, DL[r].ch, *size, *size, DL, size);
        } else {
            k = DLX_newnode(DL[DL[r].ch].up, DL[r].ch, DL[row].left, row, DL, size);
        }
    }
}
void DLX_init(int m, Node DL[], int s[], int *size, int *head) {
    *size = 0;
    *head = DLX_newnode(0, 0, 0, 0, DL, size);
    for (int i = 1; i <= m; i++) {
        DLX_newnode(i, i, DL[*head].left, *head, DL, size);
        DL[i].ch = i, s[i] = 0;
    }
}
int DLX_sudoku(byte g[][9]) {
    Node DL[MAXN + MAXR];
    int s[MAXR], head, size;
    int row[10];
    int used[4][10][10] = {}, isValid = 1;
    int n = 9, tn = 3;
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            if (g[i][j]) {
                int x = g[i][j];
                int y = i/3*3 + j/3;
                if (used[1][i][x])    isValid = 0;
                if (used[2][j][x])    isValid = 0;
                if (used[3][y][x])    isValid = 0;
                used[0][i][j] = 1;
                used[1][i][x] = used[2][j][x] = used[3][y][x] = 1;
            }
        }
    }
    if (!isValid)
        return 0;
 
    int OFF[4] = {};
    int label[4][10][10] = {};
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            if (used[0][i][j] == 0)
                label[0][i][j] = ++OFF[0];
        }
    }
    for (int k = 1; k < 4; k++) {
        OFF[k] = OFF[k-1];
        for (int i = 0; i < n; i++) {
            for (int j = 1; j <= n; j++) {
                if (used[k][i][j] == 0) {
                    label[k][i][j] = ++OFF[k];
                }
            }
        }
    }
    DLX_init(OFF[3], DL, s, &size, &head);
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            if (g[i][j])    continue;
            for (int k = 1; k <= n; k++) {
                int x = k;
                int y = i/3*3 + j/3;
                if (used[1][i][x])    continue;
                if (used[2][j][x])    continue;
                if (used[3][y][x])    continue;
 
                row[0] = label[0][i][j];
                row[1] = label[1][i][k];
                row[2] = label[2][j][k];
                row[3] = label[3][y][k];
                DLX_newrow(4, row, DL, s, &size);
            }
        }
    }
    return DLX_dfs(0, DL, s, head);
}
DLX_namespace const DLX = {DLX_sudoku};
// DLX body end
 
// parallel utils
typedef struct {
    byte b[9][9];    // board
    bool r[9][9], c[9][9], g[9][9];    // row, column, grid
} Board;
static inline int gridID(int x, int y) {
    return x / 3 * 3 + y / 3;
}
void board_init(Board *b) {
    memset(b, 0, sizeof(Board));
}
bool board_test(const Board *b, int x, int y, int v) {
    return b->b[x][y] == 0 && b->r[x][v-1] == 0 && b->c[y][v-1] == 0 && b->g[gridID(x, y)][v-1] == 0;
}
bool board_fill(Board *b, int x, int y, int v) {
    b->b[x][y] = v;
    b->r[x][v-1] = b->c[y][v-1] = b->g[gridID(x, y)][v-1] = 1;
}
 
int incomplete_bfs(int g[][9]) {
    Board origin;
    int n = 9;
    int A[81][2], m = 0;
    board_init(&origin);
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            if (g[i][j]) {
                board_fill(&origin, i, j, g[i][j]);
            } else {
                A[m][0] = i, A[m][1] = j, m++;
            }
        }
    }
#define MAXQMEM 8192
    static Board Q[2][MAXQMEM*9];
    int Qcnt[2] = {}, Qflag = 0;
    Q[Qflag][Qcnt[Qflag]] = origin, Qcnt[Qflag]++;
    for (int it = 0; it < m; it++) {
        const int x = A[it][0], y = A[it][1];
        Qcnt[!Qflag] = 0;
        #pragma omp parallel for
        for (int i = 0; i < Qcnt[Qflag]; i++) {
            for (int j = 1; j <= 9; j++) {
                if (!board_test(&Q[Qflag][i], x, y, j))
                    continue;
                int pIdx;
                #pragma omp critical
                {
                    pIdx = Qcnt[!Qflag]++;
                }
                Q[!Qflag][pIdx] = Q[Qflag][i];
                board_fill(&Q[!Qflag][pIdx], x, y, j);
            }
        }
        if (Qcnt[!Qflag] >= MAXQMEM) {
            int ret = 0;
            #pragma omp parallel for reduction(+:ret)
            for (int i = 0; i < Qcnt[Qflag]; i++) {
                ret += DLX.sudoku(Q[Qflag][i].b);
            }
            return ret;
        }
        Qflag = !Qflag;
    }
    return Qcnt[Qflag];
}
int main() {
    int g[9][9], n = 9;
    for (int i = 0; i < n; i++)
        for (int j = 0; j < n; j++)
            scanf("%d", &g[i][j]);
 
    int ret = incomplete_bfs(g);
    printf("%d\n", ret);
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10087. Sparse Matrix Multiplication (OpenMP)

題目描述

稀疏矩陣為大部份元素皆為零的矩陣，在科學與工程領域中求解線性模型時經常出現大型的稀疏矩陣。現在給予最常見的 Coordinate Format (簡稱 COO 格式)，請問兩個矩陣相乘結果為何。

給予矩陣 $A_{n, m}$ 和 $B_{m, r}$，請計算稀疏矩陣相乘。

$$A_{4,4} = \begin{bmatrix} 0 & 0 & 0 & 0 \\ 5 & 8 & 0 & 0 \\ 0 & 0 & 3 & 0 \\ 0 & 6 & 0 & 0 \\ \end{bmatrix}, \qquad B_{4,4} = \begin{bmatrix} 0 & 0 & 1 & 3 \\ 0 & 5 & 2 & 0 \\ 3 & 5 & 0 & 0 \\ 0 & 2 & 0 & 0 \\ \end{bmatrix}$$

根據 COO 格式，分別轉換矩陣 $A$ 和 $B$ 的所有非零元素，如下表所示

COO of Matrix $A$

`row_index`	`col_index`	`value`
1	0	5
1	1	8
2	2	3
3	1	6

COO of Matrix $B$

`row_index`	`col_index`	`value`
0	2	1
0	3	3
1	1	5
1	2	2
2	0	3
2	1	5
3	1	2

輸入格式

測資只有一組，第一行會有三個整數 $N, \; M, \; R$，分別表示矩陣 $A, \; B$ 的大小。下一行會有兩個整數 $N_A, \; N_B$，接下來會有 $N_A$ 行，每一行表示矩陣 $A$ 的 COO 格式，隨後會有 $N_B$ 行，每一行表示矩陣 $B$ 的 COO 格式。

$0 < N, \; M, \; R \le 10^6$
$0 < N_A, \; N_B \le 10^6$

給予 COO 格式時，先按照 row_index 由小到大，相同情況由 col_index 由小到大的方式給定，保證不會有重複，每一個元素值 $v$ 介於 $1$ 到 $2^{31}-1$ 之間。

輸出格式

輸出 $C_{n, r} = A_{n, m} \times B_{m, r}$ 的雜湊值。

定義 $\mathit{hash}(C_{n, r}) = \sum\nolimits_{e_{i, j} \in C \text{ and } e_{i, j} \neq 0} \mathit{encrypt}((i+1)*(j+1), e_{i, j})$，實際運作的流程可參考以下作法，當然你沒辦法宣告 $N \times M$ 的空間使用：

static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
#define MAXN 1024
uint32_t A[MAXN][MAXN], B[MAXN][MAXN];
int main() {
    int x, y, v;
    int N, M, R, nA, nB;
    scanf("%d %d %d", &N, &M, &R);
    scanf("%d %d", &nA, &nB);
    for (int i = 0; i < nA; i++)
        scanf("%d %d %d", &x, &y, &v), A[x][y] = v;
    for (int i = 0; i < nB; i++)
        scanf("%d %d %d", &x, &y, &v), B[x][y] = v;
    
    uint32_t hash = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < R; j++) {
            uint32_t sum = 0;
            for (int k = 0; k < M; k++)
                sum += A[i][k] * B[k][j];
            if (sum)
                hash += encrypt((i+1)*(j+1), sum);
        }
    }
    printf("%u\n", hash);
    return 0;
}

範例輸入

範例輸出

13093438

範例解釋

$$A_{n, m} \times B_{m, r} = C_{4,4}=\begin{bmatrix} 0 & 0 & 0 & 0 \\ 0 & 40 & 21 & 15 \\ 9 & 15 & 0 & 0 \\ 0 & 30 & 12 & 0 \\ \end{bmatrix}$$

Solution

傳統處理 Array of Structure: Accepted (14964 ms, 29104 KB)
傳統處理 Structure of Array: Accepted (12537 ms, 45568 KB)
額外空間 Array of Structure: Accepted (5711 ms, 29152 KB)
額外空間 Structure of Array: Accepted (5202 ms, 29056 KB)

Array of Structure 簡稱 AOS，而 Structure of Array 簡稱 SOA，這兩種寫法的差異在於運用 structure 時的寫法，根據 cache line 的設計，有時候一個 struct 下會有很多變數，然而我們只需要使用部分的變數，這時候沒有使用的變數仍會跟著一起進入 cache，很容易造成 cache 效率偏低，因此回歸原始寫出 SOA 的版本。

傳統處理

如果是兩個稀疏方陣，用 COO 格式需要先對後面那個矩陣轉置，接著在三欄 $(i, j, v)$ 的表格下，按照字典順序排序。此時會有兩個三欄 $A(i, j, v), \; B(i, j, v)$ (別忘記 $B$ 矩陣已經轉置過)，要計算 $A \times B = C$，為得到 $C_{i, j}$，$C_{i, j} = \sum A(i, k, v) B(j, k, v)$，由於已經排序好，那麼找到共同的 $k$ 不是難事，按照合併排序的手法，能在 $\mathcal{O}(N)$ 完成。

因此在最稠密的矩陣下，仍需要 $\mathcal{O}(N^3)$ 才能完成。

AOS

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
using namespace std;
#define MAXN 1048576
#define MAXL (MAXN<<2)
 
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
typedef struct ELE {
    int i, j;
    uint32_t v;
} ELE;
ELE LA[MAXL], LB[MAXL];
void SpMV(int N, int M, int R, ELE LA[], int na, ELE LB[], int nb) {
    int ma = 0, mb = 0;
    static int aoff[MAXN], boff[MAXN];
 
    for (int i = 0, p = -1; i < na; i++) {
        if (LA[i].i != p)
            aoff[ma++] = i, p = LA[i].i;
    }
    for (int i = 0, p = -1; i < nb; i++) {
        if (LB[i].i != p)
            boff[mb++] = i, p = LB[i].i;
    }
 
    uint32_t hash = 0;
    aoff[ma] = na, boff[mb] = nb;
    #pragma omp parallel for reduction(+: hash)
    for (int i = 0; i < ma; i++) {
        for (int j = 0; j < mb; j++) {
            int idx1 = aoff[i], idx2 = boff[j];
            int top1 = aoff[i+1], top2 = boff[j+1];
            uint32_t sum = 0;
            int r = LA[idx1].i, c = LB[idx2].i;
            while (idx1 < top1 && idx2 < top2) {
                if (LA[idx1].j == LB[idx2].j)
                    sum += LA[idx1].v * LB[idx2].v, idx1++, idx2++;
                else if (LA[idx1].j < LB[idx2].j)
                    idx1++;
                else
                    idx2++;                    
            }
 
            if (sum)
                hash += encrypt((r+1)*(c+1), sum);
        }
    }
    printf("%u\n", hash);
}
bool cmp(const ELE &a, const ELE &b) {
    if (a.i != b.i)
        return a.i < b.i;
    return a.j < b.j;
}
int main() {
    int N, M, R, nA, nB;
    while (scanf("%d %d %d", &N, &M, &R) == 3) {
        scanf("%d %d", &nA, &nB);
        for (int i = 0; i < nA; i++)
            scanf("%d %d %d", &LA[i].i, &LA[i].j, &LA[i].v);
        for (int i = 0; i < nB; i++)
            scanf("%d %d %d", &LB[i].j, &LB[i].i, &LB[i].v);
        sort(LB, LB+nB, cmp);
        SpMV(N, M, R, LA, nA, LB, nB);
    }
    return 0;
}

SOA

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
using namespace std;
#define MAXN 1048576
#define MAXL (MAXN<<2)
 
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
typedef struct ELE {
    int i[MAXL], j[MAXL];
    uint32_t v[MAXL];
} ELE;
ELE LA, LB, LT;
int D[MAXL];
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
    int ma = 0, mb = 0;
    static int aoff[MAXN], boff[MAXN];
 
    for (int i = 0, p = -1; i < na; i++) {
        if (LA.i[i] != p)
            aoff[ma++] = i, p = LA.i[i];
    }
    for (int i = 0, p = -1; i < nb; i++) {
        if (LB.i[i] != p)
            boff[mb++] = i, p = LB.i[i];
    }
 
    uint32_t hash = 0;
    aoff[ma] = na, boff[mb] = nb;
    #pragma omp parallel for reduction(+: hash)
    for (int i = 0; i < ma; i++) {
        for (int j = 0; j < mb; j++) {
            int idx1 = aoff[i], idx2 = boff[j];
            int top1 = aoff[i+1], top2 = boff[j+1];
            uint32_t sum = 0;
            int r = LA.i[idx1], c = LB.i[idx2];
            while (idx1 < top1 && idx2 < top2) {
                if (LA.j[idx1] < LB.j[idx2])
                    idx1++;
                else if (LA.j[idx1] > LB.j[idx2])
                    idx2++;                    
                else
                    sum += LA.v[idx1] * LB.v[idx2], idx1++, idx2++;
            }
 
            if (sum)
                hash += encrypt((r+1)*(c+1), sum);
        }
    }
    printf("%u\n", hash);
}
bool cmp(int a, int b) {
    if (LT.i[a] != LT.i[b])
        return LT.i[a] < LT.i[b];
    return LT.j[a] < LT.j[b];
}
int main() {
    int N, M, R, nA, nB;
    while (scanf("%d %d %d", &N, &M, &R) == 3) {
        scanf("%d %d", &nA, &nB);
        for (int i = 0; i < nA; i++)
            scanf("%d %d %d", &LA.i[i], &LA.j[i], &LA.v[i]);
        for (int i = 0; i < nB; i++)
            scanf("%d %d %d", <.j[i], <.i[i], <.v[i]);
        for (int i = 0; i < nB; i++)
            D[i] = i;
        sort(D, D+nB, cmp);
        for (int i = 0; i < nB; i++)
            LB.i[i] = LT.i[D[i]], LB.j[i] = LT.j[D[i]], LB.v[i] = LT.v[D[i]];
        SpMV(N, M, R, LA, nA, LB, nB);
    }
    return 0;
}

額外空間

若採用額外空間處理，一開始就不用對 $B$ 轉置，$C_{i, j} = \sum A(i, k, v) B(k, j, v)$。每一次解決 $C$ 的第 $i$ 列上的所有元素值，此時 $A(i, k, v)$ 按照 $k$ 遞增排列，$B(k, j, v)$ 要配對 $A(i, k, v)$，會恰好掃描完整的 $B$，累加元素值會不按照順序累加到 $C_{i, j}$。

因此在最稠密的矩陣下，仍需要 $\mathcal{O}(N^3)$ 才能完成。

AOS

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
 
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
 
#define MAXN 1048576
#define MAXL 1048576
typedef struct ELE {
    int i, j;
    uint32_t v;
} ELE;
ELE LA[MAXL], LB[MAXL];
 
void SpMV(int N, int M, int R, ELE LA[], int na, ELE LB[], int nb) {
    int ma = 0;
    static int aoff[MAXN];
    for (int i = 0, p = -1; i < na; i++) {
        if (LA[i].i != p)
            aoff[ma++] = i, p = LA[i].i;
    }
    aoff[ma] = na;
 
    uint32_t hash = 0;
    int chunk = 8;
    #pragma omp parallel for schedule(dynamic, chunk) reduction(+: hash)
    for (int i = 0; i < ma; i++) {
        int idx1 = aoff[i], top1 = aoff[i+1];
        int r = LA[idx1].i;
        uint32_t *C = (uint32_t *) calloc(R, sizeof(uint32_t));
        for (int idx2 = 0; idx2 < nb && idx1 < top1; ) {
            if (LA[idx1].j == LB[idx2].i) {
                C[LB[idx2].j] += LA[idx1].v * LB[idx2].v;
                idx2++;
            }
            if (idx2 < nb && LA[idx1].j != LB[idx2].i) {
                if (LA[idx1].j < LB[idx2].i)
                    idx1++;
                else
                    idx2++;
            }
        }
        for (int j = 0; j < R; j++) {
            if (C[j])
                hash += encrypt((r+1)*(j+1), C[j]);
        }
        free(C);
    }
 
    printf("%u\n", hash);
}
int main() {
    uint32_t N, M, R, nA, nB;
    while (scanf("%d %d %d", &N, &M, &R) == 3) {
        scanf("%d %d", &nA, &nB);
        for (int i = 0; i < nA; i++)
            scanf("%d %d %d", &LA[i].i, &LA[i].j, &LA[i].v);
        for (int i = 0; i < nB; i++)
            scanf("%d %d %d", &LB[i].i, &LB[i].j, &LB[i].v);
        SpMV(N, M, R, LA, nA, LB, nB);
    }
    return 0;
}

SOA

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
 
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
 
#define MAXN 1048576
#define MAXL 1048576
typedef struct ELE {
    int i[MAXL], j[MAXL];
    uint32_t v[MAXL];
} ELE;
ELE LA, LB;
 
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
    int ma = 0;
    static int aoff[MAXN];
    for (int i = 0, p = -1; i < na; i++) {
        if (LA.i[i] != p)
            aoff[ma++] = i, p = LA.i[i];
    }
    aoff[ma] = na;
 
    uint32_t hash = 0;
    int chunk = 8;
    #pragma omp parallel for schedule(dynamic, chunk) reduction(+: hash)
    for (int i = 0; i < ma; i++) {
        int idx1 = aoff[i], top1 = aoff[i+1];
        int r = LA.i[idx1];
        uint32_t *C = (uint32_t *) calloc(R, sizeof(uint32_t));
        for (int idx2 = 0; idx2 < nb && idx1 < top1; ) {
            if (LA.j[idx1] == LB.i[idx2]) {
                C[LB.j[idx2]] += LA.v[idx1] * LB.v[idx2];
                idx2++;
            }
            if (idx2 < nb && LA.j[idx1] != LB.i[idx2]) {
                if (LA.j[idx1] < LB.i[idx2])
                    idx1++;
                else
                    idx2++;
            }
        }
        for (int j = 0; j < R; j++) {
            if (C[j])
                hash += encrypt((r+1)*(j+1), C[j]);
        }
        free(C);
    }
 
    printf("%u\n", hash);
}
int main() {
    uint32_t N, M, R, nA, nB;
    while (scanf("%d %d %d", &N, &M, &R) == 3) {
        scanf("%d %d", &nA, &nB);
        for (int i = 0; i < nA; i++)
            scanf("%d %d %d", &LA.i[i], &LA.j[i], &LA.v[i]);
        for (int i = 0; i < nB; i++)
            scanf("%d %d %d", &LB.i[i], &LB.j[i], &LB.v[i]);
        SpMV(N, M, R, LA, nA, LB, nB);
    }
    return 0;
}

Read More +