2015-07-11

b442. 快取實驗矩陣乘法

1. Problem
1. 1.1. 背景
2. 1.2. 題目描述
2. Sample Input
3. Sample Output
4. Solution
1. 4.1. 轉置解
2. 4.2. LOOP UNROLL 解

Problem

背景

記得 b439: 快取置換機制提到的快取置換機制嗎？現在來一場實驗吧！

題目描述

相信不少人都已經實作所謂的矩陣乘法，計算兩個方陣大小為 $N \times N$ 的矩陣 $A, B$。為了方便起見，提供一個偽隨機數的生成，減少在輸入處理浪費的時間，同時也減少上傳測資的辛苦。

根據種子 $c = S1$ 生成矩陣 $A$，種子 $c = S2$ 生成矩陣 $B$，計算矩陣相乘 $A \times B$，為了方便起見，使用 hash 函數進行簽章，最後輸出一個值。由於會牽涉到 overflow 問題，此題作為快取實驗就不考慮這個，overflow 問題大家都會相同。

請利用快取優勢修改代碼如下：

#include <bits/stdc++.h>
using namespace std;
class Matrix {
public:
    vector< vector<int> > data;
    int row, col;
    Matrix(int n = 1, int m = 1) {
        data = vector< vector<int> >(n, vector<int>(m, 0));
        row = n, col = m;
    }
    void rand_gen(int c) {
        int x = 2, n = row * col;
        for (int i = 0; i < row; i++) {
            for (int j = 0; j < col; j++) {
                x = ((long long) x * x + c + i + j)%n;
                data[i][j] = x;
            }
        }
    }
    Matrix multiply(Matrix x) {
        Matrix ret(row, x.col);
        for (int i = 0; i < row; i++) {
            for (int j = 0; j < x.col; j++) {
                int sum = 0;	// overflow
                for (int k = 0; k < col; k++)
                    sum += data[i][k] * x.data[k][j];
                ret.data[i][j] = sum;
            }
        }
        return ret;
    }
    void print() {
        for (int i = 0; i < row; i++) {
            printf("[");
            for (int j = 0; j < col; j++) {
                printf(" %d", data[i][j]);
            }
            printf(" ]\n");
        }
    }
    unsigned long signature() {
        unsigned long h = 0;
        for (int i = 0; i < row; i++) {
            for (int j = 0; j < col; j++) {
                h = hash(h + data[i][j]);
            }
        }
        return h;
    }
private:
    unsigned long hash(unsigned long x) {
        return (x * 2654435761LU);
    }
};
int main() {
    int N, S1, S2;
    while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
        Matrix A(N, N), B(N, N), C;
        A.rand_gen(S1);
        B.rand_gen(S2);
        C = A.multiply(B);		
//		A.print();
//		B.print();
//		C.print();
        printf("%lu\n", C.signature());
    }
    return 0;
}

Sample Input

1 2	2 2 5 2 2 5

Sample Output

1 2	573770929 573770929

Solution

在越大的矩陣中，快取記憶體置換是很慢的，即便用 L1, L2, main memory 分層快取，速度差異很明顯，盡可能在線性 $O(n)$ 運算上都不造成記憶體置換 (搬移)，雖然複雜度都是 $O(n^3)$，相信沒人會去實作 $O(n^{2.807})$ 的 Strassen 算法，又或者是現今更快的 Coppersmith-Winograd $O(n^{2.3727})$。

作業系統題目第二彈，考驗快取應用。在 ZJ 主機上速度可以快二十倍，在本機上只快了十倍。

蔡神 asas 修改輸入，而我選擇了轉置，速度落後。
柳州 liouzhou_101 同步簽章，而我選擇分開函數，速度落後。
廖氏如神 pcsh710742 下刀計組，等等，我看見了什麼。

現在已經快了四十倍，就只是因為編譯器的優化參數變成手動。詳細實作和效能分析，需要的知識不只是作業系統，同時涵蓋計算機組織的 CPU stall 問題，參考 Optimizing Large Matrix-Vector Multiplications

先來個一般解，在計算矩陣 $A \times B$ 前，先將 $B$ 轉置，接著修改計算方式

1 2	for (int k = 0; k < col; k++) sum += data[i][k] * x.data[j][k];

在這一個迴圈中，陣列採用 row-major 的方式儲存，所以第二維度的區域基本上都在快取中，miss 的機會大幅度降低。

接下來要提案的做法是結合上述三位的做法，代碼快，但不能當 Matrix 模板使用，僅僅做為一個效能體現。共用記憶體，採用指針的方式減少複製，一樣進行轉置，但這會破壞原有的 $B$ 矩陣配置。同步進行簽章，捨棄掉 $C$ 矩陣的儲存，使用 LOOP UNROLLING，由於 Zerojudge 的優化沒有開到 -O2、-O3、-Ofast，關於這一部分的優化由使用者自己來。

LOOP UNROLLING 的優化在於 branch 指令，由於 pipeline 會讓效能提升，但遇到 branch 時必須捨棄掉偷偷載入的指令、算到一半的結果，效能會下降，因此使用 LOOP UNROLLING 的好處在於減少 branch 次數。

關於 data prefetch 可以用 __builtin_prefetch() 來完成，根據廖氏如神所言，這個概念可以預先載入防止 stall (pipeline hazard) 的拖延，速度並不會提升太多，有可能是硬體已經完成這一部分，甚至用別的架構去克服。

轉置解

#include <bits/stdc++.h>
using namespace std;
class Matrix {
public:
    vector< vector<int> > data;
    int row, col;
    Matrix(int n = 1, int m = 1) {
        data = vector< vector<int> >(n, vector<int>(m, 0));
        row = n, col = m;
    }
    void rand_gen(int c) {
        int x = 2, n = row * col;
        for (int i = 0; i < row; i++) {
            for (int j = 0; j < col; j++) {
                x = ((long long) x * x + c + i + j)%n;
                data[i][j] = x;
            }
        }
    }
    Matrix multiply(Matrix &x) {
        Matrix ret(row, x.col);
        x.transpose();
        for (int i = 0; i < row; i++) {
            for (int j = 0; j < x.col; j++) {
                int sum = 0;	// overflow
                int *a = &data[i][0], *b = &x.data[j][0];
                for (int k = 0; k < col; k++)
                    sum += *a * *b, a++, b++;
                ret.data[i][j] = sum;
            }
        }
        x.transpose();
        return ret;
    }
    void transpose() {
        for (int i = 0; i < row; i++)
        	for (int j = i+1; j < col; j++)
            	swap(data[i][j], data[j][i]);
    }
    void print() {
        for (int i = 0; i < row; i++) {
            printf("[");
            for (int j = 0; j < col; j++)
                printf(" %d", data[i][j]);
            printf(" ]\n");
        }
    }
    unsigned long signature() {
        unsigned long h = 0;
        for (int i = 0; i < row; i++)
            for (int j = 0; j < col; j++)
                h = hash(h + data[i][j]);
        return h;
    }
private:
    inline unsigned long hash(unsigned long x) {
        return (x * 2654435761LU);
    }
};
int main() {
    int N, S1, S2;
    while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
        Matrix A(N, N), B(N, N), C;
        A.rand_gen(S1);
        B.rand_gen(S2);
        C = A.multiply(B);
        printf("%lu\n", C.signature());
    }
    return 0;
}

LOOP UNROLL 解

#include <bits/stdc++.h>
using namespace std;
#define LOOP_UNROLL 8
class Matrix {
public:
    vector< vector<int> > data;
    int row, col;
    Matrix(int n = 1, int m = 1) {
        row = n, col = m;
        data = vector< vector<int> >(n, vector<int>(m, 0));
    }
    void rand_gen(int c) {
        int x = 2, n = row * col;
        for (int i = 0; i < row; i++) {
            for (int j = 0; j < col; j++) {
                x = ((long long) x * x + c + i + j)%n;
                data[i][j] = x;
            }
        }
    }
    unsigned long multiply(Matrix &x) {
        x.transpose();
        unsigned long h = 0;
        for (int i = 0; i < row; i++) {
            for (int j = 0; j < x.col; j++) {
                register int sum = 0;
                int *a = &data[i][0], *b = &x.data[j][0], k;
                for (k = 0; k+LOOP_UNROLL < col; k += LOOP_UNROLL) {
                    sum += *a * *b, a++, b++;
                    sum += *a * *b, a++, b++;
                    sum += *a * *b, a++, b++;
                    sum += *a * *b, a++, b++;
                    sum += *a * *b, a++, b++;
                    sum += *a * *b, a++, b++;
                    sum += *a * *b, a++, b++;
                    sum += *a * *b, a++, b++;
                }
                for (; k < col; k++)
                    sum += *a * *b, a++, b++;
                h = hash(h + sum);
            }
        }
        return h;
    }
    void transpose() {
        for (int i = 0; i < row; i++)
        	for (int j = i+1; j < col; j++)
            	swap(data[i][j], data[j][i]);
    }
    void print() {
        for (int i = 0; i < row; i++) {
            printf("[");
            for (int j = 0; j < col; j++)
                printf(" %d", data[i][j]);
            printf(" ]\n");
        }
    }
private:
    unsigned long hash(unsigned long x) {
        return (x * 2654435761LU);
    }
} A(1000, 1000), B(1000, 1000);
int main() {
    int N, S1, S2;
    while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
        A.row = A.col = B.row = B.col = N;
        A.rand_gen(S1);
        B.rand_gen(S2);
        printf("%lu\n", A.multiply(B));
    }
    return 0;
}

Morris' Blog