2016-06-28

批改娘 10107. Sparse Matrix Multiplication (CUDA)

1. 題目描述
1. 1.1. COO of Matrix $A$
2. 1.2. COO of Matrix $B$
2. 輸入格式
3. 輸出格式
4. 範例輸入
5. 範例輸出
6. 範例解釋
7. Solution
1. 7.1. 轉置版本
2. 7.2. 額外空間

題目描述

稀疏矩陣為大部份元素皆為零的矩陣，在科學與工程領域中求解線性模型時經常出現大型的稀疏矩陣。現在給予最常見的 Coordinate Format (簡稱 COO 格式)，請問兩個矩陣相乘結果為何。

給予矩陣 $A{n, m}$ 和 $B{m, r}$，請計算稀疏矩陣相乘。

$$A_{4,4} = \begin{bmatrix} 0 & 0 & 0 & 0 \\ 5 & 8 & 0 & 0 \\ 0 & 0 & 3 & 0 \\ 0 & 6 & 0 & 0 \\ \end{bmatrix}, \qquad B_{4,4} = \begin{bmatrix} 0 & 0 & 1 & 3 \\ 0 & 5 & 2 & 0 \\ 3 & 5 & 0 & 0 \\ 0 & 2 & 0 & 0 \\ \end{bmatrix}$$

根據 COO 格式，分別轉換矩陣 $A$ 和 $B$ 的所有非零元素，如下表所示

COO of Matrix $A$

`row_index`	`col_index`	`value`
1	0	5
1	1	8
2	2	3
3	1	6

COO of Matrix $B$

`row_index`	`col_index`	`value`
0	2	1
0	3	3
1	1	5
1	2	2
2	0	3
2	1	5
3	1	2

輸入格式

測資只有一組，第一行會有三個整數 $N, \; M, \; R$，分別表示矩陣 $A, \; B$ 的大小。下一行會有兩個整數 $N_A, \; N_B$，接下來會有 $N_A$ 行，每一行表示矩陣 $A$ 的 COO 格式，隨後會有 $N_B$ 行，每一行表示矩陣 $B$ 的 COO 格式。

$0 < N, \; M, \; R \le 10^6$
$0 < N_A, \; N_B \le 10^6$

給予 COO 格式時，先按照 row_index 由小到大，相同情況由 col_index 由小到大的方式給定，保證不會有重複，每一個元素值 $v$ 介於 $1$ 到 $2^{31}-1$ 之間。

輸出格式

輸出 $C_{n, r} = A_{n, m} \times B_{m, r}$ 的雜湊值。

定義 $\mathit{hash}(C_{n, r}) = \sum\nolimits_{e_{i, j} \in C \text{ and } e_{i, j} \neq 0} \mathit{encrypt}((i+1)*(j+1), e_{i, j})$，實際運作的流程可參考以下作法，當然你沒辦法宣告 $N \times M$ 的空間使用：

static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
#define MAXN 1024
uint32_t A[MAXN][MAXN], B[MAXN][MAXN];
int main() {
    int x, y, v;
    int N, M, R, nA, nB;
    scanf("%d %d %d", &N, &M, &R);
    scanf("%d %d", &nA, &nB);
    for (int i = 0; i < nA; i++)
        scanf("%d %d %d", &x, &y, &v), A[x][y] = v;
    for (int i = 0; i < nB; i++)
        scanf("%d %d %d", &x, &y, &v), B[x][y] = v;
    
    uint32_t hash = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < R; j++) {
            uint32_t sum = 0;
            for (int k = 0; k < M; k++)
                sum += A[i][k] * B[k][j];
            if (sum)
                hash += encrypt((i+1)*(j+1), sum);
        }
    }
    printf("%u\n", hash);
    return 0;
}

範例輸入

範例輸出

13093438

範例解釋

$$A_{n, m} \times B_{m, r} = C_{4,4}=\begin{bmatrix} 0 & 0 & 0 & 0 \\ 0 & 40 & 21 & 15 \\ 9 & 15 & 0 & 0 \\ 0 & 30 & 12 & 0 \\ \end{bmatrix}$$

Solution

轉置版本

需要將矩陣 $B$ 轉置後排序，整理成 COO 格式，宣告足夠多的 thread，每一個 thread 分配一個列所有的值，接著使用類似合併排序的方式將答案統計出來。這個版本的效率不算高，因為太多的 branch 以及資料部分上導致 load balance 非常不好，再者是一堆 global memory access。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
using namespace std;
#define MAXN 1048576
#define MAXL (MAXN<<2)
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
__device__ uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
__device__ uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
typedef struct ELE {
    int i[MAXL], j[MAXL];
    uint32_t v[MAXL];
} ELE;
ELE LA, LB, LT;
int D[MAXL];
__global__ void SpMM(ELE *LA, ELE *LB, ELE *LC, int aoff[], int boff[], int mb) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t hash = 0;
    for (int j = 0; j < mb; j++) {
        int idx1 = aoff[i], idx2 = boff[j];
        int top1 = aoff[i+1], top2 = boff[j+1];
        uint32_t sum = 0;
        int r = LA->i[idx1], c = LB->i[idx2];
        while (idx1 < top1 && idx2 < top2) {
            if (LA->j[idx1] < LB->j[idx2])
                idx1++;
            else if (LA->j[idx1] > LB->j[idx2])
                idx2++;					
            else
                sum += LA->v[idx1] * LB->v[idx2], idx1++, idx2++;
        }
        if (sum) {
            hash += encrypt((r+1)*(c+1), sum);
        }
    }	
    LC->v[i] = hash;
}
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
    int ma = 0, mb = 0;
    static int aoff[MAXN], boff[MAXN];
    for (int i = 0, p = -1; i < na; i++) {
        if (LA.i[i] != p)
            aoff[ma++] = i, p = LA.i[i];
    }
    for (int i = 0, p = -1; i < nb; i++) {
        if (LB.i[i] != p)
            boff[mb++] = i, p = LB.i[i];
    }
    aoff[ma] = na, boff[mb] = nb;
    ELE *cuMA, *cuMB, *cuMC;
    int *cuAoff, *cuBoff;
    cudaMalloc((void **) &cuMA, sizeof(ELE));
    cudaMalloc((void **) &cuMB, sizeof(ELE));
    cudaMalloc((void **) &cuMC, sizeof(ELE));
    cudaMalloc((void **) &cuAoff, (ma+1)*sizeof(int));
    cudaMalloc((void **) &cuBoff, (mb+1)*sizeof(int));
    cudaMemcpy(cuMA, &LA, sizeof(ELE), cudaMemcpyHostToDevice);
    cudaMemcpy(cuMB, &LB, sizeof(ELE), cudaMemcpyHostToDevice);
    cudaMemcpy(cuAoff, aoff, sizeof(int)*(ma+1), cudaMemcpyHostToDevice);
    cudaMemcpy(cuBoff, boff, sizeof(int)*(mb+1), cudaMemcpyHostToDevice);
    int localSz = 1;
    for (int i = 1; i <= 1024; i++)	{
        if (ma%i == 0) {
            localSz = i;
        }
    }	
    dim3 cuBlock(localSz);
    dim3 cuGrid(ma/localSz);
    SpMM<<<cuGrid, cuBlock>>>(cuMA, cuMB, cuMC, cuAoff, cuBoff, mb);
    CheckErr(cudaGetLastError());
    cudaMemcpy(&LA, cuMC, sizeof(ELE), cudaMemcpyDeviceToHost);
    uint32_t hash = 0;
#pragma omp parallel for reduction(+: hash)
    for (int i = 0; i < ma; i++)
        hash += LA.v[i];
    printf("%u\n", hash);
    cudaFree(cuMA);
    cudaFree(cuMB);
    cudaFree(cuMC);
    cudaFree(cuAoff);
    cudaFree(cuBoff);
}
bool cmp(int a, int b) {
    if (LT.i[a] != LT.i[b])
        return LT.i[a] < LT.i[b];
    return LT.j[a] < LT.j[b];
}
int main() {
    int N, M, R, nA, nB;
    while (scanf("%d %d %d", &N, &M, &R) == 3) {
        scanf("%d %d", &nA, &nB);
        for (int i = 0; i < nA; i++)
            scanf("%d %d %d", &LA.i[i], &LA.j[i], &LA.v[i]);
        for (int i = 0; i < nB; i++)
            scanf("%d %d %d", &LT.j[i], &LT.i[i], &LT.v[i]);
        for (int i = 0; i < nB; i++)
            D[i] = i;
        sort(D, D+nB, cmp);
        for (int i = 0; i < nB; i++)
            LB.i[i] = LT.i[D[i]], LB.j[i] = LT.j[D[i]], LB.v[i] = LT.v[D[i]];
        SpMV(N, M, R, LA, nA, LB, nB);
    }
    return 0;
}

額外空間

每一個 thread 計算 $C_{i, [L, R]}$，然後用額外的空間進行 mapping，並且把這個空間宣告在 private memory，也就是 register 上來加速。這一個做法比上述作法還快個三到四倍。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <omp.h>
using namespace std;
#define MAXN 1000005
#define MAXINT 256
#define MAXBLK 1024
#define MAXL (MAXN)
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
typedef struct ELE {
    int i[MAXL], j[MAXL];
    uint32_t v[MAXL];
} ELE;
typedef struct AUX {
    int aoff[MAXL], boff[MAXL], bmap[MAXL];
    int ma, mb, na, nb;
} AUX;
ELE LA, LB;
AUX AU;
uint32_t H[MAXL];
__device__ inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
__device__ inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
__global__ void SpMM(ELE *LA, ELE *LB, uint32_t *H, AUX *AU, int T) {
#define INTSZ MAXINT
    __shared__ uint32_t buff[MAXBLK];
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int x = idx / T, y = idx % T;
    int L = y * INTSZ;
    int R = L + INTSZ;
    uint32_t tmp[INTSZ];
    for (int i = 0; i < INTSZ; i++)
        tmp[i] = 0;
    int lx = AU->aoff[x], rx = AU->aoff[x+1];
    for (int i = lx; i < rx; i++) {
        if (AU->bmap[LA->j[i]] != -1) {
            int k = AU->bmap[LA->j[i]];
            uint32_t val = LA->v[i];
            int ly = AU->boff[k], ry = AU->boff[k+1];
            for (int j = ly; j < ry; j++) {
                if (L <= LB->j[j] && LB->j[j] < R)
                    tmp[LB->j[j] - L] += val * LB->v[j];
            }
        }
    }
    uint32_t hash = 0;
    uint32_t X = LA->i[AU->aoff[x]];
    for (int i = 0; i < INTSZ; i++) {
        if (tmp[i])
            hash += encrypt((X+1)*(i+L+1), tmp[i]);
    }
    buff[threadIdx.x] = hash;
    __syncthreads();
    for (int i = blockDim.x>>1; i; i >>= 1) {
        if (threadIdx.x < i)
            buff[threadIdx.x] += buff[threadIdx.x + i];
        __syncthreads();
    }
    if (threadIdx.x == 0)
        H[blockIdx.x] = buff[0];
#undef INTSZ
}
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
    AU.ma = 0, AU.mb = 0;
    AU.na = na, AU.nb = nb;
    memset(AU.bmap, -1, sizeof(AU.bmap));
#pragma omp parallel sections
    {
#pragma omp section 
        {
            for (int i = 0, p = -1; i < na; i++) {
                if (LA.i[i] != p)
                    AU.aoff[AU.ma++] = i, p = LA.i[i];
            }
            AU.aoff[AU.ma] = na;
        }
#pragma omp section
        {
            for (int i = 0, p = -1; i < nb; i++) {
                if (LB.i[i] != p) {
                    AU.bmap[LB.i[i]] = AU.mb;
                    AU.boff[AU.mb++] = i, p = LB.i[i];
                }
            }
            AU.boff[AU.mb] = nb;
        }
    }
    uint32_t *cuHH;
    ELE *cuMA, *cuMB;
    AUX *cuAU;
    cudaMalloc((void **) &cuHH, sizeof(H));
    cudaMalloc((void **) &cuMA, sizeof(ELE));
    cudaMalloc((void **) &cuMB, sizeof(ELE));
    cudaMalloc((void **) &cuAU, sizeof(AUX));
    cudaMemcpy(cuHH, &H, sizeof(H), cudaMemcpyHostToDevice);
    cudaMemcpy(cuMA, &LA, sizeof(ELE), cudaMemcpyHostToDevice);
    cudaMemcpy(cuMB, &LB, sizeof(ELE), cudaMemcpyHostToDevice);
    cudaMemcpy(cuAU, &AU, sizeof(AUX), cudaMemcpyHostToDevice);
    int roundR = (R + MAXINT-1) / MAXINT * MAXINT;
    int TOT = N * roundR;
    while (TOT / MAXINT % MAXBLK)
        TOT ++;
    dim3 cuBlock(MAXBLK);	// MAXTHREAD
    dim3 cuGrid(TOT / MAXINT / MAXBLK);
    // printf("%d\n", TOT/MAXINT);
    SpMM<<<cuGrid, cuBlock>>>(cuMA, cuMB, cuHH, cuAU, roundR / MAXINT);
    CheckErr(cudaGetLastError());
    cudaMemcpy(H, cuHH, sizeof(H), cudaMemcpyDeviceToHost);
    uint32_t hash = 0;
#ifdef _OPENMP
    omp_set_num_threads(4);
#endif
#pragma omp parallel for reduction(+: hash)
    for (int i = 0; i < TOT/MAXINT/MAXBLK; i++)
        hash += H[i];
    printf("%u\n", hash);
    cudaFree(cuMA);
    cudaFree(cuMB);
    cudaFree(cuHH);
    cudaFree(cuAU);
}
inline int readchar() {
    const int N = 1048576;
    static char buf[N];
    static char *p = buf, *end = buf;
    if(p == end) {
        if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
        p = buf;
    }
    return *p++;
}
inline int ReadInt(int *x) {
    static char c, neg;
    while((c = readchar()) < '-')    {if(c == EOF) return 0;}
    neg = (c == '-') ? -1 : 1;
    *x = (neg == 1) ? c-'0' : 0;
    while((c = readchar()) >= '0')
        *x = (*x << 3) + (*x << 1) + c-'0';
    *x *= neg;
    return 1;
}
inline uint32_t ReadUint(uint32_t *x) {
    static char c, neg;
    while((c = readchar()) < '-')    {if(c == EOF) return 0;}
    neg = (c == '-') ? -1 : 1;
    *x = (neg == 1) ? c-'0' : 0;
    while((c = readchar()) >= '0')
        *x = (*x << 3) + (*x << 1) + c-'0';
    *x *= neg;
    return 1;
}
int main() {
    int N, M, R, nA, nB;
//	scanf("%d %d %d", &N, &M, &R);
//	scanf("%d %d", &nA, &nB);
    ReadInt(&N), ReadInt(&M), ReadInt(&R);
    ReadInt(&nA), ReadInt(&nB);
        for (int i = 0; i < nA; i++)
            ReadInt(&LA.i[i]), ReadInt(&LA.j[i]), ReadUint(&LA.v[i]);
//			scanf("%d%d%d", &LA.i[i], &LA.j[i], &LA.v[i]);
        for (int i = 0; i < nB; i++)
            ReadInt(&LB.i[i]), ReadInt(&LB.j[i]), ReadUint(&LB.v[i]);
//			scanf("%d%d%d", &LB.i[i], &LB.j[i], &LB.v[i]);
        SpMV(N, M, R, LA, nA, LB, nB);
//	}
    return 0;
}

Morris' Blog