批改娘 10107. Sparse Matrix Multiplication (CUDA)

contents

  1. 1. 題目描述
    1. 1.1. COO of Matrix $A$
    2. 1.2. COO of Matrix $B$
  2. 2. 輸入格式
  3. 3. 輸出格式
  4. 4. 範例輸入
  5. 5. 範例輸出
  6. 6. 範例解釋
  7. 7. Solution
    1. 7.1. 轉置版本
    2. 7.2. 額外空間

題目描述

稀疏矩陣為大部份元素皆為零的矩陣,在科學與工程領域中求解線性模型時經常出現大型的稀疏矩陣。現在給予最常見的 Coordinate Format (簡稱 COO 格式),請問兩個矩陣相乘結果為何。

給予矩陣 $A{n, m}$ 和 $B{m, r}$,請計算稀疏矩陣相乘。

$$A_{4,4} = \begin{bmatrix} 0 & 0 & 0 & 0 \\ 5 & 8 & 0 & 0 \\ 0 & 0 & 3 & 0 \\ 0 & 6 & 0 & 0 \\ \end{bmatrix}, \qquad B_{4,4} = \begin{bmatrix} 0 & 0 & 1 & 3 \\ 0 & 5 & 2 & 0 \\ 3 & 5 & 0 & 0 \\ 0 & 2 & 0 & 0 \\ \end{bmatrix}$$

根據 COO 格式,分別轉換矩陣 $A$ 和 $B$ 的所有非零元素,如下表所示

COO of Matrix $A$

row_index col_index value
1 0 5
1 1 8
2 2 3
3 1 6

COO of Matrix $B$

row_index col_index value
0 2 1
0 3 3
1 1 5
1 2 2
2 0 3
2 1 5
3 1 2

輸入格式

測資只有一組,第一行會有三個整數 $N, \; M, \; R$,分別表示矩陣 $A, \; B$ 的大小。下一行會有兩個整數 $N_A, \; N_B$,接下來會有 $N_A$ 行,每一行表示矩陣 $A$ 的 COO 格式,隨後會有 $N_B$ 行,每一行表示矩陣 $B$ 的 COO 格式。

  • $0 < N, \; M, \; R \le 10^6$
  • $0 < N_A, \; N_B \le 10^6$

給予 COO 格式時,先按照 row_index 由小到大,相同情況由 col_index 由小到大的方式給定,保證不會有重複,每一個元素值 $v$ 介於 $1$ 到 $2^{31}-1$ 之間。

輸出格式

輸出 $C_{n, r} = A_{n, m} \times B_{m, r}$ 的雜湊值。

定義 $\mathit{hash}(C_{n, r}) = \sum\nolimits_{e_{i, j} \in C \text{ and } e_{i, j} \neq 0} \mathit{encrypt}((i+1)*(j+1), e_{i, j})$,實際運作的 流程 可參考以下作法,當然你沒辦法宣告 $N \times M$ 的空間使用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
#define MAXN 1024
uint32_t A[MAXN][MAXN], B[MAXN][MAXN];
int main() {
int x, y, v;
int N, M, R, nA, nB;
scanf("%d %d %d", &N, &M, &R);
scanf("%d %d", &nA, &nB);
for (int i = 0; i < nA; i++)
scanf("%d %d %d", &x, &y, &v), A[x][y] = v;
for (int i = 0; i < nB; i++)
scanf("%d %d %d", &x, &y, &v), B[x][y] = v;

uint32_t hash = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < R; j++) {
uint32_t sum = 0;
for (int k = 0; k < M; k++)
sum += A[i][k] * B[k][j];
if (sum)
hash += encrypt((i+1)*(j+1), sum);
}
}
printf("%u\n", hash);
return 0;
}

範例輸入

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
4 4 4
4 7

1 0 5
1 1 8
2 2 3
3 1 6

0 2 1
0 3 3
1 1 5
1 2 2
2 0 3
2 1 5
3 1 2

範例輸出

1
13093438

範例解釋

$$A_{n, m} \times B_{m, r} = C_{4,4}=\begin{bmatrix} 0 & 0 & 0 & 0 \\ 0 & 40 & 21 & 15 \\ 9 & 15 & 0 & 0 \\ 0 & 30 & 12 & 0 \\ \end{bmatrix}$$

Solution

轉置版本

需要將矩陣 $B$ 轉置後排序,整理成 COO 格式,宣告足夠多的 thread,每一個 thread 分配一個列所有的值,接著使用類似合併排序的方式將答案統計出來。這個版本的效率不算高,因為太多的 branch 以及資料部分上導致 load balance 非常不好,再者是一堆 global memory access。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
using namespace std;
#define MAXN 1048576
#define MAXL (MAXN<<2)
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

__device__ uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
__device__ uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
typedef struct ELE {
int i[MAXL], j[MAXL];
uint32_t v[MAXL];
} ELE;
ELE LA, LB, LT;
int D[MAXL];
__global__ void SpMM(ELE *LA, ELE *LB, ELE *LC, int aoff[], int boff[], int mb) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t hash = 0;
for (int j = 0; j < mb; j++) {
int idx1 = aoff[i], idx2 = boff[j];
int top1 = aoff[i+1], top2 = boff[j+1];
uint32_t sum = 0;
int r = LA->i[idx1], c = LB->i[idx2];
while (idx1 < top1 && idx2 < top2) {
if (LA->j[idx1] < LB->j[idx2])
idx1++;
else if (LA->j[idx1] > LB->j[idx2])
idx2++;
else
sum += LA->v[idx1] * LB->v[idx2], idx1++, idx2++;
}

if (sum) {
hash += encrypt((r+1)*(c+1), sum);
}
}
LC->v[i] = hash;
}
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
int ma = 0, mb = 0;
static int aoff[MAXN], boff[MAXN];

for (int i = 0, p = -1; i < na; i++) {
if (LA.i[i] != p)
aoff[ma++] = i, p = LA.i[i];
}
for (int i = 0, p = -1; i < nb; i++) {
if (LB.i[i] != p)
boff[mb++] = i, p = LB.i[i];
}
aoff[ma] = na, boff[mb] = nb;

ELE *cuMA, *cuMB, *cuMC;
int *cuAoff, *cuBoff;
cudaMalloc((void **) &cuMA, sizeof(ELE));
cudaMalloc((void **) &cuMB, sizeof(ELE));
cudaMalloc((void **) &cuMC, sizeof(ELE));
cudaMalloc((void **) &cuAoff, (ma+1)*sizeof(int));
cudaMalloc((void **) &cuBoff, (mb+1)*sizeof(int));

cudaMemcpy(cuMA, &LA, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuMB, &LB, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuAoff, aoff, sizeof(int)*(ma+1), cudaMemcpyHostToDevice);
cudaMemcpy(cuBoff, boff, sizeof(int)*(mb+1), cudaMemcpyHostToDevice);

int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (ma%i == 0) {
localSz = i;
}
}
dim3 cuBlock(localSz);
dim3 cuGrid(ma/localSz);
SpMM<<<cuGrid, cuBlock>>>(cuMA, cuMB, cuMC, cuAoff, cuBoff, mb);
CheckErr(cudaGetLastError());
cudaMemcpy(&LA, cuMC, sizeof(ELE), cudaMemcpyDeviceToHost);
uint32_t hash = 0;
#pragma omp parallel for reduction(+: hash)
for (int i = 0; i < ma; i++)
hash += LA.v[i];
printf("%u\n", hash);
cudaFree(cuMA);
cudaFree(cuMB);
cudaFree(cuMC);

cudaFree(cuAoff);
cudaFree(cuBoff);
}
bool cmp(int a, int b) {
if (LT.i[a] != LT.i[b])
return LT.i[a] < LT.i[b];
return LT.j[a] < LT.j[b];
}
int main() {
int N, M, R, nA, nB;
while (scanf("%d %d %d", &N, &M, &R) == 3) {
scanf("%d %d", &nA, &nB);
for (int i = 0; i < nA; i++)
scanf("%d %d %d", &LA.i[i], &LA.j[i], &LA.v[i]);
for (int i = 0; i < nB; i++)
scanf("%d %d %d", &LT.j[i], &LT.i[i], &LT.v[i]);
for (int i = 0; i < nB; i++)
D[i] = i;
sort(D, D+nB, cmp);
for (int i = 0; i < nB; i++)
LB.i[i] = LT.i[D[i]], LB.j[i] = LT.j[D[i]], LB.v[i] = LT.v[D[i]];
SpMV(N, M, R, LA, nA, LB, nB);
}
return 0;
}

額外空間

每一個 thread 計算 $C_{i, [L, R]}$,然後用額外的空間進行 mapping,並且把這個空間宣告在 private memory,也就是 register 上來加速。這一個做法比上述作法還快個三到四倍。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <omp.h>
using namespace std;
#define MAXN 1000005
#define MAXINT 256
#define MAXBLK 1024
#define MAXL (MAXN)
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
typedef struct ELE {
int i[MAXL], j[MAXL];
uint32_t v[MAXL];
} ELE;
typedef struct AUX {
int aoff[MAXL], boff[MAXL], bmap[MAXL];
int ma, mb, na, nb;
} AUX;
ELE LA, LB;
AUX AU;
uint32_t H[MAXL];
__device__ inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
__device__ inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
__global__ void SpMM(ELE *LA, ELE *LB, uint32_t *H, AUX *AU, int T) {
#define INTSZ MAXINT
__shared__ uint32_t buff[MAXBLK];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int x = idx / T, y = idx % T;
int L = y * INTSZ;
int R = L + INTSZ;
uint32_t tmp[INTSZ];
for (int i = 0; i < INTSZ; i++)
tmp[i] = 0;
int lx = AU->aoff[x], rx = AU->aoff[x+1];
for (int i = lx; i < rx; i++) {
if (AU->bmap[LA->j[i]] != -1) {
int k = AU->bmap[LA->j[i]];
uint32_t val = LA->v[i];
int ly = AU->boff[k], ry = AU->boff[k+1];
for (int j = ly; j < ry; j++) {
if (L <= LB->j[j] && LB->j[j] < R)
tmp[LB->j[j] - L] += val * LB->v[j];
}
}
}
uint32_t hash = 0;
uint32_t X = LA->i[AU->aoff[x]];
for (int i = 0; i < INTSZ; i++) {
if (tmp[i])
hash += encrypt((X+1)*(i+L+1), tmp[i]);
}
buff[threadIdx.x] = hash;
__syncthreads();
for (int i = blockDim.x>>1; i; i >>= 1) {
if (threadIdx.x < i)
buff[threadIdx.x] += buff[threadIdx.x + i];
__syncthreads();
}
if (threadIdx.x == 0)
H[blockIdx.x] = buff[0];
#undef INTSZ
}
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
AU.ma = 0, AU.mb = 0;
AU.na = na, AU.nb = nb;
memset(AU.bmap, -1, sizeof(AU.bmap));
#pragma omp parallel sections
{
#pragma omp section
{
for (int i = 0, p = -1; i < na; i++) {
if (LA.i[i] != p)
AU.aoff[AU.ma++] = i, p = LA.i[i];
}
AU.aoff[AU.ma] = na;
}
#pragma omp section
{
for (int i = 0, p = -1; i < nb; i++) {
if (LB.i[i] != p) {
AU.bmap[LB.i[i]] = AU.mb;
AU.boff[AU.mb++] = i, p = LB.i[i];
}
}
AU.boff[AU.mb] = nb;
}
}
uint32_t *cuHH;
ELE *cuMA, *cuMB;
AUX *cuAU;

cudaMalloc((void **) &cuHH, sizeof(H));
cudaMalloc((void **) &cuMA, sizeof(ELE));
cudaMalloc((void **) &cuMB, sizeof(ELE));
cudaMalloc((void **) &cuAU, sizeof(AUX));

cudaMemcpy(cuHH, &H, sizeof(H), cudaMemcpyHostToDevice);
cudaMemcpy(cuMA, &LA, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuMB, &LB, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuAU, &AU, sizeof(AUX), cudaMemcpyHostToDevice);

int roundR = (R + MAXINT-1) / MAXINT * MAXINT;
int TOT = N * roundR;
while (TOT / MAXINT % MAXBLK)
TOT ++;
dim3 cuBlock(MAXBLK); // MAXTHREAD
dim3 cuGrid(TOT / MAXINT / MAXBLK);
// printf("%d\n", TOT/MAXINT);
SpMM<<<cuGrid, cuBlock>>>(cuMA, cuMB, cuHH, cuAU, roundR / MAXINT);
CheckErr(cudaGetLastError());
cudaMemcpy(H, cuHH, sizeof(H), cudaMemcpyDeviceToHost);
uint32_t hash = 0;
#ifdef _OPENMP
omp_set_num_threads(4);
#endif
#pragma omp parallel for reduction(+: hash)
for (int i = 0; i < TOT/MAXINT/MAXBLK; i++)
hash += H[i];
printf("%u\n", hash);

cudaFree(cuMA);
cudaFree(cuMB);
cudaFree(cuHH);
cudaFree(cuAU);
}
inline int readchar() {
const int N = 1048576;
static char buf[N];
static char *p = buf, *end = buf;
if(p == end) {
if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
p = buf;
}
return *p++;
}
inline int ReadInt(int *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}
inline uint32_t ReadUint(uint32_t *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}

int main() {
int N, M, R, nA, nB;
// scanf("%d %d %d", &N, &M, &R);
// scanf("%d %d", &nA, &nB);
ReadInt(&N), ReadInt(&M), ReadInt(&R);
ReadInt(&nA), ReadInt(&nB);
for (int i = 0; i < nA; i++)
ReadInt(&LA.i[i]), ReadInt(&LA.j[i]), ReadUint(&LA.v[i]);
// scanf("%d%d%d", &LA.i[i], &LA.j[i], &LA.v[i]);
for (int i = 0; i < nB; i++)
ReadInt(&LB.i[i]), ReadInt(&LB.j[i]), ReadUint(&LB.v[i]);
// scanf("%d%d%d", &LB.i[i], &LB.j[i], &LB.v[i]);
SpMV(N, M, R, LA, nA, LB, nB);
// }
return 0;
}