批改娘 10100. Fast Matrix Multiplication (CUDA)

contents

  1. 1. 題目描述
    1. 1.1. sequence.c
  2. 2. 輸入格式
  3. 3. 輸出格式
  4. 4. 範例輸入
  5. 5. 範例輸出
  6. 6. 編譯參數
  7. 7. Solution

題目描述

計算兩個大小為 $N \times N$ 方陣 $A, \; B$ 相乘結果 $C = A \times B$。為了節省輸入輸出時間,採用亂數產生,可以參考下述程式碼,並改寫成 CUDA 的版本進行加速。

sequence.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
UINT sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + A[i][j]) * 2654435761LU;
}
return h;
}
UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
int N;
uint32_t S1, S2;
scanf("%d %u %u", &N, &S1, &S2);
rand_gen(S1, N, A);
rand_gen(S2, N, B);
multiply(N, A, B, C);
#ifdef DEBUG
print_matrix(N, A);
print_matrix(N, B);
print_matrix(N, C);
#endif
printf("%u\n", signature(N, C));
return 0;
}

輸入格式

測資只有一組,包含三個整數 $N, S_1, S_2$,分別為方陣大小 $N \times N$,產生矩陣 $A$、$B$ 的亂數種子。

  • $64 \le N \le 1024$,保證 $N \mod 64 \equiv 0$
  • $0 \le S_1, \; S_2 < 2^{31}$

輸出格式

輸出一行雜湊值 $H$,可參考 sequence.c 的流程。

範例輸入

1
64 1 2

範例輸出

1
3376147904

編譯參數

1
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main

Solution

與 OpenCL 版本相比,是否乾淨許多呢?完全不用管到底 Context 和 Kerenl Program 如何建造,但是這一些方便性都是因為預設導致的結果。

CUDA 預設會在 device 0 上運作,也就是 PCI-E 順位上的第一個顯卡,若要更動藉由函數 cudaSetDevice(deviceId); 完成,而 OpenCL 的 CommandQueue 則對應到 CUDA 的 Stream,在稍後的題目中,我會提供些許的範例使用 CUDA 這些函數,協助設計的計算流程可以加快。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <cuda.h>

#define MAXN 1024
#define GPULOCAL 64
#define UNLOOP 8
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostA[MAXN*MAXN], hostB[MAXN*MAXN], hostC[MAXN*MAXN];

int N = MAXN;

__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
__shared__ uint32_t cbuf[MAXN+1];
uint32_t rbuf[MAXN];
int r = blockIdx.x * blockDim.x + threadIdx.x;
int localId = threadIdx.x;
int localSz = blockDim.x;
for (int i = 0; i < N; i++)
rbuf[i] = A[r * N + i];
for (int c = 0; c < N; c++) {
for (int cr = localId; cr < N; cr += localSz)
cbuf[cr] = B[cr * N + c];
__syncthreads();
uint32_t sum = 0;
for (int k = 0; k+UNLOOP-1 < N; k += UNLOOP) {
sum += rbuf[k+0] * cbuf[k+0];
sum += rbuf[k+1] * cbuf[k+1];
sum += rbuf[k+2] * cbuf[k+2];
sum += rbuf[k+3] * cbuf[k+3];
sum += rbuf[k+4] * cbuf[k+4];
sum += rbuf[k+5] * cbuf[k+5];
sum += rbuf[k+6] * cbuf[k+6];
sum += rbuf[k+7] * cbuf[k+7];
}
C[r * N + c] = sum;
}
}
void readIn() {
uint32_t c1, c2;
assert(scanf("%d %u %u", &N, &c1, &c2) == 3);
uint32_t x = 2, n = N*N;
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c1 + i + j)&(n-1);
hostA[i*N+j] = x;
}
}
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c2 + i + j)&(n-1);
hostB[i*N+j] = x;
}
}
}
void writeOut() {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
printf("%u\n", h);
}
int main(int argc, char *argv[]) {
readIn();
uint32_t *cuMtxC, *cuMtxA, *cuMtxB;
cudaMalloc((void **) &cuMtxC, N*N*sizeof(uint32_t));
cudaMalloc((void **) &cuMtxA, N*N*sizeof(uint32_t));
cudaMalloc((void **) &cuMtxB, N*N*sizeof(uint32_t));
cudaMemcpy(cuMtxA, hostA, sizeof(uint32_t)*N*N, cudaMemcpyHostToDevice);
cudaMemcpy(cuMtxB, hostB, sizeof(uint32_t)*N*N, cudaMemcpyHostToDevice);
CheckErr(cudaGetLastError());
dim3 cuBlock(GPULOCAL);
dim3 cuGrid(N/GPULOCAL);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
cudaMemcpy(hostC, cuMtxC, sizeof(uint32_t)*N*N, cudaMemcpyDeviceToHost);
CheckErr(cudaGetLastError());
writeOut();
cudaFree(cuMtxC);
return 0;
}