批改娘 10102. Matrix Calculator (CUDA)

contents

  1. 1. 題目描述
    1. 1.1. sequence.c
  2. 2. 輸入格式
  3. 3. 輸出格式
  4. 4. 範例輸入 1
  5. 5. 範例輸出 1
  6. 6. 範例輸入 2
  7. 7. 範例輸出 2
  8. 8. 編譯參數
  9. 9. Solution

題目描述

小明的數學作業要計算方陣,現在請你幫幫他!

題目給定數個 $N \times N$ 的矩陣和 $2$ 小題。

  • $X = AB+CD$
  • $Y = ABE+CDF$

sequence.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
UINT sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}
void add(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
C[i][j] = A[i][j] + B[i][j];
}
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + A[i][j]) * 2654435761LU;
}
return h;
}
UINT IN[6][MAXN][MAXN], TMP[6][MAXN][MAXN];
int main() {
int N, S[6];
scanf("%d", &N);
for (int i = 0; i < 6; i++) {
scanf("%d", &S[i]);
rand_gen(S[i], N, IN[i]);
}
// AB
multiply(N, IN[0], IN[1], TMP[0]);
// CD
multiply(N, IN[2], IN[3], TMP[1]);
// AB+CD
add(N, TMP[0], TMP[1], TMP[2]);
printf("%u\n", signature(N, TMP[2]));

// ABE
multiply(N, TMP[0], IN[4], TMP[3]);
// CDF
multiply(N, TMP[1], IN[5], TMP[4]);
// ABE+CDF
add(N, TMP[3], TMP[4], TMP[5]);
printf("%u\n", signature(N, TMP[5]));
return 0;
}

輸入格式

測資只有一組,第一行會有一個整數 $N$,表示題目給定 $N \times N$ 矩陣,第二行上會有 $6$ 個整數,分別為矩陣 $A, B, C, D, E, F$ 的生成種子。

  • $1 \le N \le 1024$
  • $0 \le S_i \le 2^{31}$

輸出格式

輸出兩行 $X$ 和 $Y$ 的雜湊值,可參考 sequence.c 的流程。

範例輸入 1

1
2
2
0 1 2 3 4 5
$$A = \begin{bmatrix} 0 & 1\\ 2 & 2 \end{bmatrix}, B = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix}, C = \begin{bmatrix} 2 & 3\\ 0 & 0 \end{bmatrix}, D = \begin{bmatrix} 3 & 1\\ 1 & 2 \end{bmatrix}, E = \begin{bmatrix} 0 & 1\\ 2 & 2 \end{bmatrix}, F = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix}$$ $$AB = \begin{bmatrix} 3 & 0\\ 8 & 6 \end{bmatrix}, CD = \begin{bmatrix} 9 & 8\\ 0 & 0 \end{bmatrix}, AB+CD = \begin{bmatrix} 12 & 8\\ 8 & 6 \end{bmatrix}\\ ABE = \begin{bmatrix} 0 & 3\\ 12 & 20 \end{bmatrix}, CDF = \begin{bmatrix} 33 & 27\\ 0 & 0 \end{bmatrix}, ABE+CDF = \begin{bmatrix} 33 & 30\\ 12 & 20 \end{bmatrix}$$

範例輸出 1

1
2
2385860290
1374821695

範例輸入 2

1
2
10
0 1 2 3 4 5

範例輸出 2

1
2
617438354
1897844131

編譯參數

1
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main

Solution

為了加快計算,一個 block 最多有 1024 個 thread 運行,因為牽涉到 warp scheduling/size,又要充分利用每一個 core 的計算能力,根據實驗結果 block size 盡可能大,且又不超過 register 個數為佳,這時候效能就會是最好的。這一點與 OpenCL 不同,當 OpenCL 偵測到填入的 block size 為 NULL 時,他會自動調整到最好的大小,而在 CUDA 就要使用者自己設定才行,這導致有些人反而不會去管大小。

根據上述所講,當然直接貪心找最大值填入即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <cuda.h>

#define MAXN 1024
#define GPULOCAL 64
#define UNLOOP 8
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostMtx[6][MAXN*MAXN];
uint32_t hostMid[2][MAXN*MAXN];
int N = MAXN, M;

__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
int x = r / N, y = r % N;
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += A[x*N + i] * B[i*N + y];
C[x * N + y] = sum;
}
__global__ void matrixAdd(uint32_t A[], uint32_t B[], uint32_t C[]) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
C[r] = A[r] + B[r];
}
void readIn() {
uint32_t S[64];
assert(scanf("%d", &N) == 1);
M = 6;
for (int i = 0; i < M; i++)
assert(scanf("%d", &S[i]) == 1);
#pragma omp parallel for
for (int p = 0; p < M; p++) {
uint32_t x = 2, n = N*N, c = S[p];
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
hostMtx[p][i*N+j] = x;
}
}
}
}
uint32_t writeOut(uint32_t *hostC) {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
return h;
}
void matrix_multiply(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
}
void matrix_add(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixAdd<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC);
CheckErr(cudaGetLastError());

}
int main(int argc, char *argv[]) {
readIn();
uint32_t *cuMtx[6], *cuMtxTmp[6];
uint32_t memSz = N*N*sizeof(uint32_t);
for (int i = 0; i < 6; i++) {
cudaMalloc((void **) &cuMtx[i], memSz);
cudaMemcpy(cuMtx[i], hostMtx[i], memSz, cudaMemcpyHostToDevice);
CheckErr(cudaGetLastError());
}
for (int i = 0; i < 6; i++)
cudaMalloc((void **) &cuMtxTmp[i], memSz);
// cuMtxTmp[0] = AB
matrix_multiply(cuMtx[0], cuMtx[1], cuMtxTmp[0]);
// cuMtxTmp[1] = CD
matrix_multiply(cuMtx[2], cuMtx[3], cuMtxTmp[1]);
// cuMtxTmp[2] = ABE
matrix_multiply(cuMtxTmp[0], cuMtx[4], cuMtxTmp[2]);
// cuMtxTmp[3] = CDF
matrix_multiply(cuMtxTmp[1], cuMtx[5], cuMtxTmp[3]);
// cuMtxTmp[4] = AB + CD
matrix_add(cuMtxTmp[0], cuMtxTmp[1], cuMtxTmp[4]);
// cuMtxTmp[5] = ABE+CDF
matrix_add(cuMtxTmp[2], cuMtxTmp[3], cuMtxTmp[5]);

cudaMemcpy(hostMid[0], cuMtxTmp[4], memSz, cudaMemcpyDeviceToHost);
cudaMemcpy(hostMid[1], cuMtxTmp[5], memSz, cudaMemcpyDeviceToHost);

uint32_t ret[2];
#pragma omp parallel for
for (int i = 0; i < 2; i++) {
ret[i] = writeOut(hostMid[i]);
}
for (int i = 0; i < 2; i++)
printf("%u\n", ret[i]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtx[i]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtxTmp[i]);
return 0;
}