批改娘 10109. Sorting (CUDA)

題目描述

請加速以下程序。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#include <bits/stdc++.h>
using namespace std;
#define MAXN 16777216
uint32_t A[MAXN];
inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (m*m + key)%key;
}
int main() {
int N, K;
while (scanf("%d %d", &N, &K) == 2) {
assert(N&(-N) == N);
for (int i = 0; i < N; i++)
A[i] = encrypt(i, K);
sort(A, A+N);
uint32_t sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < N; i++)
sum += A[i] * i;
printf("%u\n", sum);
}
return 0;
}

輸入格式

有多行測資,每組第一行會有兩個整數 $N, \; K$,表示要排序 $N$ 個整數,並且以亂數種子 $K$ 生成。

  • $N = 2^i, \; 0 \le i \le 24$
  • $1 \le K \le N$

輸出格式

輸出一行 $\sum i \cdot A_i$。

範例輸入

1
2
8 8
8 7

範例輸出

1
2
66
75

Solution

bitonic sort

實作 bitonic sort,最簡單全部使用 global memory 完成,但是在遞迴公式中,當 $n$ 已經足夠小就可以代入 shared memory 進行計算,這時候效能就非常好。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <assert.h>
#include <omp.h>
using namespace std;
#define MAXN 16777216
#define MAXBLK 1024
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t A[MAXN];
__device__ inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (m*m + key)%key;
}
__device__ inline void swap(uint32_t &x, uint32_t &y) {
uint32_t t = x; x = y; y = t;
}
__global__ void bitonic_step(uint32_t *A, int p, int q) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int up = ((i >> p)&2) == 0;
int d = 1 << (p - q);
if ((i & d) == 0 && (A[i] > A[i|d]) == up)
swap(A[i], A[i|d]);
}
__global__ void bitonic_step_fit(uint32_t *A, int p, int q) {
extern __shared__ uint32_t buff[];
int i = blockIdx.x * blockDim.x + threadIdx.x;
int up = ((i >> p)&2) == 0;
buff[threadIdx.x] = A[i];
__syncthreads();
for (; q <= p; q++) {
int d = 1 << (p - q);
if ((i & d) == 0 && (buff[threadIdx.x] > buff[threadIdx.x|d]) == up)
swap(buff[threadIdx.x], buff[threadIdx.x|d]);
__syncthreads();
}
A[i] = buff[threadIdx.x];
}
__global__ void sum_arr(uint32_t *A, uint32_t *B, int N) {
extern __shared__ uint32_t buff[];
int i = blockIdx.x * blockDim.x + threadIdx.x;
buff[threadIdx.x] = A[i] * i;
__syncthreads();
for (int i = blockDim.x>>1; i; i >>= 1) {
if (threadIdx.x < i)
buff[threadIdx.x] += buff[threadIdx.x + i];
__syncthreads();
}
if (threadIdx.x == 0)
B[blockIdx.x] = buff[0];
}
__global__ void rand_gen(uint32_t *A, int N, int K) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
A[i] = encrypt(i, K);
}
void output(int N, uint32_t *cuA, uint32_t *cuB) {
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
CheckErr(cudaGetLastError());
sum_arr<<<cuGrid, cuBlock, sizeof(uint32_t)*min(MAXBLK, N)>>>(cuA, cuB, N);
CheckErr(cudaGetLastError());
cudaMemcpy(A, cuB, sizeof(uint32_t) * N / min(MAXBLK, N), cudaMemcpyDeviceToHost);
CheckErr(cudaGetLastError());
uint32_t sum = 0;
for (int i = 0; i < N / min(MAXBLK, N); i++)
sum += A[i];
printf("%u\n", sum);
}
void sort_test(int N, int K, uint32_t *cuA, uint32_t *cuB) {
assert((N&-N) == N);
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
rand_gen<<<cuGrid, cuBlock>>>(cuA, N, K);
CheckErr(cudaGetLastError());
int logN = 1;
while ((1 << logN) < N) logN++;
for (int i = 0; i < logN; i++) {
for (int j = 0; j <= i; j++) {
if ((1 << (i - j) >= MAXBLK)) {
bitonic_step<<<cuGrid, cuBlock>>>(cuA, i, j);
CheckErr(cudaGetLastError());
} else {
bitonic_step_fit<<<cuGrid, cuBlock, sizeof(uint32_t)*(MAXBLK)>>>(cuA, i, j);
CheckErr(cudaGetLastError());
break;
}
}
}
output(N, cuA, cuB);
}
int main() {
uint32_t *cuA, *cuB;
cudaMalloc((void **) &cuA, sizeof(uint32_t) * MAXN);
cudaMalloc((void **) &cuB, sizeof(uint32_t) * MAXN / MAXBLK);
CheckErr(cudaGetLastError());
int N, K;
while(scanf("%d %d", &N, &K) == 2) {
sort_test(N, K, cuA, cuB);
}
cudaFree(cuA);
cudaFree(cuB);
return 0;
}

內建 thrust

當然這種最常見的函數一定有內建可以協助,如 thrust library 就有提供相關的函數,但第一次執行時會特別地慢,有可能是 library 要代入 cache 的緣故。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <vector>
#include <assert.h>
#include <omp.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
using namespace std;
#define MAXN 16777216
#define MAXBLK 1024
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t A[MAXN];
__device__ inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (m*m + key)%key;
}
__device__ inline void swap(uint32_t &x, uint32_t &y) {
uint32_t t = x; x = y; y = t;
}
__global__ void sum_arr(uint32_t *A, uint32_t *B, int N) {
extern __shared__ uint32_t buff[];
int i = blockIdx.x * blockDim.x + threadIdx.x;
buff[threadIdx.x] = A[i] * i;
__syncthreads();
for (int i = blockDim.x>>1; i; i >>= 1) {
if (threadIdx.x < i)
buff[threadIdx.x] += buff[threadIdx.x + i];
__syncthreads();
}
if (threadIdx.x == 0)
B[blockIdx.x] = buff[0];
}
__global__ void rand_gen(uint32_t *A, int N, int K) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
A[i] = encrypt(i, K);
}
void output(int N, uint32_t *cuA, uint32_t *cuB) {
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
CheckErr(cudaGetLastError());
sum_arr<<<cuGrid, cuBlock, sizeof(uint32_t)*min(MAXBLK, N)>>>(cuA, cuB, N);
CheckErr(cudaGetLastError());
cudaMemcpy(A, cuB, sizeof(uint32_t) * N / min(MAXBLK, N), cudaMemcpyDeviceToHost);
CheckErr(cudaGetLastError());
uint32_t sum = 0;
for (int i = 0; i < N / min(MAXBLK, N); i++)
sum += A[i];
printf("%u\n", sum);
}
void cpu_compute(int N, int K) {
vector<int> A;
for (int i = 0; i < N; i++)
A.push_back((i*i + K)%K);
sort(A.begin(), A.end());
uint32_t sum = 0;
for (int i = 0; i < A.size(); i++)
sum += A[i] * i;
printf("%u\n", sum);
return ;
}
void sort_test(int N, int K, uint32_t *cuA, uint32_t *cuB) {
assert((N&-N) == N);
if (N < MAXBLK) {
cpu_compute(N, K);
return ;
}
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
rand_gen<<<cuGrid, cuBlock>>>(cuA, N, K);
CheckErr(cudaGetLastError());
thrust::device_ptr<uint32_t> cuAptr(cuA);
thrust::sort(cuAptr, cuAptr+N);
output(N, cuA, cuB);
}
int main() {
uint32_t *cuA, *cuB;
cudaMalloc((void **) &cuA, sizeof(uint32_t) * MAXN);
cudaMalloc((void **) &cuB, sizeof(uint32_t) * MAXN / MAXBLK);
CheckErr(cudaGetLastError());
int N, K;
while(scanf("%d %d", &N, &K) == 2)
sort_test(N, K, cuA, cuB);
cudaFree(cuA);
cudaFree(cuB);
return 0;
}
Read More +

批改娘 10108. Streams and Concurrency II (CUDA)

題目描述

Demo

規格

  • Accepted 判斷依準:單一 Device 是否同時執行兩個以上的 kernel。
  • 目前只有舊的 GPU 可以提供 Judge,請確定程式運行在第三個 GPU 上。意即
1
2
int device = 2;
cudaSetDevice(device);

Solution

這一題要完成數個 kernel function 可以同時運行,因為有時候使用的 core 並不會同時運作,在有剩餘的 core 情況下,就可以將下一個 kerenl function 帶進來運作,這時候效能就可以大幅度提升。

隨便寫一個測試即可,但是在設計這一題時,發現新版的 GTX 980Ti 並不支援,藉由 CUDA 環境變數仍然無法看出,最後只能在舊版的 GPU 上,利用舊有的 nvprof 進行分析,儘管是最新版的 nvvp 仍然無法針對在 GTX 980Ti 裝置上運作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <cuda.h>
#include <omp.h>
#define MAXN (64)
#define nStreams 4
__global__ void test_global1(uint32_t IN[], int m, uint32_t OUT[]) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t sum = 0;
int LOCALIT = m * 500000;
for (int i = 0; i < LOCALIT; i++) {
sum += IN[x];
}
OUT[x] = sum;
}
uint32_t hostIn[MAXN], hostOut[MAXN];
#define CheckCuda(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
cudaStream_t stream[nStreams];
void pipelTest(uint32_t *cuIn[], uint32_t *cuOut[], int n, int m[]) {
dim3 cuBlock(1);
dim3 cuGrid(n / 1);
test_global1<<<cuGrid, cuBlock, 0, stream[0]>>>(cuIn[0], m[0], cuOut[0]);
test_global1<<<cuGrid, cuBlock, 0, stream[1]>>>(cuIn[1], m[1], cuOut[1]);
test_global1<<<cuGrid, cuBlock, 0, stream[2]>>>(cuIn[2], m[2], cuOut[2]);
test_global1<<<cuGrid, cuBlock, 0, stream[3]>>>(cuIn[3], m[3], cuOut[3]);
}
int main() {
int device = 2;
cudaSetDevice(device);
// Find device clock rate to calculate number of cycles (for 10ms)
cudaDeviceProp deviceProp;
cudaGetDevice(&device);
cudaGetDeviceProperties(&deviceProp, device);
int clockRate = deviceProp.clockRate;
printf("Device clock rate: %.3f GHz\n", (float)clockRate/1000000);
// Check card supports concurrency
if (deviceProp.concurrentKernels == 0) {
printf("GPU does not support concurrent kernel execution\n");
printf("CUDA kernel runs will be serialised\n");
}
//
srand(time(NULL));
uint32_t *cuIn[nStreams];
uint32_t *cuOut[nStreams];
for (int i = 0; i < nStreams; i++) {
CheckCuda(cudaStreamCreate(&stream[i]));
CheckCuda(cudaMalloc((void **)&cuIn[i], MAXN*sizeof(uint32_t)));
CheckCuda(cudaMalloc((void **)&cuOut[i], MAXN*sizeof(uint32_t)));
for (int j = 0; j < MAXN; j++)
hostIn[j] = rand();
cudaMemcpy(cuIn[i], hostIn, MAXN*sizeof(uint32_t), cudaMemcpyHostToDevice);
}
int m[] = {1, 2, 4, 8};
for (int i = 0; i < 5; i++) {
pipelTest(cuIn, cuOut, MAXN, m);
CheckCuda(cudaThreadSynchronize());
}
CheckCuda(cudaDeviceSynchronize());
for (int i = 0; i < nStreams; i++) {
cudaMemcpy(hostOut, cuOut[i], MAXN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
uint32_t sum = 0;
for (int j = 0; j < MAXN; j++)
sum += hostOut[j];
printf("%u\n", sum);
}
for (int i = 0; i < nStreams; i++)
cudaFree(cuIn[i]);
return 0;
}
Read More +

批改娘 10107. Sparse Matrix Multiplication (CUDA)

題目描述

稀疏矩陣為大部份元素皆為零的矩陣,在科學與工程領域中求解線性模型時經常出現大型的稀疏矩陣。現在給予最常見的 Coordinate Format (簡稱 COO 格式),請問兩個矩陣相乘結果為何。

給予矩陣 $A{n, m}$ 和 $B{m, r}$,請計算稀疏矩陣相乘。

$$A_{4,4} = \begin{bmatrix} 0 & 0 & 0 & 0 \\ 5 & 8 & 0 & 0 \\ 0 & 0 & 3 & 0 \\ 0 & 6 & 0 & 0 \\ \end{bmatrix}, \qquad B_{4,4} = \begin{bmatrix} 0 & 0 & 1 & 3 \\ 0 & 5 & 2 & 0 \\ 3 & 5 & 0 & 0 \\ 0 & 2 & 0 & 0 \\ \end{bmatrix}$$

根據 COO 格式,分別轉換矩陣 $A$ 和 $B$ 的所有非零元素,如下表所示

COO of Matrix $A$

row_index col_index value
1 0 5
1 1 8
2 2 3
3 1 6

COO of Matrix $B$

row_index col_index value
0 2 1
0 3 3
1 1 5
1 2 2
2 0 3
2 1 5
3 1 2

輸入格式

測資只有一組,第一行會有三個整數 $N, \; M, \; R$,分別表示矩陣 $A, \; B$ 的大小。下一行會有兩個整數 $N_A, \; N_B$,接下來會有 $N_A$ 行,每一行表示矩陣 $A$ 的 COO 格式,隨後會有 $N_B$ 行,每一行表示矩陣 $B$ 的 COO 格式。

  • $0 < N, \; M, \; R \le 10^6$
  • $0 < N_A, \; N_B \le 10^6$

給予 COO 格式時,先按照 row_index 由小到大,相同情況由 col_index 由小到大的方式給定,保證不會有重複,每一個元素值 $v$ 介於 $1$ 到 $2^{31}-1$ 之間。

輸出格式

輸出 $C_{n, r} = A_{n, m} \times B_{m, r}$ 的雜湊值。

定義 $\mathit{hash}(C_{n, r}) = \sum\nolimits_{e_{i, j} \in C \text{ and } e_{i, j} \neq 0} \mathit{encrypt}((i+1)*(j+1), e_{i, j})$,實際運作的 流程 可參考以下作法,當然你沒辦法宣告 $N \times M$ 的空間使用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
#define MAXN 1024
uint32_t A[MAXN][MAXN], B[MAXN][MAXN];
int main() {
int x, y, v;
int N, M, R, nA, nB;
scanf("%d %d %d", &N, &M, &R);
scanf("%d %d", &nA, &nB);
for (int i = 0; i < nA; i++)
scanf("%d %d %d", &x, &y, &v), A[x][y] = v;
for (int i = 0; i < nB; i++)
scanf("%d %d %d", &x, &y, &v), B[x][y] = v;
uint32_t hash = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < R; j++) {
uint32_t sum = 0;
for (int k = 0; k < M; k++)
sum += A[i][k] * B[k][j];
if (sum)
hash += encrypt((i+1)*(j+1), sum);
}
}
printf("%u\n", hash);
return 0;
}

範例輸入

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
4 4 4
4 7
1 0 5
1 1 8
2 2 3
3 1 6
0 2 1
0 3 3
1 1 5
1 2 2
2 0 3
2 1 5
3 1 2

範例輸出

1
13093438

範例解釋

$$A_{n, m} \times B_{m, r} = C_{4,4}=\begin{bmatrix} 0 & 0 & 0 & 0 \\ 0 & 40 & 21 & 15 \\ 9 & 15 & 0 & 0 \\ 0 & 30 & 12 & 0 \\ \end{bmatrix}$$

Solution

轉置版本

需要將矩陣 $B$ 轉置後排序,整理成 COO 格式,宣告足夠多的 thread,每一個 thread 分配一個列所有的值,接著使用類似合併排序的方式將答案統計出來。這個版本的效率不算高,因為太多的 branch 以及資料部分上導致 load balance 非常不好,再者是一堆 global memory access。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
using namespace std;
#define MAXN 1048576
#define MAXL (MAXN<<2)
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
__device__ uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
typedef struct ELE {
int i[MAXL], j[MAXL];
uint32_t v[MAXL];
} ELE;
ELE LA, LB, LT;
int D[MAXL];
__global__ void SpMM(ELE *LA, ELE *LB, ELE *LC, int aoff[], int boff[], int mb) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t hash = 0;
for (int j = 0; j < mb; j++) {
int idx1 = aoff[i], idx2 = boff[j];
int top1 = aoff[i+1], top2 = boff[j+1];
uint32_t sum = 0;
int r = LA->i[idx1], c = LB->i[idx2];
while (idx1 < top1 && idx2 < top2) {
if (LA->j[idx1] < LB->j[idx2])
idx1++;
else if (LA->j[idx1] > LB->j[idx2])
idx2++;
else
sum += LA->v[idx1] * LB->v[idx2], idx1++, idx2++;
}
if (sum) {
hash += encrypt((r+1)*(c+1), sum);
}
}
LC->v[i] = hash;
}
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
int ma = 0, mb = 0;
static int aoff[MAXN], boff[MAXN];
for (int i = 0, p = -1; i < na; i++) {
if (LA.i[i] != p)
aoff[ma++] = i, p = LA.i[i];
}
for (int i = 0, p = -1; i < nb; i++) {
if (LB.i[i] != p)
boff[mb++] = i, p = LB.i[i];
}
aoff[ma] = na, boff[mb] = nb;
ELE *cuMA, *cuMB, *cuMC;
int *cuAoff, *cuBoff;
cudaMalloc((void **) &cuMA, sizeof(ELE));
cudaMalloc((void **) &cuMB, sizeof(ELE));
cudaMalloc((void **) &cuMC, sizeof(ELE));
cudaMalloc((void **) &cuAoff, (ma+1)*sizeof(int));
cudaMalloc((void **) &cuBoff, (mb+1)*sizeof(int));
cudaMemcpy(cuMA, &LA, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuMB, &LB, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuAoff, aoff, sizeof(int)*(ma+1), cudaMemcpyHostToDevice);
cudaMemcpy(cuBoff, boff, sizeof(int)*(mb+1), cudaMemcpyHostToDevice);
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (ma%i == 0) {
localSz = i;
}
}
dim3 cuBlock(localSz);
dim3 cuGrid(ma/localSz);
SpMM<<<cuGrid, cuBlock>>>(cuMA, cuMB, cuMC, cuAoff, cuBoff, mb);
CheckErr(cudaGetLastError());
cudaMemcpy(&LA, cuMC, sizeof(ELE), cudaMemcpyDeviceToHost);
uint32_t hash = 0;
#pragma omp parallel for reduction(+: hash)
for (int i = 0; i < ma; i++)
hash += LA.v[i];
printf("%u\n", hash);
cudaFree(cuMA);
cudaFree(cuMB);
cudaFree(cuMC);
cudaFree(cuAoff);
cudaFree(cuBoff);
}
bool cmp(int a, int b) {
if (LT.i[a] != LT.i[b])
return LT.i[a] < LT.i[b];
return LT.j[a] < LT.j[b];
}
int main() {
int N, M, R, nA, nB;
while (scanf("%d %d %d", &N, &M, &R) == 3) {
scanf("%d %d", &nA, &nB);
for (int i = 0; i < nA; i++)
scanf("%d %d %d", &LA.i[i], &LA.j[i], &LA.v[i]);
for (int i = 0; i < nB; i++)
scanf("%d %d %d", &LT.j[i], &LT.i[i], &LT.v[i]);
for (int i = 0; i < nB; i++)
D[i] = i;
sort(D, D+nB, cmp);
for (int i = 0; i < nB; i++)
LB.i[i] = LT.i[D[i]], LB.j[i] = LT.j[D[i]], LB.v[i] = LT.v[D[i]];
SpMV(N, M, R, LA, nA, LB, nB);
}
return 0;
}

額外空間

每一個 thread 計算 $C_{i, [L, R]}$,然後用額外的空間進行 mapping,並且把這個空間宣告在 private memory,也就是 register 上來加速。這一個做法比上述作法還快個三到四倍。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <omp.h>
using namespace std;
#define MAXN 1000005
#define MAXINT 256
#define MAXBLK 1024
#define MAXL (MAXN)
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
typedef struct ELE {
int i[MAXL], j[MAXL];
uint32_t v[MAXL];
} ELE;
typedef struct AUX {
int aoff[MAXL], boff[MAXL], bmap[MAXL];
int ma, mb, na, nb;
} AUX;
ELE LA, LB;
AUX AU;
uint32_t H[MAXL];
__device__ inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
__device__ inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
__global__ void SpMM(ELE *LA, ELE *LB, uint32_t *H, AUX *AU, int T) {
#define INTSZ MAXINT
__shared__ uint32_t buff[MAXBLK];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int x = idx / T, y = idx % T;
int L = y * INTSZ;
int R = L + INTSZ;
uint32_t tmp[INTSZ];
for (int i = 0; i < INTSZ; i++)
tmp[i] = 0;
int lx = AU->aoff[x], rx = AU->aoff[x+1];
for (int i = lx; i < rx; i++) {
if (AU->bmap[LA->j[i]] != -1) {
int k = AU->bmap[LA->j[i]];
uint32_t val = LA->v[i];
int ly = AU->boff[k], ry = AU->boff[k+1];
for (int j = ly; j < ry; j++) {
if (L <= LB->j[j] && LB->j[j] < R)
tmp[LB->j[j] - L] += val * LB->v[j];
}
}
}
uint32_t hash = 0;
uint32_t X = LA->i[AU->aoff[x]];
for (int i = 0; i < INTSZ; i++) {
if (tmp[i])
hash += encrypt((X+1)*(i+L+1), tmp[i]);
}
buff[threadIdx.x] = hash;
__syncthreads();
for (int i = blockDim.x>>1; i; i >>= 1) {
if (threadIdx.x < i)
buff[threadIdx.x] += buff[threadIdx.x + i];
__syncthreads();
}
if (threadIdx.x == 0)
H[blockIdx.x] = buff[0];
#undef INTSZ
}
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
AU.ma = 0, AU.mb = 0;
AU.na = na, AU.nb = nb;
memset(AU.bmap, -1, sizeof(AU.bmap));
#pragma omp parallel sections
{
#pragma omp section
{
for (int i = 0, p = -1; i < na; i++) {
if (LA.i[i] != p)
AU.aoff[AU.ma++] = i, p = LA.i[i];
}
AU.aoff[AU.ma] = na;
}
#pragma omp section
{
for (int i = 0, p = -1; i < nb; i++) {
if (LB.i[i] != p) {
AU.bmap[LB.i[i]] = AU.mb;
AU.boff[AU.mb++] = i, p = LB.i[i];
}
}
AU.boff[AU.mb] = nb;
}
}
uint32_t *cuHH;
ELE *cuMA, *cuMB;
AUX *cuAU;
cudaMalloc((void **) &cuHH, sizeof(H));
cudaMalloc((void **) &cuMA, sizeof(ELE));
cudaMalloc((void **) &cuMB, sizeof(ELE));
cudaMalloc((void **) &cuAU, sizeof(AUX));
cudaMemcpy(cuHH, &H, sizeof(H), cudaMemcpyHostToDevice);
cudaMemcpy(cuMA, &LA, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuMB, &LB, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuAU, &AU, sizeof(AUX), cudaMemcpyHostToDevice);
int roundR = (R + MAXINT-1) / MAXINT * MAXINT;
int TOT = N * roundR;
while (TOT / MAXINT % MAXBLK)
TOT ++;
dim3 cuBlock(MAXBLK); // MAXTHREAD
dim3 cuGrid(TOT / MAXINT / MAXBLK);
// printf("%d\n", TOT/MAXINT);
SpMM<<<cuGrid, cuBlock>>>(cuMA, cuMB, cuHH, cuAU, roundR / MAXINT);
CheckErr(cudaGetLastError());
cudaMemcpy(H, cuHH, sizeof(H), cudaMemcpyDeviceToHost);
uint32_t hash = 0;
#ifdef _OPENMP
omp_set_num_threads(4);
#endif
#pragma omp parallel for reduction(+: hash)
for (int i = 0; i < TOT/MAXINT/MAXBLK; i++)
hash += H[i];
printf("%u\n", hash);
cudaFree(cuMA);
cudaFree(cuMB);
cudaFree(cuHH);
cudaFree(cuAU);
}
inline int readchar() {
const int N = 1048576;
static char buf[N];
static char *p = buf, *end = buf;
if(p == end) {
if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
p = buf;
}
return *p++;
}
inline int ReadInt(int *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}
inline uint32_t ReadUint(uint32_t *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}
int main() {
int N, M, R, nA, nB;
// scanf("%d %d %d", &N, &M, &R);
// scanf("%d %d", &nA, &nB);
ReadInt(&N), ReadInt(&M), ReadInt(&R);
ReadInt(&nA), ReadInt(&nB);
for (int i = 0; i < nA; i++)
ReadInt(&LA.i[i]), ReadInt(&LA.j[i]), ReadUint(&LA.v[i]);
// scanf("%d%d%d", &LA.i[i], &LA.j[i], &LA.v[i]);
for (int i = 0; i < nB; i++)
ReadInt(&LB.i[i]), ReadInt(&LB.j[i]), ReadUint(&LB.v[i]);
// scanf("%d%d%d", &LB.i[i], &LB.j[i], &LB.v[i]);
SpMV(N, M, R, LA, nA, LB, nB);
// }
return 0;
}
Read More +

批改娘 10106. Multiple Device (CUDA)

題目描述

小明的數學作業要計算方陣,現在請你幫幫他!

題目給定數個 $N \times N$ 的矩陣和 $2$ 小題。

  • $X = AB+CD$
  • $Y = ABE+CDF$

sequence.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
UINT sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}
void add(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
C[i][j] = A[i][j] + B[i][j];
}
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + A[i][j]) * 2654435761LU;
}
return h;
}
UINT IN[6][MAXN][MAXN], TMP[6][MAXN][MAXN];
int main() {
int N, S[6];
scanf("%d", &N);
for (int i = 0; i < 6; i++) {
scanf("%d", &S[i]);
rand_gen(S[i], N, IN[i]);
}
// AB
multiply(N, IN[0], IN[1], TMP[0]);
// CD
multiply(N, IN[2], IN[3], TMP[1]);
// AB+CD
add(N, TMP[0], TMP[1], TMP[2]);
printf("%u\n", signature(N, TMP[2]));
// ABE
multiply(N, TMP[0], IN[4], TMP[3]);
// CDF
multiply(N, TMP[1], IN[5], TMP[4]);
// ABE+CDF
add(N, TMP[3], TMP[4], TMP[5]);
printf("%u\n", signature(N, TMP[5]));
return 0;
}

輸入格式

輸入有多組測資,每組第一行會有一個整數 $N$,表示題目給定 $N \times N$ 矩陣,第二行上會有 $6$ 個整數,分別為矩陣 $A, B, C, D, E, F$ 的生成種子。

  • $1 \le N \le 1024$
  • $0 \le S_i \le 2^{31}$

輸出格式

輸出兩行 $X$ 和 $Y$ 的雜湊值,可參考 sequence.c 的流程。

編譯參數

1
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main

Sample Input

1
2
3
4
2
0 1 2 3 4 5
10
0 1 2 3 4 5

Sample Output

1
2
3
4
2385860290
1374821695
617438354
1897844131

Solution

與 OpenMP 一起設計,仿造 OpenCL 版本。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <cuda.h>
#define MAXGPU 3
#define MAXN 1024
#define GPULOCAL 64
#define UNLOOP 8
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostMtx[MAXGPU][6][MAXN*MAXN];
uint32_t hostMid[MAXGPU][2][MAXN*MAXN];
__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
int x = r / N, y = r % N;
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += A[x*N + i] * B[i*N + y];
C[x * N + y] = sum;
}
__global__ void matrixAdd(uint32_t A[], uint32_t B[], uint32_t C[]) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
C[r] = A[r] + B[r];
}
int readIn(uint32_t S[], int *n, int devId) {
int N, M;
if (scanf("%d", &N) != 1)
return 0;
M = 6;
for (int i = 0; i < M; i++)
assert(scanf("%d", &S[i]) == 1);
for (int p = 0; p < M; p++)
#pragma omp task
{
uint32_t x = 2, n = N*N, c = S[p];
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
hostMtx[devId][p][i*N+j] = x;
}
}
}
#pragma omp taskwait
*n = N;
return 1;
}
uint32_t writeOut(uint32_t *hostC, int N) {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
return h;
}
void matrix_mul(int N, uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
}
void matrix_add(int N, uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixAdd<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC);
CheckErr(cudaGetLastError());
}
void solver(int devId, int N, uint32_t *cuMtx[], uint32_t *cuMtxTmp[], uint32_t ret[]) {
uint32_t memSz = N*N*sizeof(uint32_t);
for (int i = 0; i < 6; i++) {
cudaMemcpy(cuMtx[i], hostMtx[devId][i], memSz, cudaMemcpyHostToDevice);
CheckErr(cudaGetLastError());
}
// cuMtxTmp[0] = AB
matrix_mul(N, cuMtx[0], cuMtx[1], cuMtxTmp[0]);
// cuMtxTmp[1] = CD
matrix_mul(N, cuMtx[2], cuMtx[3], cuMtxTmp[1]);
// cuMtxTmp[2] = ABE
matrix_mul(N, cuMtxTmp[0], cuMtx[4], cuMtxTmp[2]);
// cuMtxTmp[3] = CDF
matrix_mul(N, cuMtxTmp[1], cuMtx[5], cuMtxTmp[3]);
// cuMtxTmp[4] = AB + CD
matrix_add(N, cuMtxTmp[0], cuMtxTmp[1], cuMtxTmp[4]);
// cuMtxTmp[5] = ABE+CDF
matrix_add(N, cuMtxTmp[2], cuMtxTmp[3], cuMtxTmp[5]);
cudaMemcpy(hostMid[devId][0], cuMtxTmp[4], memSz, cudaMemcpyDeviceToHost);
cudaMemcpy(hostMid[devId][1], cuMtxTmp[5], memSz, cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++)
#pragma omp task
{
ret[i] = writeOut(hostMid[devId][i], N);
}
#pragma omp taskwait
}
int main(int argc, char *argv[]) {
uint32_t *cuMtx[MAXGPU][6], *cuMtxTmp[MAXGPU][6];
uint32_t memSz = MAXN*MAXN*sizeof(uint32_t);
for (int p = 0; p < MAXGPU; p++) {
cudaSetDevice(p);
for (int i = 0; i < 6; i++) {
cudaMalloc((void **) &cuMtx[p][i], memSz);
CheckErr(cudaGetLastError());
}
for (int i = 0; i < 6; i++)
cudaMalloc((void **) &cuMtxTmp[p][i], memSz);
}
int inN = 0;
static uint32_t ansQue[32767][2];
#pragma omp parallel sections
{
#pragma omp section
{
cudaSetDevice(0);
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 0);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(0, N, cuMtx[0], cuMtxTmp[0], ansQue[pid]);
}
}
#pragma omp section
{
cudaSetDevice(1);
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 1);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(1, N, cuMtx[1], cuMtxTmp[1], ansQue[pid]);
}
}
#pragma omp section
{
cudaSetDevice(2);
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 2);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(2, N, cuMtx[2], cuMtxTmp[2], ansQue[pid]);
}
}
}
for (int i = 0; i < inN; i++)
printf("%u\n%u\n", ansQue[i][0], ansQue[i][1]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtx[i]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtxTmp[i]);
return 0;
}
Read More +

批改娘 10104. Streams and Concurrency (CUDA)

題目描述

根據 Nvidia - Streams and Concurrency 讓 Data transfer 和 Kernel execute 時間重疊達到加速。

  • 任何程式都可以
  • 沒有指定輸入、輸出

測試流程

profiler script

下載 nvprof.sh 和 nvvp.log

編譯執行

1
2
3
4
$ chmod +x nvprof.sh
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main
$ ./nvprof.sh ./main
$ cat nvvp.log

Accepted 判斷依準:單一 Device 是否在運行過程中發生并行。

Accepted

Wrong Answer

Solution

出這一題是為了測試 software pipeline 的設計,加快批次處理的效能,藉由數個 stream 的使用,讓資料傳輸和計算相互重疊。

這一題讓我設計測試 Judge 相當懊惱,藉由 Nvidia 提供環境變數的 debug 資訊產生的 log 檔就能分析執行區間是否有重疊,這個問題就迎刃而解。當然此題不在課程範圍內,提案給老師說要不要教,預料中地被打槍。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
/* Copyright (c) 1993-2015, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void kernel(float *a, int offset)
{
int i = offset + threadIdx.x + blockIdx.x*blockDim.x;
float x = (float)i;
float s = sinf(x);
float c = cosf(x);
a[i] = a[i] + sqrtf(s*s+c*c);
}
float maxError(float *a, int n)
{
float maxE = 0;
for (int i = 0; i < n; i++) {
float error = fabs(a[i]-1.0f);
if (error > maxE) maxE = error;
}
return maxE;
}
int main(int argc, char **argv)
{
const int blockSize = 256, nStreams = 4;
const int n = 4 * 1024 * blockSize * nStreams;
const int streamSize = n / nStreams;
const int streamBytes = streamSize * sizeof(float);
const int bytes = n * sizeof(float);
int devId = 0;
if (argc > 1) devId = atoi(argv[1]);
cudaDeviceProp prop;
checkCuda( cudaGetDeviceProperties(&prop, devId));
printf("Device : %s\n", prop.name);
checkCuda( cudaSetDevice(devId) );
// allocate pinned host memory and device memory
float *a, *d_a;
checkCuda( cudaMallocHost((void**)&a, bytes) ); // host pinned
checkCuda( cudaMalloc((void**)&d_a, bytes) ); // device
float ms; // elapsed time in milliseconds
// create events and streams
cudaEvent_t startEvent, stopEvent, dummyEvent;
cudaStream_t stream[nStreams];
checkCuda( cudaEventCreate(&startEvent) );
checkCuda( cudaEventCreate(&stopEvent) );
checkCuda( cudaEventCreate(&dummyEvent) );
for (int i = 0; i < nStreams; ++i)
checkCuda( cudaStreamCreate(&stream[i]) );
// baseline case - sequential transfer and execute
memset(a, 0, bytes);
checkCuda( cudaEventRecord(startEvent,0) );
checkCuda( cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice) );
kernel<<<n/blockSize, blockSize>>>(d_a, 0);
checkCuda( cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost) );
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
printf("Time for sequential transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// asynchronous version 1: loop over {copy, kernel, copy}
memset(a, 0, bytes);
checkCuda( cudaEventRecord(startEvent,0) );
for (int i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]) );
kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]) );
}
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// asynchronous version 2:
// loop over copy, loop over kernel, loop over copy
memset(a, 0, bytes);
checkCuda( cudaEventRecord(startEvent,0) );
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]) );
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]) );
}
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// cleanup
checkCuda( cudaEventDestroy(startEvent) );
checkCuda( cudaEventDestroy(stopEvent) );
checkCuda( cudaEventDestroy(dummyEvent) );
for (int i = 0; i < nStreams; ++i)
checkCuda( cudaStreamDestroy(stream[i]) );
cudaFree(d_a);
cudaFreeHost(a);
return 0;
}
Read More +

批改娘 10103. Advanced Matrix Calculator (CUDA)

題目描述

小明的數學作業要計算方陣,現在請你幫幫他!

題目給定數個 $N \times N$ 的矩陣和 $Q$ 小題,每一小題只由加法和乘法構成。

sequence.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
UINT sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}
void add(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
C[i][j] = A[i][j] + B[i][j];
}
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + A[i][j]) * 2654435761LU;
}
return h;
}
UINT IN[6][MAXN][MAXN], TMP[6][MAXN][MAXN];
int main() {
int N, S[6];
scanf("%d", &N);
for (int i = 0; i < 6; i++) {
scanf("%d", &S[i]);
rand_gen(S[i], N, IN[i]);
}
// AB
multiply(N, IN[0], IN[1], TMP[0]);
// CD
multiply(N, IN[2], IN[3], TMP[1]);
// AB+CD
add(N, TMP[0], TMP[1], TMP[2]);
printf("%u\n", signature(N, TMP[2]));
// ABE
multiply(N, TMP[0], IN[4], TMP[3]);
// CDF
multiply(N, TMP[1], IN[5], TMP[4]);
// ABE+CDF
add(N, TMP[3], TMP[4], TMP[5]);
printf("%u\n", signature(N, TMP[5]));
return 0;
}

輸入格式

測資只有一組,第一行會有兩個整數 $M,N$,表示題目給定 $M$ 個 $N \times N$ 矩陣,第二行上會有 $N$ 個整數 $S_i$ 個第 $i$ 個矩陣生成種子。最後會有一行一個整數 $Q$,表示接下來有 $Q$ 行詢問,每一行上會有一個字串 $E$ 表示接下來要處理的矩陣表達式,$E$ 只包含 A-Z 以及 +

  • $1 \le M \le 26$
  • $1 \le N \le 1024$
  • $0 \le S_i \le 2^{31}$
  • $1 \le Q \le 100$
  • $|E| \le 26$

輸出格式

對於每一組測資輸出一行。

範例輸入 1

1
2
3
4
5
6 2
0 1 2 3 4 5
2
AB+CD
ABE+CDF

範例輸出 1

1
2
2385860290
1374821695

編譯參數

1
2
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main
$ ./main

Solution

跟 OpenCL 的做法類似,讓三個 device 共同合作,每一個表達式都交給一個 device 完成,所以目標分配這些表達式使得計算最長時間最小化。處理手法都差不多,經過調校比 OpenCL 還要快上一些。

由於 CUDA 要藉由 cudaSetDevice(p); 設定計算裝置,那麼相信這個 global function 設置變數是採用 __thread 保留字完成,因此用 OpenMP 建立三條 thread,分別設置就不會相互影響,設置 __thread 的變數在各自 thread 下是獨立不受影響的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include <omp.h>
#define MAXGPU 3
#define MAXN 1024
#define MAXM 32
#define MAXMID 32
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostMtx[MAXM][MAXN*MAXN];
uint32_t hostMid[MAXGPU][MAXN*MAXN];
int N = MAXN, M, Q;
int clNeedDevCnt = 3;
__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
int x = r / N, y = r % N;
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += A[x*N + i] * B[i*N + y];
C[x * N + y] = sum;
}
__global__ void matrixAdd(uint32_t A[], uint32_t B[], uint32_t C[]) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
C[r] = A[r] + B[r];
}
void matrix_multiply(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
}
void matrix_add(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixAdd<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC);
CheckErr(cudaGetLastError());
}
char expr[1024];
typedef struct Node {
struct Node *l, *r;
int opcode;
uint32_t *hostV, *cuV;
int regNeed, regId;
int pid, mid;
long long h;
} Node;
void replaceReg(Node *u, int a, int b) {
if (u->l == NULL) return ;
if (u->regId == a)
u->regId = b;
replaceReg(u->l, a, b);
replaceReg(u->r, a, b);
}
void updateNode(Node *u, Node *l, Node *r, int opcode) {
u->l = l, u->r = r, u->opcode = opcode;
if (opcode == '+') {
u->h = u->l->h + u->r->h + N;
// -- register allocation
if (u->l->regNeed == u->r->regNeed) {
u->regNeed = u->l->regNeed + 1;
u->regId = u->regNeed;
replaceReg(u->r, u->r->regId, u->regId);
} else {
u->regNeed = u->l->regNeed > u->r->regNeed ? u->l->regNeed : u->r->regNeed;
u->regId = u->regNeed;
}
} else if (opcode == '*') {
u->h = u->l->h + u->r->h + N*N;
// -- register allocation
if (abs(u->l->regNeed - u->r->regNeed) == 1) {
u->regNeed = u->l->regNeed + 1;
u->regId = u->regNeed;
} else if (u->l->regNeed == u->r->regNeed) {
u->regNeed = u->l->regNeed + 2;
u->regId = u->regNeed;
replaceReg(u->r, u->r->regId, u->regId-1);
} else {
u->regNeed = u->l->regNeed > u->r->regNeed ? u->l->regNeed : u->r->regNeed;
u->regId = u->regNeed;
int a, b;
if (u->l->regId == u->regId) {
a = u->l->regId, b = u->l->regId-1;
replaceReg(u->l, a, -1);
replaceReg(u->l, b, a);
replaceReg(u->l, -1, b);
} else {
a = u->r->regId, b = u->r->regId-1;
replaceReg(u->r, a, -1);
replaceReg(u->r, b, a);
replaceReg(u->r, -1, b);
}
}
}
assert(u->regId < MAXMID);
}
Node* parseExpr(int l, int r, char expr[], int procId) {
Node *u = (Node *) calloc(1, sizeof(Node));
u->pid = procId;
if (l == r) {
int idx = expr[l] - 'A';
u->mid = idx;
u->h = 0;
return u;
}
int cnt = 0;
for (int i = l; i <= r; i++) {
if (expr[i] == '(') {
cnt++;
} else if (expr[i] == ')') {
cnt--;
} else if (expr[i] == '+' && cnt == 0) {
updateNode(u, parseExpr(l, i-1, expr, procId), parseExpr(i+1, r, expr, procId), '+');
return u;
}
}
for (int i = l; i <= r; i++) {
if (expr[i] == '(') {
if (cnt == 0 && i != l) {
updateNode(u, parseExpr(l, i-1, expr, procId), parseExpr(i, r, expr, procId), '*');
return u;
}
cnt++;
} else if (expr[i] == ')') {
cnt--;
} else if (expr[i] >= 'A' && expr[i] <= 'Z' && cnt == 0 && i != l) {
updateNode(u, parseExpr(l, i-1, expr, procId), parseExpr(i, r, expr, procId), '*');
return u;
}
}
free(u);
return parseExpr(l+1, r-1, expr, procId);
}
uint32_t writeOut(uint32_t *hostC) {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
return h;
}
uint32_t *cuMemMid[MAXGPU][MAXMID];
uint32_t *cuMemIn[MAXGPU][MAXM];
void memRelocation(Node *u, int did, Node *nodes[], int *offset) {
if (u->l == NULL) {
nodes[*offset] = u, (*offset)++;
return ;
}
u->cuV = cuMemMid[did][u->regId];
if (u->l->regNeed > u->r->regNeed) {
memRelocation(u->l, did, nodes, offset);
memRelocation(u->r, did, nodes, offset);
} else {
memRelocation(u->r, did, nodes, offset);
memRelocation(u->l, did, nodes, offset);
}
fprintf(stderr, "reg%d = %s ", u->regId, u->opcode == '+' ? "add" : "mul");
if (u->l->l == NULL) fprintf(stderr, "%c ", u->l->mid + 'A');
else fprintf(stderr, "reg%d ", u->l->regId);
if (u->r->l == NULL) fprintf(stderr, "%c\n", u->r->mid + 'A');
else fprintf(stderr, "reg%d\n", u->r->regId);
nodes[*offset] = u, (*offset)++;
return ;
}
int executeGPU(Node *workQue[][128], int workQueSz[], uint32_t resultBuff[]) {
Node* nodes[MAXGPU][128];
int offset[MAXGPU] = {};
uint32_t memSz = N*N*sizeof(uint32_t);
int memDeploy[MAXGPU][MAXM] = {};
int regDeploy[MAXGPU][MAXMID] = {};
// -- execute multi-device
#pragma omp parallel for
for (int p = 0; p < clNeedDevCnt; p++) {
cudaSetDevice(p);
for (int q = 0; q < workQueSz[p]; q++) {
// -- flatten binary tree
offset[p] = 0;
memRelocation(workQue[p][q], p, nodes[p], &offset[p]);
// -- execute in order
for (int i = 0; i < offset[p]; i++) {
Node *u = nodes[p][i];
// -- is leaf, deploy memory copy
if (u->l == NULL) {
if (!memDeploy[p][u->mid]) {
cudaMalloc((void **) &cuMemIn[p][u->mid], memSz);
cudaMemcpy(cuMemIn[p][u->mid], hostMtx[u->mid], memSz, cudaMemcpyHostToDevice);
memDeploy[p][u->mid] = 1;
}
u->cuV = cuMemIn[p][u->mid];
continue;
}
// -- inner node using minimum #buffer
if (!regDeploy[p][u->regId])
cudaMalloc((void **) &cuMemMid[p][u->regId], memSz);
if (u->cuV == NULL)
u->cuV = cuMemMid[p][u->regId];
if (u->opcode == '*')
matrix_multiply(u->l->cuV, u->r->cuV, u->cuV);
else
matrix_add(u->l->cuV, u->r->cuV, u->cuV);
}
// -- read back and store answer
Node *root = workQue[p][q];
fprintf(stderr, "register need %d\n", root->regNeed);
cudaMemcpy(hostMid[p], root->cuV, memSz, cudaMemcpyDeviceToHost);
uint32_t ret = writeOut(hostMid[p]);
resultBuff[root->pid] = ret;
// -- free inner node buffer
for (int i = 0; i < offset[p]; i++) {
Node *u = nodes[p][i];
if (u->l != NULL && u->hostV)
free(u->hostV);
free(u);
}
}
// -- free buffer
cudaSetDevice(p);
for (int i = 0; i < MAXMID; i++) {
cudaFree(cuMemMid[p][i]);
}
for (int i = 0; i < M; i++) {
cudaFree(cuMemIn[p][i]);
}
}
return 1;
}
int readIn() {
if (scanf("%s", expr) != 1)
return 0;
return 1;
}
int balance_cmp(const void *a, const void *b) {
Node *x = *(Node **) a;
Node *y = *(Node **) b;
if (x->h == y->h) return 0;
if (x->h < y->h) return 1;
return -1;
}
void onStart() {
int S[64];
assert(scanf("%d %d", &M, &N) == 2);
for (int i = 0; i < M; i++)
assert(scanf("%d", &S[i]) == 1);
#pragma omp parallel for
for (int p = 0; p < M; p++) {
uint32_t x = 2, n = N*N;
uint32_t c = S[p];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
hostMtx[p][i*N+j] = x;
}
}
}
Node *procBuff[128];
if (scanf("%d", &Q) != 1)
return ;
for (int i = 0; i < Q; i++) {
readIn();
int expr_len = strlen(expr);
procBuff[i] = parseExpr(0, expr_len-1, expr, i);
}
qsort(procBuff, Q, sizeof(Node*), balance_cmp);
float gpuSpeed[MAXGPU] = {1.f, 1.8f, 2.0f};
long long workload[MAXGPU] = {};
int workQueSz[MAXGPU] = {};
uint32_t resultBuff[MAXGPU] = {};
Node *workQue[MAXGPU][128];
for (int i = 0; i < Q; i++) {
int mn = 0;
for (int j = 0; j < clNeedDevCnt; j++) {
if (workload[j]*gpuSpeed[j] < workload[mn]*gpuSpeed[mn])
mn = j;
}
workload[mn] += procBuff[i]->h;
workQue[mn][workQueSz[mn]++] = procBuff[i];
}
executeGPU(workQue, workQueSz, resultBuff);
for (int i = 0; i < Q; i++)
printf("%u\n", resultBuff[i]);
}
int main(int argc, char *argv[]) {
onStart();
return 0;
}
Read More +

批改娘 10102. Matrix Calculator (CUDA)

題目描述

小明的數學作業要計算方陣,現在請你幫幫他!

題目給定數個 $N \times N$ 的矩陣和 $2$ 小題。

  • $X = AB+CD$
  • $Y = ABE+CDF$

sequence.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
UINT sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}
void add(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
C[i][j] = A[i][j] + B[i][j];
}
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + A[i][j]) * 2654435761LU;
}
return h;
}
UINT IN[6][MAXN][MAXN], TMP[6][MAXN][MAXN];
int main() {
int N, S[6];
scanf("%d", &N);
for (int i = 0; i < 6; i++) {
scanf("%d", &S[i]);
rand_gen(S[i], N, IN[i]);
}
// AB
multiply(N, IN[0], IN[1], TMP[0]);
// CD
multiply(N, IN[2], IN[3], TMP[1]);
// AB+CD
add(N, TMP[0], TMP[1], TMP[2]);
printf("%u\n", signature(N, TMP[2]));
// ABE
multiply(N, TMP[0], IN[4], TMP[3]);
// CDF
multiply(N, TMP[1], IN[5], TMP[4]);
// ABE+CDF
add(N, TMP[3], TMP[4], TMP[5]);
printf("%u\n", signature(N, TMP[5]));
return 0;
}

輸入格式

測資只有一組,第一行會有一個整數 $N$,表示題目給定 $N \times N$ 矩陣,第二行上會有 $6$ 個整數,分別為矩陣 $A, B, C, D, E, F$ 的生成種子。

  • $1 \le N \le 1024$
  • $0 \le S_i \le 2^{31}$

輸出格式

輸出兩行 $X$ 和 $Y$ 的雜湊值,可參考 sequence.c 的流程。

範例輸入 1

1
2
2
0 1 2 3 4 5
$$A = \begin{bmatrix} 0 & 1\\ 2 & 2 \end{bmatrix}, B = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix}, C = \begin{bmatrix} 2 & 3\\ 0 & 0 \end{bmatrix}, D = \begin{bmatrix} 3 & 1\\ 1 & 2 \end{bmatrix}, E = \begin{bmatrix} 0 & 1\\ 2 & 2 \end{bmatrix}, F = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix}$$ $$AB = \begin{bmatrix} 3 & 0\\ 8 & 6 \end{bmatrix}, CD = \begin{bmatrix} 9 & 8\\ 0 & 0 \end{bmatrix}, AB+CD = \begin{bmatrix} 12 & 8\\ 8 & 6 \end{bmatrix}\\ ABE = \begin{bmatrix} 0 & 3\\ 12 & 20 \end{bmatrix}, CDF = \begin{bmatrix} 33 & 27\\ 0 & 0 \end{bmatrix}, ABE+CDF = \begin{bmatrix} 33 & 30\\ 12 & 20 \end{bmatrix}$$

範例輸出 1

1
2
2385860290
1374821695

範例輸入 2

1
2
10
0 1 2 3 4 5

範例輸出 2

1
2
617438354
1897844131

編譯參數

1
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main

Solution

為了加快計算,一個 block 最多有 1024 個 thread 運行,因為牽涉到 warp scheduling/size,又要充分利用每一個 core 的計算能力,根據實驗結果 block size 盡可能大,且又不超過 register 個數為佳,這時候效能就會是最好的。這一點與 OpenCL 不同,當 OpenCL 偵測到填入的 block size 為 NULL 時,他會自動調整到最好的大小,而在 CUDA 就要使用者自己設定才行,這導致有些人反而不會去管大小。

根據上述所講,當然直接貪心找最大值填入即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <cuda.h>
#define MAXN 1024
#define GPULOCAL 64
#define UNLOOP 8
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostMtx[6][MAXN*MAXN];
uint32_t hostMid[2][MAXN*MAXN];
int N = MAXN, M;
__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
int x = r / N, y = r % N;
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += A[x*N + i] * B[i*N + y];
C[x * N + y] = sum;
}
__global__ void matrixAdd(uint32_t A[], uint32_t B[], uint32_t C[]) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
C[r] = A[r] + B[r];
}
void readIn() {
uint32_t S[64];
assert(scanf("%d", &N) == 1);
M = 6;
for (int i = 0; i < M; i++)
assert(scanf("%d", &S[i]) == 1);
#pragma omp parallel for
for (int p = 0; p < M; p++) {
uint32_t x = 2, n = N*N, c = S[p];
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
hostMtx[p][i*N+j] = x;
}
}
}
}
uint32_t writeOut(uint32_t *hostC) {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
return h;
}
void matrix_multiply(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
}
void matrix_add(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixAdd<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC);
CheckErr(cudaGetLastError());
}
int main(int argc, char *argv[]) {
readIn();
uint32_t *cuMtx[6], *cuMtxTmp[6];
uint32_t memSz = N*N*sizeof(uint32_t);
for (int i = 0; i < 6; i++) {
cudaMalloc((void **) &cuMtx[i], memSz);
cudaMemcpy(cuMtx[i], hostMtx[i], memSz, cudaMemcpyHostToDevice);
CheckErr(cudaGetLastError());
}
for (int i = 0; i < 6; i++)
cudaMalloc((void **) &cuMtxTmp[i], memSz);
// cuMtxTmp[0] = AB
matrix_multiply(cuMtx[0], cuMtx[1], cuMtxTmp[0]);
// cuMtxTmp[1] = CD
matrix_multiply(cuMtx[2], cuMtx[3], cuMtxTmp[1]);
// cuMtxTmp[2] = ABE
matrix_multiply(cuMtxTmp[0], cuMtx[4], cuMtxTmp[2]);
// cuMtxTmp[3] = CDF
matrix_multiply(cuMtxTmp[1], cuMtx[5], cuMtxTmp[3]);
// cuMtxTmp[4] = AB + CD
matrix_add(cuMtxTmp[0], cuMtxTmp[1], cuMtxTmp[4]);
// cuMtxTmp[5] = ABE+CDF
matrix_add(cuMtxTmp[2], cuMtxTmp[3], cuMtxTmp[5]);
cudaMemcpy(hostMid[0], cuMtxTmp[4], memSz, cudaMemcpyDeviceToHost);
cudaMemcpy(hostMid[1], cuMtxTmp[5], memSz, cudaMemcpyDeviceToHost);
uint32_t ret[2];
#pragma omp parallel for
for (int i = 0; i < 2; i++) {
ret[i] = writeOut(hostMid[i]);
}
for (int i = 0; i < 2; i++)
printf("%u\n", ret[i]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtx[i]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtxTmp[i]);
return 0;
}
Read More +

批改娘 10101. Fast Game of Life (CUDA)

題目描述

生命遊戲中,對於任意細胞,規則如下:
每個細胞有兩種狀態-存活或死亡,每個細胞與以自身為中心的周圍八格細胞產生互動。

  • 當前細胞為存活狀態時,當周圍低於 2 個 (不包含 2 個) 存活細胞時,該細胞變成死亡狀態。
  • 當前細胞為存活狀態時,當周圍有 2 個或 3 個存活細胞時, 該細胞保持原樣。
  • 當前細胞為存活狀態時,當周圍有 3 個以上的存活細胞時,該細胞變成死亡狀態。
  • 當前細胞為死亡狀態時,當周圍有 3 個存活細胞時,該細胞變成存活狀態。

可以把最初的細胞結構定義為種子,當所有在種子中的細胞同時被以上規則處理後,可以得到第一代細胞圖。按規則繼續處理當前的細胞圖,可以得到下一代的細胞圖,周而復始。

輸入格式

輸入第一行有兩個整數 $N$, $M$,表示盤面大小為 $N \times N$,模擬週期次數 $M$。接下來會有 $N$ 行,每一行上會有 $N$ 個字符,以 0 表示 $(i, j)$ 格子上的細胞屬於死亡狀態,反之 1 為存活狀態。

  • $1 \le N \le 2000$
  • $0 \le M \le 5000$

輸出格式

對於每一組測資輸出 $N$ 行,每一行上有 $N$ 個字元表示模擬 $M$ 次的最終盤面結果。

範例輸入 1

1
2
3
4
5
6
5 1
10001
00100
01110
00100
01010

範例輸出 1

1
2
3
4
5
00000
00100
01010
00000
00100

範例輸入 2

1
2
3
4
5
6
5 3
10001
00100
01110
00100
01010

範例輸出 2

1
2
3
4
5
00000
00000
01110
00000
00000

編譯參數

1
2
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main
$ ./main

Solution

CUDA 的函數分成 blocking 或者是 non-blocking,這是因為是異質計算在 host 上不一定要等到 GPU 算完才能執行下一行指令。kernel function call 是 non-blocking 的,可以藉由 cudaDeviceSynchronize 等待 device 所有的 task 都完成,才進行到下一個運行區塊。

但是別忘記 cudaMemcpy 和 kernel function call 這一類都類似 OpenCL 的 Command Queue,若沒有特別設定,原則上都是 in-order 處理 (相對於隨意順序,必須按照進入 queue 的順序執行),因此 memory copy 也是一條指令,cudaMemcpy 屬於 blocking 函數,在設計上就不一定要加上 cudaDeviceSynchronize

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <cuda.h>
#define GPULOCAL 1024
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define MAXN 2048
static int N, M, BN;
static char str[MAXN][MAXN] = {};
static char hostMtx[2][MAXN*MAXN] = {};
__global__ void simulate(char IN[], char OUT[], int N, int BN) {
int globalId = blockIdx.x * blockDim.x + threadIdx.x;
int x = globalId/N + 1, y = globalId%N + 1;
#define G(x, y) IN[(x) * BN + (y)]
char t = G(x, y);
char adj = G(x-1, y-1) + G(x-1, y) + G(x-1, y+1)
+ G(x, y-1) + G(x, y+1) + G(x+1, y-1) + G(x+1, y) + G(x+1, y+1);
OUT[x * BN + y] = (t == 0 && adj == 3) || (t == 1 && (adj == 2 || adj == 3));
#undef G
}
void runCuda() {
int cudaDeviceCnt = 0;
cudaGetDeviceCount(&cudaDeviceCnt);
if (cudaDeviceCnt == 0) {
printf("No supported GPU\n");
return ;
}
char *cuMtx[2];
int memSz = BN*BN*sizeof(char);
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N%i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
cudaMalloc((void **) &cuMtx[0], memSz);
cudaMalloc((void **) &cuMtx[1], memSz);
cudaMemcpy(cuMtx[0], hostMtx[0], memSz, cudaMemcpyHostToDevice);
cudaMemcpy(cuMtx[1], hostMtx[1], memSz, cudaMemcpyHostToDevice);
for (int i = 0; i < M; i++) {
simulate<<<cuGrid, cuBlock>>>(cuMtx[i%2], cuMtx[(i+1)%2], N, BN);
CheckErr(cudaGetLastError());
}
cudaDeviceSynchronize();
int f = M%2;
cudaMemcpy(hostMtx[f], cuMtx[f], memSz, cudaMemcpyDeviceToHost);
for (int i = 1; i <= N; i++) {
for (int j = 1; j <= N; j++)
hostMtx[f][i*BN + j] += '0';
puts(hostMtx[f] + i*BN + 1);
}
}
int main() {
assert(scanf("%d %d", &N, &M) == 2);
while (getchar() != '\n');
for (int i = 1; i <= N; i++)
assert(fgets(str[i]+1, MAXN, stdin) != NULL);
BN = N+2;
for (int i = 1; i <= N; i++) {
for (int j = 1; j <= N; j++)
hostMtx[0][i*BN + j] = str[i][j] - '0';
}
runCuda();
return 0;
}
Read More +

批改娘 10100. Fast Matrix Multiplication (CUDA)

題目描述

計算兩個大小為 $N \times N$ 方陣 $A, \; B$ 相乘結果 $C = A \times B$。為了節省輸入輸出時間,採用亂數產生,可以參考下述程式碼,並改寫成 CUDA 的版本進行加速。

sequence.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
UINT sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + A[i][j]) * 2654435761LU;
}
return h;
}
UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
int N;
uint32_t S1, S2;
scanf("%d %u %u", &N, &S1, &S2);
rand_gen(S1, N, A);
rand_gen(S2, N, B);
multiply(N, A, B, C);
#ifdef DEBUG
print_matrix(N, A);
print_matrix(N, B);
print_matrix(N, C);
#endif
printf("%u\n", signature(N, C));
return 0;
}

輸入格式

測資只有一組,包含三個整數 $N, S_1, S_2$,分別為方陣大小 $N \times N$,產生矩陣 $A$、$B$ 的亂數種子。

  • $64 \le N \le 1024$,保證 $N \mod 64 \equiv 0$
  • $0 \le S_1, \; S_2 < 2^{31}$

輸出格式

輸出一行雜湊值 $H$,可參考 sequence.c 的流程。

範例輸入

1
64 1 2

範例輸出

1
3376147904

編譯參數

1
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main

Solution

與 OpenCL 版本相比,是否乾淨許多呢?完全不用管到底 Context 和 Kerenl Program 如何建造,但是這一些方便性都是因為預設導致的結果。

CUDA 預設會在 device 0 上運作,也就是 PCI-E 順位上的第一個顯卡,若要更動藉由函數 cudaSetDevice(deviceId); 完成,而 OpenCL 的 CommandQueue 則對應到 CUDA 的 Stream,在稍後的題目中,我會提供些許的範例使用 CUDA 這些函數,協助設計的計算流程可以加快。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <cuda.h>
#define MAXN 1024
#define GPULOCAL 64
#define UNLOOP 8
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostA[MAXN*MAXN], hostB[MAXN*MAXN], hostC[MAXN*MAXN];
int N = MAXN;
__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
__shared__ uint32_t cbuf[MAXN+1];
uint32_t rbuf[MAXN];
int r = blockIdx.x * blockDim.x + threadIdx.x;
int localId = threadIdx.x;
int localSz = blockDim.x;
for (int i = 0; i < N; i++)
rbuf[i] = A[r * N + i];
for (int c = 0; c < N; c++) {
for (int cr = localId; cr < N; cr += localSz)
cbuf[cr] = B[cr * N + c];
__syncthreads();
uint32_t sum = 0;
for (int k = 0; k+UNLOOP-1 < N; k += UNLOOP) {
sum += rbuf[k+0] * cbuf[k+0];
sum += rbuf[k+1] * cbuf[k+1];
sum += rbuf[k+2] * cbuf[k+2];
sum += rbuf[k+3] * cbuf[k+3];
sum += rbuf[k+4] * cbuf[k+4];
sum += rbuf[k+5] * cbuf[k+5];
sum += rbuf[k+6] * cbuf[k+6];
sum += rbuf[k+7] * cbuf[k+7];
}
C[r * N + c] = sum;
}
}
void readIn() {
uint32_t c1, c2;
assert(scanf("%d %u %u", &N, &c1, &c2) == 3);
uint32_t x = 2, n = N*N;
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c1 + i + j)&(n-1);
hostA[i*N+j] = x;
}
}
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c2 + i + j)&(n-1);
hostB[i*N+j] = x;
}
}
}
void writeOut() {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
printf("%u\n", h);
}
int main(int argc, char *argv[]) {
readIn();
uint32_t *cuMtxC, *cuMtxA, *cuMtxB;
cudaMalloc((void **) &cuMtxC, N*N*sizeof(uint32_t));
cudaMalloc((void **) &cuMtxA, N*N*sizeof(uint32_t));
cudaMalloc((void **) &cuMtxB, N*N*sizeof(uint32_t));
cudaMemcpy(cuMtxA, hostA, sizeof(uint32_t)*N*N, cudaMemcpyHostToDevice);
cudaMemcpy(cuMtxB, hostB, sizeof(uint32_t)*N*N, cudaMemcpyHostToDevice);
CheckErr(cudaGetLastError());
dim3 cuBlock(GPULOCAL);
dim3 cuGrid(N/GPULOCAL);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
cudaMemcpy(hostC, cuMtxC, sizeof(uint32_t)*N*N, cudaMemcpyDeviceToHost);
CheckErr(cudaGetLastError());
writeOut();
cudaFree(cuMtxC);
return 0;
}
Read More +

批改娘 10099. Dot Product (CUDA)

題目描述

請用 CUDA 改寫下段的計算:

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#include <stdio.h>
#include <assert.h>
#include <omp.h>
#include <inttypes.h>
#include <stdint.h>
#include "utils.h"
#define MAXGPU 8
#define MAXCODESZ 32767
#define MAXN 16777216
uint32_t A[MAXN], B[MAXN], C[MAXN];
int main(int argc, char *argv[]) {
omp_set_num_threads(4);
int N;
uint32_t key1, key2;
while (scanf("%d %" PRIu32 " %" PRIu32, &N, &key1, &key2) == 3) {
int chunk = N / 4;
for (int i = 0; i < N; i++) {
A[i] = encrypt(i, key1);
B[i] = encrypt(i, key2);
}
for (int i = 0; i < N; i++)
C[i] = A[i] * B[i];
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += C[i];
printf("%" PRIu32 "\n", sum);
}
return 0;
}

utils.h

1
2
3
4
5
6
7
8
9
10
#ifndef _UTILS_H
#define _UTILS_H
#include <stdint.h>
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
#endif

範例輸入

1
2
16777216 1 2
16777216 3 5

範例輸出

1
2
2885681152
2147483648

編譯參數

1
2
$ nvcc -Xcompiler "-O2 -fopenmp" main.cu -o main
$ ./main

Solution

這裡同我們在 OpenCL 的實作技巧,將生成測資和計算都丟在 GPU 上完成,但是 CUDA 只能在 Nvidia 顯卡上運作,而且根據版本的不同,每一種顯卡的計算能力也不同,可以參考 wiki,最低版本為 1.0,也就在編譯參數中加入 nvcc -arch=compute_10,如果可以到 2.0,下達 nvcc -arch=compute_20,以此類推。編譯器預設計算能力為 1.0,因此如果要在 kernel function 裡面印出訊息 (意即 printf()),至少提供 2.0 以上的編譯參數。

CUDA 程式撰寫就不用像 OpenCL 從找尋 Platform 到抓到 Device,之後再藉由 Device IDs 建立 Context,再從 Context 建立 Program,CUDA 提供 特殊語法,而不像 OpenCL 採用 特殊函數 包裝,這導致編程複雜度差異極大,但是從彈性來看 OpenCL 可以調控的項目較多且動態,但 CUDA 由於是自家產品,效能會稍微比同版本的 OpenCL 來得快,一部分也是因為編譯器不同導致的緣故。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#include <stdio.h>
#include <stdint.h>
#include <cuda.h>
#include <omp.h>
__device__ uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
__device__ uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
__host__ uint32_t h_rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
__host__ uint32_t h_encrypt(uint32_t m, uint32_t key) {
return (h_rotate_left(m, key&31) + key)^key;
}
#define MAXN 16777216
#define GPULOCAL 128
#define BLOCKSZ (1024)
__global__ void vecdot(uint32_t keyA, uint32_t keyB, uint32_t C[], int N) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int l = x * BLOCKSZ;
int r = l + BLOCKSZ;
uint32_t sum = 0;
if (r > N) r = N;
for (int i = l; i < r; i++)
sum += encrypt(i, keyA) * encrypt(i, keyB);
C[x] = sum;
}
uint32_t hostC[MAXN / GPULOCAL];
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
uint32_t N, keyA, keyB;
uint32_t *cuArrC;
cudaMalloc((void **)&cuArrC, MAXN/GPULOCAL*sizeof(uint32_t));
while (scanf("%u %u %u", &N, &keyA, &keyB) == 3) {
int M = (N + BLOCKSZ-1) / BLOCKSZ;
int LOCAL = GPULOCAL;
M = (M + LOCAL) / LOCAL * LOCAL;
dim3 cuBlock(LOCAL);
dim3 cuGrid(M/LOCAL);
vecdot<<<cuGrid, cuBlock>>>(keyA, keyB, cuArrC, N);
CheckErr(cudaGetLastError());
cudaMemcpy(hostC, cuArrC, M*sizeof(uint32_t), cudaMemcpyDeviceToHost);
uint32_t sum = 0;
#ifdef _OPENMP
omp_set_num_threads(4);
#endif
#pragma omp parallel for reduction(+: sum)
for (int i = 0; i < M; i++)
sum += hostC[i];
printf("%u\n", sum);
}
cudaFree(cuArrC);
return 0;
}
Read More +