批改娘 10109. Sorting (CUDA)

contents

  1. 1. 題目描述
  2. 2. 輸入格式
  3. 3. 輸出格式
  4. 4. 範例輸入
  5. 5. 範例輸出
  6. 6. Solution
    1. 6.1. bitonic sort
    2. 6.2. 內建 thrust

題目描述

請加速以下程序。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#include <bits/stdc++.h>
using namespace std;
#define MAXN 16777216
uint32_t A[MAXN];
inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (m*m + key)%key;
}
int main() {
int N, K;
while (scanf("%d %d", &N, &K) == 2) {
assert(N&(-N) == N);
for (int i = 0; i < N; i++)
A[i] = encrypt(i, K);
sort(A, A+N);
uint32_t sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < N; i++)
sum += A[i] * i;
printf("%u\n", sum);
}
return 0;
}

輸入格式

有多行測資,每組第一行會有兩個整數 $N, \; K$,表示要排序 $N$ 個整數,並且以亂數種子 $K$ 生成。

  • $N = 2^i, \; 0 \le i \le 24$
  • $1 \le K \le N$

輸出格式

輸出一行 $\sum i \cdot A_i$。

範例輸入

1
2
8 8
8 7

範例輸出

1
2
66
75

Solution

bitonic sort

實作 bitonic sort,最簡單全部使用 global memory 完成,但是在遞迴公式中,當 $n$ 已經足夠小就可以代入 shared memory 進行計算,這時候效能就非常好。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <assert.h>
#include <omp.h>
using namespace std;
#define MAXN 16777216
#define MAXBLK 1024
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t A[MAXN];
__device__ inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (m*m + key)%key;
}
__device__ inline void swap(uint32_t &x, uint32_t &y) {
uint32_t t = x; x = y; y = t;
}
__global__ void bitonic_step(uint32_t *A, int p, int q) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int up = ((i >> p)&2) == 0;
int d = 1 << (p - q);
if ((i & d) == 0 && (A[i] > A[i|d]) == up)
swap(A[i], A[i|d]);
}
__global__ void bitonic_step_fit(uint32_t *A, int p, int q) {
extern __shared__ uint32_t buff[];
int i = blockIdx.x * blockDim.x + threadIdx.x;
int up = ((i >> p)&2) == 0;
buff[threadIdx.x] = A[i];
__syncthreads();
for (; q <= p; q++) {
int d = 1 << (p - q);
if ((i & d) == 0 && (buff[threadIdx.x] > buff[threadIdx.x|d]) == up)
swap(buff[threadIdx.x], buff[threadIdx.x|d]);
__syncthreads();
}
A[i] = buff[threadIdx.x];
}
__global__ void sum_arr(uint32_t *A, uint32_t *B, int N) {
extern __shared__ uint32_t buff[];
int i = blockIdx.x * blockDim.x + threadIdx.x;
buff[threadIdx.x] = A[i] * i;
__syncthreads();
for (int i = blockDim.x>>1; i; i >>= 1) {
if (threadIdx.x < i)
buff[threadIdx.x] += buff[threadIdx.x + i];
__syncthreads();
}
if (threadIdx.x == 0)
B[blockIdx.x] = buff[0];
}
__global__ void rand_gen(uint32_t *A, int N, int K) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
A[i] = encrypt(i, K);
}
void output(int N, uint32_t *cuA, uint32_t *cuB) {
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
CheckErr(cudaGetLastError());

sum_arr<<<cuGrid, cuBlock, sizeof(uint32_t)*min(MAXBLK, N)>>>(cuA, cuB, N);
CheckErr(cudaGetLastError());

cudaMemcpy(A, cuB, sizeof(uint32_t) * N / min(MAXBLK, N), cudaMemcpyDeviceToHost);
CheckErr(cudaGetLastError());

uint32_t sum = 0;
for (int i = 0; i < N / min(MAXBLK, N); i++)
sum += A[i];
printf("%u\n", sum);
}
void sort_test(int N, int K, uint32_t *cuA, uint32_t *cuB) {
assert((N&-N) == N);
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
rand_gen<<<cuGrid, cuBlock>>>(cuA, N, K);
CheckErr(cudaGetLastError());

int logN = 1;

while ((1 << logN) < N) logN++;
for (int i = 0; i < logN; i++) {
for (int j = 0; j <= i; j++) {
if ((1 << (i - j) >= MAXBLK)) {
bitonic_step<<<cuGrid, cuBlock>>>(cuA, i, j);
CheckErr(cudaGetLastError());
} else {
bitonic_step_fit<<<cuGrid, cuBlock, sizeof(uint32_t)*(MAXBLK)>>>(cuA, i, j);
CheckErr(cudaGetLastError());
break;
}
}
}

output(N, cuA, cuB);
}

int main() {
uint32_t *cuA, *cuB;
cudaMalloc((void **) &cuA, sizeof(uint32_t) * MAXN);
cudaMalloc((void **) &cuB, sizeof(uint32_t) * MAXN / MAXBLK);
CheckErr(cudaGetLastError());
int N, K;
while(scanf("%d %d", &N, &K) == 2) {
sort_test(N, K, cuA, cuB);
}
cudaFree(cuA);
cudaFree(cuB);
return 0;
}

內建 thrust

當然這種最常見的函數一定有內建可以協助,如 thrust library 就有提供相關的函數,但第一次執行時會特別地慢,有可能是 library 要代入 cache 的緣故。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <vector>
#include <assert.h>
#include <omp.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
using namespace std;
#define MAXN 16777216
#define MAXBLK 1024
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t A[MAXN];
__device__ inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (m*m + key)%key;
}
__device__ inline void swap(uint32_t &x, uint32_t &y) {
uint32_t t = x; x = y; y = t;
}
__global__ void sum_arr(uint32_t *A, uint32_t *B, int N) {
extern __shared__ uint32_t buff[];
int i = blockIdx.x * blockDim.x + threadIdx.x;
buff[threadIdx.x] = A[i] * i;
__syncthreads();
for (int i = blockDim.x>>1; i; i >>= 1) {
if (threadIdx.x < i)
buff[threadIdx.x] += buff[threadIdx.x + i];
__syncthreads();
}
if (threadIdx.x == 0)
B[blockIdx.x] = buff[0];
}
__global__ void rand_gen(uint32_t *A, int N, int K) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
A[i] = encrypt(i, K);
}
void output(int N, uint32_t *cuA, uint32_t *cuB) {
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
CheckErr(cudaGetLastError());

sum_arr<<<cuGrid, cuBlock, sizeof(uint32_t)*min(MAXBLK, N)>>>(cuA, cuB, N);
CheckErr(cudaGetLastError());

cudaMemcpy(A, cuB, sizeof(uint32_t) * N / min(MAXBLK, N), cudaMemcpyDeviceToHost);
CheckErr(cudaGetLastError());

uint32_t sum = 0;
for (int i = 0; i < N / min(MAXBLK, N); i++)
sum += A[i];
printf("%u\n", sum);
}
void cpu_compute(int N, int K) {
vector<int> A;
for (int i = 0; i < N; i++)
A.push_back((i*i + K)%K);
sort(A.begin(), A.end());
uint32_t sum = 0;
for (int i = 0; i < A.size(); i++)
sum += A[i] * i;
printf("%u\n", sum);
return ;
}
void sort_test(int N, int K, uint32_t *cuA, uint32_t *cuB) {
assert((N&-N) == N);
if (N < MAXBLK) {
cpu_compute(N, K);
return ;
}
dim3 cuBlock(min(MAXBLK, N));
dim3 cuGrid(N / min(MAXBLK, N));
rand_gen<<<cuGrid, cuBlock>>>(cuA, N, K);
CheckErr(cudaGetLastError());
thrust::device_ptr<uint32_t> cuAptr(cuA);
thrust::sort(cuAptr, cuAptr+N);
output(N, cuA, cuB);
}

int main() {
uint32_t *cuA, *cuB;
cudaMalloc((void **) &cuA, sizeof(uint32_t) * MAXN);
cudaMalloc((void **) &cuB, sizeof(uint32_t) * MAXN / MAXBLK);
CheckErr(cudaGetLastError());
int N, K;
while(scanf("%d %d", &N, &K) == 2)
sort_test(N, K, cuA, cuB);
cudaFree(cuA);
cudaFree(cuB);
return 0;
}