批改娘 10091. Fast Matrix Multiplication (OpenCL)

題目描述

計算兩個大小為 $N \times N$ 方陣 $A, \; B$ 相乘結果 $C = A \times B$。為了節省輸入輸出時間,採用亂數產生,可以參考下述程式碼,並改寫成 OpenCL 的版本進行加速。

使用 Profile 可以透過 NVIDIA Visual Profiler (GUI) 查看,遠端連線使用 ssh -X username@hostnvprof.sh, nvvp.cfg 下載

sequence.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
UINT sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = (h + A[i][j]) * 2654435761LU;
}
return h;
}
UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
int N;
uint32_t S1, S2;
scanf("%d %u %u", &N, &S1, &S2);
rand_gen(S1, N, A);
rand_gen(S2, N, B);
multiply(N, A, B, C);
#ifdef DEBUG
print_matrix(N, A);
print_matrix(N, B);
print_matrix(N, C);
#endif
printf("%u\n", signature(N, C));
return 0;
}

輸入格式

測資只有一組,包含三個整數 $N, S_1, S_2$,分別為方陣大小 $N \times N$,產生矩陣 $A$、$B$ 的亂數種子。

  • $64 \le N \le 1024$,保證 $N \mod 64 \equiv 0$
  • $0 \le S_1, \; S_2 < 2^{31}$

輸出格式

輸出一行雜湊值 $H$,可參考 sequence.c 的流程。

範例輸入

1
64 1 2

範例輸出

1
3376147904

編譯參數

1
gcc -std=c99 -O2 main.c -lm -lOpenCL -fopenmp -I/usr/include/CL

Solution

兩個 $N \times N$ 乘法計算,從網路上常見的作法通常直接針對最終答案進行切塊,大致上分成三種運行方式

  • 每一個 thread 只處理一個向量內積,因此會需要 $N^2$ 個 threads,彼此之間獨立。特別注意到在 GPU 程式中,每一個維度的索引值不可大於 65535,但是拆成兩個維度就沒有上限。
  • 每一個 thread 只處理一個向量內積,但數個 thread 會分配到同一個 block,並且合作將 global memory 搬到 on-chip 的 local/shared memory 來加快速度。特別注意到 share memory 是 on-chip 的,通常能儲存量都非常小,儘管他們能藉由 data-reused 加快存取速度,但因為 share memory 大小限制導致有一個加速上限。
  • 每一個 thread 處理一個或數個列上的所有值。

這些牽涉到 warp scheduling 和 memory coalesce 問題,有時候理論分析平行度看起來很高,但實際運作還是得看 warp size 和發生 branch 的情況。下述程式是按照第三個做法完成。

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#define MAXGPU 1
#define MAXN 1024
uint32_t hostA[MAXN*MAXN], hostB[MAXN*MAXN], hostC[MAXN*MAXN];
int N = MAXN;
char clSrcFormat[1024];
char clSrc[1024] = "";
char clSrcMain[1024] = "matrixMul";
// -- start working with OpenCL
cl_context clCtx;
cl_program clPrg;
cl_kernel clKrn;
cl_command_queue clQue;
cl_mem clMemIn1, clMemIn2, clMemOut;
#define CheckFailAndExit(state) \
if (state != CL_SUCCESS) { \
printf("Error: Line %u in file %s\n\n", __LINE__, __FILE__), \
destroyGPU(clCtx, clPrg, clKrn, clQue, clMemIn1, clMemIn2, clMemOut); \
}
#define clFuncArgs cl_context *clCtx, cl_program *clPrg, cl_kernel *clKrn, \
cl_command_queue *clQue, cl_mem *clMemIn1, cl_mem *clMemIn2, \
cl_mem *clMemOut
#define clCallFunc &clCtx, &clPrg, &clKrn, &clQue, &clMemIn1, &clMemIn2, &clMemOut
void destroyGPU(clFuncArgs) {
fprintf(stderr, "Starting Cleanup ...\n\n");
if (*clMemOut) clReleaseMemObject(*clMemOut);
if (*clMemIn2) clReleaseMemObject(*clMemIn2);
if (*clMemIn1) clReleaseMemObject(*clMemIn1);
if (*clKrn) clReleaseKernel(*clKrn);
if (*clPrg) clReleaseProgram(*clPrg);
if (*clQue) clReleaseCommandQueue(*clQue);
if (*clCtx) clReleaseContext(*clCtx);
exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
// -- generate kernel code
FILE *codefin = fopen(fileName, "r");
assert(codefin != NULL);
size_t clSrcLen = fread(clSrcFormat, 1, 1024, codefin);
sprintf(clSrc, clSrcFormat, N);
fclose(codefin);
cl_int clStat;
cl_uint clPlatN, clGPUN;
cl_platform_id clPlatID;
cl_device_id clGPUID[MAXGPU];
const char *clSrcPtr = clSrc;
// -- basic OpenCL setup
clGetPlatformIDs(1, &clPlatID, &clPlatN);
clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clGPUN);
*clCtx = clCreateContext(NULL, 1, clGPUID, NULL, NULL, &clStat);
CheckFailAndExit(clStat);
*clQue = clCreateCommandQueue(*clCtx, clGPUID[0], 0, &clStat);
CheckFailAndExit(clStat);
*clPrg = clCreateProgramWithSource(*clCtx, 1, &clSrcPtr, &clSrcLen, &clStat);
CheckFailAndExit(clStat);
clStat = clBuildProgram(*clPrg, 1, clGPUID, NULL, NULL, NULL);
if (clStat != CL_SUCCESS) {
printf("Error in clBuildProgram, Line %u in file %s\n\n", __LINE__, __FILE__);
size_t log_size;
clGetProgramBuildInfo(*clPrg, clGPUID[0], CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &clStat, NULL);
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *program_log = (char *) calloc(log_size+1, sizeof(char));
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
printf("%s", program_log);
free(program_log);
CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
}
*clKrn = clCreateKernel(*clPrg, clSrcMain, &clStat);
CheckFailAndExit(clStat);
// -- create all buffers
cl_mem_flags clInBuffFlag = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR;
cl_mem_flags clOutBuffFlag = CL_MEM_WRITE_ONLY;
*clMemIn1 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N,
hostA, &clStat);
CheckFailAndExit(clStat);
*clMemIn2 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N,
hostB, &clStat);
CheckFailAndExit(clStat);
*clMemOut = clCreateBuffer(*clCtx, clOutBuffFlag, sizeof(uint32_t)*N*N,
hostC, &clStat);
CheckFailAndExit(clStat);
// -- set argument to kernel
clStat = clSetKernelArg(*clKrn, 0, sizeof(cl_mem), (void *) clMemIn1);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(*clKrn, 1, sizeof(cl_mem), (void *) clMemIn2);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(*clKrn, 2, sizeof(cl_mem), (void *) clMemOut);
CheckFailAndExit(clStat);
return 1;
}
int min(int x, int y) {
return x < y ? x : y;
}
int executeGPU(clFuncArgs) {
cl_int clStat;
size_t globalOffset[] = {0, 0};
size_t globalSize[] = {N};
size_t localSize[] = {min(N, 64)};
clStat = clEnqueueNDRangeKernel(*clQue, *clKrn, 1, globalOffset,
globalSize, localSize, 0, NULL, NULL);
CheckFailAndExit(clStat);
clFinish(*clQue);
// -- read back
clEnqueueReadBuffer(*clQue, *clMemOut, CL_TRUE, 0, sizeof(uint32_t)*N*N,
hostC, 0, NULL, NULL);
return 1;
}
void readIn() {
uint32_t c1, c2;
assert(scanf("%d %u %u", &N, &c1, &c2) == 3);
uint32_t x = 2, n = N*N;
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c1 + i + j)&(n-1);
hostA[i*N+j] = x;
}
}
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c2 + i + j)&(n-1);
hostB[i*N+j] = x;
}
}
}
void writeOut() {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
printf("%u\n", h);
}
void onStart() {
readIn();
initAllGPU("matrixmul.cl", clCallFunc);
executeGPU(clCallFunc);
writeOut();
destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
printf("God Bless Me\n");
destroyGPU(clCallFunc);
exit(0);
}
int main(int argc, char *argv[]) {
const char sigErr[] = "I can't catch signal.\n";
if (signal(SIGTRAP, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGSEGV, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGFPE, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGKILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGINT, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
onStart();
return 0;
}

matrixmul.cl

有些人也許會想問,為什麼合作搬運一個在 global memory 的陣列,要採用 for (int cr = localID; cr < N; cr += localSz) 的方式,這是因為 GPU 設計有 memory coalesce 問題,一個 warp 運作時,存取最好的是連續的,這麼一來 memory coalesce 帶進來的一坨連續的記憶體就能充份被利用,而不是每一個 thread 讀取一塊連續的記憶體片段,採用跳躍的方式在 warp scheduling 時,讀取順序看起來才比較連續。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#define N %d
#define CTYPE unsigned int
#define UNLOOP 8
__kernel void matrixMul(__global CTYPE *in1,
__global CTYPE *in2,
__global CTYPE *out) {
CTYPE rbuf[N];
int r = get_global_id(0);
int localID = get_local_id(0);
int localSz = get_local_size(0);
__local CTYPE cbuf[N];
for (int i = 0; i < N; i++)
rbuf[i] = in1[r * N + i];
for (int c = 0; c < N; c++) {
for (int cr = localID; cr < N; cr += localSz)
cbuf[cr] = in2[cr * N + c];
barrier(CLK_LOCAL_MEM_FENCE);
CTYPE sum = 0;
for (int k = 0; k+UNLOOP-1 < N; k += UNLOOP) {
sum += rbuf[k+0] * cbuf[k+0];
sum += rbuf[k+1] * cbuf[k+1];
sum += rbuf[k+2] * cbuf[k+2];
sum += rbuf[k+3] * cbuf[k+3];
sum += rbuf[k+4] * cbuf[k+4];
sum += rbuf[k+5] * cbuf[k+5];
sum += rbuf[k+6] * cbuf[k+6];
sum += rbuf[k+7] * cbuf[k+7];
}
out[r * N + c] = sum;
}
}
Read More +

批改娘 10090. Dot Product (OpenCL)

題目描述

請用 OpenCL 改寫下段的計算:

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include <stdio.h>
#include <assert.h>
#include <omp.h>
#include <inttypes.h>
#include "utils.h"
#define MAXGPU 8
#define MAXCODESZ 32767
#define MAXN 16777216
static cl_uint A[MAXN], B[MAXN], C[MAXN];
int main(int argc, char *argv[]) {
omp_set_num_threads(4);
int N;
uint32_t key1, key2;
while (scanf("%d %" PRIu32 " %" PRIu32, &N, &key1, &key2) == 3) {
int chunk = N / 4;
for (int i = 0; i < N; i++) {
A[i] = encrypt(i, key1);
B[i] = encrypt(i, key2);
}
for (int i = 0; i < N; i++)
C[i] = A[i] * B[i];
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += C[i];
printf("%" PRIu32 "\n", sum);
}
return 0;
}

utils.h

1
2
3
4
5
6
7
8
9
10
#ifndef _UTILS_H
#define _UTILS_H
#include <stdint.h>
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
#endif

範例輸入

1
2
16777216 1 2
16777216 3 5

範例輸出

1
2
2885681152
2147483648

編譯參數

1
gcc -std=c99 -O2 main.c -lOpenCL -fopenmp

Solution

這一題藉由兩個亂數產生長度為 $N$ 的兩個向量,計算內積結果為何。

由於這是第一份計算 OpenCL 的應用,特別注意 Memory Leak 的問題,確定每一次執行都有正常釋放資源,可以透過 $ htop 或者 $ top 指令監控,若在 nvidia 平台下,可以使用 $ nvidia-smi 觀察程式佔有的記憶體量已經排隊情況,同時要小心繁重工作導致熱當機。

這一題原本預設要從 CPU 產生兩個向量,再傳送到 GPU 上面計算,同學一問就不小心將加密的檔案一起釋出,結果就能直接在 GPU 上產生,並且內積完使用 $O(\log N)$ 進行 work-group 內部進行加總,這大幅度地降低需要回到 CPU 計算總和的時間。

特別注意實驗環境最多允許一個 work-group 有 1024 個 work-item,從效率結果上來看,work-item 並不是越多越好,因為牽涉到 register 數量以及 memory access 的效率,這部分編譯器無法幫忙,全權交給程序員決定。而且 GPU 還有嚴重的 bank conflict,在計算一個 work-group 總和時,特殊的寫法減少 bank conflict 的發生。

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include "utils.h"
#include <omp.h>
#define MAXGPU 1
#define MAXN 16777216
#define GPULOCAL 256
uint32_t hostC[MAXN/GPULOCAL];
int N;
uint32_t keyA, keyB;
char clSrcFormat[1024] = "";
char clSrc[1024] = "";
char clSrcMain[1024] = "vecdot";
// -- start working with OpenCL
cl_context clCtx;
cl_program clPrg;
cl_kernel clKrn;
cl_command_queue clQue;
cl_mem clMemOut;
#define CheckFailAndExit(status) \
if (status != CL_SUCCESS) { \
fprintf(stderr, "Error %d: Line %u in file %s\n\n", status, __LINE__, __FILE__), \
destroyGPU(clCtx, clPrg, clKrn, clQue, clMemOut); \
}
#define clFuncArgs cl_context *clCtx, cl_program *clPrg, cl_kernel *clKrn, \
cl_command_queue *clQue, cl_mem *clMemOut
#define clCallFunc &clCtx, &clPrg, &clKrn, &clQue, &clMemOut
void destroyGPU(clFuncArgs) {
fprintf(stderr, "Starting Cleanup ...\n\n");
if (*clMemOut) clReleaseMemObject(*clMemOut);
if (*clKrn) clReleaseKernel(*clKrn);
if (*clPrg) clReleaseProgram(*clPrg);
if (*clQue) clReleaseCommandQueue(*clQue);
if (*clCtx) clReleaseContext(*clCtx);
exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
// -- generate kernel code
FILE *codefin = fopen(fileName, "r");
assert(codefin != NULL);
size_t clSrcLen = fread(clSrc, 1, 1024, codefin);
cl_int clStat;
cl_uint clPlatN, clGPUN;
cl_platform_id clPlatID;
cl_device_id clGPUID[MAXGPU];
const char *clSrcPtr = clSrc;
// -- basic OpenCL setup
clGetPlatformIDs(1, &clPlatID, &clPlatN);
clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, NULL);
*clCtx = clCreateContext(NULL, 1, clGPUID, NULL, NULL, &clStat);
CheckFailAndExit(clStat);
*clQue = clCreateCommandQueue(*clCtx, clGPUID[0], 0, &clStat);
CheckFailAndExit(clStat);
*clPrg = clCreateProgramWithSource(*clCtx, 1, &clSrcPtr, &clSrcLen, &clStat);
CheckFailAndExit(clStat);
clStat = clBuildProgram(*clPrg, 1, clGPUID, NULL, NULL, NULL);
if (clStat != CL_SUCCESS) {
fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__);
size_t log_size;
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *program_log = (char *) calloc(log_size+1, sizeof(char));
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
printf("%s", program_log);
free(program_log);
CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
}
*clKrn = clCreateKernel(*clPrg, clSrcMain, &clStat);
CheckFailAndExit(clStat);
// -- create all buffers
cl_mem_flags clOutBuffFlag = CL_MEM_WRITE_ONLY;
*clMemOut = clCreateBuffer(*clCtx, clOutBuffFlag, sizeof(uint32_t)*MAXN/GPULOCAL,
hostC, &clStat);
CheckFailAndExit(clStat);
return 1;
}
int executeGPU(clFuncArgs) {
uint32_t padding = 0;
while (N%GPULOCAL) {
padding += encrypt(N, keyA) * encrypt(N, keyB);
N++;
}
cl_int clStat;
size_t globalOffset[] = {0};
size_t globalSize[] = {N};
size_t localSize[] = {GPULOCAL};
// -- set argument to kernel
clStat = clSetKernelArg(*clKrn, 0, sizeof(cl_uint), (void *) &keyA);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(*clKrn, 1, sizeof(cl_uint), (void *) &keyB);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(*clKrn, 2, sizeof(cl_mem), (void *) clMemOut);
CheckFailAndExit(clStat);
// -- execute
clStat = clEnqueueNDRangeKernel(*clQue, *clKrn, 1, globalOffset,
globalSize, localSize, 0, NULL, NULL);
CheckFailAndExit(clStat);
// -- read back
clEnqueueReadBuffer(*clQue, *clMemOut, CL_TRUE, 0, sizeof(uint32_t)*N/GPULOCAL,
hostC, 0, NULL, NULL);
uint32_t sum = 0;
omp_set_num_threads(4);
#pragma omp parallel for reduction(+: sum)
for (int i = 0; i < N/GPULOCAL; i++)
sum += hostC[i];
printf("%u\n", sum - padding);
return 1;
}
int readIn() {
int has = 0;
if (scanf("%d %u %u", &N, &keyA, &keyB) != 3)
return 0;
return 1;
}
void onStart() {
initAllGPU("vecdot.cl", clCallFunc);
while (readIn())
executeGPU(clCallFunc);
destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
printf("God Bless Me\n");
destroyGPU(clCallFunc);
exit(0);
}
int main(int argc, char *argv[]) {
const char sigErr[] = "I can't catch signal.\n";
if (signal(SIGTRAP, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGSEGV, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGFPE, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGKILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGINT, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
onStart();
return 0;
}

vecdot.cl

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#define uint32_t unsigned int
inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
__kernel void vecdot(uint32_t keyA, uint32_t keyB, __global int* C) {
__local int buf[256];
int globalId = get_global_id(0);
int groupId = get_group_id(0);
int localId = get_local_id(0);
int localSz = get_local_size(0);
buf[localId] = encrypt(globalId, keyA) * encrypt(globalId, keyB);
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = localSz>>1; i; i >>= 1) {
if (localId < i)
buf[localId] += buf[localId + i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (localId == 0)
C[groupId] = buf[0];
}
Read More +

批改娘 10089. Print Platform Information (OpenCL)

Problem

使用 OpenCL 印出裝置訊息。請參考課程講義。

Sample Input

no input

Sample Output

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
1 platform found
Platform Name NVIDIA CUDA
Platform Vendor NVIDIA Corporation
Platform Version OpenCL 1.2 CUDA 7.5.23
Platform Profile FULL_PROFILE
3 Devices
0 CPU Devices
3 GPU Devices
Device name GeForce GTX 980 Ti
Global memory size 6442254336
Local memory size 49152
# of compute units 22
max # of work items in a work group 1024
Device name GeForce GTX 970
Global memory size 4294770688
Local memory size 49152
# of compute units 13
max # of work items in a work group 1024
<other ...>

備註

可參考上方的題解頁面

解法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#define MAXDEV 8
#define MAXPLAT 8
#define MAXN 2048
#define MAXSTRBUF 1024
// -- start working with OpenCL
#define CheckFailAndExit(status) \
if (status != CL_SUCCESS) {\
fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__); \
}
#define clPrint(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__)
int infoGPU() {
size_t logLen;
char logBuf[MAXSTRBUF];
cl_int clStat;
cl_uint clPlatN, clDevN;
cl_platform_id clPlatIDs[MAXPLAT], clPlatID;
cl_device_id clDevIDs[MAXDEV], clDevID;
clGetPlatformIDs(MAXPLAT, clPlatIDs, &clPlatN);
clPrint("%d platform found\n", clPlatN);
for (int i = 0; i < clPlatN; i++) {
clPlatID = clPlatIDs[i];
const cl_platform_info clPlatInfoQuery[] = {
CL_PLATFORM_NAME, CL_PLATFORM_VENDOR,
CL_PLATFORM_VERSION, CL_PLATFORM_PROFILE
};
const cl_platform_info clPlatDevInfoQuery[] = {
CL_DEVICE_TYPE_ALL, CL_DEVICE_TYPE_CPU,
CL_DEVICE_TYPE_GPU
};
const cl_platform_info clDevInfoQuery[] = {
CL_DEVICE_GLOBAL_MEM_SIZE, CL_DEVICE_LOCAL_MEM_SIZE,
CL_DEVICE_MAX_COMPUTE_UNITS, CL_DEVICE_MAX_WORK_GROUP_SIZE
};
const char const queryPlatFmt[][32] = {
"Platform Name %s\n", "Platform Vendor %s\n",
"Platform Version %s\n", "Platform Profile %s\n"
};
const char const queryPlatDevFmt[][32] = {
"%u Devices\n", "%u CPU Devices\n",
"%u GPU Devices\n"
};
const char const queryDevFmt[][64] = {
"Global memory size %lld\n", "Local memory size %lld\n",
"# of compute units %lld\n", "max # of work items in a work group %lld\n"
};
const int platQcnt = sizeof(clPlatInfoQuery) / sizeof(cl_platform_info);
const int platDevQcnt = sizeof(clPlatDevInfoQuery) / sizeof(cl_platform_info);
const int devQcnt = sizeof(clDevInfoQuery) / sizeof(cl_platform_info);
for (int j = 0; j < platQcnt; j++) {
clStat = clGetPlatformInfo(clPlatID, clPlatInfoQuery[j], MAXSTRBUF, logBuf, &logLen);
CheckFailAndExit(clStat);
clPrint(queryPlatFmt[j], logBuf);
}
for (int j = 0; j < platDevQcnt; j++) {
clGetDeviceIDs(clPlatID, clPlatDevInfoQuery[j], MAXDEV, clDevIDs, &clDevN);
clPrint(queryPlatDevFmt[j], clDevN);
if (clPlatDevInfoQuery[j] == CL_DEVICE_TYPE_ALL)
continue;
for (int k = 0; k < clDevN; k++) {
clDevID = clDevIDs[k];
clStat = clGetDeviceInfo(clDevID, CL_DEVICE_NAME, MAXSTRBUF, logBuf, &logLen);
CheckFailAndExit(clStat);
clPrint("Device name %s\n", logBuf);
for (int p = 0; p < devQcnt; p++) {
cl_ulong clVal;
clStat = clGetDeviceInfo(clDevID, clDevInfoQuery[p], sizeof(cl_ulong), &clVal, NULL);
CheckFailAndExit(clStat);
clPrint(queryDevFmt[p], clVal);
}
}
}
}
}
int main() {
infoGPU();
return 0;
}
Read More +

UVa 13100 - Painting the Wall

Problem

給一張 $N \times M$ 的圖,用最少筆畫勾勒出指定圖像,每一筆只能平行於兩軸,並且輸出任意一組解。

Sample Input

1
2
3
4
5
6
5 7
.*...*.
.*...*.
.*****.
.*...*.
.*...*.

Sample Output

1
2
3
4
3
vline 2 1 5
vline 6 1 5
hline 3 2 6

Solution

明顯地,要繪製的座標 $(x, y)$,可以視為一條邊 $x \Rightarrow y$,只要挑選某些點就可以覆蓋相鄰邊即可,在二分圖上找最少點集覆蓋是 P 多項式時間內可解的,並且最少點集大小為最大匹配數。但這一題要求要連續的筆畫,因此需要針對給定的圖形進行重新編號,將不連續的 x 和 y 座標重新定義。

這裡採用 Dinic’s algorithm 解決二分匹配,最後利用貪心回溯的方式可以找到一組解。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <algorithm>
#include <queue>
#include <stack>
#include <assert.h>
#include <set>
using namespace std;
const int MAXV = 40010;
const int MAXE = MAXV * 200 * 2;
const int INF = 1<<29;
typedef struct Edge {
int v, cap, flow;
Edge *next, *re;
} Edge;
class MaxFlow {
public:
Edge edge[MAXE], *adj[MAXV], *pre[MAXV], *arc[MAXV];
int e, n, level[MAXV], lvCnt[MAXV], Q[MAXV];
void Init(int x) {
n = x, e = 0;
assert(n < MAXV);
for (int i = 0; i < n; ++i) adj[i] = NULL;
}
void Addedge(int x, int y, int flow){
edge[e].v = y, edge[e].cap = flow, edge[e].next = adj[x];
edge[e].re = &edge[e+1], adj[x] = &edge[e++];
edge[e].v = x, edge[e].cap = 0, edge[e].next = adj[y];
edge[e].re = &edge[e-1], adj[y] = &edge[e++];
assert(e < MAXE);
}
void Bfs(int v){
int front = 0, rear = 0, r = 0, dis = 0;
for (int i = 0; i < n; ++i) level[i] = n, lvCnt[i] = 0;
level[v] = 0, ++lvCnt[0];
Q[rear++] = v;
while (front != rear){
if (front == r) ++dis, r = rear;
v = Q[front++];
for (Edge *i = adj[v]; i != NULL; i = i->next) {
int t = i->v;
if (level[t] == n) level[t] = dis, Q[rear++] = t, ++lvCnt[dis];
}
}
}
int Maxflow(int s, int t){
int ret = 0, i, j;
Bfs(t);
for (i = 0; i < n; ++i) pre[i] = NULL, arc[i] = adj[i];
for (i = 0; i < e; ++i) edge[i].flow = edge[i].cap;
i = s;
while (level[s] < n){
while (arc[i] && (level[i] != level[arc[i]->v]+1 || !arc[i]->flow))
arc[i] = arc[i]->next;
if (arc[i]){
j = arc[i]->v;
pre[j] = arc[i];
i = j;
if (i == t){
int update = INF;
for (Edge *p = pre[t]; p != NULL; p = pre[p->re->v])
if (update > p->flow) update = p->flow;
ret += update;
for (Edge *p = pre[t]; p != NULL; p = pre[p->re->v])
p->flow -= update, p->re->flow += update;
i = s;
}
}
else{
int depth = n-1;
for (Edge *p = adj[i]; p != NULL; p = p->next)
if (p->flow && depth > level[p->v]) depth = level[p->v];
if (--lvCnt[level[i]] == 0) return ret;
level[i] = depth+1;
++lvCnt[level[i]];
arc[i] = adj[i];
if (i != s) i = pre[i]->re->v;
}
}
return ret;
}
} g;
const int MAXN = 1024*1024;
int mx[MAXN], my[MAXN];
int X[MAXN], Y[MAXN];
int n, m, VX, VY;
int C[MAXN][3], R[MAXN][3];
void dfs(int u) {
if (X[u]) return ;
X[u] = 1;
for (Edge *p = g.adj[u]; p != NULL; p = p->next) {
if (p->v - VX >= 1 && p->v - VX <= VY) {
if (my[p->v - VX] != -1 && Y[p->v - VX] == 0) {
Y[p->v - VX] = 1;
dfs(my[p->v - VX]);
}
}
}
}
int main() {
static char s[1024][1024];
while (scanf("%d %d", &n, &m) == 2) {
for (int i = 1; i <= n; i++)
scanf("%s", s[i]+1);
static int xg[1024][1024] = {};
static int yg[1024][1024] = {};
memset(xg, 0, sizeof(xg));
memset(yg, 0, sizeof(yg));
VX = 0, VY = 0;
for (int i = 1; i <= n; i++) {
for (int j = 1; j <= m; j++) {
if (s[i][j] == '*') {
if (xg[i][j] == 0) {
xg[i][j] = ++VX;
int k = j;
while (k <= m && s[i][k] == '*')
xg[i][k] = VX, k++;
C[VX][0] = i, C[VX][1] = j, C[VX][2] = k-1;
}
if (yg[i][j] == 0) {
yg[i][j] = ++VY;
int k = i;
while (k <= n && s[k][j] == '*')
yg[k][j] = VY, k++;
R[VY][0] = j, R[VY][1] = i, R[VY][2] = k-1;
}
}
}
}
int source = 0, sink = VX+VY+1;
g.Init(VX+VY+2);
for (int i = 1; i <= VX; i++)
g.Addedge(source, i, 1);
for (int i = 1; i <= VY; i++)
g.Addedge(VX+i, sink, 1);
for (int i = 1; i <= n; i++) {
for (int j = 1; j <= m; j++) {
if (s[i][j] == '*') {
// printf("%d - %d\n", xg[i][j], yg[i][j]);
g.Addedge(xg[i][j], yg[i][j] + VX, 1);
}
}
}
int mxflow = g.Maxflow(source, sink);
memset(mx, -1, sizeof(mx));
memset(my, -1, sizeof(my));
memset(X, 0, sizeof(X));
memset(Y, 0, sizeof(Y));
int match = 0;
for (int i = 1; i <= VX; i++) {
for (Edge *p = g.adj[i]; p != NULL; p = p->next) {
int x = i, y = p->v, flow = p->flow;
if (flow == 0 && y - VX >= 1 && y - VX <= VY) {
// match x - (y - VX) in bipartite graph
int r = x, c = y - VX;
mx[r] = c;
my[c] = r;
match++;
}
}
}
assert(match == mxflow);
for (int i = 1; i <= VX; i++) {
if (mx[i] == -1)
dfs(i);
}
printf("%d\n", mxflow);
for (int i = 1; i <= VX; i++) {
if (!X[i] && mx[i] != -1) {
printf("hline %d %d %d\n", C[i][0], C[i][1], C[i][2]);
mxflow--;
}
}
for (int i = 1; i <= VY; i++) {
if (Y[i]) {
printf("vline %d %d %d\n", R[i][0], R[i][1], R[i][2]);
mxflow--;
}
}
assert(mxflow == 0);
}
return 0;
}
Read More +

UVa 13102 - Tobby Stones

Problem

$N$ 個石頭排成一列,一開始都是白色石頭,經過 $M$ 詢問,分別統計區間內的黑色、白色石頭個數。

操作分別有以下幾種:

1. 統計區間 [l, r] 的黑色、白色石頭個數。
2. 將區間 [l, r] 的石頭的排列反轉。
3. 將區間 [l, r] 的石頭的顏色反轉。
4. 將區間 [l, r] 的石頭的顏色全部改成 $c$

## Sample Input ##

1
2
3
4
5
6
7
8
9
10
10 7
0 0 9
3 0 4 0
0 0 4
1 0 9
0 5 9
2 5 9
0 3 9
100 1
0 0 50

Sample Output

1
2
3
4
5
0 10
5 0
5 0
0 7
0 51

Solution

用 Splay Tree 完成區間反轉,打個懶標記就可以完成,每個操作都可以在 $\mathcal{O}(\log N)$ 完成,說不定這一題用塊狀鏈表還比較容易處理,再加上快取效果還會快上很多。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#include <bits/stdc++.h>
using namespace std;
const int MAXN = 1048576;
class SPLAY_TREE { // Splay Tree
public:
static const int MAXN = 1048576;
struct Node {
static Node *EMPTY;
Node *ch[2], *fa;
char rev, set, inv, val;
int size;
int sumw, sumb;
Node() {
ch[0] = ch[1] = fa = NULL;
rev = set = inv = 0;
val = sumw = sumb = 0, size = 1;
}
bool is_root() {
return fa->ch[0] != this && fa->ch[1] != this;
}
void pushdown() {
if (rev) {
if (ch[0] != EMPTY) ch[0]->rev ^= 1;
if (ch[1] != EMPTY) ch[1]->rev ^= 1;
swap(ch[0], ch[1]);
rev ^= 1;
}
if (set) {
if (ch[0] != EMPTY) ch[0]->set = set, ch[0]->inv = 0;
if (ch[1] != EMPTY) ch[1]->set = set, ch[1]->inv = 0;
if (set == 1) // white
val = 0, sumw = sumw + sumb, sumb = 0;
else
val = 1, sumb = sumw + sumb, sumw = 0;
set = 0, inv = 0;
}
if (inv) {
if (ch[0] != EMPTY) {
if (ch[0]->set)
ch[0]->set = 3 - ch[0]->set;
else
ch[0]->inv ^= 1;
}
if (ch[1] != EMPTY) {
if (ch[1]->set)
ch[1]->set = 3 - ch[1]->set;
else
ch[1]->inv ^= 1;
}
swap(sumw, sumb), val = !val;
inv = 0;
}
}
void pushup() {
if (ch[0] != EMPTY) ch[0]->pushdown();
if (ch[1] != EMPTY) ch[1]->pushdown();
sumw = sumb = 0;
if (val == 0) sumw++;
else sumb++;
sumw += ch[0]->sumw + ch[1]->sumw;
sumb += ch[0]->sumb + ch[1]->sumb;
size = 1 + ch[0]->size + ch[1]->size;
}
} _mem[MAXN];
int bufIdx;
SPLAY_TREE::Node *root;
SPLAY_TREE() {
Node::EMPTY = &_mem[0];
Node::EMPTY->fa = Node::EMPTY->ch[0] = Node::EMPTY->ch[1] = Node::EMPTY;
Node::EMPTY->size = Node::EMPTY->val = 0;
bufIdx = 1;
}
void init() {
bufIdx = 1;
}
Node* newNode() {
Node *u = &_mem[bufIdx++];
*u = Node();
u->fa = u->ch[0] = u->ch[1] = Node::EMPTY;
return u;
}
void rotate(Node *x) {
Node *y;
int d;
y = x->fa, d = y->ch[1] == x ? 1 : 0;
x->ch[d^1]->fa = y, y->ch[d] = x->ch[d^1];
x->ch[d^1] = y;
if (!y->is_root())
y->fa->ch[y->fa->ch[1] == y] = x;
x->fa = y->fa, y->fa = x;
y->pushup();
}
void deal(Node *x) {
if (!x->is_root()) deal(x->fa);
x->pushdown();
}
Node* find_rt(Node *x) {
for (; x->fa != Node::EMPTY; x = x->fa);
return x;
}
void splay(Node *x, Node *below) {
Node *y, *z;
deal(x);
while (!x->is_root() && x->fa != below) {
y = x->fa, z = y->fa;
if (!y->is_root() && y->fa != below) {
if (y->ch[0] == x ^ z->ch[0] == y)
rotate(x);
else
rotate(y);
}
rotate(x);
}
x->pushup();
if (x->fa == Node::EMPTY) root = x;
}
Node* build(int l, int r, Node *p, char s[]) {
if (l > r) return Node::EMPTY;
int mid = (l + r)/2;
Node *t = newNode();
t->val = s[mid], t->fa = p;
if (t->val == 0) t->sumw ++;
else t->sumb ++;
t->ch[0] = build(l, mid-1, t, s);
t->ch[1] = build(mid+1, r, t, s);
t->pushup();
if (p == Node::EMPTY) root = t;
return t;
}
void debug(Node *u){
if (u == Node::EMPTY) return;
u->pushdown();
debug(u->ch[0]);
printf("%d", u->val);
debug(u->ch[1]);
}
Node* kthNode(int pos) {
Node *u = root;
for (int t; u != Node::EMPTY;) {
u->pushdown();
t = u->ch[0]->size;
if (t+1 == pos) return u;
if (t >= pos)
u = u->ch[0];
else
pos -= t+1, u = u->ch[1];
}
return Node::EMPTY;
}
void insert(int pos, int val) {
Node *p, *q, *r;
p = kthNode(pos), q = kthNode(pos+1);
r = newNode();
splay(p, Node::EMPTY);
splay(q, root);
r->val = val, q->ch[0] = r, r->fa = q;
splay(r, Node::EMPTY);
}
void erase(int pos) {
Node *p, *q;
p = kthNode(pos-1), q = kthNode(pos+1);
splay(p, Node::EMPTY), splay(q, root);
q->ch[0] = Node::EMPTY;
splay(q, Node::EMPTY);
}
void reverse(int l, int r) {
Node *p, *q;
p = kthNode(l), q = kthNode(r+2);
splay(p, Node::EMPTY), splay(q, root);
q->ch[0]->rev ^= 1;
splay(q->ch[0], Node::EMPTY);
}
void inverse(int l, int r) {
Node *p, *q;
p = kthNode(l), q = kthNode(r+2);
splay(p, Node::EMPTY), splay(q, root);
q->ch[0]->inv ^= 1;
splay(q->ch[0], Node::EMPTY);
}
void reset(int l, int r, int c) {
Node *p, *q;
p = kthNode(l), q = kthNode(r+2);
splay(p, Node::EMPTY), splay(q, root);
if (c == 1) {
q->ch[0]->set = 1;
} else {
q->ch[0]->set = 2;
}
splay(q->ch[0], Node::EMPTY);
}
pair<int, int> stat(int l, int r) {
Node *p, *q;
p = kthNode(l), q = kthNode(r+2);
splay(p, Node::EMPTY), splay(q, root);
return make_pair(q->ch[0]->sumb, q->ch[0]->sumw);
}
} tree;
SPLAY_TREE::Node *SPLAY_TREE::Node::EMPTY;
int main() {
static char s[1048576] = {}; // white
int n, m, cmd, l, r, c;
while (scanf("%d %d", &n, &m) == 2) {
for (int i = 0; i <= n+1; i++)
s[i] = 0;
tree.init();
tree.build(0, n+1, SPLAY_TREE::Node::EMPTY, s);
pair<int, int> ret;
for (int i = 0; i < m; i++) {
scanf("%d", &cmd);
if (cmd == 0) {
scanf("%d %d", &l, &r);
ret = tree.stat(l+1, r+1);
printf("%d %d\n", ret.first, ret.second);
} else if (cmd == 1) {
scanf("%d %d", &l, &r);
tree.reverse(l+1, r+1);
} else if (cmd == 2) {
scanf("%d %d", &l, &r);
tree.inverse(l+1, r+1);
} else if (cmd == 3) {
scanf("%d %d %d", &l, &r, &c);
tree.reset(l+1, r+1, c);
}
}
}
return 0;
}
Read More +

UVa 13101 - Tobby on Tree

Problem

一場遊戲有 $N$ 個數,每一次挑兩個不同的數,將大的數字扣除小的數字,直到沒辦法挑出任何不同的數,亦即所有數皆相同,遊戲結果就是最後的數字值。

現在給予一棵樹,操作有兩種:

  • 詢問樹上兩點 $u$$v$ 的路徑上的數字,遊戲結果為何。
  • 改變樹上某個節點 $u$ 的值為 $c$

Sample Input

1
2
3
4
5
6
7
8
9
10
5
5 15 20 15 9
0 2
0 3
3 1
3 4
3
1 2 1
2 3 3
1 1 4

Sample Output

1
2
5
3

Solution

明顯地答案就是路徑上所有數字的最大公因數,充分利用 Link/Cut Tree 的特性,解決樹上詢問的詢問即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#include <bits/stdc++.h>
using namespace std;
class LCT { // Link-Cut Tree
public:
static const int MAXN = 262144;
struct Node {
static Node *EMPTY;
Node *ch[2], *fa;
int rev;
int val, size;
int gcd;
Node() {
ch[0] = ch[1] = fa = NULL;
rev = 0;
val = 0;
gcd = 1, size = 1;
}
bool is_root() {
return fa->ch[0] != this && fa->ch[1] != this;
}
void pushdown() {
if (rev) {
ch[0]->rev ^= 1, ch[1]->rev ^= 1;
swap(ch[0], ch[1]);
rev ^= 1;
}
}
void pushup() {
if (this == EMPTY) return ;
gcd = this->val, size = 1;
if (ch[0] != EMPTY) {
gcd = __gcd(gcd, ch[0]->gcd);
size += ch[0]->size;
}
if (ch[1] != EMPTY) {
gcd = __gcd(gcd, ch[1]->gcd);
size += ch[1]->size;
}
}
} _mem[MAXN];
int bufIdx;
LCT() {
Node::EMPTY = &_mem[0];
Node::EMPTY->fa = Node::EMPTY->ch[0] = Node::EMPTY->ch[1] = Node::EMPTY;
Node::EMPTY->size = 0;
bufIdx = 1;
}
void init() {
bufIdx = 1;
}
Node* newNode() {
Node *u = &_mem[bufIdx++];
*u = Node();
u->fa = u->ch[0] = u->ch[1] = Node::EMPTY;
return u;
}
void rotate(Node *x) {
Node *y;
int d;
y = x->fa, d = y->ch[1] == x ? 1 : 0;
x->ch[d^1]->fa = y, y->ch[d] = x->ch[d^1];
x->ch[d^1] = y;
if (!y->is_root())
y->fa->ch[y->fa->ch[1] == y] = x;
x->fa = y->fa, y->fa = x;
y->pushup(), x->pushup();
}
void deal(Node *x) {
if (!x->is_root()) deal(x->fa);
x->pushdown();
}
void splay(Node *x) {
Node *y, *z;
deal(x);
while (!x->is_root()) {
y = x->fa, z = y->fa;
if (!y->is_root()) {
if (y->ch[0] == x ^ z->ch[0] == y)
rotate(x);
else
rotate(y);
}
rotate(x);
}
x->pushup();
}
Node* access(Node *u) {
Node *v = Node::EMPTY;
for (; u != Node::EMPTY; u = u->fa) {
splay(u);
u->ch[1] = v;
v = u;
v->pushup();
}
return v;
}
void mk_root(Node *u) {
access(u)->rev ^= 1, splay(u);
}
void cut(Node *x, Node *y) {
mk_root(x);
access(y), splay(y);
y->ch[0] = x->fa = Node::EMPTY;
}
Node* _cut(Node *rt, Node *x) {
Node *u, *v;
mk_root(rt);
access(x), splay(x);
for (v = x->ch[0]; v->ch[1] != Node::EMPTY; v = v->ch[1]);
x->ch[0]->fa = x->fa;
x->fa = x->ch[0] = Node::EMPTY;
return v;
}
void link(Node *x, Node *y) {
mk_root(x);
x->fa = y;
}
Node* find(Node *x) {
for (x = access(x); x->pushdown(), x->ch[0] != Node::EMPTY; x = x->ch[0]);
return x;
}
//
int gcdPath(Node *x, Node *y) {
mk_root(x);
access(y), splay(y);
return y->gcd;
}
//
void changeNode(Node *x, int c) {
mk_root(x);
x->val = c;
}
void debug(int n) {
}
} tree;
LCT::Node *LCT::Node::EMPTY;
LCT::Node *A[262144];
int W[131072];
int main() {
int n, m, cmd, x, y;
while (scanf("%d", &n) == 1) {
for (int i = 0; i < n; i++)
scanf("%d", &W[i]);
tree.init();
for (int i = 0; i < n; i++) {
A[i] = tree.newNode();
A[i]->val = W[i];
}
for (int i = 1; i < n; i++) {
scanf("%d %d", &x, &y);
tree.link(A[x], A[y]);
}
scanf("%d", &m);
for (int i = 0; i < m; i++) {
scanf("%d %d %d", &cmd, &x, &y);
if (cmd == 1) {
printf("%d\n", tree.gcdPath(A[x], A[y]));
} else {
tree.changeNode(A[x], y);
}
}
}
return 0;
}
/*
5
5 15 20 15 9
0 2
0 3
3 1
3 4
9999
1 2 1
1 1 3
2 3 3
1 1 4
1 1 3
*/
Read More +

UVa 13000 - VIP Treatment

Problem

$N$ 個工人和 $M$ 種工作,每一種工作分成兩種工作量 VIP 以及 Regular,有一些工作只能指派給特定工人完成。每名工人有各自的完成速度,請問最小化所有工作完成的最大時間為何?

Sample Input

1
2
3
4
5
6
7
8
9
10
11
12
13
14
3
3 3 10
2 4 8
2 3 1 1
2 3 1 2
2 4 1 3
2 1 4
2
2 3 1 1
3 2 1 1
2 2 4
1 2
2 3 2 1 2
3 2 2 1 2

Sample Output

1
2
3
Case 1: 48
Case 2: 18
Case 3: 6

Solution

二分時間,利用 maxflow 流滿的情況判定是否可以讓所有工人在限時內合作完成所有工作。

source - worker - job - sink,其中 source - worker 為工人能運作的最大工作量 time/W[i],worker - job 根據指派工人的配置,job - sink 為工作的 VIP 和 Regular 量。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#include <bits/stdc++.h>
using namespace std;
const int MAXV = 40010;
const int MAXE = MAXV * 200 * 2;
const long long LLINF = 1LL<<62;
typedef struct Edge {
int v;
long long cap, flow;
Edge *next, *re;
} Edge;
class MaxFlow {
public:
Edge edge[MAXE], *adj[MAXV], *pre[MAXV], *arc[MAXV];
int e, n, level[MAXV], lvCnt[MAXV], Q[MAXV];
void Init(int x) {
n = x, e = 0;
for (int i = 0; i < n; ++i) adj[i] = NULL;
}
void Addedge(int x, int y, long long flow){
edge[e].v = y, edge[e].cap = flow, edge[e].next = adj[x];
edge[e].re = &edge[e+1], adj[x] = &edge[e++];
edge[e].v = x, edge[e].cap = 0, edge[e].next = adj[y];
edge[e].re = &edge[e-1], adj[y] = &edge[e++];
}
void Bfs(int v){
int front = 0, rear = 0, r = 0, dis = 0;
for (int i = 0; i < n; ++i) level[i] = n, lvCnt[i] = 0;
level[v] = 0, ++lvCnt[0];
Q[rear++] = v;
while (front != rear){
if (front == r) ++dis, r = rear;
v = Q[front++];
for (Edge *i = adj[v]; i != NULL; i = i->next) {
int t = i->v;
if (level[t] == n) level[t] = dis, Q[rear++] = t, ++lvCnt[dis];
}
}
}
long long Maxflow(int s, int t){
long long ret = 0;
int i, j;
Bfs(t);
for (i = 0; i < n; ++i) pre[i] = NULL, arc[i] = adj[i];
for (i = 0; i < e; ++i) edge[i].flow = edge[i].cap;
i = s;
while (level[s] < n){
while (arc[i] && (level[i] != level[arc[i]->v]+1 || !arc[i]->flow))
arc[i] = arc[i]->next;
if (arc[i]){
j = arc[i]->v;
pre[j] = arc[i];
i = j;
if (i == t){
long long update = LLINF;
for (Edge *p = pre[t]; p != NULL; p = pre[p->re->v])
if (update > p->flow) update = p->flow;
ret += update;
for (Edge *p = pre[t]; p != NULL; p = pre[p->re->v])
p->flow -= update, p->re->flow += update;
i = s;
}
}
else{
int depth = n-1;
for (Edge *p = adj[i]; p != NULL; p = p->next)
if (p->flow && depth > level[p->v]) depth = level[p->v];
if (--lvCnt[level[i]] == 0) return ret;
level[i] = depth+1;
++lvCnt[level[i]];
arc[i] = adj[i];
if (i != s) i = pre[i]->re->v;
}
}
return ret;
}
} g;
int main() {
int testcase, cases = 0;
scanf("%d", &testcase);
while (testcase--) {
int M, N, K;
int W[64], V[64], R[64];
vector<int> WR[64];
int sumVIP = 0, maxW = 0;
scanf("%d %d %d", &M, &N, &K);
for (int i = 0; i < N; i++)
scanf("%d", &W[i]), maxW = max(maxW, W[i]);
for (int i = 0; i < M; i++) {
int n, x;
scanf("%d %d %d", &V[i], &R[i], &n);
WR[i].clear();
for (int j = 0; j < n; j++) {
scanf("%d", &x), x--;
WR[i].push_back(x);
}
sumVIP += V[i];
}
long long l = 0, r = (long long) maxW * (sumVIP + K), mid, ret = 0;
while (l <= r) {
mid = (l + r)/2;
long long time = mid;
int source = N + 2*M;
int sink1 = N + 2*M + 1; // VIP
int sink2 = N + 2*M + 2; // Regular
int sink = N + 2*M + 3;
g.Init(N + 2*M + 5);
for (int i = 0; i < N; i++) {
g.Addedge(source, i, time / W[i]);
// printf("e %d %d %d\n", source, i, time / W[i]);
}
g.Addedge(sink1, sink, sumVIP);
g.Addedge(sink2, sink, K);
// printf("e %d %d %d\n", sink1, sink, sumVIP);
// printf("e %d %d %d\n", sink2, sink, K);
for (int i = 0; i < M; i++) {
int u1 = N + 2*i, u2 = N + 2*i + 1;
g.Addedge(u1, sink1, V[i]);
g.Addedge(u2, sink2, R[i]);
// printf("e %d %d %d\n", u1, sink1, V[i]);
// printf("e %d %d %d\n", u2, sink2, R[i]);
for (int j = 0; j < WR[i].size(); j++) {
int v = WR[i][j];
// printf("e %d %d %d\n", v, u1, INF);
// printf("e %d %d %d\n", v, u2, INF);
g.Addedge(v, u1, LLINF);
g.Addedge(v, u2, LLINF);
}
}
long long flow = g.Maxflow(source, sink);
// printf("time %d: %d\n", time, flow);
if (flow == sumVIP + K)
r = mid - 1, ret = time;
else
l = mid + 1;
}
printf("Case %d: %lld\n", ++cases, ret);
}
return 0;
}
Read More +

UVa 13070 - Palm trees in the snow

Problem

給一排棕梠樹的樹高 $H_i$,經過大雪後,高度 $H_i \ge W$ 的樹都會攤倒,挑選一個區間滿足最多五棵屹立不搖的情況下,請問區間長度最長為何?

Sample Input

1
2
3
4
5
6
7
8
9
10
3
30
10
10 30 50 20 40 60 30 40 50 36
40
10
10 30 50 20 40 20 10 10 20 36
20
3
40 10 15

Sample Output

1
2
3
7
10
3

Solution

將會倒的位置記錄下來,線性掃描過去即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#include <bits/stdc++.h>
using namespace std;
int main() {
int testcase;
scanf("%d", &testcase);
while (testcase--) {
int W, N, x;
scanf("%d %d", &W, &N);
vector<int> A;
A.push_back(-1);
for (int i = 0; i < N; i++) {
scanf("%d", &x);
if (x >= W)
A.push_back(i);
}
A.push_back(N);
int ret = 0;
for (int i = 1; i < A.size()-1; i++) {
int x = A[min(i+5, (int)A.size()-1)];
ret = max(ret, x - A[i-1]-1);
}
if (A.size() == 2) ret = N;
printf("%d\n", ret);
}
return 0;
}
Read More +

UVa 13036 - Birthday Gift to SJ - 2

Problem

問在區間 $[a, b]$ 之中最大的 interesting number,所謂的 interesting number 就是分解成數個費波納契數的乘積。

Sample Input

1
2
3
4
3
1 7
1 10
1 1000000000000000000

Sample Output

1
2
3
6
10
1000000000000000000

Solution

由於費波納契數成長速度非常快,用不著擔心會有好幾種不同數構成,因此可以採用中間相遇法,將前幾個小的費波納契數任意組合,後半則是剩餘的組合。至於要從哪裡開始拆團,必須手動測一下。

而這一題詢問次數非常多,儘管建好表,$\mathcal{O}(N \log N)$ 的搜索相當慢,甚至會 TLE,最好使用一次二分搜尋,或者利用單調性質降到 $\mathcal{O}(N)$ 以下最為保險。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#include <bits/stdc++.h>
using namespace std;
// Algorithm: Meet in Middle
const int MAXN = 85;
const long long MAXV = 1e+18;
long long F[100] = {2, 3};
vector<long long> S1, S2;
void bfs(int begin, int end, vector<long long> &S) {
S.push_back(1);
for (int i = begin; i < end; i++) {
long long x = F[i];
vector<long long> next(S);
for (auto e : S) {
while (MAXV / e / x) {
e *= x;
next.push_back(e);
}
}
S = next;
}
sort(S.begin(), S.end());
S.resize(unique(S.begin(), S.end()) - S.begin());
}
int main() {
for (int i = 2; i < MAXN; i++) {
F[i] = F[i-1] + F[i-2];
assert(F[i] / MAXV == 0);
}
int split = 9;
bfs(0, split, S1);
bfs(split, MAXN, S2);
int testcase;
scanf("%d", &testcase);
while (testcase--) {
long long a, b;
long long ret = -1;
scanf("%lld %lld", &a, &b);
int idx1 = 0, idx2 = S2.size()-1;
for (; idx1 < S1.size(); idx1++) {
if (S1[idx1] > b) break;
long long e = S1[idx1];
while (idx2 > 0 && b / S2[idx2] / e == 0)
idx2--;
long long t = S2[idx2] * e;
if (t >= a)
ret = max(ret, t);
}
vector<long long>::iterator it = lower_bound(S1.begin(), S1.end(), b);
if (it == S1.end() || it != S1.begin() && *it > b) it--;
if (it != S1.end() && b / *it) {
long long t = *it;
if (t >= a)
ret = max(ret, t);
}
it = lower_bound(S2.begin(), S2.end(), b);
if (it == S2.end() || it != S2.begin() && *it > b) it--;
if (it != S2.end() && b / *it) {
long long t = *it;
if (t >= a)
ret = max(ret, t);
}
printf("%lld\n", ret);
}
return 0;
}
/*
1
5231711 13073137
*/
Read More +

UVa 13057 - Prove Them All

Problem

這家公司有自動推論引擎,請問至少要證明幾個基礎定理後,剩餘的都可以藉由推論引擎得到?

Sample Input

1
2
3
4
5
6
1
4 4
1 2
1 3
4 2
4 3

Sample Output

1
Case 1: 2

Solution

把這張圖的所有 SCC 元件全部縮成一點,變成 DAG 後,計算有幾個節點的 indegree 等於零,其零的個數就是答案。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#include <bits/stdc++.h>
using namespace std;
// similar as UVa 11504 - Dominos
// algorithm: SCC
const int MAXN = 32767;
class SCC {
public:
int n;
vector<int> g[MAXN], dag[MAXN];
// <SCC begin>
int vfind[MAXN], findIdx;
int stk[MAXN], stkIdx;
int in_stk[MAXN], visited[MAXN];
int contract[MAXN];
int scc_cnt;
// <SCC end>
int scc(int u) {
in_stk[u] = visited[u] = 1;
stk[++stkIdx] = u, vfind[u] = ++findIdx;
int mn = vfind[u], v;
for(int i = 0; i < g[u].size(); i++) {
v = g[u][i];
if(!visited[v])
mn = min(mn, scc(v));
if(in_stk[v])
mn = min(mn, vfind[v]);
}
if(mn == vfind[u]) {
do {
in_stk[stk[stkIdx]] = 0;
contract[stk[stkIdx]] = scc_cnt;
} while(stk[stkIdx--] != u);
scc_cnt++;
}
return mn;
}
void addEdge(int u, int v) { // u -> v
g[u].push_back(v);
}
void solve() {
for (int i = 0; i < n; i++)
visited[i] = in_stk[i] = 0;
scc_cnt = 0;
for (int i = 0; i < n; i++) {
if (visited[i]) continue;
stkIdx = findIdx = 0;
scc(i);
}
}
void make_DAG() {
int x, y;
for (int i = 0; i < n; i++) {
x = contract[i];
for (int j = 0; j < g[i].size(); j++) {
y = contract[g[i][j]];
if (x != y)
dag[x].push_back(y);
}
}
for (int i = 0; i < scc_cnt; i++) {
sort(dag[i].begin(), dag[i].end());
dag[i].resize(unique(dag[i].begin(), dag[i].end()) - dag[i].begin());
}
}
void init(int n) {
this->n = n;
for (int i = 0; i < n; i++)
g[i].clear(), dag[i].clear();
}
} g;
int main() {
int testcase, cases = 0;
scanf("%d", &testcase);
while (testcase--) {
int n, m, u, v;
scanf("%d %d", &n, &m);
g.init(n);
for (int i = 0; i < m; i++) {
scanf("%d %d", &u, &v);
u--, v--;
g.addEdge(u, v);
}
g.solve();
g.make_DAG();
int indeg[MAXN] = {}, ret = 0;
for (int i = 0; i < g.scc_cnt; i++) {
for (auto &e : g.dag[i]) {
indeg[e]++;
}
}
for (int i = 0; i < g.scc_cnt; i++)
ret += indeg[i] == 0;
printf("Case %d: %d\n", ++cases, ret);
}
return 0;
}
Read More +