#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include <omp.h>
#define MAXGPU 3
#define MAXN 1024
uint32_t hostMtx[MAXGPU][6][MAXN*MAXN];
uint32_t hostMid[MAXGPU][2][MAXN*MAXN];
char clSrcFormat[32767] = "";
char clSrc[32767] = "";
const int clNeedDevCnt = 3;
cl_context clCtx[MAXGPU];
cl_program clPrg[MAXGPU];
cl_kernel clKrnAdd[MAXGPU], clKrnMul[MAXGPU];
cl_command_queue clQue[MAXGPU];
cl_mem clMtx[MAXGPU][6], clMtxTmp[MAXGPU][6];
#define CheckFailAndExit(status) \
if (status != CL_SUCCESS) { \
fprintf(stderr, "Error %d: Line %u in file %s\n\n", status, __LINE__, __FILE__), \
destroyGPU(clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMtx, clMtxTmp); \
}
#define clFuncArgs cl_context clCtx[], cl_program clPrg[], cl_kernel clKrnAdd[], \
cl_kernel clKrnMul[], cl_command_queue clQue[], cl_mem clMtx[][6], cl_mem clMtxTmp[][6]
#define clCallFunc clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMtx, clMtxTmp
#define clCallFuncOuter clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMtx, clMtxTmp
uint32_t writeOut(uint32_t *hostC, int N) {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
return h;
}
void destroyGPU(clFuncArgs) {
fprintf(stderr, "Starting Cleanup ...\n\n");
for (int i = 0; i < clNeedDevCnt; i++) {
for (int j = 0; j < 6; j++) {
if (clMtx[i][j])
clReleaseMemObject(clMtx[i][j]);
if (clMtxTmp[i][j])
clReleaseMemObject(clMtxTmp[i][j]);
}
if (clKrnAdd[i])
clReleaseKernel(clKrnAdd[i]);
if (clKrnMul[i])
clReleaseKernel(clKrnMul[i]);
if (clPrg[i])
clReleaseProgram(clPrg[i]);
if (clQue[i])
clReleaseCommandQueue(clQue[i]);
if (clCtx[i])
clReleaseContext(clCtx[i]);
}
exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
FILE *codefin = fopen(fileName, "r");
assert(codefin != NULL);
assert(fread(clSrcFormat, 1, 32767, codefin) < 32767);
sprintf(clSrc, clSrcFormat);
size_t clSrcLen = strlen(clSrc);
fclose(codefin);
cl_int clStat;
cl_uint clPlatN, clGPUN, clDevN;
cl_platform_id clPlatID;
cl_device_id clGPUID[MAXGPU];
const char *clSrcPtr = clSrc;
clGetPlatformIDs(1, &clPlatID, &clPlatN);
clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clDevN);
assert(clDevN >= clNeedDevCnt);
for (int i = 0; i < clNeedDevCnt; i++) {
clCtx[i] = clCreateContext(NULL, 1, clGPUID+i, NULL, NULL, &clStat);
CheckFailAndExit(clStat);
clQue[i] = clCreateCommandQueue(clCtx[i], clGPUID[i], 0, &clStat);
CheckFailAndExit(clStat);
clPrg[i] = clCreateProgramWithSource(clCtx[i], 1, &clSrcPtr, &clSrcLen, &clStat);
CheckFailAndExit(clStat);
clStat = clBuildProgram(clPrg[i], 1, clGPUID+i, NULL, NULL, NULL);
if (clStat != CL_SUCCESS) {
fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__);
size_t log_size;
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *program_log = (char *) calloc(log_size+1, sizeof(char));
clGetProgramBuildInfo(*clPrg, clGPUID[0],
CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
printf("%s", program_log);
free(program_log);
CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
}
clKrnAdd[i] = clCreateKernel(clPrg[i], "matrixAdd", &clStat);
CheckFailAndExit(clStat);
clKrnMul[i] = clCreateKernel(clPrg[i], "matrixMul", &clStat);
CheckFailAndExit(clStat);
for (int j = 0; j < 6; j++) {
clMtx[i][j] = clCreateBuffer(clCtx[i], CL_MEM_READ_WRITE,
sizeof(uint32_t)*MAXN*MAXN, NULL, &clStat);
CheckFailAndExit(clStat);
clMtxTmp[i][j] = clCreateBuffer(clCtx[i], CL_MEM_READ_WRITE,
sizeof(uint32_t)*MAXN*MAXN, NULL, &clStat);
CheckFailAndExit(clStat);
}
}
return 1;
}
void matrix_mul(int N, int devIdx, cl_mem *LIN, cl_mem *RIN, cl_mem *OUT, clFuncArgs) {
cl_int clStat;
size_t globalOffset[] = {0};
size_t globalSize[] = {N*N};
size_t localSize[] = {0};
for (int i = 1; i <= N; i++) {
if (N%i == 0 && i*N <= 32768/2)
localSize[0] = i;
}
clStat = clSetKernelArg(clKrnMul[devIdx], 0, sizeof(cl_mem), LIN);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(clKrnMul[devIdx], 1, sizeof(cl_mem), RIN);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(clKrnMul[devIdx], 2, sizeof(cl_mem), OUT);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(clKrnMul[devIdx], 3, sizeof(cl_int), &N);
CheckFailAndExit(clStat);
clStat = clEnqueueNDRangeKernel(clQue[devIdx], clKrnMul[devIdx], 1, globalOffset,
globalSize, NULL, 0, NULL, NULL);
CheckFailAndExit(clStat);
}
void matrix_add(int N, int devIdx, cl_mem *LIN, cl_mem *RIN, cl_mem *OUT, clFuncArgs) {
cl_int clStat;
size_t globalOffset[] = {0};
size_t globalSize[] = {N*N};
clStat = clSetKernelArg(clKrnAdd[devIdx], 0, sizeof(cl_mem), LIN);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(clKrnAdd[devIdx], 1, sizeof(cl_mem), RIN);
CheckFailAndExit(clStat);
clStat = clSetKernelArg(clKrnAdd[devIdx], 2, sizeof(cl_mem), OUT);
CheckFailAndExit(clStat);
clStat = clEnqueueNDRangeKernel(clQue[devIdx], clKrnAdd[devIdx], 1, globalOffset,
globalSize, NULL, 0, NULL, NULL);
CheckFailAndExit(clStat);
}
int solver(int N, int devId, uint32_t ret[], clFuncArgs) {
uint32_t memSz = N*N*sizeof(uint32_t);
cl_int clStat;
for (int i = 0; i < 6; i++) {
clStat = clEnqueueWriteBuffer(clQue[devId],
clMtx[devId][i], 0, 0, memSz,
hostMtx[devId][i], 0, NULL, NULL);
CheckFailAndExit(clStat);
}
matrix_mul(N, devId, &clMtx[devId][0], &clMtx[devId][1], &clMtxTmp[devId][0], clCallFunc);
matrix_mul(N, devId, &clMtx[devId][2], &clMtx[devId][3], &clMtxTmp[devId][1], clCallFunc);
matrix_mul(N, devId, &clMtxTmp[devId][0], &clMtx[devId][4], &clMtxTmp[devId][2], clCallFunc);
matrix_mul(N, devId, &clMtxTmp[devId][1], &clMtx[devId][5], &clMtxTmp[devId][3], clCallFunc);
matrix_add(N, devId, &clMtxTmp[devId][0], &clMtxTmp[devId][1], &clMtxTmp[devId][4], clCallFunc);
matrix_add(N, devId, &clMtxTmp[devId][2], &clMtxTmp[devId][3], &clMtxTmp[devId][5], clCallFunc);
clStat = clEnqueueReadBuffer(clQue[devId], clMtxTmp[devId][4], CL_TRUE, 0,
sizeof(uint32_t)*N*N, hostMid[devId][0], 0, NULL, NULL);
CheckFailAndExit(clStat);
clStat = clEnqueueReadBuffer(clQue[devId], clMtxTmp[devId][5], CL_TRUE, 0,
sizeof(uint32_t)*N*N, hostMid[devId][1], 0, NULL, NULL);
CheckFailAndExit(clStat);
for (int i = 0; i < 2; i++)
#pragma omp task
{
ret[i] = writeOut(hostMid[devId][i], N);
}
#pragma omp taskwait
return 1;
}
int readIn(uint32_t S[], int *n, int devId) {
int N, M;
if (scanf("%d", &N) != 1)
return 0;
M = 6;
for (int i = 0; i < M; i++)
assert(scanf("%d", &S[i]) == 1);
for (int p = 0; p < M; p++)
#pragma omp task
{
uint32_t x = 2, n = N*N, c = S[p];
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
hostMtx[devId][p][i*N+j] = x;
}
}
}
#pragma omp taskwait
*n = N;
return 1;
}
void onStart(clFuncArgs) {
initAllGPU("matrix-lib.cl", clCallFunc);
int inN = 0;
static uint32_t ansQue[32767][2];
#pragma omp parallel sections
{
#pragma omp section
{
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 0);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(N, 0, ansQue[pid], clCallFunc);
}
}
#pragma omp section
{
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 1);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(N, 1, ansQue[pid], clCallFunc);
}
}
#pragma omp section
{
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 2);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(N, 2, ansQue[pid], clCallFunc);
}
}
}
for (int i = 0; i < inN; i++)
printf("%u\n%u\n", ansQue[i][0], ansQue[i][1]);
destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
printf("God Bless Me\n");
destroyGPU(clCallFuncOuter);
exit(0);
}
int main(int argc, char *argv[]) {
const char sigErr[] = "I can't catch signal.\n";
if (signal(SIGTRAP, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGSEGV, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGILL, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGFPE, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
if (signal(SIGINT, sigHandler) == SIG_ERR)
fprintf(stderr, sigErr);
onStart(clCallFuncOuter);
return 0;
}