#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <cuda.h>
#define MAXGPU 3
#define MAXN 1024
#define GPULOCAL 64
#define UNLOOP 8
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostMtx[MAXGPU][6][MAXN*MAXN];
uint32_t hostMid[MAXGPU][2][MAXN*MAXN];
__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
int x = r / N, y = r % N;
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += A[x*N + i] * B[i*N + y];
C[x * N + y] = sum;
}
__global__ void matrixAdd(uint32_t A[], uint32_t B[], uint32_t C[]) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
C[r] = A[r] + B[r];
}
int readIn(uint32_t S[], int *n, int devId) {
int N, M;
if (scanf("%d", &N) != 1)
return 0;
M = 6;
for (int i = 0; i < M; i++)
assert(scanf("%d", &S[i]) == 1);
for (int p = 0; p < M; p++)
#pragma omp task
{
uint32_t x = 2, n = N*N, c = S[p];
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
hostMtx[devId][p][i*N+j] = x;
}
}
}
#pragma omp taskwait
*n = N;
return 1;
}
uint32_t writeOut(uint32_t *hostC, int N) {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
return h;
}
void matrix_mul(int N, uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
}
void matrix_add(int N, uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixAdd<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC);
CheckErr(cudaGetLastError());
}
void solver(int devId, int N, uint32_t *cuMtx[], uint32_t *cuMtxTmp[], uint32_t ret[]) {
uint32_t memSz = N*N*sizeof(uint32_t);
for (int i = 0; i < 6; i++) {
cudaMemcpy(cuMtx[i], hostMtx[devId][i], memSz, cudaMemcpyHostToDevice);
CheckErr(cudaGetLastError());
}
matrix_mul(N, cuMtx[0], cuMtx[1], cuMtxTmp[0]);
matrix_mul(N, cuMtx[2], cuMtx[3], cuMtxTmp[1]);
matrix_mul(N, cuMtxTmp[0], cuMtx[4], cuMtxTmp[2]);
matrix_mul(N, cuMtxTmp[1], cuMtx[5], cuMtxTmp[3]);
matrix_add(N, cuMtxTmp[0], cuMtxTmp[1], cuMtxTmp[4]);
matrix_add(N, cuMtxTmp[2], cuMtxTmp[3], cuMtxTmp[5]);
cudaMemcpy(hostMid[devId][0], cuMtxTmp[4], memSz, cudaMemcpyDeviceToHost);
cudaMemcpy(hostMid[devId][1], cuMtxTmp[5], memSz, cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++)
#pragma omp task
{
ret[i] = writeOut(hostMid[devId][i], N);
}
#pragma omp taskwait
}
int main(int argc, char *argv[]) {
uint32_t *cuMtx[MAXGPU][6], *cuMtxTmp[MAXGPU][6];
uint32_t memSz = MAXN*MAXN*sizeof(uint32_t);
for (int p = 0; p < MAXGPU; p++) {
cudaSetDevice(p);
for (int i = 0; i < 6; i++) {
cudaMalloc((void **) &cuMtx[p][i], memSz);
CheckErr(cudaGetLastError());
}
for (int i = 0; i < 6; i++)
cudaMalloc((void **) &cuMtxTmp[p][i], memSz);
}
int inN = 0;
static uint32_t ansQue[32767][2];
#pragma omp parallel sections
{
#pragma omp section
{
cudaSetDevice(0);
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 0);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(0, N, cuMtx[0], cuMtxTmp[0], ansQue[pid]);
}
}
#pragma omp section
{
cudaSetDevice(1);
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 1);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(1, N, cuMtx[1], cuMtxTmp[1], ansQue[pid]);
}
}
#pragma omp section
{
cudaSetDevice(2);
while (1) {
int f = 0, N, pid = 0;
uint32_t S[32];
#pragma omp critical
{
f = readIn(S, &N, 2);
pid = inN;
inN += f;
}
if (f == 0)
break;
solver(2, N, cuMtx[2], cuMtxTmp[2], ansQue[pid]);
}
}
}
for (int i = 0; i < inN; i++)
printf("%u\n%u\n", ansQue[i][0], ansQue[i][1]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtx[i]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtxTmp[i]);
return 0;
}