#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <cuda.h>
#define MAXN 1024
#define GPULOCAL 64
#define UNLOOP 8
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostMtx[6][MAXN*MAXN];
uint32_t hostMid[2][MAXN*MAXN];
int N = MAXN, M;
__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
int x = r / N, y = r % N;
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += A[x*N + i] * B[i*N + y];
C[x * N + y] = sum;
}
__global__ void matrixAdd(uint32_t A[], uint32_t B[], uint32_t C[]) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
C[r] = A[r] + B[r];
}
void readIn() {
uint32_t S[64];
assert(scanf("%d", &N) == 1);
M = 6;
for (int i = 0; i < M; i++)
assert(scanf("%d", &S[i]) == 1);
#pragma omp parallel for
for (int p = 0; p < M; p++) {
uint32_t x = 2, n = N*N, c = S[p];
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
hostMtx[p][i*N+j] = x;
}
}
}
}
uint32_t writeOut(uint32_t *hostC) {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
return h;
}
void matrix_multiply(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
}
void matrix_add(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixAdd<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC);
CheckErr(cudaGetLastError());
}
int main(int argc, char *argv[]) {
readIn();
uint32_t *cuMtx[6], *cuMtxTmp[6];
uint32_t memSz = N*N*sizeof(uint32_t);
for (int i = 0; i < 6; i++) {
cudaMalloc((void **) &cuMtx[i], memSz);
cudaMemcpy(cuMtx[i], hostMtx[i], memSz, cudaMemcpyHostToDevice);
CheckErr(cudaGetLastError());
}
for (int i = 0; i < 6; i++)
cudaMalloc((void **) &cuMtxTmp[i], memSz);
matrix_multiply(cuMtx[0], cuMtx[1], cuMtxTmp[0]);
matrix_multiply(cuMtx[2], cuMtx[3], cuMtxTmp[1]);
matrix_multiply(cuMtxTmp[0], cuMtx[4], cuMtxTmp[2]);
matrix_multiply(cuMtxTmp[1], cuMtx[5], cuMtxTmp[3]);
matrix_add(cuMtxTmp[0], cuMtxTmp[1], cuMtxTmp[4]);
matrix_add(cuMtxTmp[2], cuMtxTmp[3], cuMtxTmp[5]);
cudaMemcpy(hostMid[0], cuMtxTmp[4], memSz, cudaMemcpyDeviceToHost);
cudaMemcpy(hostMid[1], cuMtxTmp[5], memSz, cudaMemcpyDeviceToHost);
uint32_t ret[2];
#pragma omp parallel for
for (int i = 0; i < 2; i++) {
ret[i] = writeOut(hostMid[i]);
}
for (int i = 0; i < 2; i++)
printf("%u\n", ret[i]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtx[i]);
for (int i = 0; i < 6; i++)
cudaFree(cuMtxTmp[i]);
return 0;
}