#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <cuda.h>
#define MAXN 1024
#define GPULOCAL 64
#define UNLOOP 8
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostA[MAXN*MAXN], hostB[MAXN*MAXN], hostC[MAXN*MAXN];
int N = MAXN;
__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
__shared__ uint32_t cbuf[MAXN+1];
uint32_t rbuf[MAXN];
int r = blockIdx.x * blockDim.x + threadIdx.x;
int localId = threadIdx.x;
int localSz = blockDim.x;
for (int i = 0; i < N; i++)
rbuf[i] = A[r * N + i];
for (int c = 0; c < N; c++) {
for (int cr = localId; cr < N; cr += localSz)
cbuf[cr] = B[cr * N + c];
__syncthreads();
uint32_t sum = 0;
for (int k = 0; k+UNLOOP-1 < N; k += UNLOOP) {
sum += rbuf[k+0] * cbuf[k+0];
sum += rbuf[k+1] * cbuf[k+1];
sum += rbuf[k+2] * cbuf[k+2];
sum += rbuf[k+3] * cbuf[k+3];
sum += rbuf[k+4] * cbuf[k+4];
sum += rbuf[k+5] * cbuf[k+5];
sum += rbuf[k+6] * cbuf[k+6];
sum += rbuf[k+7] * cbuf[k+7];
}
C[r * N + c] = sum;
}
}
void readIn() {
uint32_t c1, c2;
assert(scanf("%d %u %u", &N, &c1, &c2) == 3);
uint32_t x = 2, n = N*N;
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c1 + i + j)&(n-1);
hostA[i*N+j] = x;
}
}
x = 2;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c2 + i + j)&(n-1);
hostB[i*N+j] = x;
}
}
}
void writeOut() {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
printf("%u\n", h);
}
int main(int argc, char *argv[]) {
readIn();
uint32_t *cuMtxC, *cuMtxA, *cuMtxB;
cudaMalloc((void **) &cuMtxC, N*N*sizeof(uint32_t));
cudaMalloc((void **) &cuMtxA, N*N*sizeof(uint32_t));
cudaMalloc((void **) &cuMtxB, N*N*sizeof(uint32_t));
cudaMemcpy(cuMtxA, hostA, sizeof(uint32_t)*N*N, cudaMemcpyHostToDevice);
cudaMemcpy(cuMtxB, hostB, sizeof(uint32_t)*N*N, cudaMemcpyHostToDevice);
CheckErr(cudaGetLastError());
dim3 cuBlock(GPULOCAL);
dim3 cuGrid(N/GPULOCAL);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
cudaMemcpy(hostC, cuMtxC, sizeof(uint32_t)*N*N, cudaMemcpyDeviceToHost);
CheckErr(cudaGetLastError());
writeOut();
cudaFree(cuMtxC);
return 0;
}