#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include <omp.h>
#define MAXGPU 3
#define MAXN 1024
#define MAXM 32
#define MAXMID 32
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
uint32_t hostMtx[MAXM][MAXN*MAXN];
uint32_t hostMid[MAXGPU][MAXN*MAXN];
int N = MAXN, M, Q;
int clNeedDevCnt = 3;
__global__ void matrixMul(uint32_t A[], uint32_t B[], uint32_t C[], int N) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
int x = r / N, y = r % N;
uint32_t sum = 0;
for (int i = 0; i < N; i++)
sum += A[x*N + i] * B[i*N + y];
C[x * N + y] = sum;
}
__global__ void matrixAdd(uint32_t A[], uint32_t B[], uint32_t C[]) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
C[r] = A[r] + B[r];
}
void matrix_multiply(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixMul<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC, N);
CheckErr(cudaGetLastError());
}
void matrix_add(uint32_t *cuMtxA, uint32_t *cuMtxB, uint32_t *cuMtxC) {
int localSz = 1;
for (int i = 1; i <= 1024; i++) {
if (N*N % i == 0)
localSz = i;
}
dim3 cuBlock(localSz);
dim3 cuGrid(N*N/localSz);
matrixAdd<<<cuGrid, cuBlock>>>(cuMtxA, cuMtxB, cuMtxC);
CheckErr(cudaGetLastError());
}
char expr[1024];
typedef struct Node {
struct Node *l, *r;
int opcode;
uint32_t *hostV, *cuV;
int regNeed, regId;
int pid, mid;
long long h;
} Node;
void replaceReg(Node *u, int a, int b) {
if (u->l == NULL) return ;
if (u->regId == a)
u->regId = b;
replaceReg(u->l, a, b);
replaceReg(u->r, a, b);
}
void updateNode(Node *u, Node *l, Node *r, int opcode) {
u->l = l, u->r = r, u->opcode = opcode;
if (opcode == '+') {
u->h = u->l->h + u->r->h + N;
if (u->l->regNeed == u->r->regNeed) {
u->regNeed = u->l->regNeed + 1;
u->regId = u->regNeed;
replaceReg(u->r, u->r->regId, u->regId);
} else {
u->regNeed = u->l->regNeed > u->r->regNeed ? u->l->regNeed : u->r->regNeed;
u->regId = u->regNeed;
}
} else if (opcode == '*') {
u->h = u->l->h + u->r->h + N*N;
if (abs(u->l->regNeed - u->r->regNeed) == 1) {
u->regNeed = u->l->regNeed + 1;
u->regId = u->regNeed;
} else if (u->l->regNeed == u->r->regNeed) {
u->regNeed = u->l->regNeed + 2;
u->regId = u->regNeed;
replaceReg(u->r, u->r->regId, u->regId-1);
} else {
u->regNeed = u->l->regNeed > u->r->regNeed ? u->l->regNeed : u->r->regNeed;
u->regId = u->regNeed;
int a, b;
if (u->l->regId == u->regId) {
a = u->l->regId, b = u->l->regId-1;
replaceReg(u->l, a, -1);
replaceReg(u->l, b, a);
replaceReg(u->l, -1, b);
} else {
a = u->r->regId, b = u->r->regId-1;
replaceReg(u->r, a, -1);
replaceReg(u->r, b, a);
replaceReg(u->r, -1, b);
}
}
}
assert(u->regId < MAXMID);
}
Node* parseExpr(int l, int r, char expr[], int procId) {
Node *u = (Node *) calloc(1, sizeof(Node));
u->pid = procId;
if (l == r) {
int idx = expr[l] - 'A';
u->mid = idx;
u->h = 0;
return u;
}
int cnt = 0;
for (int i = l; i <= r; i++) {
if (expr[i] == '(') {
cnt++;
} else if (expr[i] == ')') {
cnt--;
} else if (expr[i] == '+' && cnt == 0) {
updateNode(u, parseExpr(l, i-1, expr, procId), parseExpr(i+1, r, expr, procId), '+');
return u;
}
}
for (int i = l; i <= r; i++) {
if (expr[i] == '(') {
if (cnt == 0 && i != l) {
updateNode(u, parseExpr(l, i-1, expr, procId), parseExpr(i, r, expr, procId), '*');
return u;
}
cnt++;
} else if (expr[i] == ')') {
cnt--;
} else if (expr[i] >= 'A' && expr[i] <= 'Z' && cnt == 0 && i != l) {
updateNode(u, parseExpr(l, i-1, expr, procId), parseExpr(i, r, expr, procId), '*');
return u;
}
}
free(u);
return parseExpr(l+1, r-1, expr, procId);
}
uint32_t writeOut(uint32_t *hostC) {
uint32_t h = 0;
uint32_t *Cend = hostC + N*N, *C = hostC;
for (; C != Cend; C++)
h = (h + *C) * 2654435761LU;
return h;
}
uint32_t *cuMemMid[MAXGPU][MAXMID];
uint32_t *cuMemIn[MAXGPU][MAXM];
void memRelocation(Node *u, int did, Node *nodes[], int *offset) {
if (u->l == NULL) {
nodes[*offset] = u, (*offset)++;
return ;
}
u->cuV = cuMemMid[did][u->regId];
if (u->l->regNeed > u->r->regNeed) {
memRelocation(u->l, did, nodes, offset);
memRelocation(u->r, did, nodes, offset);
} else {
memRelocation(u->r, did, nodes, offset);
memRelocation(u->l, did, nodes, offset);
}
fprintf(stderr, "reg%d = %s ", u->regId, u->opcode == '+' ? "add" : "mul");
if (u->l->l == NULL) fprintf(stderr, "%c ", u->l->mid + 'A');
else fprintf(stderr, "reg%d ", u->l->regId);
if (u->r->l == NULL) fprintf(stderr, "%c\n", u->r->mid + 'A');
else fprintf(stderr, "reg%d\n", u->r->regId);
nodes[*offset] = u, (*offset)++;
return ;
}
int executeGPU(Node *workQue[][128], int workQueSz[], uint32_t resultBuff[]) {
Node* nodes[MAXGPU][128];
int offset[MAXGPU] = {};
uint32_t memSz = N*N*sizeof(uint32_t);
int memDeploy[MAXGPU][MAXM] = {};
int regDeploy[MAXGPU][MAXMID] = {};
#pragma omp parallel for
for (int p = 0; p < clNeedDevCnt; p++) {
cudaSetDevice(p);
for (int q = 0; q < workQueSz[p]; q++) {
offset[p] = 0;
memRelocation(workQue[p][q], p, nodes[p], &offset[p]);
for (int i = 0; i < offset[p]; i++) {
Node *u = nodes[p][i];
if (u->l == NULL) {
if (!memDeploy[p][u->mid]) {
cudaMalloc((void **) &cuMemIn[p][u->mid], memSz);
cudaMemcpy(cuMemIn[p][u->mid], hostMtx[u->mid], memSz, cudaMemcpyHostToDevice);
memDeploy[p][u->mid] = 1;
}
u->cuV = cuMemIn[p][u->mid];
continue;
}
if (!regDeploy[p][u->regId])
cudaMalloc((void **) &cuMemMid[p][u->regId], memSz);
if (u->cuV == NULL)
u->cuV = cuMemMid[p][u->regId];
if (u->opcode == '*')
matrix_multiply(u->l->cuV, u->r->cuV, u->cuV);
else
matrix_add(u->l->cuV, u->r->cuV, u->cuV);
}
Node *root = workQue[p][q];
fprintf(stderr, "register need %d\n", root->regNeed);
cudaMemcpy(hostMid[p], root->cuV, memSz, cudaMemcpyDeviceToHost);
uint32_t ret = writeOut(hostMid[p]);
resultBuff[root->pid] = ret;
for (int i = 0; i < offset[p]; i++) {
Node *u = nodes[p][i];
if (u->l != NULL && u->hostV)
free(u->hostV);
free(u);
}
}
cudaSetDevice(p);
for (int i = 0; i < MAXMID; i++) {
cudaFree(cuMemMid[p][i]);
}
for (int i = 0; i < M; i++) {
cudaFree(cuMemIn[p][i]);
}
}
return 1;
}
int readIn() {
if (scanf("%s", expr) != 1)
return 0;
return 1;
}
int balance_cmp(const void *a, const void *b) {
Node *x = *(Node **) a;
Node *y = *(Node **) b;
if (x->h == y->h) return 0;
if (x->h < y->h) return 1;
return -1;
}
void onStart() {
int S[64];
assert(scanf("%d %d", &M, &N) == 2);
for (int i = 0; i < M; i++)
assert(scanf("%d", &S[i]) == 1);
#pragma omp parallel for
for (int p = 0; p < M; p++) {
uint32_t x = 2, n = N*N;
uint32_t c = S[p];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
hostMtx[p][i*N+j] = x;
}
}
}
Node *procBuff[128];
if (scanf("%d", &Q) != 1)
return ;
for (int i = 0; i < Q; i++) {
readIn();
int expr_len = strlen(expr);
procBuff[i] = parseExpr(0, expr_len-1, expr, i);
}
qsort(procBuff, Q, sizeof(Node*), balance_cmp);
float gpuSpeed[MAXGPU] = {1.f, 1.8f, 2.0f};
long long workload[MAXGPU] = {};
int workQueSz[MAXGPU] = {};
uint32_t resultBuff[MAXGPU] = {};
Node *workQue[MAXGPU][128];
for (int i = 0; i < Q; i++) {
int mn = 0;
for (int j = 0; j < clNeedDevCnt; j++) {
if (workload[j]*gpuSpeed[j] < workload[mn]*gpuSpeed[mn])
mn = j;
}
workload[mn] += procBuff[i]->h;
workQue[mn][workQueSz[mn]++] = procBuff[i];
}
executeGPU(workQue, workQueSz, resultBuff);
for (int i = 0; i < Q; i++)
printf("%u\n", resultBuff[i]);
}
int main(int argc, char *argv[]) {
onStart();
return 0;
}