#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm>
#include <omp.h>
using namespace std;
#define MAXN 1000005
#define MAXINT 256
#define MAXBLK 1024
#define MAXL (MAXN)
#define CheckErr(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
typedef struct ELE {
int i[MAXL], j[MAXL];
uint32_t v[MAXL];
} ELE;
typedef struct AUX {
int aoff[MAXL], boff[MAXL], bmap[MAXL];
int ma, mb, na, nb;
} AUX;
ELE LA, LB;
AUX AU;
uint32_t H[MAXL];
__device__ inline uint32_t rotate_left(uint32_t x, uint32_t n) {
return (x << n) | (x >> (32-n));
}
__device__ inline uint32_t encrypt(uint32_t m, uint32_t key) {
return (rotate_left(m, key&31) + key)^key;
}
__global__ void SpMM(ELE *LA, ELE *LB, uint32_t *H, AUX *AU, int T) {
#define INTSZ MAXINT
__shared__ uint32_t buff[MAXBLK];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int x = idx / T, y = idx % T;
int L = y * INTSZ;
int R = L + INTSZ;
uint32_t tmp[INTSZ];
for (int i = 0; i < INTSZ; i++)
tmp[i] = 0;
int lx = AU->aoff[x], rx = AU->aoff[x+1];
for (int i = lx; i < rx; i++) {
if (AU->bmap[LA->j[i]] != -1) {
int k = AU->bmap[LA->j[i]];
uint32_t val = LA->v[i];
int ly = AU->boff[k], ry = AU->boff[k+1];
for (int j = ly; j < ry; j++) {
if (L <= LB->j[j] && LB->j[j] < R)
tmp[LB->j[j] - L] += val * LB->v[j];
}
}
}
uint32_t hash = 0;
uint32_t X = LA->i[AU->aoff[x]];
for (int i = 0; i < INTSZ; i++) {
if (tmp[i])
hash += encrypt((X+1)*(i+L+1), tmp[i]);
}
buff[threadIdx.x] = hash;
__syncthreads();
for (int i = blockDim.x>>1; i; i >>= 1) {
if (threadIdx.x < i)
buff[threadIdx.x] += buff[threadIdx.x + i];
__syncthreads();
}
if (threadIdx.x == 0)
H[blockIdx.x] = buff[0];
#undef INTSZ
}
void SpMV(int N, int M, int R, ELE &LA, int na, ELE &LB, int nb) {
AU.ma = 0, AU.mb = 0;
AU.na = na, AU.nb = nb;
memset(AU.bmap, -1, sizeof(AU.bmap));
#pragma omp parallel sections
{
#pragma omp section
{
for (int i = 0, p = -1; i < na; i++) {
if (LA.i[i] != p)
AU.aoff[AU.ma++] = i, p = LA.i[i];
}
AU.aoff[AU.ma] = na;
}
#pragma omp section
{
for (int i = 0, p = -1; i < nb; i++) {
if (LB.i[i] != p) {
AU.bmap[LB.i[i]] = AU.mb;
AU.boff[AU.mb++] = i, p = LB.i[i];
}
}
AU.boff[AU.mb] = nb;
}
}
uint32_t *cuHH;
ELE *cuMA, *cuMB;
AUX *cuAU;
cudaMalloc((void **) &cuHH, sizeof(H));
cudaMalloc((void **) &cuMA, sizeof(ELE));
cudaMalloc((void **) &cuMB, sizeof(ELE));
cudaMalloc((void **) &cuAU, sizeof(AUX));
cudaMemcpy(cuHH, &H, sizeof(H), cudaMemcpyHostToDevice);
cudaMemcpy(cuMA, &LA, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuMB, &LB, sizeof(ELE), cudaMemcpyHostToDevice);
cudaMemcpy(cuAU, &AU, sizeof(AUX), cudaMemcpyHostToDevice);
int roundR = (R + MAXINT-1) / MAXINT * MAXINT;
int TOT = N * roundR;
while (TOT / MAXINT % MAXBLK)
TOT ++;
dim3 cuBlock(MAXBLK);
dim3 cuGrid(TOT / MAXINT / MAXBLK);
SpMM<<<cuGrid, cuBlock>>>(cuMA, cuMB, cuHH, cuAU, roundR / MAXINT);
CheckErr(cudaGetLastError());
cudaMemcpy(H, cuHH, sizeof(H), cudaMemcpyDeviceToHost);
uint32_t hash = 0;
#ifdef _OPENMP
omp_set_num_threads(4);
#endif
#pragma omp parallel for reduction(+: hash)
for (int i = 0; i < TOT/MAXINT/MAXBLK; i++)
hash += H[i];
printf("%u\n", hash);
cudaFree(cuMA);
cudaFree(cuMB);
cudaFree(cuHH);
cudaFree(cuAU);
}
inline int readchar() {
const int N = 1048576;
static char buf[N];
static char *p = buf, *end = buf;
if(p == end) {
if((end = buf + fread(buf, 1, N, stdin)) == buf) return EOF;
p = buf;
}
return *p++;
}
inline int ReadInt(int *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}
inline uint32_t ReadUint(uint32_t *x) {
static char c, neg;
while((c = readchar()) < '-') {if(c == EOF) return 0;}
neg = (c == '-') ? -1 : 1;
*x = (neg == 1) ? c-'0' : 0;
while((c = readchar()) >= '0')
*x = (*x << 3) + (*x << 1) + c-'0';
*x *= neg;
return 1;
}
int main() {
int N, M, R, nA, nB;
ReadInt(&N), ReadInt(&M), ReadInt(&R);
ReadInt(&nA), ReadInt(&nB);
for (int i = 0; i < nA; i++)
ReadInt(&LA.i[i]), ReadInt(&LA.j[i]), ReadUint(&LA.v[i]);
for (int i = 0; i < nB; i++)
ReadInt(&LB.i[i]), ReadInt(&LB.j[i]), ReadUint(&LB.v[i]);
SpMV(N, M, R, LA, nA, LB, nB);
return 0;
}