#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <cuda.h>
#include <omp.h>
#define MAXN (64)
#define nStreams 4
__global__ void test_global1(uint32_t IN[], int m, uint32_t OUT[]) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t sum = 0;
    int LOCALIT = m * 500000;
    for (int i = 0; i < LOCALIT; i++) {
            sum += IN[x];
    }
    OUT[x] = sum;
}
uint32_t hostIn[MAXN], hostOut[MAXN];
#define CheckCuda(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
cudaStream_t stream[nStreams];
void pipelTest(uint32_t *cuIn[], uint32_t *cuOut[], int n, int m[]) {
    dim3 cuBlock(1);
    dim3 cuGrid(n / 1);
    test_global1<<<cuGrid, cuBlock, 0, stream[0]>>>(cuIn[0], m[0], cuOut[0]);
    test_global1<<<cuGrid, cuBlock, 0, stream[1]>>>(cuIn[1], m[1], cuOut[1]);
    test_global1<<<cuGrid, cuBlock, 0, stream[2]>>>(cuIn[2], m[2], cuOut[2]);
    test_global1<<<cuGrid, cuBlock, 0, stream[3]>>>(cuIn[3], m[3], cuOut[3]);
}
int main() {
    int device = 2;
    cudaSetDevice(device);
    
    cudaDeviceProp deviceProp;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&deviceProp, device);
    int clockRate = deviceProp.clockRate;
    printf("Device clock rate: %.3f GHz\n", (float)clockRate/1000000);
    
    if (deviceProp.concurrentKernels == 0) {
        printf("GPU does not support concurrent kernel execution\n");
        printf("CUDA kernel runs will be serialised\n");
    }
    
    srand(time(NULL));
    uint32_t *cuIn[nStreams];
    uint32_t *cuOut[nStreams];
    for (int i = 0; i < nStreams; i++) {
        CheckCuda(cudaStreamCreate(&stream[i]));
        CheckCuda(cudaMalloc((void **)&cuIn[i], MAXN*sizeof(uint32_t)));
        CheckCuda(cudaMalloc((void **)&cuOut[i], MAXN*sizeof(uint32_t)));
        for (int j = 0; j < MAXN; j++)
            hostIn[j] = rand();
        cudaMemcpy(cuIn[i], hostIn, MAXN*sizeof(uint32_t), cudaMemcpyHostToDevice);
    }
    int m[] = {1, 2, 4, 8};
    for (int i = 0; i < 5; i++) {
        pipelTest(cuIn, cuOut, MAXN, m);	
        CheckCuda(cudaThreadSynchronize());
    }
    CheckCuda(cudaDeviceSynchronize());
    for (int i = 0; i < nStreams; i++) {
        cudaMemcpy(hostOut, cuOut[i], MAXN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
        uint32_t sum = 0;
        for (int j = 0; j < MAXN; j++)
            sum += hostOut[j];
        printf("%u\n", sum);
    }
    for (int i = 0; i < nStreams; i++)
        cudaFree(cuIn[i]);
    return 0;
}