#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <cuda.h>
#include <omp.h>
#define MAXN (64)
#define nStreams 4
__global__ void test_global1(uint32_t IN[], int m, uint32_t OUT[]) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t sum = 0;
int LOCALIT = m * 500000;
for (int i = 0; i < LOCALIT; i++) {
sum += IN[x];
}
OUT[x] = sum;
}
uint32_t hostIn[MAXN], hostOut[MAXN];
#define CheckCuda(status) { gpuAssert((status), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, int abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
cudaStream_t stream[nStreams];
void pipelTest(uint32_t *cuIn[], uint32_t *cuOut[], int n, int m[]) {
dim3 cuBlock(1);
dim3 cuGrid(n / 1);
test_global1<<<cuGrid, cuBlock, 0, stream[0]>>>(cuIn[0], m[0], cuOut[0]);
test_global1<<<cuGrid, cuBlock, 0, stream[1]>>>(cuIn[1], m[1], cuOut[1]);
test_global1<<<cuGrid, cuBlock, 0, stream[2]>>>(cuIn[2], m[2], cuOut[2]);
test_global1<<<cuGrid, cuBlock, 0, stream[3]>>>(cuIn[3], m[3], cuOut[3]);
}
int main() {
int device = 2;
cudaSetDevice(device);
cudaDeviceProp deviceProp;
cudaGetDevice(&device);
cudaGetDeviceProperties(&deviceProp, device);
int clockRate = deviceProp.clockRate;
printf("Device clock rate: %.3f GHz\n", (float)clockRate/1000000);
if (deviceProp.concurrentKernels == 0) {
printf("GPU does not support concurrent kernel execution\n");
printf("CUDA kernel runs will be serialised\n");
}
srand(time(NULL));
uint32_t *cuIn[nStreams];
uint32_t *cuOut[nStreams];
for (int i = 0; i < nStreams; i++) {
CheckCuda(cudaStreamCreate(&stream[i]));
CheckCuda(cudaMalloc((void **)&cuIn[i], MAXN*sizeof(uint32_t)));
CheckCuda(cudaMalloc((void **)&cuOut[i], MAXN*sizeof(uint32_t)));
for (int j = 0; j < MAXN; j++)
hostIn[j] = rand();
cudaMemcpy(cuIn[i], hostIn, MAXN*sizeof(uint32_t), cudaMemcpyHostToDevice);
}
int m[] = {1, 2, 4, 8};
for (int i = 0; i < 5; i++) {
pipelTest(cuIn, cuOut, MAXN, m);
CheckCuda(cudaThreadSynchronize());
}
CheckCuda(cudaDeviceSynchronize());
for (int i = 0; i < nStreams; i++) {
cudaMemcpy(hostOut, cuOut[i], MAXN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
uint32_t sum = 0;
for (int j = 0; j < MAXN; j++)
sum += hostOut[j];
printf("%u\n", sum);
}
for (int i = 0; i < nStreams; i++)
cudaFree(cuIn[i]);
return 0;
}