2016-06-28

批改娘 10098. Print Device Information (CUDA)

Problem

使用 CUDA 印出裝置訊息。請參考課程講義。

Sample Input

no input

Sample Output

3 devices found supporting CUDA
----------------------------------
Device GeForce GTX 980 Ti
----------------------------------
 Device memory: 	6442254336
 Memory per-block: 	49152
 Register per-block: 	65536
 Warp size: 		32
 Memory pitch: 		2147483647
 Constant Memory: 	65536
 Max thread per-block: 	1024
 Max thread dim: 	1024 / 1024 / 64
 Max grid size: 	2147483647 / 65535 / 65535
 Ver: 			5.2
 Clock: 		1190000
 Texture Alignment: 	512
----------------------------------
Device GeForce GTX 970
----------------------------------
 Device memory: 	4294770688
 Memory per-block: 	49152
 Register per-block: 	65536
 Warp size: 		32
 Memory pitch: 		2147483647
 Constant Memory: 	65536
 Max thread per-block: 	1024
 Max thread dim: 	1024 / 1024 / 64
 Max grid size: 	2147483647 / 65535 / 65535
 Ver: 			5.2
 Clock: 		1228000
 Texture Alignment: 	512
----------------------------------
Device GeForce GTX 770
----------------------------------
 Device memory: 	2147287040
 Memory per-block: 	49152
 Register per-block: 	65536
 Warp size: 		32
 Memory pitch: 		2147483647
 Constant Memory: 	65536
 Max thread per-block: 	1024
 Max thread dim: 	1024 / 1024 / 64
 Max grid size: 	2147483647 / 65535 / 65535
 Ver: 			3.0
 Clock: 		1137000
 Texture Alignment: 	512

編譯參數

1 2	$ nvcc hello.cu -o hello $ ./hello

備註

請參考題解頁面的輸出格式。

Solution

以防萬一還是處理一下抓不到 device 的判斷，有時候因為驅動版本不對，抓不到 device 是很正常的。接下來就藉由 cudaDeviceProp 下的資訊全部打印。而在 %zu 則是處理型態 size_t 的輸出。

#include <stdio.h>
#include <cuda.h>
const char splitLine[] = "----------------------------------";
void output(const cudaDeviceProp devInfo) {
    puts(splitLine);
    printf("Device %s\n", devInfo.name);
    puts(splitLine);
    printf(" Device memory: \t%zu\n", devInfo.totalGlobalMem);
    printf(" Memory per-block: \t%zu\n", devInfo.sharedMemPerBlock);
    printf(" Register per-block: \t%d\n", devInfo.regsPerBlock);
    printf(" Warp size: \t\t%d\n", devInfo.warpSize);
    printf(" Memory pitch: \t\t%zu\n", devInfo.memPitch);
    printf(" Constant Memory: \t%zu\n", devInfo.totalConstMem);
    printf(" Max thread per-block: \t%d\n", devInfo.maxThreadsPerBlock);
    printf(" Max thread dim: \t%d / %d / %d\n", 
        devInfo.maxThreadsDim[0], devInfo.maxThreadsDim[1], devInfo.maxThreadsDim[2]);
    printf(" Max grid size: \t%d / %d / %d\n", 
        devInfo.maxGridSize[0], devInfo.maxGridSize[1], devInfo.maxGridSize[2]);
    printf(" Ver: \t\t\t%d.%d\n", devInfo.major, devInfo.minor);
    printf(" Clock: \t\t%d\n", devInfo.clockRate);
    printf(" Texture Alignment: \t%zu\n", devInfo.textureAlignment);
}
int main() {
    int cudaDeviceCnt = 0;
    cudaGetDeviceCount(&cudaDeviceCnt);
    printf("%d devices found supporting CUDA\n", cudaDeviceCnt);
    
    if (cudaDeviceCnt == 0) {
        printf("No supported GPU\n");
        return 0;
    }
    for (int i = 0; i < cudaDeviceCnt; i++) {
        cudaDeviceProp devInfo;
        cudaGetDeviceProperties(&devInfo, i);
        output(devInfo);
    }
    return 0;
}

Read More +

2016-06-26

學校課程/平行程式

批改娘 10105. Multiple Device (OpenCL)

題目描述

小明的數學作業要計算方陣，現在請你幫幫他！

題目給定數個 $N \times N$ 的矩陣和 $2$ 小題。

$X = AB+CD$
$Y = ABE+CDF$

輸入格式

多組測資，每組第一行會有一個整數 $N$，表示題目給定 $N \times N$ 矩陣，第二行上會有 $6$ 個整數，分別為矩陣 $A, B, C, D, E, F$ 的生成種子。

$1 \le N \le 1024$
$0 \le S_i \le 2^{31}$

輸出格式

輸出兩行 $X$ 和 $Y$ 的雜湊值，可參考 sequence.c 的流程。

Sample Input

2
0 1 2 3 4 5
10
0 1 2 3 4 5

Sample Output

Solution

這一題要充分實作使用 real-time 分配工作到沒有運行的 GPU 上，利用在 OpenMP 學到的平行技巧，讓多個 thread 等待工作，一抓到工作立即運行。

main.c

#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include <omp.h>
#define MAXGPU 3
#define MAXN 1024
uint32_t    hostMtx[MAXGPU][6][MAXN*MAXN];
uint32_t    hostMid[MAXGPU][2][MAXN*MAXN];
char clSrcFormat[32767] = ""; 
char clSrc[32767] = "";
// -- start working with OpenCL
const int clNeedDevCnt = 3;
cl_context	clCtx[MAXGPU];
cl_program	clPrg[MAXGPU];
cl_kernel	clKrnAdd[MAXGPU], clKrnMul[MAXGPU];
cl_command_queue        clQue[MAXGPU];
cl_mem                    clMtx[MAXGPU][6], clMtxTmp[MAXGPU][6];
 
#define CheckFailAndExit(status) \
    if (status != CL_SUCCESS) { \
        fprintf(stderr, "Error %d: Line %u in file %s\n\n", status, __LINE__, __FILE__), \
        destroyGPU(clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMtx, clMtxTmp); \
    }
#define clFuncArgs cl_context clCtx[], cl_program clPrg[], cl_kernel clKrnAdd[], \
    cl_kernel clKrnMul[], cl_command_queue clQue[], cl_mem clMtx[][6], cl_mem clMtxTmp[][6]
#define clCallFunc clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMtx, clMtxTmp
#define clCallFuncOuter clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMtx, clMtxTmp
 
uint32_t writeOut(uint32_t *hostC, int N) {
    uint32_t h = 0;
    uint32_t *Cend = hostC + N*N, *C = hostC;
    for (; C != Cend; C++)
        h = (h + *C) * 2654435761LU;
    return h;
}
void destroyGPU(clFuncArgs) {
    fprintf(stderr, "Starting Cleanup ...\n\n");
    for (int i = 0; i < clNeedDevCnt; i++) {
        for (int j = 0; j < 6; j++) { 
            if (clMtx[i][j])
                clReleaseMemObject(clMtx[i][j]);
            if (clMtxTmp[i][j])
                clReleaseMemObject(clMtxTmp[i][j]);
        }
        if (clKrnAdd[i])    
            clReleaseKernel(clKrnAdd[i]);
        if (clKrnMul[i])
            clReleaseKernel(clKrnMul[i]);
        if (clPrg[i])
            clReleaseProgram(clPrg[i]);
        if (clQue[i])    
            clReleaseCommandQueue(clQue[i]);
        if (clCtx[i])    
            clReleaseContext(clCtx[i]);
    }
    exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
    // -- generate kernel code
    FILE *codefin = fopen(fileName, "r");
    assert(codefin != NULL);
    assert(fread(clSrcFormat, 1, 32767, codefin) < 32767);
    sprintf(clSrc, clSrcFormat);
    size_t clSrcLen = strlen(clSrc);
    fclose(codefin);
    cl_int                    clStat;
    cl_uint                    clPlatN, clGPUN, clDevN;
    cl_platform_id            clPlatID;
    cl_device_id            clGPUID[MAXGPU];
    const char                *clSrcPtr = clSrc;
    // -- basic OpenCL setup
    clGetPlatformIDs(1, &clPlatID, &clPlatN);
    clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clDevN);
    assert(clDevN >= clNeedDevCnt);
    for (int i = 0; i < clNeedDevCnt; i++) {
        clCtx[i] = clCreateContext(NULL, 1, clGPUID+i, NULL, NULL, &clStat);
        CheckFailAndExit(clStat);
        clQue[i] = clCreateCommandQueue(clCtx[i], clGPUID[i], 0, &clStat);
        CheckFailAndExit(clStat);
        clPrg[i] = clCreateProgramWithSource(clCtx[i], 1, &clSrcPtr, &clSrcLen, &clStat);
        CheckFailAndExit(clStat);
        clStat = clBuildProgram(clPrg[i], 1, clGPUID+i, NULL, NULL, NULL);
        if (clStat != CL_SUCCESS) {
            fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__);
            size_t log_size;
            clGetProgramBuildInfo(*clPrg, clGPUID[0],
                    CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
            char *program_log = (char *) calloc(log_size+1, sizeof(char));
            clGetProgramBuildInfo(*clPrg, clGPUID[0],
                    CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
            printf("%s", program_log);
            free(program_log);
            CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
        }
        clKrnAdd[i] = clCreateKernel(clPrg[i], "matrixAdd", &clStat);
        CheckFailAndExit(clStat);
        clKrnMul[i] = clCreateKernel(clPrg[i], "matrixMul", &clStat);
        CheckFailAndExit(clStat);
        for (int j = 0; j < 6; j++) {
            clMtx[i][j] = clCreateBuffer(clCtx[i], CL_MEM_READ_WRITE, 
                    sizeof(uint32_t)*MAXN*MAXN, NULL, &clStat);
            CheckFailAndExit(clStat);
            clMtxTmp[i][j] = clCreateBuffer(clCtx[i], CL_MEM_READ_WRITE, 
                    sizeof(uint32_t)*MAXN*MAXN, NULL, &clStat);
            CheckFailAndExit(clStat);
        }
    }
    return 1;
}
void matrix_mul(int N, int devIdx, cl_mem *LIN, cl_mem *RIN, cl_mem *OUT, clFuncArgs) {
    cl_int clStat;
    size_t globalOffset[] = {0};
    size_t globalSize[] = {N*N};
    size_t localSize[] = {0};
    for (int i = 1; i <= N; i++) {
        if (N%i == 0 && i*N <= 32768/2)
            localSize[0] = i;
    }
    // -- set argument to kernel
    clStat = clSetKernelArg(clKrnMul[devIdx], 0, sizeof(cl_mem), LIN);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnMul[devIdx], 1, sizeof(cl_mem), RIN);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnMul[devIdx], 2, sizeof(cl_mem), OUT);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnMul[devIdx], 3, sizeof(cl_int), &N);
    CheckFailAndExit(clStat);
    // -- execute
    clStat = clEnqueueNDRangeKernel(clQue[devIdx], clKrnMul[devIdx], 1, globalOffset,
            globalSize, NULL, 0, NULL, NULL);
    CheckFailAndExit(clStat);
}
void matrix_add(int N, int devIdx, cl_mem *LIN, cl_mem *RIN, cl_mem *OUT, clFuncArgs) {
    cl_int clStat;
    size_t globalOffset[] = {0};
    size_t globalSize[] = {N*N};
    // -- set argument to kernel
    clStat = clSetKernelArg(clKrnAdd[devIdx], 0, sizeof(cl_mem), LIN);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnAdd[devIdx], 1, sizeof(cl_mem), RIN);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnAdd[devIdx], 2, sizeof(cl_mem), OUT);
    CheckFailAndExit(clStat);
    // -- execute
    clStat = clEnqueueNDRangeKernel(clQue[devIdx], clKrnAdd[devIdx], 1, globalOffset,
            globalSize, NULL, 0, NULL, NULL);
    CheckFailAndExit(clStat);
}
int solver(int N, int devId, uint32_t ret[], clFuncArgs) {
    uint32_t memSz = N*N*sizeof(uint32_t);
    cl_int clStat;
    for (int i = 0; i < 6; i++) {
        clStat = clEnqueueWriteBuffer(clQue[devId], 
                clMtx[devId][i], 0, 0, memSz,
                hostMtx[devId][i], 0, NULL, NULL);
        CheckFailAndExit(clStat);
    }
    // cuMtxTmp[0] = AB
    matrix_mul(N, devId, &clMtx[devId][0], &clMtx[devId][1], &clMtxTmp[devId][0], clCallFunc);
    // cuMtxTmp[1] = CD
    matrix_mul(N, devId, &clMtx[devId][2], &clMtx[devId][3], &clMtxTmp[devId][1], clCallFunc);
    // cuMtxTmp[2] = ABE
    matrix_mul(N, devId, &clMtxTmp[devId][0], &clMtx[devId][4], &clMtxTmp[devId][2], clCallFunc);
    // cuMtxTmp[3] = CDF
    matrix_mul(N, devId, &clMtxTmp[devId][1], &clMtx[devId][5], &clMtxTmp[devId][3], clCallFunc);
    // cuMtxTmp[4] = AB + CD
    matrix_add(N, devId, &clMtxTmp[devId][0], &clMtxTmp[devId][1], &clMtxTmp[devId][4], clCallFunc);
    // cuMtxTmp[5] = ABE+CDF
    matrix_add(N, devId, &clMtxTmp[devId][2], &clMtxTmp[devId][3], &clMtxTmp[devId][5], clCallFunc);
    clStat = clEnqueueReadBuffer(clQue[devId], clMtxTmp[devId][4], CL_TRUE, 0, 
            sizeof(uint32_t)*N*N, hostMid[devId][0], 0, NULL, NULL);
    CheckFailAndExit(clStat);
    clStat = clEnqueueReadBuffer(clQue[devId], clMtxTmp[devId][5], CL_TRUE, 0, 
            sizeof(uint32_t)*N*N, hostMid[devId][1], 0, NULL, NULL);
    CheckFailAndExit(clStat);
    for (int i = 0; i < 2; i++)
#pragma omp task
    {
        ret[i] = writeOut(hostMid[devId][i], N);
    }
#pragma omp taskwait
    return 1;
}
int readIn(uint32_t S[], int *n, int devId) {
    int N, M;
    if (scanf("%d", &N) != 1)
        return 0;
    M = 6;
    for (int i = 0; i < M; i++)
        assert(scanf("%d", &S[i]) == 1);
    for (int p = 0; p < M; p++)
#pragma omp task
    {
        uint32_t x = 2, n = N*N, c = S[p];
        x = 2;
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                x = (x * x + c + i + j)%n;
                hostMtx[devId][p][i*N+j] = x;
            }
        }
    }
#pragma omp taskwait
    *n = N;
    return 1;
}
void onStart(clFuncArgs) {
    initAllGPU("matrix-lib.cl", clCallFunc);
    int inN = 0;
    static uint32_t ansQue[32767][2];
#pragma omp parallel sections
    {
#pragma omp section
        {
            while (1) {
                int f = 0, N, pid = 0;
                uint32_t S[32];
#pragma omp critical
                {
                    f = readIn(S, &N, 0);
                    pid = inN;
                    inN += f;
                }
                if (f == 0)
                    break;
                solver(N, 0, ansQue[pid], clCallFunc);
            }
        }
#pragma omp section
        {
            while (1) {
                int f = 0, N, pid = 0;
                uint32_t S[32];
#pragma omp critical
                {
                    f = readIn(S, &N, 1);
                    pid = inN;
                    inN += f;
                }
                if (f == 0)
                    break;
                solver(N, 1, ansQue[pid], clCallFunc);
            }
        }
#pragma omp section
        {
            while (1) {
                int f = 0, N, pid = 0;
                uint32_t S[32];
#pragma omp critical
                {
                    f = readIn(S, &N, 2);
                    pid = inN;
                    inN += f;
                }
                if (f == 0)
                    break;
                solver(N, 2, ansQue[pid], clCallFunc);
            }
        }
    }
    for (int i = 0; i < inN; i++)
        printf("%u\n%u\n", ansQue[i][0], ansQue[i][1]);	
    destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
    printf("God Bless Me\n");
    destroyGPU(clCallFuncOuter);
    exit(0);
}
int main(int argc, char *argv[]) {
    const char sigErr[] = "I can't catch signal.\n";
    if (signal(SIGTRAP, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGSEGV, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGILL, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGFPE, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGINT, sigHandler) == SIG_ERR)     
        fprintf(stderr, sigErr);
    onStart(clCallFuncOuter);
    return 0;
}

matrix-lib.cl

#define CTYPE unsigned int
__kernel void matrixAdd(__global CTYPE *in1,
        __global CTYPE *in2,
        __global CTYPE *out) {
    int x = get_global_id(0);
    out[x] = in1[x] + in2[x];
}
__kernel void matrixMul(__global CTYPE *in1,
        __global CTYPE *in2,
        __global CTYPE *out, int N) {
    int id = get_global_id(0);
    int x = id / N, y = id % N;
    CTYPE sum = 0;
    for (int i = 0; i < N; i++)
        sum += in1[x*N + i] * in2[i*N + y];
    out[x * N + y] = sum;
}

Read More +

2016-06-26

學校課程/平行程式

批改娘 10097. Advanced Matrix Calculator (OpenCL)

題目描述

小明的數學作業要計算方陣，現在請你幫幫他！

題目給定數個 $N \times N$ 的矩陣和 $Q$ 小題，每一小題只由加法和乘法構成。

sequence.c

#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            UINT sum = 0;    // overflow, let it go.
            for (int k = 0; k < N; k++)
                sum += A[i][k] * B[k][j];
            C[i][j] = sum;
        }
    }
}
void add(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++)
        	C[i][j] = A[i][j] + B[i][j];
    }
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
    UINT x = 2, n = N*N;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            x = (x * x + c + i + j)%n;
            A[i][j] = x;
        }
    }
}
void print_matrix(int N, UINT A[][MAXN]) {
    for (int i = 0; i < N; i++) {
        fprintf(stderr, "[");
        for (int j = 0; j < N; j++)
            fprintf(stderr, " %u", A[i][j]);
        fprintf(stderr, " ]\n");
    }
}
UINT signature(int N, UINT A[][MAXN]) {
    UINT h = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++)
            h = (h + A[i][j]) * 2654435761LU;
    }
    return h;
}
UINT IN[6][MAXN][MAXN], TMP[6][MAXN][MAXN];
int main() {
    int N, S[6];
    scanf("%d", &N);
    for (int i = 0; i < 6; i++) {
        scanf("%d", &S[i]);
        rand_gen(S[i], N, IN[i]);
    }
    // AB
    multiply(N, IN[0], IN[1], TMP[0]);
    // CD
    multiply(N, IN[2], IN[3], TMP[1]);
    // AB+CD
    add(N, TMP[0], TMP[1], TMP[2]);
    printf("%u\n", signature(N, TMP[2]));
    
    // ABE
    multiply(N, TMP[0], IN[4], TMP[3]);
    // CDF
    multiply(N, TMP[1], IN[5], TMP[4]);
    // ABE+CDF
    add(N, TMP[3], TMP[4], TMP[5]);
    printf("%u\n", signature(N, TMP[5]));
    return 0;
}

輸入格式

測資只有一組，第一行會有兩個整數 $M,N$，表示題目給定 $M$ 個 $N \times N$ 矩陣，第二行上會有 $N$ 個整數 $S_i$ 個第 $i$ 個矩陣生成種子。最後會有一行一個整數 $Q$，表示接下來有 $Q$ 行詢問，每一行上會有一個字串 $E$ 表示接下來要處理的矩陣表達式，$E$ 只包含 A-Z 以及 +。

$1 \le M \le 26$
$1 \le N \le 1024$
$0 \le S_i \le 2^{31}$
$1 \le Q \le 100$
$|E| \le 26$

輸出格式

對於每一組測資輸出一行。

範例輸入 1

6 2
0 1 2 3 4 5
2
AB+CD
ABE+CDF

範例輸出 1

1 2	2385860290 1374821695

編譯參數

1 2	$ gcc -std=c99 -O2 main.c -lm -lOpenCL -fopenmp $ ./main

Solution

這一題是 10095. Matrix Calculator (OpenCL) 的強化版，針對計算量在多個 GPU 裝置上分配工作。由於每一個表達式的計算量多寡不定，為了批次解決一坨工作，讓三個 GPU 的執行時間最大值最小化，貪心分配表達式，將計算量由大排到小後，依序取出，挑選目前 workload 最小的 GPU 分配到這之上，但 GPU 計算能力不同 (例如頻率或傳輸效率 … 等)，需要多乘上一個常數比較。

main.c

#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include <omp.h>
#define MAXGPU 3
#define MAXN 1024
#define MAXM 26
#define MAXMID 32
uint32_t	hostMtx[MAXM][MAXN*MAXN];
int N, M, Q;
char expr[1024];
char clSrcFormat[32767] = ""; 
char clSrc[32767] = "";
// -- start working with OpenCL
const int clNeedDevCnt = 3;
cl_context				clCtx[MAXGPU];
cl_program				clPrg[MAXGPU];
cl_kernel				clKrnAdd[MAXGPU], clKrnMul[MAXGPU];
cl_command_queue		clQue[MAXGPU];
cl_mem					clMemIn[MAXGPU][MAXM], clMemMid[MAXGPU][MAXMID];
typedef struct Node {
    struct Node *l, *r;
    int opcode;
    uint32_t *hostV;
    cl_mem	clV;
    cl_event event, *waitEvents;
    int waitEventsN;
    int pid, mid;
    long long h;
} Node;
#define CheckFailAndExit(status) \
    if (status != CL_SUCCESS) { \
        fprintf(stderr, "Error %d: Line %u in file %s\n\n", status, __LINE__, __FILE__), \
        destroyGPU(clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMemIn); \
    }
#define clFuncArgs cl_context clCtx[], cl_program clPrg[], cl_kernel clKrnAdd[], \
    cl_kernel clKrnMul[], cl_command_queue clQue[], cl_mem clMemIn[][MAXM]
#define clCallFunc clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMemIn
#define clCallFuncOuter clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMemIn
void assignGPU(Node *u, int gpuIdx) {
    if (u == NULL)	return ;
    if (u->l == NULL) {
        u->hostV = hostMtx[u->mid];
        u->clV = clMemIn[gpuIdx][u->mid];
        return ;
    }
    assignGPU(u->l, gpuIdx);
    assignGPU(u->r, gpuIdx);
}
Node* parseExpr(int l, int r, char expr[], int procId, clFuncArgs) {
    cl_int clStat;
    Node *u = (Node *) calloc(1, sizeof(Node));
    u->pid = procId;
    if (l == r) {
        int idx = expr[l] - 'A';
        u->hostV = hostMtx[idx];
        u->mid = idx;
        u->h = 0;
        return u;
    }
    int cnt = 0;
    for (int i = l; i <= r; i++) {
        if (expr[i] == '(') {
            cnt++;
        } else if (expr[i] == ')') {
            cnt--;
        } else if (expr[i] == '+' && cnt == 0) {
            u->l = parseExpr(l, i-1, expr, procId, clCallFunc);
            u->r = parseExpr(i+1, r, expr, procId, clCallFunc);
            u->opcode = '+';
            u->h = u->l->h + u->r->h + N;
            return u;
        }
    }
    for (int i = l; i <= r; i++) {
        if (expr[i] == '(') {
            if (cnt == 0 && i != l) {
                u->l = parseExpr(l, i-1, expr, procId, clCallFunc);
                u->r = parseExpr(i, r, expr, procId, clCallFunc);
                u->opcode = '*';
                u->h = u->l->h + u->r->h + N*N;
                return u;
            }
            cnt++;
        } else if (expr[i] == ')') {
            cnt--;
        } else if (expr[i] >= 'A' && expr[i] <= 'Z' && cnt == 0 && i != l) {
            u->l = parseExpr(l, i-1, expr, procId, clCallFunc);
            u->r = parseExpr(i, r, expr, procId, clCallFunc);
            u->opcode = '*';
            u->h = u->l->h + u->r->h + N*N;
            return u;
        }
    }
    free(u);
    return parseExpr(l+1, r-1, expr, procId, clCallFunc);
}
uint32_t writeMatrixOut(int N, uint32_t *A) {
    uint32_t h = 0;
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            h = (h + A[i*N + j]) * 2654435761LU;
    return h;
}
void destroyGPU(clFuncArgs) {
    fprintf(stderr, "Starting Cleanup ...\n\n");
    for (int i = 0; i < clNeedDevCnt; i++) {
        for (int j = 0; j < M; j++) { 
            if (clMemIn[i][j])
                clReleaseMemObject(clMemIn[i][j]);
        }
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        for (int j = 0; j < MAXMID; j++) {
            if (clMemMid[i][j])
                clReleaseMemObject(clMemMid[i][j]);
        }
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clKrnAdd[i])	clReleaseKernel(clKrnAdd[i]);
        if (clKrnMul[i])	clReleaseKernel(clKrnMul[i]);
        if (clPrg[i])		clReleaseProgram(clPrg[i]);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clQue[i])	
            clReleaseCommandQueue(clQue[i]);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clCtx[i])	
            clReleaseContext(clCtx[i]);
    }
    exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
    // -- generate kernel code
    FILE *codefin = fopen(fileName, "r");
    assert(codefin != NULL);
    assert(fread(clSrcFormat, 1, 32767, codefin) < 32767);
    sprintf(clSrc, clSrcFormat, N);
    size_t clSrcLen = strlen(clSrc);
    fclose(codefin);
    cl_int					clStat;
    cl_uint					clPlatN, clGPUN, clDevN;
    cl_platform_id			clPlatID;
    cl_device_id			clGPUID[MAXGPU];
    const char				*clSrcPtr = clSrc;
    // -- basic OpenCL setup
    clGetPlatformIDs(1, &clPlatID, &clPlatN);
    clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clDevN);
    assert(clDevN >= clNeedDevCnt);
    for (int i = 0; i < clNeedDevCnt; i++) {
        clCtx[i] = clCreateContext(NULL, 1, clGPUID+i, NULL, NULL, &clStat);
        CheckFailAndExit(clStat);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        clQue[i] = clCreateCommandQueue(clCtx[i], clGPUID[i], 
                /*CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE*/ 0, &clStat);
        CheckFailAndExit(clStat);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        clPrg[i] = clCreateProgramWithSource(clCtx[i], 1, &clSrcPtr, &clSrcLen, &clStat);
        CheckFailAndExit(clStat);
        clStat = clBuildProgram(clPrg[i], 1, clGPUID+i, NULL, NULL, NULL);
        if (clStat != CL_SUCCESS) {
            fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__);
            size_t log_size;
            clGetProgramBuildInfo(*clPrg, clGPUID[0],
                    CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
            char *program_log = (char *) calloc(log_size+1, sizeof(char));
            clGetProgramBuildInfo(*clPrg, clGPUID[0],
                    CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
            printf("%s", program_log);
            free(program_log);
            CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
        }
        clKrnAdd[i] = clCreateKernel(clPrg[i], "matrixAdd", &clStat);
        CheckFailAndExit(clStat);
        clKrnMul[i] = clCreateKernel(clPrg[i], "matrixMul", &clStat);
        CheckFailAndExit(clStat);
    }
    // -- create all buffers
    cl_mem_flags clInBuffFlag = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
    for (int d = 0; d < clNeedDevCnt; d++) {
        for (int i = 0; i < M; i++) {
            clMemIn[d][i] = clCreateBuffer(clCtx[d], clInBuffFlag, sizeof(uint32_t)*N*N,
                    hostMtx[i], &clStat);
            CheckFailAndExit(clStat);
        }
    }
    for (int d = 0; d < clNeedDevCnt; d++) {
        for (int i = 0; i < MAXMID; i++) {
            clMemMid[d][i] = clCreateBuffer(clCtx[d], CL_MEM_READ_WRITE, 
                    sizeof(uint32_t)*N*N, NULL, &clStat);
            CheckFailAndExit(clStat);
        }
    }
    return 1;
}
void GPUmultiply(int N, Node *U, Node *L, Node *R, int devIdx, clFuncArgs) {
    cl_int clStat;
    size_t globalOffset[] = {0};
    size_t globalSize[] = {N*N};
    size_t localSize[] = {0};
    for (int i = 1; i <= 1024; i++) {
        if (N*N%i == 0)
            localSize[0] = i;
    }
    // -- set argument to kernel
    clStat = clSetKernelArg(clKrnMul[devIdx], 0, sizeof(cl_mem), &(L->clV));
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnMul[devIdx], 1, sizeof(cl_mem), &(R->clV));
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnMul[devIdx], 2, sizeof(cl_mem), &(U->clV));
    CheckFailAndExit(clStat);
    // -- find wait events
    int waitN = 0, waitCnt = 0;
    if (L->event)	waitCnt++;
    if (R->event)	waitCnt++;
    cl_event *events = (cl_event*) malloc(sizeof(cl_event) * waitCnt);
    if (L->event)	events[waitN++] = L->event;
    if (R->event)	events[waitN++]	= R->event;
    U->waitEvents = events, U->waitEventsN = waitCnt;
    // -- execute
    clStat = clEnqueueNDRangeKernel(clQue[devIdx], clKrnMul[devIdx], 1, globalOffset,
            globalSize, localSize, U->waitEventsN, U->waitEvents, &(U->event) );
    CheckFailAndExit(clStat);
}
void GPUadd(int N, Node *U, Node *L, Node *R, int devIdx, clFuncArgs) {
    cl_int clStat;
    size_t globalOffset[] = {0};
    size_t globalSize[] = {N*N};
    size_t localSize[] = {0};
    for (int i = 1; i <= 1024; i++) {
        if (N*N%i == 0)
            localSize[0] = i;
    }
    // -- set argument to kernel	
    clStat = clSetKernelArg(clKrnAdd[devIdx], 0, sizeof(cl_mem), &(L->clV));
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnAdd[devIdx], 1, sizeof(cl_mem), &(R->clV));
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnAdd[devIdx], 2, sizeof(cl_mem), &(U->clV));
    CheckFailAndExit(clStat);
    // -- find wait events
    int waitN = 0, waitCnt = 0;
    if (L->event)	waitCnt++;
    if (R->event)	waitCnt++;
    cl_event *events = (cl_event*) malloc(sizeof(cl_event) * waitCnt);
    if (L->event)	events[waitN++] = L->event;
    if (R->event)	events[waitN++]	= R->event;
    U->waitEvents = events, U->waitEventsN = waitCnt;
    // -- execute
    clStat = clEnqueueNDRangeKernel(clQue[devIdx], clKrnAdd[devIdx], 1, globalOffset,
            globalSize, localSize, U->waitEventsN, U->waitEvents, &(U->event) );
    CheckFailAndExit(clStat);
}
int executeGPU(Node *workQue[][128], int workQueSz[], uint32_t resultBuff[], clFuncArgs) {
    cl_int clStat;
    Node* nodes[MAXGPU][128];
    int offset[MAXGPU] = {};
#pragma omp parallel for
    for (int p = 0; p < clNeedDevCnt; p++) {
        for (int q = 0; q < workQueSz[p]; q++) {
            // -- flatten binary tree
            offset[p] = 0;
            nodes[p][offset[p]++] = workQue[p][q];
            for (int i = 0; i < offset[p]; i++) {
                Node *u = nodes[p][i];
                if (u->l != NULL)
                    nodes[p][offset[p]++] = u->l;
                if (u->r != NULL)
                    nodes[p][offset[p]++] = u->r;
            }
            // -- execute in order
            int reuseId = 0;
            for (int i = offset[p]-1; i >= 0; i--) {
                Node *u = nodes[p][i];
                if (u->l == NULL)	// is leaf
                    continue;
                u->clV = clMemMid[p][reuseId++];
                if (u->opcode == '*')
                    GPUmultiply(N, u, u->l, u->r, p, clCallFunc);	
                else
                    GPUadd(N, u, u->l, u->r, p, clCallFunc);
            }
            clFlush(clQue[p]);
            clFinish(clQue[p]);
            nodes[p][0]->hostV = (uint32_t *) malloc(sizeof(uint32_t)*N*N);
            int waitN = nodes[p][0]->event != NULL;
            clStat = clEnqueueReadBuffer(clQue[p], nodes[p][0]->clV, CL_TRUE, 0, 
                    sizeof(uint32_t)*N*N, nodes[p][0]->hostV, waitN, 
                    waitN ? &(nodes[p][0]->event): NULL, NULL);
            uint32_t ret = writeMatrixOut(N, nodes[p][0]->hostV);
            resultBuff[nodes[p][0]->pid] = ret;
            // -- free inner node buffer
            for (int i = 0; i < offset[p]; i++) {
                Node *u = nodes[p][i];
                if (u->l != NULL && u->hostV)
                    free(u->hostV);
                if (u->l != NULL && u->event)
                    clReleaseEvent(u->event);
                if (u->l != NULL && u->waitEvents)
                    free(u->waitEvents);
                free(u);
            }
        }
    }
    return 1;
}
int readIn() {
    if (scanf("%s", expr) != 1)
        return 0;
    return 1;
}
int balance_cmp(const void *a, const void *b) {
    Node *x = *(Node **) a;
    Node *y = *(Node **) b;
    if (x->h == y->h)	return 0;
    if (x->h < y->h)	return 1;
    return -1;
}
void onStart(clFuncArgs) {
    int S[64];
    assert(scanf("%d %d", &M, &N) == 2);
    for (int i = 0; i < M; i++)
        assert(scanf("%d", &S[i]) == 1);
#pragma omp parallel for
    for (int p = 0; p < M; p++) {
        uint32_t x = 2, n = N*N;
        uint32_t c = S[p];
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                x = (x * x + c + i + j)%n;
                hostMtx[p][i*N+j] = x;
            }
        }
    }
    initAllGPU("matrix-lib.cl", clCallFunc);
    Node *procBuff[128];
    if (scanf("%d", &Q) != 1)
        return ;
    for (int i = 0; i < Q; i++) {
        readIn();
        int expr_len = strlen(expr);
        procBuff[i] = parseExpr(0, expr_len-1, expr, i, clCallFunc);
    }
    /*	
		for (int i = 0; i < Q; i++)
		executeCPU(procBuff[i]);
		return ; 
	 */	
    qsort(procBuff, Q, sizeof(Node*), balance_cmp);
    float gpuSpeed[16] = {1.f, 1.8f, 3.2f};
    long long workload[16] = {};
    int workQueSz[MAXGPU] = {};
    uint32_t resultBuff[128] = {};
    Node *workQue[MAXGPU][128];
    for (int i = 0; i < Q; i++) {
        int mn = 0;
        for (int j = 0; j < clNeedDevCnt; j++) {
            if (workload[j]*gpuSpeed[j] < workload[mn]*gpuSpeed[mn])
                mn = j;
        }
        assignGPU(procBuff[i], mn);
        workload[mn] += procBuff[i]->h;
        workQue[mn][workQueSz[mn]++] = procBuff[i];
    }
    executeGPU(workQue, workQueSz, resultBuff, clCallFunc);
    for (int i = 0; i < Q; i++)
        printf("%u\n", resultBuff[i]);
    destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
    printf("God Bless Me\n");
    destroyGPU(clCallFuncOuter);
    exit(0);
}
int main(int argc, char *argv[]) {
    const char sigErr[] = "I can't catch signal.\n";
    if (signal(SIGTRAP, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGSEGV, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGILL, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGFPE, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGKILL, sigHandler) == SIG_ERR)     
        fprintf(stderr, sigErr);
    if (signal(SIGINT, sigHandler) == SIG_ERR)     
        fprintf(stderr, sigErr);
    onStart(clCallFuncOuter);
    return 0;
}

matrix-lib.cl

#define N %d
#define CTYPE unsigned int
#define UNLOOP 8
__kernel void matrixAdd(__global CTYPE *in1,
        __global CTYPE *in2,
        __global CTYPE *out) {
    int x = get_global_id(0);
    out[x] = in1[x] + in2[x];
}
__kernel void matrixMul(__global CTYPE *in1,
        __global CTYPE *in2,
        __global CTYPE *out) {
    int r = get_global_id(0);
    int x = r / N, y = r % N;
    unsigned int sum = 0;
    for (int i = 0; i < N; i++)
        sum += in1[x*N+i] * in2[i*N+y];
    out[x*N+y] = sum;
}

Read More +

2016-06-26

學校課程/平行程式

批改娘 10096. Fast Game of Life (OpenCL)

題目描述

生命遊戲中，對於任意細胞，規則如下：
每個細胞有兩種狀態-存活或死亡，每個細胞與以自身為中心的周圍八格細胞產生互動。

當前細胞為存活狀態時，當周圍低於 2 個 (不包含 2 個) 存活細胞時，該細胞變成死亡狀態。
當前細胞為存活狀態時，當周圍有 2 個或 3 個存活細胞時，該細胞保持原樣。
當前細胞為存活狀態時，當周圍有 3 個以上的存活細胞時，該細胞變成死亡狀態。
當前細胞為死亡狀態時，當周圍有 3 個存活細胞時，該細胞變成存活狀態。

可以把最初的細胞結構定義為種子，當所有在種子中的細胞同時被以上規則處理後，可以得到第一代細胞圖。按規則繼續處理當前的細胞圖，可以得到下一代的細胞圖，周而復始。

輸入格式

輸入第一行有兩個整數 $N$, $M$，表示盤面大小為 $N \times N$，模擬週期次數 $M$。接下來會有 $N$ 行，每一行上會有 $N$ 個字符，以 0 表示 $(i, j)$ 格子上的細胞屬於死亡狀態，反之 1 為存活狀態。

$1 \le N \le 2000$
$0 \le M \le 5000$

輸出格式

對於每一組測資輸出 $N$ 行，每一行上有 $N$ 個字元表示模擬 $M$ 次的最終盤面結果。

範例輸入 1

範例輸出 1

範例輸入 2

範例輸出 2

編譯參數

1 2	$ gcc -std=c99 -O2 main.c -lOpenCL -fopenmp -o main $ ./main

備註

2016/05/07 放寬時間限制，請減少 clCreateBuffer 數量並重複使用那些已經建立好的。
2016/05/09 提供測資下載

by Morris

Solution

簡單的模擬題目，平行化只需要套用滾動數組即可。

當我們拚命優化 local memory 存取，卻在替同學 debug 時發現意外地加速，於是新境界到來，順便跟同學交流一下加速部份，甚至連開檔時間都要省！一起追尋神乎其技的感覺非常不賴。

3571 ms (24-core CPU) -> 2567 ms (GPU, partial local memory) -> 2472 ms (GPU, full local memory) -> 1675 ms (GPU, full local memory + work group opt) -> 967 ms (GPU, global memory + I/O opt + embedded kernel code)

partial local memory

#define N %d
#define binN %d
#define CTYPE char
 
__kernel void simulate(__global CTYPE *IN,
        __global CTYPE *OUT) {
    int x = get_global_id(0);
    int y = get_global_id(1);
    int localX = get_local_id(0);
    int localY = get_local_id(1);
    int localSz = get_local_size(0);
    __local char g[16][16];
    const int dx[] = {-1, -1, -1, 0, 0, 1, 1, 1};
    const int dy[] = {-1, 0, 1, -1, 1, -1, 0, 1};
    char t = IN[x * binN + y];
    g[localX][localY] = t;
 
    barrier(CLK_LOCAL_MEM_FENCE);
    int adj = 0;
    for (int i = 0; i < 8; i++) {
        int cx = localX + dx[i];
        int cy = localY + dy[i];
        int tx = x + dx[i];
        int ty = y + dy[i];
        if (tx < 0 || ty < 0 || tx >= N || ty >= N)
            continue;
 
        if (cx >= 0 && cx < localSz && cy >= 0 && cy < localSz)    {
            adj += g[cx][cy];
        } else {
            adj += IN[tx * binN + ty];
        }
    }
    OUT[x * binN + y] = (t == 0 && adj == 3) || (t == 1 && (adj == 2 || adj == 3));
}

full local memory

#define N %d
#define binN %d
#define localN %d
#define CTYPE char
 
inline void move_border(__local char g[][localN+2], __global CTYPE *IN,
    int localX, int localY, int localSz, int x, int y) {
    if (localX == 1) {
        g[localX-1][localY] = IN[(x-1) * binN + y];
        if (localY == 1)
            g[localX-1][localY-1] = IN[(x-1) * binN + (y-1)];
        if (localY == localSz)
            g[localX-1][localY+1] = IN[(x-1) * binN + (y+1)];
    }
    if (localY == 1)    g[localX][localY-1] = IN[x * binN + (y-1)];
    if (localY == localSz)    g[localX][localY+1] = IN[x * binN + (y+1)];
    if (localX == localSz) {
        g[localX+1][localY] = IN[(x+1) * binN + y];
        if (localY == 1)
            g[localX+1][localY-1] = IN[(x+1) * binN + (y-1)];
        if (localY == localSz)
            g[localX+1][localY+1] = IN[(x+1) * binN + (y+1)];
    }
}
__kernel void simulate(__global CTYPE *IN,
        __global CTYPE *OUT) {
    int x = get_global_id(0)+1;
    int y = get_global_id(1)+1;
    int localX = get_local_id(0)+1;
    int localY = get_local_id(1)+1;
    int localSz = get_local_size(0);
 
    __local char g[localN+2][localN+2];
 
    const int dx[] = {-1, -1, -1, 0, 0, 1, 1, 1};
    const int dy[] = {-1, 0, 1, -1, 1, -1, 0, 1};
 
    // move itself to local
    char t = IN[x * binN + y];
    g[localX][localY] = t;
    // move border to local
    move_border(g, IN, localX, localY, localSz, x, y);
    barrier(CLK_LOCAL_MEM_FENCE);
 
    if (x > N || y > N)    return ;
 
    int adj = 0;
    for (int i = 0; i < 8; i++) {
        int cx = localX + dx[i];
        int cy = localY + dy[i];
        adj += g[cx][cy];
    }
    OUT[x * binN + y] = (t == 0 && adj == 3) || (t == 1 && (adj == 2 || adj == 3));
}

最終優化

#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include <omp.h>
#define OPENCL_MAXGPU 2
#define KERNEL_CODE_LEN 32767
#define MAXN 2048
#define MAXM 2
char hostMtx[2][MAXN*MAXN];
int N, M, binN;
// -- start working with OpenCL
const int clNeedDevCnt = 1;
#define CheckFailAndExit(status) \
    if (status != CL_SUCCESS) { \
        fprintf(stderr, "Error %d: Line %u in file %s\n", status, __LINE__, __FILE__), \
        destroyGPU(clCtx, clPrg, clKrn, clQue, clMemIn); \
    }
#define clFuncArgs cl_context clCtx[], cl_program clPrg[], cl_kernel clKrn[], \
    cl_command_queue clQue[], cl_mem clMemIn[][MAXM]
#define clCallFunc clCtx, clPrg, clKrn, clQue, clMemIn
#define clCallFuncOuter clCtx, clPrg, clKrn, clQue, clMemIn
void destroyGPU(clFuncArgs) {
    fprintf(stderr, "Starting Cleanup ...\n\n");
    for (int i = 0; i < clNeedDevCnt; i++) {
        for (int j = 0; j < M; j++) { 
            if (clMemIn[i][j])
                clReleaseMemObject(clMemIn[i][j]);
        }
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clKrn[i])
            clReleaseKernel(clKrn[i]);
        if (clPrg[i])
            clReleaseProgram(clPrg[i]);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clQue[i])	
            clReleaseCommandQueue(clQue[i]);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clCtx[i])
            clReleaseContext(clCtx[i]);
    }
    exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
static char clSrcFormat[KERNEL_CODE_LEN] = 
"#define N %d\n"
"#define M %d\n"
"#define CTYPE char\n"
"__kernel void simulate(__global CTYPE *IN,\n"
"        __global CTYPE *OUT) {\n"
"    int id = get_global_id(0);\n"
"        int x = id / M+1, y = id % M +1;\n"
"#define G(x, y) IN[(x) * N + (y)]\n"
"        char t = G(x, y);\n"
"        char adj = G(x-1, y-1) + G(x-1, y) + G(x-1, y+1) + G(x, y-1) + G(x, y+1)\n"
"                        + G(x+1, y-1) + G(x+1, y) + G(x+1, y+1);\n"
"        OUT[x * N + y] = (t == 0 && adj == 3) || (t == 1 && (adj == 2 || adj == 3));\n"
"}"; 
    static char clSrc[KERNEL_CODE_LEN] = "";
    // -- generate kernel code
//	FILE *codefin = fopen(fileName, "r");
//	assert(codefin != NULL);
//	assert(fread(clSrcFormat, 1, KERNEL_CODE_LEN, codefin) < KERNEL_CODE_LEN);
    sprintf(clSrc, clSrcFormat, N+2, N);
    size_t clSrcLen = strlen(clSrc);
//	fclose(codefin);
    cl_int					clStat;
    cl_uint					clPlatN, clGPUN, clDevN;
    cl_platform_id			clPlatID;
    cl_device_id			clGPUID[OPENCL_MAXGPU];
    const char				*clSrcPtr = clSrc;
    // -- basic OpenCL setup
    clGetPlatformIDs(1, &clPlatID, &clPlatN);
    clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, OPENCL_MAXGPU, clGPUID, &clDevN);
    assert(clDevN >= clNeedDevCnt);
    for (int i = 0; i < clNeedDevCnt; i++) {
        clCtx[i] = clCreateContext(NULL, 1, clGPUID+i, NULL, NULL, &clStat);
        CheckFailAndExit(clStat);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        clQue[i] = clCreateCommandQueue(clCtx[i], clGPUID[i], 
                0, &clStat);
        CheckFailAndExit(clStat);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        clPrg[i] = clCreateProgramWithSource(clCtx[i], 1, &clSrcPtr, &clSrcLen, &clStat);
        CheckFailAndExit(clStat);
        clStat = clBuildProgram(clPrg[i], 1, clGPUID+i, "-cl-fast-relaxed-math", NULL, NULL);
        if (clStat != CL_SUCCESS) {
            fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__);
            size_t log_size;
            clGetProgramBuildInfo(*clPrg, clGPUID[0],
                    CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
            char *program_log = (char *) calloc(log_size+1, sizeof(char));
            clGetProgramBuildInfo(*clPrg, clGPUID[0],
                    CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
            printf("%s", program_log);
            free(program_log);
            CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
        }
        clKrn[i] = clCreateKernel(clPrg[i], "simulate", &clStat);
        CheckFailAndExit(clStat);
    }
    // -- create all buffers
    cl_mem_flags clInBuffFlag = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
    for (int d = 0; d < clNeedDevCnt; d++) {
        for (int i = 0; i < 2; i++) {
            clMemIn[d][i] = clCreateBuffer(clCtx[d], clInBuffFlag, 
                sizeof(char)*binN*binN, hostMtx[i], &clStat);
            CheckFailAndExit(clStat);
        }
    }
    return 1;
}
int executeGPU(clFuncArgs) {
    cl_int clStat;
    size_t globalOffset[] = {0};
    size_t globalSize[] = {N*N};
    int flag = 0;
    for (int it = 0; it < M; it++) {
        // -- set argument to kernel
        clStat = clSetKernelArg(clKrn[0], 0, sizeof(cl_mem), &clMemIn[0][flag]);
        CheckFailAndExit(clStat);
        clStat = clSetKernelArg(clKrn[0], 1, sizeof(cl_mem), &clMemIn[0][!flag]);
        CheckFailAndExit(clStat);
        // -- execute
        clStat = clEnqueueNDRangeKernel(clQue[0], clKrn[0], 1, globalOffset,
                globalSize, 0, 0, NULL, NULL);
        CheckFailAndExit(clStat);
        flag = !flag;
    }
    // -- read back
    clStat = clEnqueueReadBuffer(clQue[0], clMemIn[0][flag], CL_TRUE, 0,
            sizeof(char)*binN*binN, hostMtx[flag], 0, NULL, NULL);
    for (int i = 1; i <= N; i++) {
        for (int j = 1; j <= N; j++)
            hostMtx[flag][i*binN+j] += '0';
        puts(hostMtx[flag]+i*binN+1);
    }
    return 1;
}
void onStart(clFuncArgs) {
    assert(scanf("%d %d", &N, &M) == 2);
    while (getchar() != '\n');
    static char str[2048][2048];
    for (int i = 1; i <= N; i++)
        assert(fgets(str[i]+1, 2048, stdin) != NULL);
    binN = N+2;
    for (int i = 1; i <= N; i++) {
        for (int j = 1; j <= N; j++)
            hostMtx[0][i*binN + j] = str[i][j] - '0';
    }	
    initAllGPU("game-of-life.cl", clCallFunc);
    executeGPU(clCallFunc);
    return ;
}
cl_context				clCtx[OPENCL_MAXGPU];
cl_program				clPrg[OPENCL_MAXGPU];
cl_kernel				clKrn[OPENCL_MAXGPU];
cl_command_queue		clQue[OPENCL_MAXGPU];
cl_mem					clMemIn[OPENCL_MAXGPU][MAXM];
void sigHandler(int signo) {
    printf("God Bless Me\n");
    destroyGPU(clCallFuncOuter);
    exit(0);
}
int main(int argc, char *argv[]) {
    const char sigErr[] = "I can't catch signal.\n";
    if (signal(SIGTRAP, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGSEGV, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGILL, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGFPE, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGKILL, sigHandler) == SIG_ERR)     
        fprintf(stderr, sigErr);
    if (signal(SIGINT, sigHandler) == SIG_ERR)     
        fprintf(stderr, sigErr);
    onStart(clCallFuncOuter);
    return 0;
}

Read More +

2016-06-26

學校課程/平行程式

批改娘 10095. Matrix Calculator (OpenCL)

題目描述

小明的數學作業要計算方陣，現在請你幫幫他！

題目給定數個 $N \times N$ 的矩陣和 $2$ 小題。

$X = AB+CD$
$Y = ABE+CDF$

sequence.c

#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            UINT sum = 0;    // overflow, let it go.
            for (int k = 0; k < N; k++)
                sum += A[i][k] * B[k][j];
            C[i][j] = sum;
        }
    }
}
void add(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++)
        	C[i][j] = A[i][j] + B[i][j];
    }
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
    UINT x = 2, n = N*N;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            x = (x * x + c + i + j)%n;
            A[i][j] = x;
        }
    }
}
void print_matrix(int N, UINT A[][MAXN]) {
    for (int i = 0; i < N; i++) {
        fprintf(stderr, "[");
        for (int j = 0; j < N; j++)
            fprintf(stderr, " %u", A[i][j]);
        fprintf(stderr, " ]\n");
    }
}
UINT signature(int N, UINT A[][MAXN]) {
    UINT h = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++)
            h = (h + A[i][j]) * 2654435761LU;
    }
    return h;
}
UINT IN[6][MAXN][MAXN], TMP[6][MAXN][MAXN];
int main() {
    int N, S[6];
    scanf("%d", &N);
    for (int i = 0; i < 6; i++) {
        scanf("%d", &S[i]);
        rand_gen(S[i], N, IN[i]);
    }
    // AB
    multiply(N, IN[0], IN[1], TMP[0]);
    // CD
    multiply(N, IN[2], IN[3], TMP[1]);
    // AB+CD
    add(N, TMP[0], TMP[1], TMP[2]);
    printf("%u\n", signature(N, TMP[2]));
    
    // ABE
    multiply(N, TMP[0], IN[4], TMP[3]);
    // CDF
    multiply(N, TMP[1], IN[5], TMP[4]);
    // ABE+CDF
    add(N, TMP[3], TMP[4], TMP[5]);
    printf("%u\n", signature(N, TMP[5]));
    return 0;
}

輸入格式

測資只有一組，第一行會有一個整數 $N$，表示題目給定 $N \times N$ 矩陣，第二行上會有 $6$ 個整數，分別為矩陣 $A, B, C, D, E, F$ 的生成種子。

$1 \le N \le 1024$
$0 \le S_i \le 2^{31}$

輸出格式

輸出兩行 $X$ 和 $Y$ 的雜湊值，可參考 sequence.c 的流程。

範例輸入 1

1 2	2 0 1 2 3 4 5

$$A = \begin{bmatrix} 0 & 1\\ 2 & 2 \end{bmatrix}, B = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix}, C = \begin{bmatrix} 2 & 3\\ 0 & 0 \end{bmatrix}, D = \begin{bmatrix} 3 & 1\\ 1 & 2 \end{bmatrix}, E = \begin{bmatrix} 0 & 1\\ 2 & 2 \end{bmatrix}, F = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix}$$ $$AB = \begin{bmatrix} 3 & 0\\ 8 & 6 \end{bmatrix}, CD = \begin{bmatrix} 9 & 8\\ 0 & 0 \end{bmatrix}, AB+CD = \begin{bmatrix} 12 & 8\\ 8 & 6 \end{bmatrix}\\ ABE = \begin{bmatrix} 0 & 3\\ 12 & 20 \end{bmatrix}, CDF = \begin{bmatrix} 33 & 27\\ 0 & 0 \end{bmatrix}, ABE+CDF = \begin{bmatrix} 33 & 30\\ 12 & 20 \end{bmatrix}$$

範例輸出 1

1 2	2385860290 1374821695

範例輸入 2

1 2	10 0 1 2 3 4 5

範例輸出 2

1 2	617438354 1897844131

編譯參數

1 2	$ gcc -std=c99 -O2 main.c -lm -lOpenCL -fopenmp $ ./main

Solution

這一題用來設計多個 device 共同合作計算一個矩陣表達式，通常會有兩個面向 fine-grain 或者是 coarse-grain，從 fine-grain 角度看來，只需要針對矩陣劃分成數個區塊，例如 device 0 計算 [0, B], device 1 計算 [B+1, 2B] 等方法。而 coarse-grain 則看起來會像是直接從表達式那裡拆分，有可能會重複計算相同的計算值，這裡就不特別消除。

雖然 OpenCL 提供多個 device 共同合作的平台，藉由 context 建立 buffer，但是他們傳輸還是得透過 CPU 控制，沒辦法直接存取另一個 GPU 的 global memory，但寫起來方便許多。

coarse grain 版本

這個版本會針對計算能力做 scheduling，兩個表達式 $X\;, Y$ 分別拆到兩個裝置上運行，重複計算就不理會。將計算量大的表達式丟到較高運算能力的 GPU 上執行。

main.c

#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include <omp.h>
#define MAXGPU 2
#define MAXN 1024
#define MAXM 26
#define GPULOCAL 64
#define MAXMID 20
uint32_t	hostMtx[MAXM][MAXN*MAXN];
int N, binN, M, Q;
char expr[1024];
char clSrcFormat[32767] = ""; 
char clSrc[32767] = "";
// -- start working with OpenCL
const int clNeedDevCnt = 2;
cl_context				clCtx[2];
cl_program				clPrg[2];
cl_kernel				clKrnAdd[2], clKrnMul[2];
cl_command_queue		clQue[2];
cl_mem					clMemIn[2][MAXM], clMemMid[2][MAXM*2];
typedef struct Node {
    struct Node *l, *r;
    int opcode;
    uint32_t *hostV;
    cl_mem	clV;
    cl_event event, *waitEvents;
    int waitEventsN;
    int pid, mid;
    long long h;
} Node;
#define CheckFailAndExit(status) \
    if (status != CL_SUCCESS) { \
        fprintf(stderr, "Error %d: Line %u in file %s\n\n", status, __LINE__, __FILE__), \
        destroyGPU(clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMemIn); \
    }
#define clFuncArgs cl_context clCtx[], cl_program clPrg[], cl_kernel clKrnAdd[], \
    cl_kernel clKrnMul[], cl_command_queue clQue[], cl_mem clMemIn[][MAXM]
#define clCallFunc clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMemIn
#define clCallFuncOuter clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMemIn
void assignGPU(Node *u, int gpuIdx) {
    if (u == NULL)	return ;
    if (u->l == NULL) {
        u->hostV = hostMtx[u->mid];
        u->clV = clMemIn[gpuIdx][u->mid];
        return ;
    }
    assignGPU(u->l, gpuIdx);
    assignGPU(u->r, gpuIdx);
}
Node* parseExpr(int l, int r, char expr[], int procId, clFuncArgs) {
    cl_int clStat;
    Node *u = (Node *) calloc(1, sizeof(Node));
    u->pid = procId;
    if (l == r) {
        int idx = expr[l] - 'A';
        u->hostV = hostMtx[idx];
        u->mid = idx;
        u->h = 0;
        return u;
    }
    int cnt = 0;
    for (int i = l; i <= r; i++) {
        if (expr[i] == '(') {
            cnt++;
        } else if (expr[i] == ')') {
            cnt--;
        } else if (expr[i] == '+' && cnt == 0) {
            u->l = parseExpr(l, i-1, expr, procId, clCallFunc);
            u->r = parseExpr(i+1, r, expr, procId, clCallFunc);
            u->opcode = '+';
            u->h = u->l->h + u->r->h + N;
            return u;
        }
    }
    for (int i = l; i <= r; i++) {
        if (expr[i] == '(') {
            if (cnt == 0 && i != l) {
                u->l = parseExpr(l, i-1, expr, procId, clCallFunc);
                u->r = parseExpr(i, r, expr, procId, clCallFunc);
                u->opcode = '*';
                u->h = u->l->h + u->r->h + N*N;
                return u;
            }
            cnt++;
        } else if (expr[i] == ')') {
            cnt--;
        } else if (expr[i] >= 'A' && expr[i] <= 'Z' && cnt == 0 && i != l) {
            u->l = parseExpr(l, i-1, expr, procId, clCallFunc);
            u->r = parseExpr(i, r, expr, procId, clCallFunc);
            u->opcode = '*';
            u->h = u->l->h + u->r->h + N*N;
            return u;
        }
    }
    free(u);
    return parseExpr(l+1, r-1, expr, procId, clCallFunc);
}
uint32_t writeMatrixOut(int N, uint32_t *A) {
    uint32_t h = 0;
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            h = (h + A[i*binN + j]) * 2654435761LU;
    return h;
}
void destroyGPU(clFuncArgs) {
    fprintf(stderr, "Starting Cleanup ...\n\n");
    for (int i = 0; i < clNeedDevCnt; i++) {
        for (int j = 0; j < M; j++) { 
            if (clMemIn[i][j])
                clReleaseMemObject(clMemIn[i][j]);
        }
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        for (int j = 0; j < MAXMID; j++) {
            if (clMemMid[i][j])
                clReleaseMemObject(clMemMid[i][j]);
        }
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clKrnAdd[i])	clReleaseKernel(clKrnAdd[i]);
        if (clKrnMul[i])	clReleaseKernel(clKrnMul[i]);
        if (clPrg[i])		clReleaseProgram(clPrg[i]);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clQue[i])	
            clReleaseCommandQueue(clQue[i]);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clCtx[i])	
            clReleaseContext(clCtx[i]);
    }
    exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
    // -- generate kernel code
    FILE *codefin = fopen(fileName, "r");
    assert(codefin != NULL);
    assert(fread(clSrcFormat, 1, 32767, codefin) < 32767);
    sprintf(clSrc, clSrcFormat, binN);
    size_t clSrcLen = strlen(clSrc);
    fclose(codefin);
    cl_int					clStat;
    cl_uint					clPlatN, clGPUN, clDevN;
    cl_platform_id			clPlatID;
    cl_device_id			clGPUID[MAXGPU];
    const char				*clSrcPtr = clSrc;
    // -- basic OpenCL setup
    clGetPlatformIDs(1, &clPlatID, &clPlatN);
    clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clDevN);
    assert(clDevN >= clNeedDevCnt);
    for (int i = 0; i < clNeedDevCnt; i++) {
        clCtx[i] = clCreateContext(NULL, 1, clGPUID+i, NULL, NULL, &clStat);
        CheckFailAndExit(clStat);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        clQue[i] = clCreateCommandQueue(clCtx[i], clGPUID[i], 
                CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &clStat);
        CheckFailAndExit(clStat);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        clPrg[i] = clCreateProgramWithSource(clCtx[i], 1, &clSrcPtr, &clSrcLen, &clStat);
        CheckFailAndExit(clStat);
        clStat = clBuildProgram(clPrg[i], 1, clGPUID+i, NULL, NULL, NULL);
        if (clStat != CL_SUCCESS) {
            fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__);
            size_t log_size;
            clGetProgramBuildInfo(*clPrg, clGPUID[0],
                    CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
            char *program_log = (char *) calloc(log_size+1, sizeof(char));
            clGetProgramBuildInfo(*clPrg, clGPUID[0],
                    CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
            printf("%s", program_log);
            free(program_log);
            CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
        }
        clKrnAdd[i] = clCreateKernel(clPrg[i], "matrixAdd", &clStat);
        CheckFailAndExit(clStat);
        clKrnMul[i] = clCreateKernel(clPrg[i], "matrixMul", &clStat);
        CheckFailAndExit(clStat);
    }
    // -- create all buffers
    cl_mem_flags clInBuffFlag = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
    for (int d = 0; d < clNeedDevCnt; d++) {
        for (int i = 0; i < M; i++) {
            clMemIn[d][i] = clCreateBuffer(clCtx[d], clInBuffFlag, sizeof(uint32_t)*binN*binN,
                    hostMtx[i], &clStat);
            CheckFailAndExit(clStat);
        }
    }
    for (int d = 0; d < clNeedDevCnt; d++) {
        for (int i = 0; i < MAXMID; i++) {
            clMemMid[d][i] = clCreateBuffer(clCtx[d], CL_MEM_READ_WRITE, 
                    sizeof(uint32_t)*binN*binN, NULL, &clStat);
            CheckFailAndExit(clStat);
        }
    }
    return 1;
}
void GPUmultiply(int N, Node *U, Node *L, Node *R, int devIdx, clFuncArgs) {
    cl_int clStat;
    size_t globalOffset[] = {0};
    size_t globalSize[] = {binN};
    size_t localSize[] = {GPULOCAL};
    // -- set argument to kernel
    clStat = clSetKernelArg(clKrnMul[devIdx], 0, sizeof(cl_mem), &(L->clV));
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnMul[devIdx], 1, sizeof(cl_mem), &(R->clV));
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnMul[devIdx], 2, sizeof(cl_mem), &(U->clV));
    CheckFailAndExit(clStat);
    // -- find wait events
    int waitN = 0, waitCnt = 0;
    if (L->event)	waitCnt++;
    if (R->event)	waitCnt++;
    cl_event *events = (cl_event*) malloc(sizeof(cl_event) * waitCnt);
    if (L->event)	events[waitN++] = L->event;
    if (R->event)	events[waitN++]	= R->event;
    U->waitEvents = events, U->waitEventsN = waitCnt;
    // -- execute
    clStat = clEnqueueNDRangeKernel(clQue[devIdx], clKrnMul[devIdx], 1, globalOffset,
            globalSize, localSize, U->waitEventsN, U->waitEvents, &(U->event) );
    CheckFailAndExit(clStat);
}
void GPUadd(int N, Node *U, Node *L, Node *R, int devIdx, clFuncArgs) {
    cl_int clStat;
    size_t globalOffset[] = {0};
    size_t globalSize[] = {binN*binN};
    size_t localSize[] = {1};
    // -- set argument to kernel	
    clStat = clSetKernelArg(clKrnAdd[devIdx], 0, sizeof(cl_mem), &(L->clV));
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnAdd[devIdx], 1, sizeof(cl_mem), &(R->clV));
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnAdd[devIdx], 2, sizeof(cl_mem), &(U->clV));
    CheckFailAndExit(clStat);
    // -- find wait events
    int waitN = 0, waitCnt = 0;
    if (L->event)	waitCnt++;
    if (R->event)	waitCnt++;
    cl_event *events = (cl_event*) malloc(sizeof(cl_event) * waitCnt);
    if (L->event)	events[waitN++] = L->event;
    if (R->event)	events[waitN++]	= R->event;
    U->waitEvents = events, U->waitEventsN = waitCnt;
    // -- execute
    clStat = clEnqueueNDRangeKernel(clQue[devIdx], clKrnAdd[devIdx], 1, globalOffset,
            globalSize, localSize, U->waitEventsN, U->waitEvents, &(U->event) );
    CheckFailAndExit(clStat);
}
int executeGPU(Node *workQue[][128], int workQueSz[], uint32_t resultBuff[], clFuncArgs) {
    cl_int clStat;
    Node* nodes[2][128];
    int offset[2] = {};
    #pragma omp parallel for
    for (int p = 0; p < clNeedDevCnt; p++) {
        for (int q = 0; q < workQueSz[p]; q++) {
            // -- flatten binary tree
            offset[p] = 0;
            nodes[p][offset[p]++] = workQue[p][q];
            for (int i = 0; i < offset[p]; i++) {
                Node *u = nodes[p][i];
                if (u->l != NULL)
                    nodes[p][offset[p]++] = u->l;
                if (u->r != NULL)
                    nodes[p][offset[p]++] = u->r;
            }
            // -- execute in order
            int reuseId = 0;
            for (int i = offset[p]-1; i >= 0; i--) {
                Node *u = nodes[p][i];
                if (u->l == NULL)	// is leaf
                    continue;
                u->clV = clMemMid[p][reuseId++];
                if (u->opcode == '*')
                    GPUmultiply(N, u, u->l, u->r, p, clCallFunc);	
                else
                    GPUadd(N, u, u->l, u->r, p, clCallFunc);
            }
            clFlush(clQue[p]);
            clFinish(clQue[p]);
            nodes[p][0]->hostV = (uint32_t *) malloc(sizeof(uint32_t)*binN*binN);
            int waitN = nodes[p][0]->event != NULL;
            clStat = clEnqueueReadBuffer(clQue[p], nodes[p][0]->clV, CL_TRUE, 0, 
                    sizeof(uint32_t)*binN*binN, nodes[p][0]->hostV, waitN, 
                    waitN ? &(nodes[p][0]->event): NULL, NULL);
            uint32_t ret = writeMatrixOut(N, nodes[p][0]->hostV);
            resultBuff[nodes[p][0]->pid] = ret;
            // -- free inner node buffer
            for (int i = 0; i < offset[p]; i++) {
                Node *u = nodes[p][i];
                if (u->l != NULL && u->hostV)
                    free(u->hostV);
                if (u->l != NULL && u->event)
                    clReleaseEvent(u->event);
                if (u->l != NULL && u->waitEvents)
                    free(u->waitEvents);
                free(u);
            }
        }
    }
    return 1;
}
void CPUmultiply(int N, uint32_t *A, uint32_t *B, uint32_t *C) {
#pragma omp parallel for
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            uint32_t sum = 0;
            for (int k = 0; k < N; k++)
                sum += A[i*binN+k] * B[k*binN+j];
            C[i*binN+j] = sum;
        }
    }
}
void CPUadd(int N, uint32_t *A, uint32_t *B, uint32_t *C) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            C[i*binN+j] = A[i*binN+j] + B[i*binN+j];
        }
    }
}
int executeCPU(Node *root) {
    // -- flatten binary tree
    Node* nodes[128];
    int offset = 0;
    nodes[offset++] = root;
    for (int i = 0; i < offset; i++) {
        Node *u = nodes[i];
        if (u->l != NULL)
            nodes[offset++] = u->l;
        if (u->r != NULL)
            nodes[offset++] = u->r;
    }
    for (int i = offset-1; i >= 0; i--) {
        Node *u = nodes[i];
        if (u->l == NULL)	// is leaf
            continue;
        u->hostV = (uint32_t *) calloc(1, sizeof(uint32_t)*binN*binN);
        if (u->opcode == '*')
            CPUmultiply(N, u->l->hostV, u->r->hostV, u->hostV);	
        else
            CPUadd(N, u->l->hostV, u->r->hostV, u->hostV);
        // -- free inner node buffer
        if (u->l->l != NULL)
            free(u->l->hostV), u->l->hostV = NULL;
        if (u->r->l != NULL)
            free(u->r->hostV), u->r->hostV = NULL;
    }
    /*
	   for (int k = 0; k < M; k++) {
	   printf("=== Matrix %c ===\n", k + 'A');
	   for (int i = 0; i < N; i++) {
	   for (int j = 0; j < N; j++)
	   printf("%u ", hostMtx[k][i*N+j]);
	   puts("");
	   }
	   }
	*/
/*	puts("=== final");
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < N; j++)
			printf("%u ", nodes[0]->hostV[i*binN+j]);
		puts("");
	}
*/
    uint32_t ret = writeMatrixOut(N, nodes[0]->hostV);
    printf("%u\n", ret);
    for (int i = 0; i < offset; i++) {
        Node *u = nodes[i];
        if (u->l != NULL && u->hostV)
            free(u->hostV);
        free(u);
    }
}
int readIn() {
    if (scanf("%s", expr) != 1)
        return 0;
    return 1;
}
int balance_cmp(const void *a, const void *b) {
    Node *x = *(Node **) a;
    Node *y = *(Node **) b;
    if (x->h == y->h)	return 0;
    if (x->h < y->h)	return 1;
    return -1;
}
void onStart(clFuncArgs) {
    int S[64];
    M = 6;
    assert(scanf("%d", &N) == 1);
    binN = N;
    while (binN % GPULOCAL)
        binN++;
    for (int i = 0; i < M; i++)
        assert(scanf("%d", &S[i]) == 1);
#pragma omp parallel for
    for (int p = 0; p < M; p++) {
        uint32_t x = 2, n = N*N;
        memset(hostMtx[p], 0, sizeof(uint32_t)*binN*binN);
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                x = (x * x + S[p] + i + j)%n;
                hostMtx[p][i*binN+j] = x;
            }
        }
    }
    initAllGPU("matrix-lib.cl", clCallFunc);
    Node *procBuff[128];
    Q = 2;
    for (int i = 0; i < Q; i++) {
        if (i == 0)	strcpy(expr, "AB+CD");
        else		strcpy(expr, "ABE+CDF");
        int expr_len = strlen(expr);
        procBuff[i] = parseExpr(0, expr_len-1, expr, i, clCallFunc);
    }
    /*
	   for (int i = 0; i < Q; i++)
	   executeCPU(procBuff[i]);
	   return ; 
	 */
    qsort(procBuff, Q, sizeof(Node*), balance_cmp);
    long long workload[16] = {};
    int workQueSz[2] = {};
    uint32_t resultBuff[128];
    Node *workQue[2][128];
    for (int i = 0; i < Q; i++) {
        int mn = 0;
        for (int j = 0; j < clNeedDevCnt; j++) {
            if (workload[j] < workload[mn])
                mn = j;
        }
        assignGPU(procBuff[i], mn);
        workload[mn] += procBuff[i]->h;
        workQue[mn][workQueSz[mn]++] = procBuff[i];
    }
    executeGPU(workQue, workQueSz, resultBuff, clCallFunc);
    for (int i = 0; i < Q; i++)
        printf("%u\n", resultBuff[i]);
    destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
    printf("God Bless Me\n");
    destroyGPU(clCallFuncOuter);
    exit(0);
}
int main(int argc, char *argv[]) {
    const char sigErr[] = "I can't catch signal.\n";
    if (signal(SIGTRAP, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGSEGV, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGILL, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGFPE, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGKILL, sigHandler) == SIG_ERR)     
        fprintf(stderr, sigErr);
    if (signal(SIGINT, sigHandler) == SIG_ERR)     
        fprintf(stderr, sigErr);
    onStart(clCallFuncOuter);
    return 0;
}

matrix-lib.cl

#define N %d
#define CTYPE unsigned int
#define UNLOOP 8
__kernel void matrixAdd(__global CTYPE *in1,
        __global CTYPE *in2,
        __global CTYPE *out) {
    int x = get_global_id(0);
    out[x] = in1[x] + in2[x];
}
__kernel void matrixMul(__global CTYPE *in1,
        __global CTYPE *in2,
        __global CTYPE *out) {
    CTYPE rbuf[N];
    int r = get_global_id(0);
    int localID = get_local_id(0);
    int localSz = get_local_size(0);
    __local CTYPE cbuf[N];
    for (int i = 0; i < N; i++)
        rbuf[i] = in1[r * N + i];
    for (int c = 0; c < N; c++) {
        for (int cr = localID; cr < N; cr += localSz)
            cbuf[cr] = in2[cr * N + c];
        barrier(CLK_LOCAL_MEM_FENCE);
        CTYPE sum = 0;
        for (int k = 0; k+UNLOOP-1 < N; k += UNLOOP) {
            sum += rbuf[k+0] * cbuf[k+0];
            sum += rbuf[k+1] * cbuf[k+1];
            sum += rbuf[k+2] * cbuf[k+2];
            sum += rbuf[k+3] * cbuf[k+3];
            sum += rbuf[k+4] * cbuf[k+4];
            sum += rbuf[k+5] * cbuf[k+5];
            sum += rbuf[k+6] * cbuf[k+6];
            sum += rbuf[k+7] * cbuf[k+7];
        }
        out[r * N + c] = sum;
    }
}

作弊版本

單一個 device 完成，因為 create context 的 overhead 過大，倒不如直接用一個最好的 device 完成所有計算。

main.c

#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include <omp.h>
#define MAXGPU 1
#define MAXN 1024
#define MAXM 6
#define GPULOCAL 32
#define MAXMID 8
uint32_t	hostMtx[MAXM][MAXN*MAXN];
uint32_t	hostX[MAXN*MAXN], hostY[MAXN*MAXN];
int N, M, Q;
char clSrcFormat[32767] = ""; 
char clSrc[32767] = "";
// -- start working with OpenCL
const int clNeedDevCnt = 1;
cl_context				clCtx[2];
cl_program				clPrg[2];
cl_kernel				clKrnAdd[2], clKrnMul[2];
cl_command_queue		clQue[2];
cl_mem					clMemIn[2][MAXM], clMemMid[2][MAXMID];
#define CheckFailAndExit(status) \
    if (status != CL_SUCCESS) { \
        fprintf(stderr, "Error %d: Line %u in file %s\n\n", status, __LINE__, __FILE__), \
        destroyGPU(clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMemIn); \
    }
#define clFuncArgs cl_context clCtx[], cl_program clPrg[], cl_kernel clKrnAdd[], \
    cl_kernel clKrnMul[], cl_command_queue clQue[], cl_mem clMemIn[][MAXM]
#define clCallFunc clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMemIn
#define clCallFuncOuter clCtx, clPrg, clKrnAdd, clKrnMul, clQue, clMemIn
uint32_t writeMatrixOut(int N, uint32_t *A) {
    uint32_t h = 0;
    uint32_t *Aend = A + N*N;
    for (; A != Aend; A++)
        h = (h + *A) * 2654435761LU;
    return h;
}
void destroyGPU(clFuncArgs) {
    fprintf(stderr, "Starting Cleanup ...\n\n");
    for (int i = 0; i < clNeedDevCnt; i++) {
        for (int j = 0; j < M; j++) { 
            if (clMemIn[i][j])
                clReleaseMemObject(clMemIn[i][j]);
        }
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        for (int j = 0; j < MAXMID; j++) {
            if (clMemMid[i][j])
                clReleaseMemObject(clMemMid[i][j]);
        }
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clKrnAdd[i])	clReleaseKernel(clKrnAdd[i]);
        if (clKrnMul[i])	clReleaseKernel(clKrnMul[i]);
        if (clPrg[i])		clReleaseProgram(clPrg[i]);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clQue[i])	
            clReleaseCommandQueue(clQue[i]);
    }
    for (int i = 0; i < clNeedDevCnt; i++) {
        if (clCtx[i])	clReleaseContext(clCtx[i]);
    }
    exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
    // -- generate kernel code
    FILE *codefin = fopen(fileName, "r");
    assert(codefin != NULL);
    assert(fread(clSrcFormat, 1, 32767, codefin) < 32767);
    sprintf(clSrc, clSrcFormat, N);
    size_t clSrcLen = strlen(clSrc);
    fclose(codefin);
    cl_int					clStat;
    cl_uint					clPlatN, clGPUN, clDevN;
    cl_platform_id			clPlatID;
    cl_device_id			clGPUID[MAXGPU];
    const char				*clSrcPtr = clSrc;
    // -- basic OpenCL setup
    clGetPlatformIDs(1, &clPlatID, &clPlatN);
    clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clDevN);
    assert(clDevN >= clNeedDevCnt);
    clCtx[0] = clCreateContext(NULL, 1, clGPUID, NULL, NULL, &clStat);
    CheckFailAndExit(clStat);
    for (int i = 0; i < clNeedDevCnt; i++) {
        clQue[i] = clCreateCommandQueue(clCtx[0], clGPUID[i], 
                /*CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE*/ 0, &clStat);
        CheckFailAndExit(clStat);
    }
    clPrg[0] = clCreateProgramWithSource(clCtx[0], 1, &clSrcPtr, &clSrcLen, &clStat);
    CheckFailAndExit(clStat);
    clStat = clBuildProgram(clPrg[0], 1, clGPUID, NULL, NULL, NULL);
    if (clStat != CL_SUCCESS) {
        fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__);
        size_t log_size;
        clGetProgramBuildInfo(*clPrg, clGPUID[0],
                CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        char *program_log = (char *) calloc(log_size+1, sizeof(char));
        clGetProgramBuildInfo(*clPrg, clGPUID[0],
                CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
        printf("%s", program_log);
        free(program_log);
        CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
    }
    clKrnAdd[0] = clCreateKernel(clPrg[0], "matrixAdd", &clStat);
    CheckFailAndExit(clStat);
    clKrnMul[0] = clCreateKernel(clPrg[0], "matrixMul", &clStat);
    CheckFailAndExit(clStat);
    // -- create all buffers
    cl_mem_flags clInBuffFlag = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
    for (int j = 0; j < clNeedDevCnt; j++) {
        for (int i = 0; i < M; i++) {
            clMemIn[j][i] = clCreateBuffer(clCtx[0], clInBuffFlag, sizeof(uint32_t)*N*N,
                    hostMtx[i], &clStat);
            CheckFailAndExit(clStat);
        }
    }
    for (int j = 0; j < clNeedDevCnt; j++) {
        for (int i = 0; i < MAXMID; i++) {
            clMemMid[j][i] = clCreateBuffer(clCtx[0], CL_MEM_READ_WRITE, 
                    sizeof(uint32_t)*N*N, NULL, &clStat);
            CheckFailAndExit(clStat);
        }
    }
    return 1;
}
void GPUmultiply(int N, int waitN, cl_event events[], cl_event *ret_event, 
		int devIdx, cl_mem *LIN, cl_mem *RIN, cl_mem *OUT, clFuncArgs) {
    cl_int clStat;
    size_t globalOffset[] = {0};
    size_t globalSize[] = {N*N};
    size_t localSize[] = {GPULOCAL};
    // -- set argument to kernel
    clStat = clSetKernelArg(clKrnMul[0], 0, sizeof(cl_mem), LIN);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnMul[0], 1, sizeof(cl_mem), RIN);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnMul[0], 2, sizeof(cl_mem), OUT);
    CheckFailAndExit(clStat);
    // -- execute
    clStat = clEnqueueNDRangeKernel(clQue[devIdx], clKrnMul[0], 1, globalOffset,
            globalSize, NULL, waitN, events, ret_event);
    CheckFailAndExit(clStat);
}
void GPUadd(int N, int waitN, cl_event events[], cl_event *ret_event, 
		int devIdx, cl_mem *LIN, cl_mem *RIN, cl_mem *OUT, clFuncArgs) {
    cl_int clStat;
    size_t globalOffset[] = {0};
    size_t globalSize[] = {N*N};
    // -- set argument to kernel
    clStat = clSetKernelArg(clKrnAdd[0], 0, sizeof(cl_mem), LIN);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnAdd[0], 1, sizeof(cl_mem), RIN);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(clKrnAdd[0], 2, sizeof(cl_mem), OUT);
    CheckFailAndExit(clStat);
    // -- execute
    clStat = clEnqueueNDRangeKernel(clQue[devIdx], clKrnAdd[0], 1, globalOffset,
            globalSize, NULL, waitN, events, ret_event);
    CheckFailAndExit(clStat);
}
int executeGPU(clFuncArgs) {
    cl_int clStat;
    cl_event events[4];
    // AB
    GPUmultiply(N, 0, NULL, &events[0], 0, &clMemIn[0]['A'-'A'], &clMemIn[0]['B'-'A'],
            &clMemMid[0][0], clCallFunc);
    fprintf(stderr, "AB\n");
    // CD
    GPUmultiply(N, 0, NULL, &events[1], 0, &clMemIn[0]['C'-'A'], &clMemIn[0]['D'-'A'],
            &clMemMid[0][1], clCallFunc);
    fprintf(stderr, "CD\n");
    // ABE
    GPUmultiply(N, 1, &events[0], &events[2], 0, &clMemMid[0][0], &clMemIn[0]['E'-'A'], 
            &clMemMid[0][2], clCallFunc);
    fprintf(stderr, "ABE\n");
    // CDF
    GPUmultiply(N, 1, &events[1], &events[3], 0, &clMemMid[0][1], &clMemIn[0]['F'-'A'], 
            &clMemMid[0][3], clCallFunc);
    fprintf(stderr, "CDF\n");	
    // AB+CD
    GPUadd(N, 2, &events[0], NULL, 0, &clMemMid[0][0], &clMemMid[0][1], &clMemMid[0][4], 
            clCallFunc);
    fprintf(stderr, "AB+CD\n");
    // ABE+CDF
    GPUadd(N, 2, &events[2], NULL, 0, &clMemMid[0][2], &clMemMid[0][3], &clMemMid[0][5], 
            clCallFunc);
    fprintf(stderr, "ABE+CDF\n");
    clFinish(clQue[0]);
    clStat = clEnqueueReadBuffer(clQue[0], clMemMid[0][4], CL_TRUE, 0, 
            sizeof(uint32_t)*N*N, hostX, 0, NULL, NULL);
    CheckFailAndExit(clStat);
    clStat = clEnqueueReadBuffer(clQue[0], clMemMid[0][5], CL_TRUE, 0, 
            sizeof(uint32_t)*N*N, hostY, 0, NULL, NULL);
    CheckFailAndExit(clStat);
    
    printf("%u\n", writeMatrixOut(N, hostX));
    printf("%u\n", writeMatrixOut(N, hostY));
    return 1;
}
void onStart(clFuncArgs) {
    int S[64];
    assert(scanf("%d", &N) == 1);
    M = 6;
    for (int i = 0; i < M; i++)
        assert(scanf("%d", &S[i]) == 1);
#pragma omp parallel for
    for (int p = 0; p < M; p++) {
        uint32_t x = 2, n = N*N;
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                x = (x * x + S[p] + i + j)%n;
                hostMtx[p][i*N+j] = x;
            }
        }
    }
    initAllGPU("matrix-lib.cl", clCallFunc);
    executeGPU(clCallFunc);
    destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
    printf("God Bless Me\n");
    destroyGPU(clCallFuncOuter);
    exit(0);
}
int main(int argc, char *argv[]) {
    const char sigErr[] = "I can't catch signal.\n";
    if (signal(SIGTRAP, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGSEGV, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGILL, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGFPE, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGINT, sigHandler) == SIG_ERR)     
        fprintf(stderr, sigErr);
    onStart(clCallFuncOuter);
    return 0;
}

matrix-lib.cl

#define N %d
#define CTYPE unsigned int
__kernel void matrixAdd(__global CTYPE *in1,
        __global CTYPE *in2,
        __global CTYPE *out) {
    int x = get_global_id(0);
    out[x] = in1[x] + in2[x];
}
__kernel void matrixMul(__global CTYPE *in1,
        __global CTYPE *in2,
        __global CTYPE *out) {
    int id = get_global_id(0);
    int x = id / N, y = id % N;
    CTYPE sum = 0;
    for (int i = 0; i < N; i++)
        sum += in1[x*N + i] * in2[i*N + y];
    out[x * N + y] = sum;
}

Read More +

2016-06-26

學校課程/平行程式

批改娘 10092. OpenCL Build Program Debug

題目描述

為 OpenCL 中的 clBuildProgram() Debug 鋪路。請嘗試從標準輸入得到要編譯的檔案名稱，並把編譯的錯誤訊息輸出。

err1.cl

typedef unsigned int uint32_t;
__kernel void mul(__global uint32_t A[], __global uint32_t C[], const int N)
{
    opencl;
}

輸入格式

輸入只有一行，字串長度不大於 30 的檔案名稱。

輸出格式

將錯誤訊息印出，如 printf("%s", program_log);。

範例輸入 1

err1.cl

範例輸出 1

1
2
3

<kernel>:4:2: error: use of undeclared identifier 'opencl'
        opencl;
        ^

Solution

盡量使用較少的 device，減少建立文本的 overhead，反正都是錯誤的代碼要找編譯錯誤資訊，那麼就直接用其中一個 device 編譯即可，除非牽涉到 compute version 問題，原則上都會是一樣的。

#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <CL/cl.h>
#define MAXGPU 1
#define MAXN 2048
int N = MAXN;
char clSrc[1024] = "";
char clSrcMain[1024] = "notused";
// -- start working with OpenCL
cl_context             clCtx;
cl_program             clPrg;
#define clCallFunc &clCtx, &clPrg
#define clFuncArgs cl_context *clCtx, cl_program *clPrg
#define CheckFailAndExit(status) \
    if (status != CL_SUCCESS) { \
        fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__), \
        destroyGPU(clCtx, clPrg); \
    }
#define clPrint(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__)
void destroyGPU(clFuncArgs) {
    fprintf(stderr, "Starting Cleanup ...\n\n");
    if (*clCtx)	clReleaseContext(*clCtx);
    if (*clPrg)	clReleaseProgram(*clPrg);
    exit(0);
}
void clCompile(char fileName[], clFuncArgs) {
    FILE *codefin = fopen(fileName, "r");
    assert(codefin != NULL);
    size_t clSrcLen = fread(clSrc, 1, 1024, codefin);
    fclose(codefin);
    cl_int				clStat;
    cl_uint				clPlatN, clGPUN;
    cl_platform_id		clPlatID;
    cl_device_id		clGPUID[MAXGPU];
    const char			*clSrcPtr = clSrc;
    // -- basic OpenCL setup
    clGetPlatformIDs(1, &clPlatID, &clPlatN);
    clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clGPUN);
    *clCtx = clCreateContext(NULL, 1, clGPUID, NULL, NULL, &clStat);
    CheckFailAndExit(clStat);
    *clPrg = clCreateProgramWithSource(*clCtx, 1, &clSrcPtr, &clSrcLen, &clStat);
    CheckFailAndExit(clStat);
    clStat = clBuildProgram(*clPrg, 1, clGPUID, NULL, NULL, NULL);
    if (clStat != CL_SUCCESS) {
        static char program_log[32767];
        clGetProgramBuildInfo(*clPrg, clGPUID[0],
                CL_PROGRAM_BUILD_LOG, sizeof(program_log), program_log, NULL);
        printf("%s", program_log);
        CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
    }
}
int main() {
    char fileName[128];
    assert(scanf("%s", fileName) == 1);
    clCompile(fileName, clCallFunc);
    // Compile Success
    destroyGPU(clCallFunc);
    return 0;
}

Read More +

2016-06-26

學校課程/平行程式

批改娘 10091. Fast Matrix Multiplication (OpenCL)

題目描述

計算兩個大小為 $N \times N$ 方陣 $A, \; B$ 相乘結果 $C = A \times B$。為了節省輸入輸出時間，採用亂數產生，可以參考下述程式碼，並改寫成 OpenCL 的版本進行加速。

使用 Profile 可以透過 NVIDIA Visual Profiler (GUI) 查看，遠端連線使用 ssh -X username@host，nvprof.sh, nvvp.cfg 下載

sequence.c

#include <stdio.h>
#include <stdint.h>
// #define DEBUG
#define UINT uint32_t
#define MAXN 1024
void multiply(int N, UINT A[][MAXN], UINT B[][MAXN], UINT C[][MAXN]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            UINT sum = 0;	// overflow, let it go.
            for (int k = 0; k < N; k++)
                sum += A[i][k] * B[k][j];
            C[i][j] = sum;
        }
    }
}
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
    UINT x = 2, n = N*N;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            x = (x * x + c + i + j)%n;
            A[i][j] = x;
        }
    }
}
void print_matrix(int N, UINT A[][MAXN]) {
    for (int i = 0; i < N; i++) {
        fprintf(stderr, "[");
        for (int j = 0; j < N; j++)
            fprintf(stderr, " %u", A[i][j]);
        fprintf(stderr, " ]\n");
    }
}
UINT signature(int N, UINT A[][MAXN]) {
    UINT h = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++)
            h = (h + A[i][j]) * 2654435761LU;
    }
    return h;
}
UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
    int N;
    uint32_t S1, S2;
    scanf("%d %u %u", &N, &S1, &S2);
    rand_gen(S1, N, A);
    rand_gen(S2, N, B);
    multiply(N, A, B, C);
#ifdef DEBUG
    print_matrix(N, A);
    print_matrix(N, B);
    print_matrix(N, C);
#endif
    printf("%u\n", signature(N, C));
    return 0;
}

輸入格式

測資只有一組，包含三個整數 $N, S_1, S_2$，分別為方陣大小 $N \times N$，產生矩陣 $A$、$B$ 的亂數種子。

$64 \le N \le 1024$，保證 $N \mod 64 \equiv 0$
$0 \le S_1, \; S_2 < 2^{31}$

輸出格式

輸出一行雜湊值 $H$，可參考 sequence.c 的流程。

範例輸入

64 1 2

範例輸出

1	3376147904

編譯參數

1	gcc -std=c99 -O2 main.c -lm -lOpenCL -fopenmp -I/usr/include/CL

Solution

兩個 $N \times N$ 乘法計算，從網路上常見的作法通常直接針對最終答案進行切塊，大致上分成三種運行方式

每一個 thread 只處理一個向量內積，因此會需要 $N^2$ 個 threads，彼此之間獨立。特別注意到在 GPU 程式中，每一個維度的索引值不可大於 65535，但是拆成兩個維度就沒有上限。
每一個 thread 只處理一個向量內積，但數個 thread 會分配到同一個 block，並且合作將 global memory 搬到 on-chip 的 local/shared memory 來加快速度。特別注意到 share memory 是 on-chip 的，通常能儲存量都非常小，儘管他們能藉由 data-reused 加快存取速度，但因為 share memory 大小限制導致有一個加速上限。
每一個 thread 處理一個或數個列上的所有值。

這些牽涉到 warp scheduling 和 memory coalesce 問題，有時候理論分析平行度看起來很高，但實際運作還是得看 warp size 和發生 branch 的情況。下述程式是按照第三個做法完成。

main.c

#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#define MAXGPU 1
#define MAXN 1024
uint32_t  hostA[MAXN*MAXN], hostB[MAXN*MAXN], hostC[MAXN*MAXN];
int N = MAXN;
char clSrcFormat[1024]; 
char clSrc[1024] = "";
char clSrcMain[1024] = "matrixMul";
// -- start working with OpenCL
cl_context 			clCtx;
cl_program 			clPrg;
cl_kernel  			clKrn;
cl_command_queue 	clQue;
cl_mem 		clMemIn1, clMemIn2, clMemOut;
#define CheckFailAndExit(state) \
    if (state != CL_SUCCESS) { \
        printf("Error: Line %u in file %s\n\n", __LINE__, __FILE__), \
        destroyGPU(clCtx, clPrg, clKrn, clQue, clMemIn1, clMemIn2, clMemOut); \
    }
#define clFuncArgs cl_context *clCtx, cl_program *clPrg, cl_kernel *clKrn, \
    cl_command_queue *clQue, cl_mem *clMemIn1, cl_mem *clMemIn2, \
    cl_mem *clMemOut 
#define clCallFunc &clCtx, &clPrg, &clKrn, &clQue, &clMemIn1, &clMemIn2, &clMemOut
void destroyGPU(clFuncArgs) {
    fprintf(stderr, "Starting Cleanup ...\n\n");
    if (*clMemOut)	clReleaseMemObject(*clMemOut);
    if (*clMemIn2)	clReleaseMemObject(*clMemIn2);
    if (*clMemIn1)	clReleaseMemObject(*clMemIn1);
    if (*clKrn)	clReleaseKernel(*clKrn);
    if (*clPrg) clReleaseProgram(*clPrg);
    if (*clQue)	clReleaseCommandQueue(*clQue);
    if (*clCtx)	clReleaseContext(*clCtx);
    exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
    // -- generate kernel code
    FILE *codefin = fopen(fileName, "r");
    assert(codefin != NULL);
    size_t clSrcLen = fread(clSrcFormat, 1, 1024, codefin);
    sprintf(clSrc, clSrcFormat, N);
    fclose(codefin);
    cl_int 				clStat;
    cl_uint 			clPlatN, clGPUN;
    cl_platform_id 		clPlatID;
    cl_device_id 		clGPUID[MAXGPU];
    const char	 		*clSrcPtr = clSrc;
    // -- basic OpenCL setup
    clGetPlatformIDs(1, &clPlatID, &clPlatN);
    clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, &clGPUN);
    *clCtx = clCreateContext(NULL, 1, clGPUID, NULL, NULL, &clStat);
    CheckFailAndExit(clStat);
    *clQue = clCreateCommandQueue(*clCtx, clGPUID[0], 0, &clStat);
    CheckFailAndExit(clStat);
    *clPrg = clCreateProgramWithSource(*clCtx, 1, &clSrcPtr, &clSrcLen, &clStat);
    CheckFailAndExit(clStat);
    clStat = clBuildProgram(*clPrg, 1, clGPUID, NULL, NULL, NULL);
    if (clStat != CL_SUCCESS) {
        printf("Error in clBuildProgram, Line %u in file %s\n\n", __LINE__, __FILE__);
        size_t log_size;
        clGetProgramBuildInfo(*clPrg, clGPUID[0], CL_PROGRAM_BUILD_STATUS,
                sizeof(cl_build_status), &clStat, NULL);
        clGetProgramBuildInfo(*clPrg, clGPUID[0],
                CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        char *program_log = (char *) calloc(log_size+1, sizeof(char));
        clGetProgramBuildInfo(*clPrg, clGPUID[0],
                CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
        printf("%s", program_log);
        free(program_log);
        CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
    }
    *clKrn = clCreateKernel(*clPrg, clSrcMain, &clStat);
    CheckFailAndExit(clStat);
    // -- create all buffers
    cl_mem_flags clInBuffFlag = CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR;
    cl_mem_flags clOutBuffFlag = CL_MEM_WRITE_ONLY;
    *clMemIn1 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N, 
            hostA, &clStat);
    CheckFailAndExit(clStat);
    *clMemIn2 = clCreateBuffer(*clCtx, clInBuffFlag, sizeof(uint32_t)*N*N,
            hostB, &clStat);
    CheckFailAndExit(clStat);
    *clMemOut = clCreateBuffer(*clCtx, clOutBuffFlag, sizeof(uint32_t)*N*N,
            hostC, &clStat);
    CheckFailAndExit(clStat);
    // -- set argument to kernel
    clStat = clSetKernelArg(*clKrn, 0, sizeof(cl_mem), (void *) clMemIn1);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(*clKrn, 1, sizeof(cl_mem), (void *) clMemIn2);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(*clKrn, 2, sizeof(cl_mem), (void *) clMemOut);
    CheckFailAndExit(clStat);
    return 1;
}
int min(int x, int y) {
    return x < y ? x : y;
}
int executeGPU(clFuncArgs) {
    cl_int clStat;
    size_t globalOffset[] = {0, 0};
    size_t globalSize[] = {N};
    size_t localSize[] = {min(N, 64)};
    clStat = clEnqueueNDRangeKernel(*clQue, *clKrn, 1, globalOffset,
            globalSize, localSize, 0, NULL, NULL);
    CheckFailAndExit(clStat);
    clFinish(*clQue);
    // -- read back
    clEnqueueReadBuffer(*clQue, *clMemOut, CL_TRUE, 0, sizeof(uint32_t)*N*N, 
            hostC, 0, NULL, NULL);
    return 1;
}
void readIn() {
    uint32_t c1, c2;
    assert(scanf("%d %u %u", &N, &c1, &c2) == 3);
    uint32_t x = 2, n = N*N;
    x = 2;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            x = (x * x + c1 + i + j)&(n-1);
            hostA[i*N+j] = x;
        }
    }
    x = 2;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            x = (x * x + c2 + i + j)&(n-1);
            hostB[i*N+j] = x;
        }
    }
}
void writeOut() {
    uint32_t h = 0;
    uint32_t *Cend = hostC + N*N, *C = hostC;
    for (; C != Cend; C++)
        h = (h + *C) * 2654435761LU;
    printf("%u\n", h);
}
void onStart() {
    readIn();
    initAllGPU("matrixmul.cl", clCallFunc);
    executeGPU(clCallFunc);
    writeOut();
    destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
    printf("God Bless Me\n");
    destroyGPU(clCallFunc);
    exit(0);
}
int main(int argc, char *argv[]) {
    const char sigErr[] = "I can't catch signal.\n";
    if (signal(SIGTRAP, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGSEGV, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGILL, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGFPE, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGKILL, sigHandler) == SIG_ERR) 	
        fprintf(stderr, sigErr);
    if (signal(SIGINT, sigHandler) == SIG_ERR) 	
        fprintf(stderr, sigErr);
    onStart();
    return 0;
}

matrixmul.cl

有些人也許會想問，為什麼合作搬運一個在 global memory 的陣列，要採用 for (int cr = localID; cr < N; cr += localSz) 的方式，這是因為 GPU 設計有 memory coalesce 問題，一個 warp 運作時，存取最好的是連續的，這麼一來 memory coalesce 帶進來的一坨連續的記憶體就能充份被利用，而不是每一個 thread 讀取一塊連續的記憶體片段，採用跳躍的方式在 warp scheduling 時，讀取順序看起來才比較連續。

#define N %d
#define CTYPE unsigned int
#define UNLOOP 8
__kernel void matrixMul(__global CTYPE *in1,
		__global CTYPE *in2,
		__global CTYPE *out) {
    CTYPE rbuf[N];
    int r = get_global_id(0);
    int localID = get_local_id(0);
    int localSz = get_local_size(0);
    __local CTYPE cbuf[N];
    for (int i = 0; i < N; i++)
        rbuf[i] = in1[r * N + i];
    for (int c = 0; c < N; c++) {
        for (int cr = localID; cr < N; cr += localSz)
            cbuf[cr] = in2[cr * N + c];
        barrier(CLK_LOCAL_MEM_FENCE);
        CTYPE sum = 0;
        for (int k = 0; k+UNLOOP-1 < N; k += UNLOOP) {
            sum += rbuf[k+0] * cbuf[k+0];
            sum += rbuf[k+1] * cbuf[k+1];
            sum += rbuf[k+2] * cbuf[k+2];
            sum += rbuf[k+3] * cbuf[k+3];
            sum += rbuf[k+4] * cbuf[k+4];
            sum += rbuf[k+5] * cbuf[k+5];
            sum += rbuf[k+6] * cbuf[k+6];
            sum += rbuf[k+7] * cbuf[k+7];
        }
        out[r * N + c] = sum;
    }
}

Read More +

2016-06-26

學校課程/平行程式

批改娘 10090. Dot Product (OpenCL)

題目描述

請用 OpenCL 改寫下段的計算：

main.c

#include <stdio.h>
#include <assert.h>
#include <omp.h>
#include <inttypes.h>
#include "utils.h"
 
#define MAXGPU 8
#define MAXCODESZ 32767
#define MAXN 16777216
static cl_uint A[MAXN], B[MAXN], C[MAXN];
int main(int argc, char *argv[]) {
    omp_set_num_threads(4);
    int N;
    uint32_t key1, key2;
    while (scanf("%d %" PRIu32 " %" PRIu32, &N, &key1, &key2) == 3) {
        int chunk = N / 4;
        for (int i = 0; i < N; i++) {
            A[i] = encrypt(i, key1);
            B[i] = encrypt(i, key2);
        }
 
        for (int i = 0; i < N; i++)
            C[i] = A[i] * B[i];
 
        uint32_t sum = 0;
        for (int i = 0; i < N; i++)
            sum += C[i];
        printf("%" PRIu32 "\n", sum);
    }
    return 0;
}

utils.h

#ifndef _UTILS_H
#define _UTILS_H
#include <stdint.h>
static inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
static inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
#endif

範例輸入

1 2	16777216 1 2 16777216 3 5

範例輸出

1 2	2885681152 2147483648

編譯參數

1	gcc -std=c99 -O2 main.c -lOpenCL -fopenmp

Solution

這一題藉由兩個亂數產生長度為 $N$ 的兩個向量，計算內積結果為何。

由於這是第一份計算 OpenCL 的應用，特別注意 Memory Leak 的問題，確定每一次執行都有正常釋放資源，可以透過 $ htop 或者 $ top 指令監控，若在 nvidia 平台下，可以使用 $ nvidia-smi 觀察程式佔有的記憶體量已經排隊情況，同時要小心繁重工作導致熱當機。

這一題原本預設要從 CPU 產生兩個向量，再傳送到 GPU 上面計算，同學一問就不小心將加密的檔案一起釋出，結果就能直接在 GPU 上產生，並且內積完使用 $O(\log N)$ 進行 work-group 內部進行加總，這大幅度地降低需要回到 CPU 計算總和的時間。

特別注意實驗環境最多允許一個 work-group 有 1024 個 work-item，從效率結果上來看，work-item 並不是越多越好，因為牽涉到 register 數量以及 memory access 的效率，這部分編譯器無法幫忙，全權交給程序員決定。而且 GPU 還有嚴重的 bank conflict，在計算一個 work-group 總和時，特殊的寫法減少 bank conflict 的發生。

main.c

#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#include "utils.h"
#include <omp.h>
#define MAXGPU 1
#define MAXN 16777216
#define GPULOCAL 256
uint32_t	hostC[MAXN/GPULOCAL];
int N;
uint32_t keyA, keyB;
char clSrcFormat[1024] = ""; 
char clSrc[1024] = "";
char clSrcMain[1024] = "vecdot";
// -- start working with OpenCL
cl_context				clCtx;
cl_program 				clPrg;
cl_kernel				clKrn;
cl_command_queue		clQue;
cl_mem         			clMemOut;
#define CheckFailAndExit(status) \
    if (status != CL_SUCCESS) { \
        fprintf(stderr, "Error %d: Line %u in file %s\n\n", status, __LINE__, __FILE__), \
        destroyGPU(clCtx, clPrg, clKrn, clQue, clMemOut); \
    }
#define clFuncArgs cl_context *clCtx, cl_program *clPrg, cl_kernel *clKrn, \
    cl_command_queue *clQue, cl_mem *clMemOut  
#define clCallFunc &clCtx, &clPrg, &clKrn, &clQue, &clMemOut
void destroyGPU(clFuncArgs) {
    fprintf(stderr, "Starting Cleanup ...\n\n");
    if (*clMemOut)    clReleaseMemObject(*clMemOut);
    if (*clKrn)	clReleaseKernel(*clKrn);
    if (*clPrg)	clReleaseProgram(*clPrg);
    if (*clQue)	clReleaseCommandQueue(*clQue);
    if (*clCtx)	clReleaseContext(*clCtx);
    exit(0);
}
int initAllGPU(char fileName[], clFuncArgs) {
    // -- generate kernel code
    FILE *codefin = fopen(fileName, "r");
    assert(codefin != NULL);
    size_t clSrcLen = fread(clSrc, 1, 1024, codefin);
    cl_int                 clStat;
    cl_uint             clPlatN, clGPUN;
    cl_platform_id         clPlatID;
    cl_device_id         clGPUID[MAXGPU];
    const char             *clSrcPtr = clSrc;
    // -- basic OpenCL setup
    clGetPlatformIDs(1, &clPlatID, &clPlatN);
    clGetDeviceIDs(clPlatID, CL_DEVICE_TYPE_GPU, MAXGPU, clGPUID, NULL);
    *clCtx = clCreateContext(NULL, 1, clGPUID, NULL, NULL, &clStat);
    CheckFailAndExit(clStat);
    *clQue = clCreateCommandQueue(*clCtx, clGPUID[0], 0, &clStat);
    CheckFailAndExit(clStat);
    *clPrg = clCreateProgramWithSource(*clCtx, 1, &clSrcPtr, &clSrcLen, &clStat);
    CheckFailAndExit(clStat);
    clStat = clBuildProgram(*clPrg, 1, clGPUID, NULL, NULL, NULL);
    if (clStat != CL_SUCCESS) {
        fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__);
        size_t log_size;
        clGetProgramBuildInfo(*clPrg, clGPUID[0],
                CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        char *program_log = (char *) calloc(log_size+1, sizeof(char));
        clGetProgramBuildInfo(*clPrg, clGPUID[0],
                CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL);
        printf("%s", program_log);
        free(program_log);
        CheckFailAndExit(CL_BUILD_PROGRAM_FAILURE);
    }
    *clKrn = clCreateKernel(*clPrg, clSrcMain, &clStat);
    CheckFailAndExit(clStat);
    // -- create all buffers
    cl_mem_flags clOutBuffFlag = CL_MEM_WRITE_ONLY;
    *clMemOut = clCreateBuffer(*clCtx, clOutBuffFlag, sizeof(uint32_t)*MAXN/GPULOCAL,
            hostC, &clStat);
    CheckFailAndExit(clStat);
    return 1;
}
int executeGPU(clFuncArgs) {
    uint32_t padding = 0;
    while (N%GPULOCAL) {
        padding += encrypt(N, keyA) * encrypt(N, keyB);
        N++;
    }
    cl_int clStat;
    size_t globalOffset[] = {0};
    size_t globalSize[] = {N};
    size_t localSize[] = {GPULOCAL};
    
    // -- set argument to kernel
    clStat = clSetKernelArg(*clKrn, 0, sizeof(cl_uint), (void *) &keyA);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(*clKrn, 1, sizeof(cl_uint), (void *) &keyB);
    CheckFailAndExit(clStat);
    clStat = clSetKernelArg(*clKrn, 2, sizeof(cl_mem), (void *) clMemOut);
    CheckFailAndExit(clStat);
    // -- execute
    clStat = clEnqueueNDRangeKernel(*clQue, *clKrn, 1, globalOffset,
            globalSize, localSize, 0, NULL, NULL);
    CheckFailAndExit(clStat);
    
    // -- read back
    clEnqueueReadBuffer(*clQue, *clMemOut, CL_TRUE, 0, sizeof(uint32_t)*N/GPULOCAL, 
            hostC, 0, NULL, NULL);
    uint32_t sum = 0;
    omp_set_num_threads(4);
#pragma omp parallel for reduction(+: sum)
    for (int i = 0; i < N/GPULOCAL; i++)
        sum += hostC[i];
    printf("%u\n", sum - padding);
    return 1;
}
int readIn() {
    int has = 0;
    if (scanf("%d %u %u", &N, &keyA, &keyB) != 3)
        return 0;
    return 1;
}
void onStart() {
    initAllGPU("vecdot.cl", clCallFunc);
    while (readIn())
        executeGPU(clCallFunc);
    destroyGPU(clCallFunc);
}
void sigHandler(int signo) {
    printf("God Bless Me\n");
    destroyGPU(clCallFunc);
    exit(0);
}
int main(int argc, char *argv[]) {
    const char sigErr[] = "I can't catch signal.\n";
    if (signal(SIGTRAP, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGSEGV, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGILL, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGFPE, sigHandler) == SIG_ERR)
        fprintf(stderr, sigErr);
    if (signal(SIGKILL, sigHandler) == SIG_ERR)     
        fprintf(stderr, sigErr);
    if (signal(SIGINT, sigHandler) == SIG_ERR)     
        fprintf(stderr, sigErr);
    onStart();
    return 0;
}

vecdot.cl

#define uint32_t unsigned int
inline uint32_t rotate_left(uint32_t x, uint32_t n) {
    return  (x << n) | (x >> (32-n));
}
inline uint32_t encrypt(uint32_t m, uint32_t key) {
    return (rotate_left(m, key&31) + key)^key;
}
__kernel void vecdot(uint32_t keyA, uint32_t keyB, __global int* C) {
    __local int buf[256];
    int globalId = get_global_id(0);
    int groupId = get_group_id(0);
    int localId = get_local_id(0);
    int localSz = get_local_size(0);
    buf[localId] = encrypt(globalId, keyA) * encrypt(globalId, keyB);
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int i = localSz>>1; i; i >>= 1) {
        if (localId < i)
            buf[localId] += buf[localId + i];
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (localId == 0)
        C[groupId] = buf[0];
}

Read More +

2016-06-26

學校課程/平行程式

批改娘 10089. Print Platform Information (OpenCL)

Problem

使用 OpenCL 印出裝置訊息。請參考課程講義。

Sample Input

no input

Sample Output

1 platform found
Platform Name NVIDIA CUDA
Platform Vendor NVIDIA Corporation
Platform Version OpenCL 1.2 CUDA 7.5.23
Platform Profile FULL_PROFILE
3 Devices
0 CPU Devices
3 GPU Devices
Device name GeForce GTX 980 Ti
Global memory size 6442254336
Local memory size 49152
# of compute units 22
max # of work items in a work group 1024
Device name GeForce GTX 970
Global memory size 4294770688
Local memory size 49152
# of compute units 13
max # of work items in a work group 1024
<other ...>

備註

可參考上方的題解頁面

解法

#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <CL/cl.h>
#define MAXDEV 8
#define MAXPLAT 8
#define MAXN 2048
#define MAXSTRBUF 1024
// -- start working with OpenCL
#define CheckFailAndExit(status) \
        if (status != CL_SUCCESS) {\
            fprintf(stderr, "Error: Line %u in file %s\n\n", __LINE__, __FILE__); \
        }
#define clPrint(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__)
int infoGPU() {
    size_t 				logLen;
    char 				logBuf[MAXSTRBUF];
    cl_int				clStat;
    cl_uint				clPlatN, clDevN;
    cl_platform_id		clPlatIDs[MAXPLAT], clPlatID;
    cl_device_id		clDevIDs[MAXDEV], clDevID;
    
    clGetPlatformIDs(MAXPLAT, clPlatIDs, &clPlatN);
    clPrint("%d platform found\n", clPlatN);
    for (int i = 0; i < clPlatN; i++) {
        clPlatID = clPlatIDs[i];
        const cl_platform_info clPlatInfoQuery[] = {
            CL_PLATFORM_NAME, CL_PLATFORM_VENDOR, 
            CL_PLATFORM_VERSION, CL_PLATFORM_PROFILE
        };
        const cl_platform_info clPlatDevInfoQuery[] = {
            CL_DEVICE_TYPE_ALL, CL_DEVICE_TYPE_CPU, 
            CL_DEVICE_TYPE_GPU
        };
        const cl_platform_info clDevInfoQuery[] = {
            CL_DEVICE_GLOBAL_MEM_SIZE, CL_DEVICE_LOCAL_MEM_SIZE,
            CL_DEVICE_MAX_COMPUTE_UNITS, CL_DEVICE_MAX_WORK_GROUP_SIZE
        };
        const char const queryPlatFmt[][32] = {
            "Platform Name %s\n", "Platform Vendor %s\n", 
            "Platform Version %s\n", "Platform Profile %s\n"
        };
        const char const queryPlatDevFmt[][32] = {
            "%u Devices\n", "%u CPU Devices\n",
            "%u GPU Devices\n"
        };
        const char const queryDevFmt[][64] = {
            "Global memory size %lld\n", "Local memory size %lld\n",
            "# of compute units %lld\n", "max # of work items in a work group %lld\n"
        };
        const int platQcnt = sizeof(clPlatInfoQuery) / sizeof(cl_platform_info);
        const int platDevQcnt = sizeof(clPlatDevInfoQuery) / sizeof(cl_platform_info);
        const int devQcnt = sizeof(clDevInfoQuery) / sizeof(cl_platform_info);
        
        for (int j = 0; j < platQcnt; j++) {
            clStat = clGetPlatformInfo(clPlatID, clPlatInfoQuery[j], MAXSTRBUF, logBuf, &logLen);
            CheckFailAndExit(clStat);
            clPrint(queryPlatFmt[j], logBuf);			
        }
        
        for (int j = 0; j < platDevQcnt; j++) {
            clGetDeviceIDs(clPlatID, clPlatDevInfoQuery[j], MAXDEV, clDevIDs, &clDevN);
            clPrint(queryPlatDevFmt[j], clDevN);
            if (clPlatDevInfoQuery[j] == CL_DEVICE_TYPE_ALL)
                continue;
            for (int k = 0; k < clDevN; k++) {
            	clDevID = clDevIDs[k];
            	clStat = clGetDeviceInfo(clDevID, CL_DEVICE_NAME, MAXSTRBUF, logBuf, &logLen);
            	CheckFailAndExit(clStat);
            	clPrint("Device name %s\n", logBuf);
            	for (int p = 0; p < devQcnt; p++) {
            		cl_ulong clVal;
            		clStat = clGetDeviceInfo(clDevID, clDevInfoQuery[p], sizeof(cl_ulong), &clVal, NULL);
            		CheckFailAndExit(clStat);
            		clPrint(queryDevFmt[p], clVal);
                }
            }
        }
    }
}
int main() {
    infoGPU();
    return 0;
}

Read More +

2016-04-30

學校課程/平行程式

批改娘 10093. Fast Matrix Chain Multiplication (OpenMP)

題目描述

計算矩陣鏈乘積 $A_{r_1, c_1} B_{r_2, c_2} \cdots$ 的值。

sample.c

// generate matrix, row-major
uint32_t* rand_gen(uint32_t seed, int R, int C) {
    uint32_t *m = (uint32_t *) malloc(sizeof(uint32_t) * R*C);
    uint32_t x = 2, n = R*C;
    for (int i = 0; i < R; i++) {
        for (int j = 0; j < C; j++) {
            x = (x * x + seed + i + j)%n;
            m[i*C + j] = x;
        }
    }
    return m;
}
uint32_t hash(uint32_t x) {
    return (x * 2654435761LU);
}
// output
uint32_t signature(uint32_t *A, int r, int c) {
    uint32_t h = 0;
    for (int i = 0; i < r; i++) {
        for (int j = 0; j < c; j++)
            h = hash(h + A[i*c + j]);
    }
    return h;
}

輸入格式

有多組測資，每組第一行會有一個整數 $N$ 表示矩陣鏈上有 $N$ 個矩陣，第二行上會有 $N+1$ 個整數 $Z_i$，表示矩陣鏈的每一個行列大小，例如當 $N = 3$ 時，輸入 10 30 5 60 表示矩陣 $A_{10, 30} B_{30, 5} C_{5, 60}$ 相乘。第三行會有 $N$ 個整數，第 $i$ 個整數 $S_i$ 為第 $i$ 個矩陣生成種子。

$1 \le N \le 100$
$1 \le Z_i \le 1000$
$0 \le S_i \le 32767$

輸出格式

對於每組測資輸出一行，將最後的矩陣結果輸出雜湊值。

範例輸入

2
2 2 2
2 5
3
10 30 5 60
0 0 0
3
1 5 20 1
0 0 0
3
5 10 20 35
0 0 0 
6
30 35 15 5 10 20 25
0 0 0 0 0 0

範例輸出

備註

輸出請用 printf("%u", answer);，計算 Dynamic Programming 時，請使用 64-bit 型態紀錄，因為最慘情況下會超過 32-bit 所能容納的範圍。

Solution

充分地運用當初在演算法學到的，計算矩陣鍊乘積的最少乘法數，接著再針對優化後的乘法順序進行平行。平行可以從單純矩陣乘法，又或者針對可以同時進行矩陣乘法操作開始。甚至可以套用編譯器學到的最少暫存器算法，想辦法從少量的空間換取好的快取效果。

下述程式只針對矩陣乘法計算平行，而非兩個乘法同時進行，其一原因在於很難保證 load balance。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <assert.h>
#define MAXN 128
#define LOOP_UNROLL 8
#define INF (1LL<<60)
int N, SZ[MAXN], SEED[MAXN];
long long dp[MAXN][MAXN] = {};
int argdp[MAXN][MAXN];
uint32_t* rand_gen(uint32_t c, int R, int C) {
    uint32_t *m = (uint32_t *) malloc(sizeof(uint32_t) * R*C);
    assert(m != NULL);
    uint32_t x = 2, n = R*C;
    for (int i = 0; i < R; i++) {
        for (int j = 0; j < C; j++) {
            x = (x * x + c + i + j)%n;
            m[i*C + j] = x;
        }
    }
    return m;
}
uint32_t* multiplyAndDel(uint32_t *A, uint32_t *B, int r, int rc, int c) {
    uint32_t *C = (uint32_t *) malloc(sizeof(uint32_t) * r * c);
    uint32_t *tB = (uint32_t *) malloc(sizeof(uint32_t) * rc * c);
    assert(C != NULL);
    assert(tB != NULL);
    for (int i = 0; i < rc; i++) {
        for (int j = 0; j < c; j++)
            tB[j*rc + i] = B[i*c + j];
    }
    free(B);
    #pragma omp parallel for
    for (int i = r-1; i >= 0; i--) {
        for (int j = c-1; j >= 0; j--) {
            register uint32_t sum = 0;
            uint32_t *a = &A[i*rc], *b = &tB[j*rc];
            int k = rc;
            switch (k % LOOP_UNROLL) {
                case 0: do { sum += *a * *b, a++, b++;
                case 7: sum += *a * *b, a++, b++;
                case 6: sum += *a * *b, a++, b++;
                case 5: sum += *a * *b, a++, b++;
                case 4: sum += *a * *b, a++, b++;
                case 3: sum += *a * *b, a++, b++;
                case 2: sum += *a * *b, a++, b++;
                case 1: sum += *a * *b, a++, b++;
                } while ((k -= LOOP_UNROLL) > 0);
            }
            C[i*c + j] = sum;
        }
    }
    free(A), free(tB);
    return C;
}
uint32_t hash(uint32_t x) {
    return (x * 2654435761LU);
}
uint32_t signatureAndDel(uint32_t *A, int r, int c) {
    uint32_t h = 0;
    for (int i = 0; i < r; i++) {
        for (int j = 0; j < c; j++)
            h = hash(h + A[i*c + j]);
    }
    free(A);
    return h;
}
uint32_t* dfs(int l, int r, int *mR, int *mC) {
    if (l == r) {
        *mR = SZ[l], *mC = SZ[l+1];
        return rand_gen(SEED[l], *mR, *mC);
    }
    int split = argdp[l][r];
    int r1, r2, c1, c2;
    uint32_t *A, *B;
    A = dfs(l, split, &r1, &c1);
    B = dfs(split+1, r, &r2, &c2);
    assert(c1 == r2);
    *mR = r1, *mC = c2;
    return multiplyAndDel(A, B, r1, c1, c2);
}
int main() {
    while (scanf("%d", &N) == 1) {
        for (int i = 0; i <= N; i++)
            scanf("%d", &SZ[i]);
        for (int i = 0; i < N; i++)
            scanf("%d", &SEED[i]);
        memset(dp, 0, sizeof(dp));
        for (int i = 1; i <= N; i++) {
            for (int j = 0; j+i < N; j++) {
                int l = j, r = j+i;
                dp[l][r] = INF;
                for (int k = l; k < r; k++) {
                    long long t = dp[l][k] + dp[k+1][r] + (long long) SZ[l] * SZ[k+1] * SZ[r+1];
                    if (t < dp[l][r])
                        dp[l][r] = t, argdp[l][r] = k;
                }
            }
        }
        int retR, retC;
        uint32_t *retM;
        uint32_t hval;
        retM = dfs(0, N-1, &retR, &retC);
        hval = signatureAndDel(retM, retR, retC);
        printf("%u\n", hval);
 
        long long test = 0;
        for (int i = 1; i < N; i++) {
            test += SZ[0] * SZ[i] * SZ[i+1];
        }
        fprintf(stderr, "best %lld, origin %lld, %lf\n", dp[0][N-1], test, dp[0][N-1]*1.f / test);
    }
    return 0;
}

Read More +