批改娘 10080. Fast Matrix Multiplication (pthread)

contents

  1. 1. 題目描述
    1. 1.1. matrix.h
    2. 1.2. matrix.c
    3. 1.3. main.c
  2. 2. 輸入格式
  3. 3. 輸出格式
  4. 4. 範例輸入
  5. 5. 範例輸出
  6. 6. 解釋
  7. 7. Makefile
  8. 8. Solution
    1. 8.1. matrix.h
    2. 8.2. matrix.c

題目描述

相信不少人都已經實作所謂的矩陣乘法,計算兩個方陣大小為 $N \times N$ 的矩陣 $A, B$。為了方便起見,提供一個偽隨機數的生成,減少在輸入處理浪費的時間,同時也減少上傳測資的辛苦。

根據種子 $c = S1$ 生成矩陣 $A$,種子 $c = S2$ 生成矩陣 $B$,計算矩陣相乘 $A \times B$,為了方便起見,使用 hash 函數進行簽章,最後輸出一個值。由於會牽涉到 overflow 問題,此題作為快取實驗就不考慮這個,overflow 問題都會相同。

matrix.h

1
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

1
2
3
4
5
6
7
8
9
10
11
12
#include "matrix.h"
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
unsigned long sum = 0; // overflow, let it go.
for (int k = 0; k < N; k++)
sum += A[i][k] * B[k][j];
C[i][j] = sum;
}
}
}

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#include <stdio.h>
#include "matrix.h"
// #define DEBUG
#define UINT unsigned long
#define MAXN 2048
void rand_gen(UINT c, int N, UINT A[][MAXN]) {
UINT x = 2, n = N*N;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x = (x * x + c + i + j)%n;
A[i][j] = x;
}
}
}
void print_matrix(int N, UINT A[][MAXN]) {
for (int i = 0; i < N; i++) {
fprintf(stderr, "[");
for (int j = 0; j < N; j++)
fprintf(stderr, " %u", A[i][j]);
fprintf(stderr, " ]\n");
}
}
UINT hash(UINT x) {
return (x * 2654435761LU);
}
UINT signature(int N, UINT A[][MAXN]) {
UINT h = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++)
h = hash(h + A[i][j]);
}
return h;
}
static UINT A[MAXN][MAXN], B[MAXN][MAXN], C[MAXN][MAXN];
int main() {
int N, S1, S2;
while (scanf("%d %d %d", &N, &S1, &S2) == 3) {
rand_gen(S1, N, A);
rand_gen(S2, N, B);
multiply(N, A, B, C);
#ifdef DEBUG
print_matrix(N, A);
print_matrix(N, B);
print_matrix(N, C);
#endif
printf("%u\n", signature(N, C));
}
return 0;
}

輸入格式

多組測資,每組測資一行三個整數 $N, S1, S2$

  • $1 \le N \le 1000$
  • $0 \le S1, S2 \le 65536$

輸出格式

每組測資輸出一行,一個整數 signature 的結果。

範例輸入

1
2
2 2 5
2 2 5

範例輸出

1
2
573770929
573770929

解釋

範例輸入產生 $2 \times 2$ 的矩陣。

$$A = \begin{bmatrix} 2 & 3\\ 0 & 0 \end{bmatrix} , B = \begin{bmatrix} 1 & 3\\ 3 & 0 \end{bmatrix} , A \times B = \begin{bmatrix} 11 & 6\\ 0 & 0 \end{bmatrix}$$

Makefile

1
2
matrix.out: matrix.h matrix.c main.c
gcc -std=c99 -O2 matrix.h matrix.c main.c -o matrix -lpthread -lm

Solution

針對矩陣乘法平行很簡單,在 CPU 上由於需要搬運資料,不一定開的 thread 數量接近 core 數量就一定最好。而在平行的部分要把握資料局部性的加速,以下實作根據最終答案進行 row-based 策略切割,每一個 thread 計算那一個 row 的所有答案。

matrix.h

1
2
3
#define UINT unsigned long
void multiply(int N, unsigned long A[][2048], unsigned long B[][2048], unsigned long C[][2048]);

matrix.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#include "matrix.h"
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define LOOP_UNROLL 8
#define MAXTHREAD 8
#define MAXN 2048
typedef struct Argu {
int l, r, N;
UINT *A, *B, *C;
} Argu;
static void transpose(int N, UINT A[][2048]) {
UINT t;
for (int i = 0; i < N; i++) {
for (int j = i+1; j < N; j++)
t = A[i][j], A[i][j] = A[j][i], A[j][i] = t;
}
}
void* thread_multiply(void *arg) {
Argu data = * ((Argu *) arg);
int l = data.l, r = data.r;
int N = data.N;
UINT *A = data.A, *B = data.B, *C = data.C;
UINT stkA[MAXN];
for (int i = l; i <= r; i++) {
memcpy(stkA, A + (i * MAXN), sizeof(UINT) * N);
for (int j = 0; j < N; j++) {
UINT sum = 0;
// UINT *a = A + (i * MAXN), *b = B + (j * MAXN);
UINT *a = stkA, *b = B + (j * MAXN);
int k;
for (k = 0; k+LOOP_UNROLL < N; k += LOOP_UNROLL) {
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
sum += *a * *b, a++, b++;
}
for (; k < N; k++)
sum += *a * *b, a++, b++;
*(C + (i * MAXN + j)) = sum;
}
}
return NULL;
}
void multiply(int N, UINT A[][2048], UINT B[][2048], UINT C[][2048]) {
transpose(N, B);
pthread_t *threads;
threads = (pthread_t *) malloc(MAXTHREAD * sizeof(pthread_t));
for (int i = 0; i < MAXTHREAD; i++) {
Argu *data = (Argu *) malloc(sizeof(Argu));
data->l = i * N / MAXTHREAD;
data->r = (i+1) * N / MAXTHREAD - 1;
data->N = N;
data->A = &A[0][0], data->B = &B[0][0], data->C = &C[0][0];
pthread_create(&threads[i], NULL, thread_multiply, (void *) (data));
}
for (int i = 0; i < MAXTHREAD; i++)
pthread_join(threads[i], NULL);
return ;
}