UVa 10476 - Spam or Not Spam

contents

  1. 1. Problem
  2. 2. Sample Input
  3. 3. Sample Output
  4. 4. Solution

Problem

判斷訊息是否為垃圾訊息。

給數組垃圾訊息、非垃圾訊息,從中統計語言的出現頻率。例如 ABCDEFG 取 k-shingle 中 k = 3,那麼總共出現 ABCBCDCDEDEF … 等。

接著按照題目給定的公式,計算輸入的訊息離垃圾訊息比較接近、還是非垃圾訊息。

n-gram 通常指的是以單詞 (word) 為切割單位,而 k-shingle 則是以字符 (char) 為切割單位。在巨量資料的開放課程中,以 shingle 去描述這樣的切割方式。

Sample Input

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
2 1 1
AAAA
BBBB CCCC
ENDMESSAGE
BBBB
ENDMESSAGE
AAAABBBB
ENDMESSAGE
AAABB
ENDMESSAGE
2 1 2
AAAA
BBBB CCCC
ENDMESSAGE
BBBB
ENDMESSAGE
AAAABBBB
ENDMESSAGE
AAABB
ENDMESSAGE
AAABB
ENDMESSAGE
0 0 0

Sample Output

1
2
3
4
5
6
7
8
Set 1:
0.21822 0.73030
non-spam
Set 2:
0.21822 0.73030
non-spam
0.21822 0.73030
non-spam

Solution

相當有趣的題目,終於把自然語言學習到的部分拿出來玩。但可惜題目沒有巨量資料那樣瘋狂,用 hash 去失真的操作,由於記憶體不夠,用 1-pass/2-pass 找 frequent items 之類的。

為了加速運算,改成 hash 會比較快,但只要一碰撞輸出機率時就會炸掉,所以還是別貿然嘗試。

特別用了 std::inner_product 來寫中間的數學運算,但仔細想想如果是兩個 std::map 要進行 std::inner_product() 仍然會因為元素不見得都會在兩個的 key set 而無法運作。std::inner_product 內建實作中只有單純的迭代,用在 vector 類型比較合適。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
// NLP, n-gram, shingle
#include <stdio.h>
#include <map>
#include <iostream>
#include <algorithm>
#include <numeric>
#include <math.h>
#include <string.h>
using namespace std;
int map_product(pair<string, int> l, pair<string, int> r) {
return l.second * r.second;
}
class Stat {
public:
int kShingle;
int sqsum;
map<string, int> count;
void init(int k) {
kShingle = k, sqsum = 0;
count.clear();
}
string toKey(string &s) { // retain for hash function
return s;
}
void add(char s[]) {
if (s[0] == '\0' || s[1] == '\0')
return;
char end = '\0';
for (int i = 2; s[i]; i++) {
swap(s[i+1], end);
string shingle = s+i-2;
swap(s[i+1], end);
count[toKey(shingle)] ++;
}
}
void flush() {
sqsum = inner_product(count.begin(), count.end(), count.begin(), 0,
plus<int>(), map_product);
}
double distance(Stat &other) {
double sum = 0;
for (map<string, int>::iterator it = count.begin(), jt;
it != count.end(); it++) {
jt = other.count.find(it->first);
if (jt != other.count.end()) {
sum += it->second * jt->second;
}
}
return (double) sum / sqrt((double) sqsum * other.sqsum);
}
} spam, nonspam, msg;
const int MAXN = 1048576;
char buf[MAXN];
int main() {
int cases = 0;
int n, m, q;
while (scanf("%d %d %d", &n, &m, &q) == 3 && n + m + q) {
while (getchar() != '\n');
spam.init(3), nonspam.init(3);
for (int i = 0; i < n; i++) {
while(gets(buf) && strcmp("ENDMESSAGE", buf))
spam.add(buf);
}
for (int i = 0; i < m; i++) {
while(gets(buf) && strcmp("ENDMESSAGE", buf))
nonspam.add(buf);
}
spam.flush(), nonspam.flush();
printf("Set %d:\n", ++cases);
for (int i = 0; i < q; i++) {
msg.init(3);
while(gets(buf) && strcmp("ENDMESSAGE", buf))
msg.add(buf);
msg.flush();
double spam_dist = msg.distance(spam);
double nonspam_dist = msg.distance(nonspam);
printf("%.5lf %.5lf\n", spam_dist, nonspam_dist);
puts(spam_dist > nonspam_dist + 1e-10 ? "spam" : "non-spam");
}
}
return 0;
}
/*
2 1 1
AAAA
BBBB CCCC
ENDMESSAGE
BBBB
ENDMESSAGE
AAAABBBB
ENDMESSAGE
AAABB
ENDMESSAGE
2 1 2
AAAA
BBBB CCCC
ENDMESSAGE
BBBB
ENDMESSAGE
AAAABBBB
ENDMESSAGE
AAABB
ENDMESSAGE
AAABB
ENDMESSAGE
0 0 0
*/