資料庫正規化 Database BCNF

1. Problem
2. Sample Input
3. Sample Output
4. Solution

Problem

stackoverflow : Decomposing a relation into BCNF.

資料庫講到正規化，針對 1NF, 2NF, 3NF 的做法。正規化的目的
－要避免資料重複或相互矛盾的情形，並使資料庫在使用時能更有效率、更容易維護。

關於 1NF, 2NF, 3NF 的細節操作就上網查吧。也就是在還沒正規化前，可能會遭遇到

修改一個 f(x) = y 值時，可能修改非常大量的 row，因為有可能 f(x, z) = y，無用相依關係的 z 很多。
刪除某個元素時，造成 y 值沒有被任何相依，導致 y 的資訊無法存入資料庫。 … 如此類推。

現在給定 relation 和 functional dependencies，正規化它們！

舉個例子：(直接從 stackoverflow 的例子來說)

Relation : R(A, C, B, D, E)
Functional dependencies (FD): A -> B, C -> D

Functional dependencies 簡單來說就是一個決定性的一對多、或者是一對一的相依關係，可以根據 LHS (left hand side) 決策 RHS (right hand side)，當然 function dependencies 的訊息只有部分，也暗示著具有遞移關係。因此例如 A->B, B->C 則可以推論出 A->C 。

定義 X+ 為 X 的遞移閉包，因此假設知道 A->B, B->C 則 {A}+ = {A, B, C} ，假設知道 AB->C 則 {A, B}+ = {A, B, C}

回到例子，每一步要找到違反 BCNF 規則 (violates BCNF) 的 R’ 進行拆分。怎麼樣才算違反 BCNF 規則？可以找到一個 X != X+ != [R’ all attributes] ，就可以對 R’ 進行拆分。

Step 1:
S = {ABCDE}       
    R' = ABCDE, 
    find {A} != {A, B} != {A, B, C, D, E}, 
    R1 = (X+) = {A, B}
    R2 = R' - ((X+) - X) = {A, C, D, E}
Step 2:
S = {ACDE, AB}
    R' = ACDE,
    find {C} != {C, D} != {A, C, D, E}, 
    R1 = (X+) = {C, D}
    R2 = R' - ((X+) - X) = {A, C, E}
Step 3:
S = {ACE, AB, CD}
    // NOT FOUND END.

在程式實作 (非人工) 的時候，如何找到 X 滿足上述的條件很重要，由於 X 是所有變數的 subset 情況，相當於 O(2ⁿ) ，很多項目是無用的。假如 R’ = ABE ，針對 X = AC 是無效的操作，基本上只要拿所有 function dependencies 的 X = LHS 進行測試即可？這貪心的做法應該是對的。

BCNF_Decompose(R)
    find X s.t.: X != (X+) != [all attributes]
    if (not found) then “R is in BCNF”
    let Y = (X+) - X
    let Z = [all attributes] - (X+)
    decompose R into R1(X union Y) and R2(X union Z) 
    continue to decompose recursively R1 and R2

說真的，老師上課講的練習答案還有錯，講算法時也不夠確切，真不知道教授怎麼教的，還是我沒認真上課。當教授發現全班答題有極端的相似錯誤時，到底是我們學不好？還是他沒教好？教授一點都不會反思。

Sample Input

5 2
A->B
C->D
6 3
AC->B
C->D
A->EF
6 6
AB->C
BC->AD
D->E
CF->B
BC->A
BC->D 
5 3
AB->C
DE->C
B->E

Sample Output

A->B
S = {AB, ACDE}
C->D
S = {AB, CD, ACE}
  
C->D
S = {CD, ABCEF}
A->EF
S = {ABC, CD, AEF}
  
AB->C
S = {ABCDE, ABF}
D->E
S = {ABCD, DE, ABF}
  
AB->C
S = {ABD, ABCE}
B->E
S = {ABC, ABD, BE}

Solution

#include <stdio.h>
#include <string.h>
#include <map>
#include <set>
#include <queue>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <assert.h>
using namespace std;
#define INF 0x3f3f3f3f
// stackoverflow Database: Decomposing a relation into BCNF
void parsingStmt(string s, string &l, string &r) {
    size_t pos = s.find("->");
    l = s.substr(0, pos);
    r = s.substr(pos + 2);
}
int toBitmask(string s) {
    int ret = 0;
    for (int i = 0; i < s.length(); i++) {
        assert(s[i] >= 'A' && s[i] <= 'Z');
        ret |= 1<<(s[i] - 'A');
    }
    return ret;
}
void printBitmask(int state) {
    for (int i = 0; (1<<i) <= state; i++) {
        if ((state>>i)&1)
            putchar(i + 'A');
    }
}
vector<int> LHS, RHS;
int findFD(int i) {
    int ss = i;
    for (int j = 0; j < LHS.size(); j++) {
        if ((LHS[j]&i) == LHS[j])
            ss |= RHS[j];
    }
    if (ss != i)
    	return findFD(ss);
    return ss;
}
vector<int> decomposing(vector<int> state, int fdlhs, int fdrhs) {
    vector<int> ret;
    for (int i = 0; i < state.size(); i++) { // X->Y
        int X = fdlhs, Y = fdrhs&state[i];
        if (Y != state[i] && (X&state[i]) == X && Y && X && X != Y) {
            ret.push_back(state[i]^X^Y);
            ret.push_back(Y);
        } else {
            ret.push_back(state[i]);
        }
    }
    sort(ret.begin(), ret.end());
    ret.resize(unique(ret.begin(), ret.end()) - ret.begin());
    return ret;
}
void printResult(vector<int> &ret) {
    printf("S = {");
    for (int i = 0; i < ret.size(); i++) {
        printBitmask(ret[i]);
        if (i != ret.size() - 1) {
            printf(", ");
        }
    }
    puts("}");
}
int main() {
    int n, m;
    string s;
    while (cin >> n >> m) {
        LHS.clear(), RHS.clear();
        for (int i = 0; i < m; i++) {
            cin >> s;
            string lhs, rhs;
            parsingStmt(s, lhs, rhs);
            LHS.push_back(toBitmask(lhs));
            RHS.push_back(toBitmask(rhs));
        }
        vector<int> ret;
        int update = 0;
        ret.push_back((1<<n) - 1);
        do {
            update = (int) ret.size();
            for (int i = 0; i < m; i++) {
                ret = decomposing(ret, LHS[i], findFD(LHS[i]));
                if (ret.size() > update) {	
                    printBitmask(LHS[i]);
                	printf("->");
                	printBitmask(RHS[i]);
                	puts("");
                    printResult(ret);
                	break;
                }            
            }
            update = ret.size() > update;
        } while (update);
    }
    return 0;
}

Morris' Blog

資料庫正規化 Database BCNF

contents

Problem

Sample Input

Sample Output

Solution