2015-08-04

HDU 5307 - He is Flying

1. Problem
2. Sample Input
3. Sample Output
4. Solution

Problem

題目連結，加速以下的程序計算，下方程式需要 $O(N^3)$，若用前綴維護總和也需要 $O(N^2)$。

for l = 0 to n-1
    for r = l+1 to n-1
        sum = 0
        for k = l to r
            sum += A[k]
        ret[sum] += r-l+1

最後輸出所有 ret[sum] 的對應結果。

Sample Input

Sample Output

Solution

由於是一個很樸素的計算，為了加速運算不套點資料結構和算法是不行的。看到對於每一個結果都要輸出，因此可以想到快速傅立葉 FFT 的旋積計算，接下來就要思考如何構造多項式 (向量)。

假設前 $i$ 個數字的前綴和$s_i$，為了要計數反應在係數，而索引值要反應在項數，因此得到兩個 $x$ 多項式相乘，若要統計區間 $[l, r]$ 的總和，則反應在$(i - j) x^{s_i} \times x^{- s_j} = (i-j) x^{s_i - s_j}$。但這樣的計算無法一次完成，因此要拆成兩次計算，分別得到$i x^{s_i - s_j}$ 和$-j x^{s_i - s_j}$。

明顯地前者構造$(\sum i x^s_i) \times (\sum x^{-s_j})$，後者構造$(\sum x^s_i) \times (\sum -j x^{-s_j})$，利用快速傅立葉 $O(n \log n)$ 計算多項式相乘，隨後相扣即可。

特別注意到總和 0 要特別判斷，因為構造法無法計算。此外這題非常講究精準度，可以利用 NTT/FNT 全部都在整數運算，又或者使用 FFT 在 double 形態下完成，特別小心 FFT 通常會利用角度疊加 (合角公式) 來加速運算，但不幸地這裡會遇到精準度誤差，必須採用 cos, sin 全建表。其他人容易遇到要用 long double 取代 double 計算是因為這種寫法的問題。

FFT

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <vector>
#include <algorithm>
#include <complex>
using namespace std;
template<typename T> class TOOL_FFT {
public:
typedef unsigned int UINT32;
#define MAXN 262144
    complex<T> p[2][MAXN];
    int pre_n;
    T PI;
    TOOL_FFT() {
        pre_n = 0;
        PI = acos(-1);
    }
    int NumberOfBitsNeeded(int PowerOfTwo) {
        for (int i = 0; ; ++i) {
            if (PowerOfTwo & (1 << i)) {
                return i;
            }
        }
    }
    inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
        a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
        a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
        a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
        a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
        a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
        return a >> (32 - NumBits);
    }
    
    void FFT(bool InverseTransform, vector<complex<T> >& In, vector<complex<T> >& Out) {
        // simultaneous data copy and bit-reversal ordering into outputs
        int NumSamples = In.size();
        int NumBits = NumberOfBitsNeeded(NumSamples);
        for (int i = 0; i < NumSamples; ++i) {
            Out[FastReverseBits(i, NumBits)] = In[i];
        }
        // the FFT process
        for (register int i = 1; i <= NumBits; i++) {
            int BlockSize = 1<<i, BlockEnd = BlockSize>>1, BlockCnt = NumSamples/BlockSize;
            for (register int j = 0; j < NumSamples; j += BlockSize) {
                complex<T> *t = p[InverseTransform];
                for (register int k = 0; k < BlockEnd; k++, t += BlockCnt) {
                    complex<T> a = (*t) * Out[k+j+BlockEnd];
                    Out[k+j+BlockEnd] = Out[k+j] - a;
                    Out[k+j] += a;
                }
            }
        }
        // normalize if inverse transform
        if (InverseTransform) {
            for (int i = 0; i < NumSamples; ++i) {
                Out[i] /= NumSamples;
            }
        }
    }
    void prework(int n) {
        if (pre_n == n)
            return ;
        pre_n = n;
        p[0][0] = complex<T>(1, 0);
        p[1][0] = complex<T>(1, 0);
        for (register int i = 1; i < n; i++) {
            p[0][i] = complex<T>(cos(2*i*PI / n ) , sin(2*i*PI / n ));
            p[1][i] = complex<T>(cos(2*i*PI / n ) , -sin(2*i*PI / n ));
        }
    }
    vector<T> convolution(complex<T> *a, complex<T> *b, int n) {
        prework(n);
        vector< complex<T> > s(a, a+n), d1(n), d2(n), y(n);
        vector<T> ret(n);
        FFT(false, s, d1);
        s[0] = b[0];
        for (int i = 1, j = n-1; i < n; ++i, --j)
            s[i] = b[j];
        FFT(false, s, d2);
        for (int i = 0; i < n; ++i) {
            y[i] = d1[i] * d2[i];
        }
        FFT(true, y, s);
        for (int i = 0; i < n; ++i) {
            ret[i] = s[i].real();
        }
        return ret;
    }
};
TOOL_FFT<double> tool;
complex<double> a[MAXN], b[MAXN];
vector<double> c;
int A[MAXN], sum[MAXN];
long long ret[MAXN], pr[262144];
int main() {
    pr[0] = 0;
    for (int i = 1; i < MAXN; i++)
        pr[i] = pr[i-1] + (long long)i*(i+1)/2;
    int testcase, n, m, s;
    scanf("%d", &testcase);
    while (testcase--) {
        scanf("%d", &n);
        for (int i = 1; i <= n; i++)
            scanf("%d", &A[i]);
        sum[0] = 0;
        for (int i = 1; i <= n; i++)
            sum[i] = sum[i-1] + A[i];
        
        s = sum[n];
        memset(ret, 0, sizeof(ret[0]) * (s+1));
        for (m = 1; m <= (s<<1); m <<= 1);
        
        memset(a, 0, sizeof(a[0])*m);
        memset(b, 0, sizeof(b[0])*m);
        for (int i = 1; i <= n; i++) {
            a[sum[i]] += complex<double>(i, 0);
            b[sum[i-1]] += complex<double>(1, 0);
        }
        
        c = tool.convolution(a, b, m);
        for (int i = 1; i < m; i++)
            ret[i] += round(c[i]);
        memset(a, 0, sizeof(a[0])*m);
        memset(b, 0, sizeof(b[0])*m);
        for (int i = 1; i <= n; i++) {
            a[sum[i]] += complex<double>(1, 0);
            b[sum[i-1]] += complex<double>(i-1, 0);
        }
        c = tool.convolution(a, b, m);
        for (int i = 1; i <= s; i++)
            ret[i] -= round(c[i]);
        for (int i = 1, z = 0; i <= n+1; i++) {
            if (i == n+1 || A[i] != 0)
                ret[0] += pr[z], z = 0;
            else
                z++;
        }        
        for (int i = 0; i <= s; i++) {
            printf("%lld\n", ret[i]);
        }
    }
    return 0;
}

NTT

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <vector>
#include <algorithm>
using namespace std;
typedef unsigned int UINT32;
typedef long long INT64;
class TOOL_NTT {
public:
#define MAXN 262144
    const INT64 P = 50000000001507329LL; // prime m = kn+1
    const INT64 G = 3;
    INT64 wn[20];
    INT64 s[MAXN], d1[MAXN], d2[MAXN], y[MAXN];
    TOOL_NTT() {
        for (int i = 0; i < 20; i++)
            wn[i] = mod_pow(G, (P-1) / (1<<i), P);
    }
    INT64 mod_mul(INT64 a, INT64 b, INT64 mod) {
        return (a*b - (long long)(a/(long double)mod*b+1e-3)*mod+mod)%mod;
        //      INT64 ret = 0;
        //      for (a = a >= mod ? a%mod : a, b = b >= mod ? b%mod : b; b != 0; b>>=1, a <<= 1, a = a >= mod ? a - mod : a) {
        //          if (b&1) {
        //              ret += a;
        //              if (ret >= mod)
        //                  ret -= mod;
        //          }
        //      }
        //      return ret;
    }
    INT64 mod_pow(INT64 n, INT64 e, INT64 m) {
        INT64 x = 1;
        for (n = n >= m ? n%m : n; e; e >>= 1) {
            if (e&1)
                x = mod_mul(x, n, m);
            n = mod_mul(n, n, m);
        }
        return x;
    }
    int NumberOfBitsNeeded(int PowerOfTwo) {
        for (int i = 0;; ++i) {
            if (PowerOfTwo & (1 << i)) {
                return i;
            }
        }
    }
    inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
        a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
        a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
        a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
        a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
        a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
        return a >> (32 - NumBits);
    }
    void NTT(int on, INT64 *In, INT64 *Out, int n) {
        int NumBits = NumberOfBitsNeeded(n);
        for (int i = 0; i < n; ++i)
            Out[FastReverseBits(i, NumBits)] = In[i];
        for(int h = 2, id = 1; h <= n; h <<= 1, id++) {
            for(int j = 0; j < n; j += h) {
                INT64 w = 1, u, t;
                int block = h/2, blockEnd = j + h/2;
                for(int k = j; k < blockEnd; k++) {
                    u = Out[k], t = mod_mul(w, Out[k+block], P);
                    Out[k] = u + t;
                    Out[k + block] = u - t + P;
                    if (Out[k] >= P)        Out[k] -= P;
                    if (Out[k+block] >= P)  Out[k+block] -= P;
                    w = mod_mul(w, wn[id], P);
                }
            }
        }
        if (on == 1) {
            for (int i = 1; i < n/2; i++)
                swap(Out[i], Out[n-i]);
            INT64 invn = mod_pow(n, P-2, P);
            for (int i = 0; i < n; i++)
                Out[i] = mod_mul(Out[i], invn, P);
        }
    }
    void convolution(INT64 *a, INT64 *b, int n, INT64 *c) {
        NTT(0, a, d1, n);
        s[0] = b[0];
        for (int i = 1; i < n; ++i)
            s[i] = b[n-i];
        NTT(0, s, d2, n);
        for (int i = 0; i < n; i++)
            s[i] = mod_mul(d1[i], d2[i], P);
        NTT(1, s, c, n);
    }
} tool;
INT64 a[MAXN], b[MAXN], c[MAXN];
int A[MAXN], sum[MAXN];
long long ret[MAXN], pr[262144];
int main() {
    pr[0] = 0;
    for (int i = 1; i < MAXN; i++)
        pr[i] = pr[i-1] + (long long)i*(i+1)/2;
    int testcase, n, m, s;
    scanf("%d", &testcase);
    while (testcase--) {
        scanf("%d", &n);
        for (int i = 1; i <= n; i++)
            scanf("%d", &A[i]);
        sum[0] = 0;
        for (int i = 1; i <= n; i++)
            sum[i] = sum[i-1] + A[i];
        
        s = sum[n];
        memset(ret, 0, sizeof(ret[0]) * (s+1));
        for (m = 1; m <= (s<<1); m <<= 1);
        
        memset(a, 0, sizeof(a[0])*m);
        memset(b, 0, sizeof(b[0])*m);
        for (int i = 1; i <= n; i++) {
            a[sum[i]] += i;
            b[sum[i-1]] ++;
        }
        tool.convolution(a, b, m, c);
        for (int i = 1; i < m; i++)
            ret[i] += c[i];
        memset(a, 0, sizeof(a[0])*m);
        memset(b, 0, sizeof(b[0])*m);
        for (int i = 1; i <= n; i++) {
            a[sum[i]] ++;
            b[sum[i-1]] += i-1;
        }
        tool.convolution(a, b, m, c);
        for (int i = 1; i <= s; i++)
            ret[i] -= c[i];
        for (int i = 1, z = 0; i <= n+1; i++) {
            if (i == n+1 || A[i] != 0)
                ret[0] += pr[z], z = 0;
            else
                z++;
        }        
        for (int i = 0; i <= s; i++) {
            printf("%lld\n", ret[i]);
        }
    }
    return 0;
}

NTT CRT

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <vector>
#include <algorithm>
using namespace std;
typedef uint_fast32_t UINT32;
typedef long long INT64;
typedef uint_fast32_t INT32;
class TOOL_NTT {
public:
#define MAXN 262144
    //  INT64 P = 50000000001507329LL; // prime m = kn+1
    //  INT64 G = 3;
    INT32 P = 3, G = 2;
    INT32 wn[20];
    INT32 s[MAXN], d1[MAXN], d2[MAXN], c1[MAXN], c2[MAXN];
    const INT32 P1 = 998244353; // P1 = 2^23 * 7 * 17 + 1
    const INT32 G1 = 3;
    const INT32 P2 = 995622913; // P2 = 2^19 *3*3*211 + 1
    const INT32 G2 = 5;
    const INT64 M1 = 397550359381069386LL;
    const INT64 M2 = 596324591238590904LL;
    const INT64 MM = 993874950619660289LL; // MM = P1*P2
    TOOL_NTT() {
        for (int i = 0; i < 20; i++)
            wn[i] = mod_pow(G, (P-1) / (1<<i), P);
    }
    void reset(INT32 p, INT32 g) {
        P = p, G = g;
        for (int i = 0; i < 20; i++)
            wn[i] = mod_pow(G, (P-1) / (1<<i), P);
    }
    INT64 mod_mul(INT64 a, INT64 b, INT64 mod) {
        return (a*b - (long long)(a/(long double)mod*b+1e-3)*mod+mod)%mod;
        //      INT64 ret = 0;
        //      for (a = a >= mod ? a%mod : a, b = b >= mod ? b%mod : b; b != 0; b>>=1, a <<= 1, a = a >= mod ? a - mod : a) {
        //          if (b&1) {
        //              ret += a;
        //              if (ret >= mod)
        //                  ret -= mod;
        //          }
        //      }
        //      return ret;
    }
    INT64 mod_pow(INT64 n, INT64 e, INT64 m) {
        INT64 x = 1;
        for (n = n >= m ? n%m : n; e; e >>= 1) {
            if (e&1)
                x = mod_mul(x, n, m);
            n = mod_mul(n, n, m);
        }
        return x;
    }
    int NumberOfBitsNeeded(int PowerOfTwo) {
        for (int i = 0;; ++i) {
            if (PowerOfTwo & (1 << i)) {
                return i;
            }
        }
    }
    inline UINT32 FastReverseBits(UINT32 a, int NumBits) {
        a = ( ( a & 0x55555555U ) << 1 ) | ( ( a & 0xAAAAAAAAU ) >> 1 ) ;
        a = ( ( a & 0x33333333U ) << 2 ) | ( ( a & 0xCCCCCCCCU ) >> 2 ) ;
        a = ( ( a & 0x0F0F0F0FU ) << 4 ) | ( ( a & 0xF0F0F0F0U ) >> 4 ) ;
        a = ( ( a & 0x00FF00FFU ) << 8 ) | ( ( a & 0xFF00FF00U ) >> 8 ) ;
        a = ( ( a & 0x0000FFFFU ) << 16 ) | ( ( a & 0xFFFF0000U ) >> 16 ) ;
        return a >> (32 - NumBits);
    }
    void NTT(int on, INT32 *In, INT32 *Out, int n) {
        int NumBits = NumberOfBitsNeeded(n);
        for (int i = 0; i < n; ++i)
            Out[FastReverseBits(i, NumBits)] = In[i];
        for (int h = 2, id = 1; h <= n; h <<= 1, id++) {
            for (int j = 0; j < n; j += h) {
                INT32 w = 1, u, t;
                int block = h/2, blockEnd = j + h/2;
                for (int k = j; k < blockEnd; k++) {
                    u = Out[k], t = (INT64)w*Out[k+block]%P;
                    Out[k] = (u + t)%P;
                    Out[k+block] = (u - t + P)%P;
                    w = (INT64)w * wn[id]%P;
                }
            }
        }
        if (on == 1) {
            for (int i = 1; i < n/2; i++)
                swap(Out[i], Out[n-i]);
            INT32 invn = mod_pow(n, P-2, P);
            for (int i = 0; i < n; i++)
                Out[i] = (INT64)Out[i]*invn%P;
        }
    }
    INT64 crt(INT32 a, INT32 b) {
        return (mod_mul(a, M1, MM) + mod_mul(b, M2, MM))%MM;
    }
    void convolution(INT32 *a, INT32 *b, int n, INT64 *c) {
        reset(P1, G1);
        NTT(0, a, d1, n);
        s[0] = b[0];    for (int i = 1; i < n; ++i) s[i] = b[n-i];
        NTT(0, s, d2, n);
        for (int i = 0; i < n; i++) s[i] = (INT64)d1[i] * d2[i]%P;
        NTT(1, s, c1, n);
        reset(P2, G2);
        NTT(0, a, d1, n);
        s[0] = b[0];    for (int i = 1; i < n; ++i) s[i] = b[n-i];
        NTT(0, s, d2, n);
        for (int i = 0; i < n; i++) s[i] = (INT64)d1[i] * d2[i]%P;
        NTT(1, s, c2, n);
        for (int i = 0; i < n; i++)
            c[i] = crt(c1[i], c2[i]);
    }
} tool;
INT32 a[262144], b[262144];
INT64 c[262144];
int A[MAXN], sum[MAXN];
long long ret[MAXN], pr[262144];
int main() {
    pr[0] = 0;
    for (int i = 1; i < MAXN; i++)
        pr[i] = pr[i-1] + (long long)i*(i+1)/2;
    int testcase, n, m, s;
    scanf("%d", &testcase);
    while (testcase--) {
        scanf("%d", &n);
        for (int i = 1; i <= n; i++)
            scanf("%d", &A[i]);
        
        sum[0] = 0;
        for (int i = 1; i <= n; i++)
            sum[i] = sum[i-1] + A[i];
        
        s = sum[n];
        memset(ret, 0, sizeof(ret[0]) * (s+1));
        for (m = 1; m <= (s<<1); m <<= 1);
        
        memset(a, 0, sizeof(a[0])*m);
        memset(b, 0, sizeof(b[0])*m);
        for (int i = 1; i <= n; i++) {
            a[sum[i]] += i;
            b[sum[i-1]] ++;
        }
        tool.convolution(a, b, m, c);
        for (int i = 1; i < m; i++)
            ret[i] += c[i];
        memset(a, 0, sizeof(a[0])*m);
        memset(b, 0, sizeof(b[0])*m);
        for (int i = 1; i <= n; i++) {
            a[sum[i]] ++;
            b[sum[i-1]] += i-1;
        }
        tool.convolution(a, b, m, c);
        for (int i = 1; i <= s; i++)
            ret[i] -= c[i];
        for (int i = 1, z = 0; i <= n+1; i++) {
            if (i == n+1 || A[i] != 0)
                ret[0] += pr[z], z = 0;
            else
                z++;
        }
        for (int i = 0; i <= s; i++) {
            printf("%lld\n", ret[i]);
        }
    }
    return 0;
}

Morris' Blog