Project Euler 691

Project Euler 691

题目

Long substring with many repetitions

Given a character string \(s\), we define \(L(k,s)\) to be the length of the longest substring of \(s\) which appears at least \(k\) times in \(s\), or \(0\) if such a substring does not exist. For example, \(L(3,\text{“bbabcabcabcacba”})=4\) because of the three occurrences of the substring \(\text{“abca”}\), and \(L(2,\text{“bbabcabcabcacba”})=7\) because of the repeated substring \(\text{“abcabca”}\). Note that the occurrences can overlap.

Let \(a_n\), \(b_n\) and \(c_n\) be the \(0/1\) sequences defined by:

  • \(a_0 = 0\)
  • \(a_{2n} = a_{n}\)
  • \(a_{2n+1} = 1-a_{n}\)
  • \(b_n = \left\lfloor\dfrac{n+1}{\varphi}\right\rfloor - \left\lfloor\dfrac{n}{\varphi}\right\rfloor\) (where \(\varphi\) is the golden ratio)
  • \(c_n = a_n + b_n - 2a_nb_n\)

and \(S_n\) the character string \(c_0\ldots c_{n-1}\). You are given that \(L(2,S_{10})=5, L(3,S_{10})=2, L(2,S_{100})=14, L(4,S_{100})=6, L(2,S_{1000})=86, L(3,S_{1000}) = 45, L(5,S_{1000}) = 31\), and that the sum of non-zero \(L(k,S_{1000})\) for \(k\ge 1\) is \(2460\).

Find the sum of non-zero \(L(k,S_{5000000})\) for \(k\ge 1\).

后缀数组(Suffix Array)

后缀树组是一个将字符串\(s\)的所有后缀进行排序后得到的数组,\(sa[i]\)表示第\(i\)小的后缀\(s_{sa[i]}s_{sa[i]+1}\dots s_n\)(用第一个字符的下标代表后缀)。可以以\(O(n\log n)\)的时间复杂度求出,最优秀的算法甚至达到\(O(n)\)

\(sa\)可以以\(O(n)\)的时间推导出另一个数组\(h\),其中\(h[i](i\ge 2)\)表示后缀\(sa[i-1]\)\(sa[i]\)最长公共前缀的长度。可以认为\(h[1]=0\)

解决方案

为了方便,我们将\(S_n\)假设成是\(1\)下标的,也就是\(S_n=c_1c_2\dots c_n\),并且和原来的字符串相同。

直接使用模板计算出字符串\(S_n\)\(sa\)数组和\(h\)数组后,不难知道\(\displaystyle{L(k,S_n)=\text{argmax}_{i}\{\exists p\in[2,n-k+2]: \min_{j=p}^{p+k-2}\{h[j]\}\ge i\}}\),也就是找到最大的\(i\),使得\(h\)某一个长度为\(k-1\)的子数组都不低于\(i\)

可以知道\(L\)是单调非递增的,计算\(L\)的过程可以考虑使用区间并查集解决(这种并查集只会将相邻被填入集合中的数进行合并)。从大到小枚举可以填入的答案\(i\),并且将\(h[k]\ge i\)的所有连续段的最大值\(M\)求出来。最终,\(\forall j\le M\),如果\(L(j,S_n)\)之前没被计算过,那么\(L(j,S_n)=M\)

代码

这里的后缀数组求解使用了OI-WIKI中的模板,其基本思想是基数排序。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#include <bits/stdc++.h>
using namespace std;
typedef long long ll;

const int N = 5000000;

int c[N+4];
int sa[N+4], rk[N+4], oldrk[(N+4) << 1], id[N+4], key1[N+4], cnt[N+4];
int h[N + 4];
bool cmp(int x, int y, int w) {
return oldrk[x] == oldrk[y] && oldrk[x + w] == oldrk[y + w];
}
void run_sa(int *s,int n,int m) {
for (int i = 1; i <= n; ++i) ++cnt[rk[i] = s[i]];
for (int i = 1; i <= m; ++i) cnt[i] += cnt[i - 1];
for (int i = n; i >= 1; --i) sa[cnt[rk[i]]--] = i;
for (int w = 1, p;; w <<= 1, m = p) {
p = 0;
for (int i = n; i > n - w; --i) id[++p] = i;
for (int i = 1; i <= n; ++i)
if (sa[i] > w) id[++p] = sa[i] - w;

memset(cnt, 0, sizeof(cnt));
for (int i = 1; i <= n; ++i) ++cnt[key1[i] = rk[id[i]]];
for (int i = 1; i <= m; ++i) cnt[i] += cnt[i - 1];
for (int i = n; i >= 1; --i) sa[cnt[key1[i]]--] = id[i];
memcpy(oldrk + 1, rk + 1, n * sizeof(int));
p = 0;
for (int i = 1; i <= n; ++i)
rk[sa[i]] = cmp(sa[i], sa[i - 1], w) ? p : ++p;
if (p == n) {
for (int i = 1; i <= n; ++i) sa[rk[i]] = i;
break;
}
}
for (int i = 1, k = 0; i <= n; ++i) {
if (rk[i] == 0) continue;
if (k) --k;
while (max(sa[rk[i] - 1], i) + k <= n && s[i + k] == s[sa[rk[i] - 1] + k]) ++k;
h[rk[i]] = k;
}
}
int fa[N+4],sz[N+4],vis[N+4],mx = 0;
int find(int x){return x==fa[x]?x:fa[x]=find(fa[x]);}
void merge(int x,int y){
int u=find(x),v=find(y);
if(u!=v){
fa[v]=u;
sz[u]+=sz[v];
}
}
vector<int>pt[N+4];
int l[N+4];
int main(){
clock_t t = clock();
for(int i=1;i<=N;i++)
c[i]=__builtin_popcount(i-1)&1;

double ph=(1+sqrt(5.0))/2;
double pre=0;
for(int i=1;i<=N;i++){
double now=1.0*i/ph;
c[i]^=int(now)-int(pre);
pre=now;
}
run_sa(c,N,1);
for(int i=2;i<=N;i++) {
fa[i] = i, sz[i] = 1;
pt[h[i]].push_back(i);
}

for(int i=N,p=1;i>=1;i--){
for(int x:pt[i]){
vis[x]=1;
if(vis[x-1]) merge(x-1,x);
if(vis[x+1]) merge(x+1,x);
mx=max(mx,sz[find(x)]);
}
for(;p-1<=mx;p++)
l[p] = i;
}
ll ans = 0;
for(int i=1;i<=N;i++)
ans += l[i];
printf("%lld\n",ans);
cout << clock()-t<<endl;
}