dipu-bd
6/29/2017 - 1:53 AM

## Count occurrences of a sub-string within the given text using Suffix Array

Count occurrences of a sub-string within the given text using Suffix Array

``````/*====================================================================
Title: Computes Longest Repeated Substring from a given string
Complexity: O(n.log(n))
Author : Sudipto Chandra (Dipu)
=====================================================================*/
#include <bits/stdc++.h>
using namespace std;
#define mem(a,b) memset(a, b, sizeof(a))
#define loop(i, x) for(__typeof((x).begin()) i=(x).begin(); i!=(x).end(); ++i)
#define rloop(i, x) for(__typeof((x).rbegin()) i=(x).rbegin(); i!=(x).rend(); ++i)
/*------------------------------------------------------------------------------------*/

int test, cas = 1;

const int SIZ = 10000050; // maximum possible size

int n; // text length
int m; // pattern length
char T[SIZ]; // text
char P[SIZ]; // pattern
int SA[SIZ], tempSA[SIZ]; // the sorted suffixes
int RA[SIZ], tempRA[SIZ]; // ranks of suffix array
int L[SIZ]; // used in counting sort

inline int getRA(int i)
{
return (i < n) ? RA[i] : 0;
}

{
mem(L, 0);
// count frequencies
for(int i = 0; i < n; ++i)
{
L[getRA(i + k)]++;
}
// save first index of every characters
int mx = max(n, 130);
for(int i = 0, s = 0; i < mx; ++i)
{
int x = L[i];
L[i] = s;
s += x;
}
// build sorted tempSA
for(int i = 0; i < n; ++i)
{
int& x = L[getRA(SA[i] + k)];
tempSA[x++] = SA[i];
}
// copy tempSA to SA
for(int i = 0; i < n; ++i)
{
SA[i] = tempSA[i];
}
}
// text must ends with a \$
void buildSA()
{
// initialize
n = strlen(T);
T[n++] = '\$', T[n] = 0; // append \$
for(int i = 0; i < n; ++i)
{
SA[i] = i;
RA[i] = T[i];
}

// algorithm loop
for(int k = 1; k < n; k <<= 1)
{
// sort by k-th ranks
// compute new ranks
tempRA[SA[0]] = 0;
for(int i = 1, r = 0; i < n; ++i)
{
if(getRA(SA[i-1]) != getRA(SA[i])) {
r++;
}
else if(getRA(SA[i-1]+k) != getRA(SA[i]+k)) {
r++;
}
tempRA[SA[i]] = r;
}
for(int i = 0; i < n; ++i)
{
RA[i] = tempRA[i];
}
if(RA[SA[n - 1]] == n - 1) break;
}
}

// Count the number of times a given pattern P appears
// in the string T. Complexity: O(n.log(n))
int countOccurences()
{
m = strlen(P);
SA[n] = n;

int first, last;
int low, high, mid;

// find lower bound
low = 0, high = n;
while(low < high)
{
mid = (low + high) >> 1;
printf("lower: %d %d %d\n", low, mid, high);
if(strncmp(P, T + SA[mid], m) <= 0)
{
high = mid;
}
else
{
low = mid + 1;
}
}
printf("lower: %d %d %d\n", low, mid, high);
first = low;

// find higher bound
low = 0, high = n;
while(low < high)
{
mid = (low + high) >> 1;
printf("upper: %d %d %d\n", low, mid, high);
if(strncmp(P, T + SA[mid], m) < 0)
{
high = mid;
}
else
{
low = mid + 1;
}
}
printf("upper: %d %d %d\n", low, mid, high);
last = low;

printf("%d %d\n", first, last);
return last - first;
}

int main()
{
/*
GATAGACA
GA

abcabxabcd
abc

mississippi
i
*/

printf("Text: ");
scanf("%s", T);
printf("Pattern: ");
scanf("%s", P);

buildSA();

for(int i = 0; i < n; ++i)
{
printf("%2d | %2d | %s\n", i, SA[i], T + SA[i]);
}

int ans = countOccurences();

printf("Pattern found %d times\n", ans);

return 0;
}
``````