NMN - Editorial

PROBLEM LINK:

Practice
Contest

Author: Jatin Nagpal
Tester: Taranpreet Singh
Editorialist: Jatin Nagpal

DIFFICULTY:

MEDIUM

PREREQUISITES:

Map, Strings, String Hashing, Suffix Array Construction Link-1 or Link-2. Refer Google and Youtube for more on prerequisites.

PROBLEM:

Given 2 strings A and B, you have to find a Substring of length K such that it is common in both the strings and the sum of occurrences in both the strings is maximum. If there exist multiple such substrings, then find the one which is lexicographically smaller among them.

QUICK EXPLANATION:

It’s can be divided into 4 parts:-

  1. Find Hash of all substrings of String N and M of length K in O(N+M), and create 2 arrays for N and M to store the hash values of each substring of length K at the starting index of substring.
  2. With the help of map of Frequencies of Hash values, find all the substrings which are common in both strings and have sum of occurrences in both strings maximum.
  3. Construct a suffix array for one of the string let’s say N either with Suffix tree or with any methods available, an O(N*logN*logN) also works
  4. With the help of suffix array, u can find the lexicographically smallest substring.

EXPLANATION:

The 4 parts mentioned in quick explanation are explained below as:

Part 1
Why do we need Hash?

Because with the help of a Hash, we can compare if 2 strings are equal just by looking at their Hash values, which is O(1) complexity, while without it can take upto O(N) complexity, where N is length of string.

How do we calculate all Hash of length K in O(N+M)?

Either u can follow this link, or u can first calculate the Hash of string starting from index 0 of length K, and then for every index,

Why do we need to store the Hash values of each substring of length K at the starting index of substring?

We’d need it in the part 4

It is recommended to make a double Hash instead of single Hash, since it reduces the chances of collision.

From the next Part onwards, I’d refer to Hash of All the Substrings of Length K of string N as ASLKN.

Part 2

Make 2 maps which stores the frequencies of ASLKN and ASLKM respectively.
With the help of these 2 maps, we make a single map which stores the frequencies of ASLK(N&M) collectively such that the string is common in both i.e. the hashes which have frequency value positive in ASLKN & ASLKM.
With the map ASLK(N&M) , we can know the max value of frequency. With that value, we can make a special map let’s say SP_MAP which stores only those values of map ASLK(N&M), which have max frequency.

Part 3

U need to create suffix array of one of the strings of ur choice, i.e. N or M
For creating suffix array, u can refer to Prerequsites.

Part 4

U know from Part 3, that suffix array is the sorted array of suffixes of a string, and element of array is the starting index of the suffix string.
Now, U should be able to observe that sorting suffixes of atleast length K of a string is equivalent to sorting all the Substrings of length K of string N.
With that observation, we can iterate over the suffix array from start to end, which means, we are iterating in lexicographical order, and at each point, we check that the hash value at that Index (which we stored int Part 1) is actually present in SP_MAP or not. If it’s present, u can simply stop iterating print the string starting at that index, since U’ve got the required answer.

ALTERNATE SOLUTION:

In part 3 and 4, Instead of using a Suffix Array, u could have used Rolling Hashes to reach the answer.

SOLUTIONS:

Setter's Solution
#include <bits/stdc++.h>
#define ff first
#define ss second
using namespace std;
#define MP make_pair
#define PB push_back
#define ll long long
#define int long long
#define f(i,x,n) for(int i=x;i<n;i++)
#define ld long double
#define SZ 200005
char a[2][200005];
char txt[SZ];
int suffixArr[SZ];
struct suffix 
{ 
    int index; // To store original index 
    int rank[2]; // To store ranks and next rank pair 
}; 
int cmp(struct suffix a, struct suffix b) 
{ 
    return (a.rank[0] == b.rank[0])? (a.rank[1] < b.rank[1] ?1: 0): 
               (a.rank[0] < b.rank[0] ?1: 0); 
} 
void buildSuffixArray(int n) 
{ 
    struct suffix suffixes[n]; 
    for (int i = 0; i < n; i++) 
    { 
        suffixes[i].index = i; 
        suffixes[i].rank[0] = txt[i] - 'a'; 
        suffixes[i].rank[1] = ((i+1) < n)? (txt[i + 1] - 'a'): -1; 
    } 
    sort(suffixes, suffixes+n, cmp); 
    int ind[n];
    for (int k = 4; k < 2*n; k = k*2) 
    { 
        int rank = 0; 
        int prev_rank = suffixes[0].rank[0]; 
        suffixes[0].rank[0] = rank; 
        ind[suffixes[0].index] = 0; 
        for (int i = 1; i < n; i++) 
        { 
            if (suffixes[i].rank[0] == prev_rank && suffixes[i].rank[1] == suffixes[i-1].rank[1]) 
            { 
                prev_rank = suffixes[i].rank[0]; 
                suffixes[i].rank[0] = rank; 
            } 
            else
            { 
                prev_rank = suffixes[i].rank[0]; 
                suffixes[i].rank[0] = ++rank; 
            } 
            ind[suffixes[i].index] = i; 
        } 
        for (int i = 0; i < n; i++) 
        { 
            int nextindex = suffixes[i].index + k/2; 
            suffixes[i].rank[1] = (nextindex < n)?suffixes[ind[nextindex]].rank[0]: -1; 
        }
        sort(suffixes, suffixes+n, cmp); 
    } 
    for (int i = 0; i < n; i++) 
    	suffixArr[i] = suffixes[i].index; 
    return; 
}
int Mi[2]={758620695,838709685}; // Mod-inverse Values
int M[2]={1000000007,1000000009};
int D[2]={29,31};
int n[2],k;
pair <int,int> h[2][SZ]; //Double Hash
map < pair <int,int> , pair <int,int> > mp;
void compute_hash(int i)
{
	pair <int,int> val={0,0};
	pair <int,int> power={1,1};
	f(j,0,k)
	{
		val.ff=(val.ff+power.ff*(a[i][j]-'a'+1))%M[0];
		val.ss=(val.ss+power.ss*(a[i][j]-'a'+1))%M[1];
		power.ff=(power.ff*D[0])%M[0];
		power.ss=(power.ss*D[1])%M[1];
	}
	h[i][0]=val;
	if(i==0)
		mp[val].ff++;
	else
		mp[val].ss++;
	f(j,k,n[i])
	{
		val.ff=(val.ff + power.ff*(a[i][j]-'a'+1) - (a[i][j-k]-'a'+1) + M[0])%M[0];
		val.ss=(val.ss + power.ss*(a[i][j]-'a'+1) - (a[i][j-k]-'a'+1) + M[1])%M[1];
		val.ff=(val.ff*Mi[0])%M[0];
		val.ss=(val.ss*Mi[1])%M[1];
		h[i][j-k+1]=val;
		if(i==0)
			mp[val].ff++;
		else
			mp[val].ss++;
	}
}
int32_t main()
{
	ios_base::sync_with_stdio(false);
	cin.tie(NULL);
	cout.tie(NULL);
	int t;
	cin>>t;
	while(t--)
	{
		mp.clear();
		cin>>n[0]>>n[1]>>k>>a[0]>>a[1];
		compute_hash(0);
		compute_hash(1);
		int mx=0;
		for(auto i: mp)
		{
			if(i.ss.ff>0&&i.ss.ss>0)
			{
				mx=max(mx,i.ss.ff+i.ss.ss);
			}
		}
		if(mx==0)
		{
			cout<<"-1\n";
			continue;
		}
		set <pair <int,int> > se;
		for(auto i: mp)
		{
			if(i.ss.ff>0&&i.ss.ss>0&&i.ss.ff+i.ss.ss==mx)
			{
				se.insert({i.ff.ff,i.ff.ss});
			}
		}
		int l=0;
		if(n[1]<n[0])
			l=1;
	  	f(i,0,n[0])
	    	txt[i]=a[l][i];
	  	buildSuffixArray(n[l]);
		f(i,0,n[l])
		{
			if(suffixArr[i]<=n[l]-k)
			{
				if(se.count(h[l][suffixArr[i]])==1)
				{
		      		f(j,0,k)
					{
						cout<<a[l][j+suffixArr[i]];
					}
					cout<<'\n';
		      		break;
				}
			}
		}
	}
	return 0;
} 
Tester's Solution
import java.util.*;
import java.io.*;
import java.text.*;
public class Main{
    //SOLUTION BEGIN
    //Into the Hardware Mode
    void pre() throws Exception{}
    void solve(int TC)throws Exception{
        int n = ni(), m = ni(), k = ni();
        String a = n(), b = n();
        
        long p1 = 31, p2 = 37;
        long m1 = (long)1e8+7, m2 = (long)1e9+7;
        int mx = 1+Math.max(n, m);
        long[][] pip1 = pow(mx, p1, m1), pip2 = pow(mx, p2, m2);
        long[] a1 = new long[1+n], a2 = new long[1+n], b1 = new long[1+m], b2 = new long[1+m];
        for(int i = 1; i<= n; i++){
            a1[i] = add(a1[i-1], mul(a.charAt(i-1)-'a'+1, pip1[0][i], m1), m1);
            a2[i] = add(a2[i-1], mul(a.charAt(i-1)-'a'+1, pip2[0][i], m2), m2);
        }
        for(int i = 1; i<= m; i++){
            b1[i] = add(b1[i-1], mul(b.charAt(i-1)-'a'+1, pip1[0][i], m1), m1);
            b2[i] = add(b2[i-1], mul(b.charAt(i-1)-'a'+1, pip2[0][i], m2), m2);
        }
        TreeMap<Pair, Integer> map1 = new TreeMap<>(), map2 = new TreeMap<>();
        for(int i = 0; i<= n-k; i++){
            Pair hash = new Pair(hash(a1, pip1, m1, i+1, i+k), hash(a2, pip2, m2, i+1, i+k));
            map1.put(hash, map1.getOrDefault(hash, 0)+1);
        }
        for(int i = 0; i<= m-k; i++){
            Pair hash = new Pair(hash(b1, pip1, m1, i+1, i+k), hash(b2, pip2, m2, i+1, i+k));
            map2.put(hash, map2.getOrDefault(hash, 0)+1);
        }
        int maxCount = 0;
        TreeSet<Pair> set = new TreeSet<>();
        for(Map.Entry<Pair, Integer> e:map1.entrySet()){
            int c = map2.getOrDefault(e.getKey(), 0);
            if(c == 0)continue;
            c += e.getValue();
            if(c > maxCount){
                maxCount = c;
                set = new TreeSet<>();
            }
            if(c == maxCount)set.add(e.getKey());
        }
        
        if(maxCount == 0){
            pn(-1);
            return;
        }
        
        int[] sufArray = suffixArray(a);
        
        for(int i = 0; i< sufArray.length; i++){
            if(sufArray[i]+k > a.length())continue;
            int ind = sufArray[i];
            Pair hash = new Pair(hash(a1, pip1, m1, ind+1, ind+k), hash(a2, pip2, m2, ind+1, ind+k));
            if(set.contains(hash)){
                pn(a.substring(ind, ind+k));
                return;
            }
        }
        hold(false);
    }
    int[] suffixArray(String s){
        int n = s.length();
        Suffix[] su = new Suffix[n];
        for(int i = 0; i< n; i++){
            su[i] = new Suffix(i, s.charAt(i)-'$', 0);
        }
        for(int i = 0; i< n; i++)su[i].next = (i+1 < n?su[i+1].rank:-1);
        Arrays.sort(su);
        int[] ind = new int[n];
        for(int length = 4; length < 2*n; length<<=1){
            int rank = 0, prev = su[0].rank;
            su[0].rank = rank;
            ind[su[0].index] = 0;
            for(int i = 1; i< n; i++){
                if(su[i].rank == prev && su[i].next == su[i-1].next){
                    prev = su[i].rank;
                    su[i].rank = rank;
                }else{
                    prev = su[i].rank;
                    su[i].rank = ++rank;
                }
                ind[su[i].index] = i;
            }
            for(int i = 0; i< n; i++){
                int nextP = su[i].index+length/2;
                su[i].next = nextP<n?su[ind[nextP]].rank:-1;
            }
            Arrays.sort(su);
        }
        int[] suf = new int[n];
        for(int i = 0; i< n; i++)suf[i] = su[i].index;
        return suf;
    }
    class Suffix implements Comparable<Suffix>{
        int index, rank, next;
        public Suffix(int ind, int r, int nr){
            index = ind; rank = r; next = nr;
        }
        public int compareTo(Suffix s){
            if(rank != s.rank)return Integer.compare(rank, s.rank);
            return Integer.compare(next, s.next);
        }
    }
    class Pair implements Comparable<Pair>{
        long h1, h2;
        public Pair(long a, long b){
            h1 = a;h2 = b;
        }
        public int compareTo(Pair p){
            if(h1 != p.h1)return Long.compare(h1, p.h1);
            return Long.compare(h2, p.h2);
        }
    }
    long hash(long[] h, long[][] p, long m, int l, int r){
        return mul(add(h[r], m-h[l-1], m), p[1][l-1], m);
    }
    long[][] pow(int mx, long p, long mod){
        long[] P = new long[mx], IP = new long[mx];
        P[0] = 1;
        for(int i = 1; i< mx; i++)P[i] = (P[i-1]*p)%mod;
        long M = mod; 
        long y = 0, x = 1; 
        long a = P[mx-1];
        while(a> 1){ 
            long q = a/M;
            long t = M; 
            M = a%M;
            a = t;
            t = y;
            y = x-q*y;
            x = t;
        } 
        if(x<0)x+=mod;
        IP[mx-1] = x;
        for(int i = mx-2; i>= 0; i--)IP[i] = (IP[i+1]*p)%mod;
        return new long[][]{P, IP};
    }
    long mul(long a, long b, long m){
        if(a>=m)a%=m;
        if(b>=m)b%=m;
        a*=b;
        if(a>=m)a%=m;
        return a;
    }
    long add(long a, long b, long mod){
        if(Math.abs(a)>=mod)a%=mod;
        if(a<0)a+=mod;
        if(Math.abs(b)>=mod)b%=mod;
        if(b<0)b+=mod;
        a+=b;
        if(Math.abs(a)>=mod)a%=mod;
        return a;
    }
    long pow(long a, long p, long mod){
        long o = 1;
        while(p>0){
            if(p%2==1)o = (a*o)%mod;
            a = (a*a)%mod;
            p>>=1;
        }
        return o;
    }
    //SOLUTION END
    void hold(boolean b)throws Exception{if(!b)throw new Exception("Hold right there, Sparky!");}
    void exit(boolean b){if(!b)System.exit(0);}
    long IINF = (long)1e18, mod = (long)1e9+7;
    final int INF = (int)1e9, MX = (int)2e6+5;
    DecimalFormat df = new DecimalFormat("0.00000000");
    double PI = 3.141592653589793238462643383279502884197169399, eps = 1e-6;
    static boolean multipleTC = true, memory = false, fileIO = false;
    FastReader in;PrintWriter out;
    void run() throws Exception{
        if(fileIO){
            in = new FastReader("input.txt");
            out = new PrintWriter("output.txt");
        }else {
            in = new FastReader();
            out = new PrintWriter(System.out);
        }
        //Solution Credits: Taranpreet Singh
        int T = (multipleTC)?ni():1;
        pre();
        for(int t = 1; t<= T; t++)solve(t);
        out.flush();
        out.close();
    }
    public static void main(String[] args) throws Exception{
        if(memory)new Thread(null, new Runnable() {public void run(){try{new Main().run();}catch(Exception e){e.printStackTrace();}}}, "1", 1 << 28).start();
        else new Main().run();
    }
    
    int digit(long s){int ans = 0;while(s>0){s/=10;ans++;}return ans;}
    long gcd(long a, long b){return (b==0)?a:gcd(b,a%b);}
    int gcd(int a, int b){return (b==0)?a:gcd(b,a%b);}
    int bit(long n){return (n==0)?0:(1+bit(n&(n-1)));}
    void p(Object o){out.print(o);}
    void pn(Object o){out.println(o);}
    void pni(Object o){out.println(o);out.flush();}
    String n()throws Exception{return in.next();}
    String nln()throws Exception{return in.nextLine();}
    int ni()throws Exception{return Integer.parseInt(in.next());}
    long nl()throws Exception{return Long.parseLong(in.next());}
    double nd()throws Exception{return Double.parseDouble(in.next());}
 
    class FastReader{
        BufferedReader br;
        StringTokenizer st;
        public FastReader(){
            br = new BufferedReader(new InputStreamReader(System.in));
        }
 
        public FastReader(String s) throws Exception{
            br = new BufferedReader(new FileReader(s));
        }
 
        String next() throws Exception{
            while (st == null || !st.hasMoreElements()){
                try{
                    st = new StringTokenizer(br.readLine());
                }catch (IOException  e){
                    throw new Exception(e.toString());
                }
            }
            return st.nextToken();
        }
 
        String nextLine() throws Exception{
            String str = "";
            try{   
                str = br.readLine();
            }catch (IOException e){
                throw new Exception(e.toString());
            }  
            return str;
        }
    }
} 

I’d also like to know if there exist other solutions for this problem, I even wanted to know how it’d be implemented with rolling Hashes, since I simply have no Idea about it.

3 Likes

I had used unordered_map without doing hashing of strings and I am getting TLE in Sub-Task 3. So can you tell me can I optimise my code without doing string hashing or the string hashing is only way? But, I think it is running nlogn times only.

I’ve already explained it in the Part 1, why Hashing is important. Not only that, ur solution should also lead to MLE as u have stored Order of N^2 memory in the ordered_map <string,int> part.
For Example, if K equals N/2, U have N/2 strings of N/2 length storing which would require N^{2}/4 memory.

can you please help me with some counter cases for my code? I am trying from yesterday night but didn’t find any now :(.

I have created Suffix Tree for both the strings and then do a dfs to store the count of leaves beneath my current node (so we get the number of occurences) and then I am iterating on the second string (it may be first one doesn’t matter) and try to extend to the current node by some characters to make it point to a node which is at K distance (shift variable in my code). Now, if we can extend the node by required characters we are sure that current suffix is present and I calculate the count (using the value stored during dfs).

After reading the editorial I know it is an overkill to the problem :stuck_out_tongue_winking_eye: but still want to know where I am going wrong :frowning:

Please help!

I think there’s something wrong in your lex_small function, just have a look at if block on line 156 in ur code, ur function is indeed comparing if s[negative value]>s[negative value]. Maybe ur formula is wrong, or the foo function is wrong. Try debugging this part

Hashing was not needed to solve this problem. Here’s how I solved it. Consider the string s = a + ‘$’ + b. Obtain the suffix array for the string. Also obtain the type of the i-th suffix (type[i] = 0 if suf[i] is a suffix of the first string and 1 otherwise). Now, fix a particular index l in the suffix array and obtain the largest r s.t. lcp(suf[l … r]) = lcp(suf[l], suf[r]) >= k. Now, the k length substring starting from position suf[l] in s is a common substring for a and b iff within the range [l, r] there exists some suffix of type 0 and some suffix of type 1. We can simply iterate over all l and obtain the corresponding r. This gives an O(n^2 lg n) algorithm. Also, note that we can binary search for r which makes the complexity O(n (lg n)^2).
The next observation is that as l increases, r may only increase or stay the same. So, r is monotonic w.r.t. l. This can be used to ensure that the total movement of both l and r is O(n) and now, we have an algorithm that works in O(n lg n). My solution: https://www.codechef.com/viewsolution/26679442

Turns out that using prefix hashes, you can also compare substrings lexicographically in O(lg N) time. This blog post describes how to do this: https://codeforces.com/blog/entry/60445

Thanks for your help! If you don’t mind I want to ask you something.
Why did you think that it is comparing (s[neg_value] > s[neg_value])? Because I have seen that you haven’t submitted the code with any additional assert statements. I mean whether you read my code and find the mistake or you find something fishy and make a guess?

Okay, let me explain it a bit, i was checking the bug in ur code, check this test case:-

1
20 20 10
zbbbbbbbbbbbbbbbbbbb
bbbbbbbbbbbbbbbbbbbz

in this case, it’s comparing s[-8] > s[-7]. I didn’t actually dry run ur Code, since it’d hard, but here’s the bug I found.
Talking about Assert, I should have included the assert solution in the editorial, would take care next time, but it’s not that it hasn’t been tested that way for asserts, here is the code for testing, with asserts and it gives a Wrong Answer instead of Runtime Error:

Dummy Code
#include <bits/stdc++.h>
#define ff first
#define ss second
using namespace std;
#define MP make_pair
#define PB push_back
#define ll long long
#define int long long
#define f(i,x,n) for(int i=x;i<n;i++)
#define ld long double
#define mod 1000000007
 
long long readInt(long long l,long long r,char endd){
    long long x=0;
    int cnt=0;
    int fi=-1;
    bool is_neg=false;
    while(true){
        char g=getchar();
        if(g=='-'){
            assert(fi==-1);
            is_neg=true;
            continue;
        }
        if('0'<=g && g<='9'){
            x*=10;
            x+=g-'0';
            if(cnt==0){
                fi=g-'0';
            }
            cnt++;
            assert(fi!=0 || cnt==1);
            assert(fi!=0 || is_neg==false);
           
            assert(!(cnt>19 || ( cnt==19 && fi>1) ));
        } else if(g==endd){
            if(is_neg){
                x= -x;
            }
            assert(l<=x && x<=r);
            return x;
        } else {
            assert(false);
        }
    }
}
string readString(int l,int r,char endd){
    string ret="";
    int cnt=0;
    while(true){
        char g=getchar();
        assert(g!=-1);
        if(g==endd){
            break;
        }
        cnt++;
        ret+=g;
    }
    assert(l<=cnt && cnt<=r);
    return ret;
}
long long readIntSp(long long l,long long r){
    return readInt(l,r,' ');
}
long long readIntLn(long long l,long long r){
    return readInt(l,r,'\n');
}
string readStringLn(int l,int r){
    return readString(l,r,'\n');
}
string readStringSp(int l,int r){
    return readString(l,r,' ');
}
 
int32_t main()
{
	int t;
	t=readIntLn(1,200000);
	while(t--)
	{
		int n,m,k;
		n=readIntSp(1,200000);
		m=readIntSp(1,200000);
		k=readIntLn(1,200000);
		string a,b;
		a=readStringLn(n,n);
		b=readStringLn(m,m);
	}
	assert(getchar()==-1);
	return 0;
} 

Thanks!
But sorry to disturb you again. can you please just once look into the code? Just one WA and some TLE. Don’t know why…

U have Wrong Answer on the case where there is only 1 common substring with frequency of 1 in both strings
TLE on those where test cases are more than 1, or when a strings don’t have equal length.