sundeepblue
4/15/2014 - 2:38 PM

Remove all duplicate words from a string of words. Not just the duplicate but all the instances of the duplicates need to be removed.

Remove all duplicate words from a string of words. Not just the duplicate but all the instances of the duplicates need to be removed.

#include <string>
#include <unordered_map>
#include <iostream>
using namespace std;

/* 
Remove all duplicate words from a string of words. Not just the duplicate but all the instances of the duplicates need to be removed.
idea: put every word into hashtable, and count its frequency. Then, scan sentence again, if hash count > 1, ignore this word
*/

bool is_letter(char c) {
    return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z';
}

string remove_all_duplicated_words(string &s) {
    if(s.empty()) return "";
    unordered_map<string, int> hash;
    
    int i = 0, N = s.size();
    
    while(i < N) {
        while(i < N && is_letter(s[i]) == false) i++;
        if(i == N) break;
        
        int word_begin = i, word_end = i;
        while(word_end < N && is_letter(s[word_end])) word_end++;
        string word = s.substr(word_begin, word_end - word_begin);
        
        hash[word]++;
        i = word_end;
    }
    if(hash.empty()) return s;
    //for(auto s : hash) cout << s.first << endl;
    
    i = 0;
    int tail = 0;
    while(i < N) {
        while(i < N && is_letter(s[i]) == false) {
            //i++;
            //tail++;
            s[tail++] = s[i++];
        }
        int word_begin = i, word_end = i;
        while(word_end < N && is_letter(s[word_end])) word_end++;
        string word = s.substr(word_begin, word_end - word_begin);
        
        if(hash[word] == 1) {
            int j = 0;
            while(j < word.size()) s[tail++] = word[j++];
        } 
        i = word_end;
    }
    return s.substr(0, tail);
}

int main() {
    string s = "a b ab ab b";
    cout << "[" << remove_all_duplicated_words(s) << "]";
}