freemo
6/10/2015 - 8:17 PM

A utility class that uses various methods to determine the similarity between two strings.

A utility class that uses various methods to determine the similarity between two strings.

import org.apache.commons.lang3.StringUtils;

import java.util.ArrayList;
import java.util.List;

public final class StringSimilarityUtils {
    private StringSimilarityUtils() {
        throw new IllegalStateException("Should not be able to instantiate this class");
    }

    public static int levenshteinDistance(final String original, final String modified) {
        if(original == null)
            throw new IllegalArgumentException("original can not be null");
        if(modified == null)
            throw new IllegalArgumentException("modified can not be null");

        return StringUtils.getLevenshteinDistance(original, modified);
    }

    public static int levenshteinDistance(final String original, final String modified, final int threshold) throws ThresholdExceeded {
        if(original == null)
            throw new IllegalArgumentException("original can not be null");
        if(modified == null)
            throw new IllegalArgumentException("modified can not be null");
        if(threshold < 0)
            throw new IllegalArgumentException("threshold can not be a negative number");

        int retVal = StringUtils.getLevenshteinDistance(original, modified, threshold);
        if(retVal < 0)
            throw new ThresholdExceeded("Distance exceeded threshold limit");
        return retVal;
    }

    public static double levenshteinDistanceRatio(final String original, final String modified) {
        if(original == null)
            throw new IllegalArgumentException("original can not be null");
        if(modified == null)
            throw new IllegalArgumentException("modified can not be null");

        return levenshteinDistance(original, modified) / ((double)original.length());
    }

    public static double levenshteinDistanceRatio(final String original, final String modified, final double threshold) throws ThresholdExceeded {
        if(original == null)
            throw new IllegalArgumentException("original can not be null");
        if(modified == null)
            throw new IllegalArgumentException("modified can not be null");

        final int absoluteThreshold = Double.valueOf(Math.ceil(original.length() * threshold)).intValue();
        final int distance = levenshteinDistance(original, modified, absoluteThreshold);
        return distance / ((double)original.length());
    }

    public static boolean isSimilarByWordGroup(final String original, final String modified, final int wordGroupThreshold) {
        if(original == null)
            throw new IllegalArgumentException("original can not be null");
        if(modified == null)
            throw new IllegalArgumentException("modified can not be null");
        if(wordGroupThreshold <0)
            throw new IllegalArgumentException("wordGroupThreshold can not be a negative value");

        //Since the Levenshtein distance between the two is above the threshold value lets compare word sequences. If a
        //word sequence containing the number of words int he threshold (or more) appear in the original then it will
        //be considered a derivative.
        final String originalStripped = original.replaceAll("[^A-Za-z0-9 ]", "").replaceAll("[ ]+", " ").toLowerCase();
        final String[] modifiedWords = modified.replaceAll("[^A-Za-z0-9 ]", "").replaceAll("[ ]+", " ").toLowerCase().split(" ");
        final List<String> modifiedWordGroups = new ArrayList<>();
        for(int wordIndex = 0; wordIndex < modifiedWords.length - (wordGroupThreshold - 1); wordIndex++) {
            final StringBuilder wordGroup = new StringBuilder(modifiedWords[wordIndex]);
            for(int wordCount = 0; wordCount < (wordGroupThreshold - 1); wordCount++) {
                wordGroup.append(" ").append(modifiedWords[wordIndex+wordCount+1]);
            }
            modifiedWordGroups.add(wordGroup.toString());
        }
        for(String modifiedWordGroup : modifiedWordGroups)
            if(originalStripped.contains(modifiedWordGroup))
                return true;

        return false;
    }

    /**
     * Determines how similar two strings are using the Levenshtein distance between the two and comparing it
     * against a threshold value, as well as checking contigous word blocks.
     */
    public static boolean isSimilarByDistanceAndWordGroup(final String original, final String modified, final double distanceThreshold, final int wordGroupThreshold) {
        if(original == null)
            throw new IllegalArgumentException("original can not be null");
        if(modified == null)
            throw new IllegalArgumentException("modified can not be null");
        if(wordGroupThreshold <0)
            throw new IllegalArgumentException("wordGroupThreshold can not be a negative value");
        if((distanceThreshold < 0d) || (distanceThreshold > 1d) )
            throw new IllegalArgumentException("distanceThreshold must be a value between 0.0 and 1.0 inclusive");

        //First lets use Levenshtein distance to calculate the distance (similarlity) between the two strings. A 0
        //inidcates they are identical, a number greater than 0 inidicates the number of edits needed to convert one
        //string into the other. This is then converted into a percentage of the total number of characters in the
        //original string. If this number is less thant he threshold then we can simply assume it is a derivative and
        //return. Otherwise we need to resoort to a more rigerous comparison.

        try {
            levenshteinDistanceRatio(original, modified, distanceThreshold);
        } catch (ThresholdExceeded thresholdExceeded) {
            return isSimilarByWordGroup(original, modified, wordGroupThreshold);
        }
        return true;
    }
}
public class ThresholdExceeded extends Exception {
    public ThresholdExceeded() {
    }

    public ThresholdExceeded(String s) {
        super(s);
    }

    public ThresholdExceeded(String s, Throwable throwable) {
        super(s, throwable);
    }

    public ThresholdExceeded(Throwable throwable) {
        super(throwable);
    }

    public ThresholdExceeded(String s, Throwable throwable, boolean b, boolean b1) {
        super(s, throwable, b, b1);
    }
}