vgrabovets
9/10/2017 - 6:14 PM

infer_spaces

import numpy as np

# Build a cost dictionary, assuming Zipf'text law and cost = -math.log(probability).
words = open(r'E:\Documents\work\vocab.txt').read().split()
word_cost = dict((k, np.log((i + 1) * np.log(len(words)))) for i, k in enumerate(words))
max_word_length = max(len(word) for word in words)


def infer_spaces(text):
    """Uses dynamic programming to infer the location of spaces in a string
    without spaces."""

    def _best_match(num_chars):
        # Find the best match for the i first characters, assuming cost has
        # been built for the i-1 first characters.
        # Returns a pair (match_cost, match_length).

        candidates = reversed(cost_array[max(0, num_chars - max_word_length):num_chars])

        result = []
        for iter_num, candidate in enumerate(candidates):
            chars_to_check = text[num_chars - iter_num - 1:num_chars]
            temp = (candidate + word_cost.get(chars_to_check, np.inf), iter_num + 1)
            result.append(temp)

        return min(result)

    # Build the cost array.
    cost_array = [0]
    for num_chars in range(1, len(text) + 1):
        cost, match_length = _best_match(num_chars)
        cost_array.append(cost)

    # Backtrack to recover the minimal-cost string.
    result = []
    text_length = len(text)
    while text_length > 0:
        cost, match_length = _best_match(text_length)
        assert cost == cost_array[text_length]
        result.append(text[text_length - match_length:text_length])
        text_length -= match_length

    return ' '.join(reversed(result))