import numpy as np
# Build a cost dictionary, assuming Zipf'text law and cost = -math.log(probability).
words = open(r'E:\Documents\work\vocab.txt').read().split()
word_cost = dict((k, np.log((i + 1) * np.log(len(words)))) for i, k in enumerate(words))
max_word_length = max(len(word) for word in words)
def infer_spaces(text):
"""Uses dynamic programming to infer the location of spaces in a string
without spaces."""
def _best_match(num_chars):
# Find the best match for the i first characters, assuming cost has
# been built for the i-1 first characters.
# Returns a pair (match_cost, match_length).
candidates = reversed(cost_array[max(0, num_chars - max_word_length):num_chars])
result = []
for iter_num, candidate in enumerate(candidates):
chars_to_check = text[num_chars - iter_num - 1:num_chars]
temp = (candidate + word_cost.get(chars_to_check, np.inf), iter_num + 1)
result.append(temp)
return min(result)
# Build the cost array.
cost_array = [0]
for num_chars in range(1, len(text) + 1):
cost, match_length = _best_match(num_chars)
cost_array.append(cost)
# Backtrack to recover the minimal-cost string.
result = []
text_length = len(text)
while text_length > 0:
cost, match_length = _best_match(text_length)
assert cost == cost_array[text_length]
result.append(text[text_length - match_length:text_length])
text_length -= match_length
return ' '.join(reversed(result))