cxfans
11/22/2019 - 11:27 AM

字符串匹配算法

# 哈希滚动字符串匹配算法理解与模除运算法则

string = "It is a test, but not just a test."
sub_string = "test"
string_len = len(string)
sub_len = len(sub_string)

# 暴力匹配
for i in range(string_len):
    if string[i:i + sub_len] == sub_string:
        # 打印匹配起始索引
        print(f"{sub_string} is found at index {i}.")

# 再原始一点
for i in range(len(string)):
    match = True
    for j in range(sub_len):
        if string[i + j] != sub_string[j]:
            match = False
            break
    if match:
        print(f"{sub_string} is found at index {i}.")


# 哈希滚动 —— Rabin Karp 算法
# 散列空间大小
base = 256
# 模运算除数
modulus = 101

# 用于哈希滚动时每次去除首字符的哈希成分,
# 使其余部分不受影响,避免多次重复计算
h = 1
for i in range(sub_len-1):
    h = (h*base) % modulus

string_hash = 0
sub_hash = 0

for i in range(sub_len):
    # 从字符串首部计算与子字符串等长字符串的哈希值
    string_hash = (base * string_hash + ord(string[i])) % modulus
    # 计算匹配子字符串哈希值
    sub_hash = (base*sub_hash+ord(sub_string[i])) % modulus

for i in range(string_len - sub_len):
    if (string_hash == sub_hash):
        print(f"{sub_string} is found at index {i}.")
    tail_hash = base*(string_hash-ord(string[i])*h)  # 去除首字符的哈希成分,基本原理是模除运算法则
    string_hash = (tail_hash+ord(string[i+sub_len])) % modulus