yehosef
1/31/2012 - 3:00 PM

levenshtein in php, supports multibyte characters

levenshtein in php, supports multibyte characters

<?php
function levenshtein_php($str1, $str2){
	$length1 = mb_strlen( $str1, 'UTF-8');
	$length2 = mb_strlen( $str2, 'UTF-8');
	if( $length1 < $length2) return levenshtein_php($str2, $str1);
	if( $length1 == 0 ) return $length2;
	if( $str1 === $str2) return 0;
	$prevRow = range( 0, $length2);
	$currentRow = array();
	for ( $i = 0; $i < $length1; $i++ ) {
		$currentRow=array();
		$currentRow[0] = $i + 1;
		$c1 = mb_substr( $str1, $i, 1, 'UTF-8') ;
		for ( $j = 0; $j < $length2; $j++ ) {
			$c2 = mb_substr( $str2, $j, 1, 'UTF-8' );
			$insertions = $prevRow[$j+1] + 1;
			$deletions = $currentRow[$j] + 1;
			$substitutions = $prevRow[$j] + (($c1 != $c2)?1:0);
			$currentRow[] = min($insertions, $deletions, $substitutions);
		}
		$prevRow = $currentRow;
	}
	return $prevRow[$length2];
}

echo levenshtein_php( 'കട', 'കടല' )."\n";
echo levenshtein_php( 'കട', 'കല' )."\n";
echo levenshtein_php( 'കട', 'കടി' )."\n";
echo levenshtein_php( 'abce', 'abcdf' )."\n";