Reduce (UTF-8) strings to alphanumeric

4/8/2011 - 11:52 AM

Reduce (UTF-8) strings to alphanumeric

urlify.test.php

<?php

include dirname(__FILE__) . '/urlify.php';

mb_internal_encoding('UTF-8');

$test = array(
    'hällö wörld' => 'haelloe-woerld',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörld ' => 'haelloe-woerld',
    'hällö wörld %' => 'haelloe-woerld',
    'héllò peôplë ÑO?' => 'hello-people-nO',
);

echo '<pre>';
foreach ($test as $o => $r){
    $_r = urlify($o);
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
}
echo '</pre>';

$test = array(
    'hällö wörld' => 'haelloe-woerld',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörld ' => 'haelloe-woerld-',
    'hällö wörld %' => 'haelloe-woerld--',
    'héllò peôplë ÑO?' => 'hello-people-nO-',
);

echo '<pre>';
foreach ($test as $o => $r){
    $_r = urlify($o, false);
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
}
echo '</pre>';


$test = array(
    'hällö wörld' => 'haelloe-woerld',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörld ' => 'haelloe-woerld',
    'hällö wörld %' => 'haelloe-woerld',
    'héllò peôplë ÑO?' => 'hello-people-nO',
    'héllò peôplë ñO?' => 'hello-people-ñO',
);

echo '<pre>';
foreach ($test as $o => $r){
    $_r = urlify($o, true, array(0xF1 => true));
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
}
echo '</pre>';


$test = array(
    'hällö wörld' => 'haelloe-woerld',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörld ' => 'haelloe-woerld',
    'hällö wörld %' => 'haelloe-woerld',
    'héllò peôplë ÑO?' => 'hello-people-nO',
    'héllò peôplë ñO?' => 'hello-people-XXXO',
);

echo '<pre>';
foreach ($test as $o => $r){
    $_r = urlify($o, true, null, array(0xF1 => 'XXX'));
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
}
echo '</pre>';

$test = array(
    'hällö wörld' => 'haelloe-woerld',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörldß' => 'haelloe-woerldss',
    'hällö wörld ' => 'haelloe-woerld',
    'hällö wörld %' => 'haelloe-woerld',
    'héllò peôplë ÑO?' => 'hello-people-nO?',
    'héllò peôplë ñO?' => 'hello-people-ñO?',
);

echo '<pre>';
foreach ($test as $o => $r){
    $_r = urlify($o, true, 'ñ?');
    echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
}
echo '</pre>';

urlify.php

<?php

/*
  consider decomposing the characters to "capture" more "obscure" characters such as ṩ
    - http://www.php.net/manual/en/normalizer.normalize.php#92592
*/

/**
 * Normalize a string to only contain alphanumeric characters and dashes.
 *
 * Replace accents by their entities. 
 * Replace everything else by - (dash).
 * @note mb_internal_charset() must be set to whatever encoding $string had originally
 * @param string $string String to normalize
 * @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences
 * @param array|string $allow List of Characters that are to be ignored while urlifying array( unicode => true ) or string like "$é@"
 * @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" )
 * @return string normalized string
 * @author Christian Kruse <cjk+os@wwwtech.de>
 * @author Rodney Rehm <rodney.rehm@medialize.de>
 */
function urlify($string, $trim=true, $allow=null, $replace=null)
{
    if (!is_string($string)) {
        throw new Exception('$string must be a string');
    }
    
    $_replace = array(
        0xE4 => "\0\0\0\x61\0\0\0\x65", // ä
        0xC4 => "\0\0\0\x41\0\0\0\x65", // Ä
        0xF6 => "\0\0\0\x6F\0\0\0\x65", // ö
        0xD6 => "\0\0\0\x4F\0\0\0\x65", // Ö
        0xFC => "\0\0\0\x75\0\0\0\x65", // ü
        0xDC => "\0\0\0\x55\0\0\0\x65", // Ü
        0xDF => "\0\0\0\x73\0\0\0\x73", // ß
        0xE6 => "\0\0\0\x61\0\0\0\x65", // æ
        0xC6 => "\0\0\0\x41\0\0\0\x65", // Æ
    );
    
    if ($replace && is_array($replace)) {
        foreach ($replace as $k => $v) {
            $_replace[$k] = mb_convert_encoding($v, "UTF-32BE");
        }
    }
    
    if ($allow && is_string($allow)) {
        $t = mb_convert_encoding($allow, "UTF-32BE");
        $t = unpack("N*", $t);
        $allow = array();
        foreach ($t as $k) {
            $allow[$k] = true;
        }
    } elseif ($allow && !is_array($allow)) {
        $allow = null;
    }

    $res = '';
    $string = mb_convert_encoding($string, "UTF-32BE");
    $unicodes = unpack("N*", $string);
    $i = -1;

    foreach ($unicodes as $code) {
        $i++;

        if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) {
            // skip normalization for alphanumeric characters [a-zA-Z0-9_-]
            $res .= mb_substr($string, $i, 1, "UTF-32BE");
        } elseif ($allow && isset($allow[$code])) {
            // skip normalization for allowed characters
            $res .= mb_substr($string, $i, 1, "UTF-32BE");
        } elseif (isset($_replace[$code])) {
            // replace as defined
            $res .= $_replace[$code];
        } elseif (($code >= 0xC0 && $code <= 0xC6) || ($code >= 0xE0 && $code <= 0xE6) || ($code >= 0x100 && $code <= 0x105)) {
            $res .= "\0\0\0\x61"; // a
        } elseif ($code == 0xC7 || $code == 0xE7 || ($code >= 0x106 && $code <= 0x10D)) {
            $res .= "\0\0\0\x63"; // c
        } elseif ($code == 0xD0 || ($code >= 0x10E && $code <= 0x111)) {
            $res .= "\0\0\0\x64"; // d
        } elseif (($code >= 0xC8 && $code <= 0xCB) || ($code >= 0xE8 && $code <= 0xEB) || ($code >= 0x112 && $code <= 0x11B)) {
            $res .= "\0\0\0\x65"; // e
        } elseif (($code >= 0xCC && $code <= 0xCF) || ($code >= 0xEC && $code <= 0xEF)) {
            $res .= "\0\0\0\x69"; // i
        } elseif ($code == 0xD1 || $code == 0xF1) {
            $res .= "\0\0\0\x6E"; // n
        } elseif (($code >= 0xD2 && $code <= 0xD8) || ($code >= 0xF2 && $code <= 0xF8)) {
            $res .= "\0\0\0\x6F"; // o
        } elseif (($code >= 0xD9 && $code <= 0xDB) || ($code >= 0xF9 && $code <= 0xFB)) {
            $res .= "\0\0\0\x75"; // u
        } elseif ($code == 0xDD || $code == 0xFD || $code == 0xFF) {
            $res .= "\0\0\0\x79"; // y
        } else {
            $res .= "\0\0\0\x2D"; // -
        }
    }
    
    if ($trim) {
        $res = preg_replace('#(\0\0\0\x2D){2,}#', "\0\0\0\x2D", $res);
        $res = preg_replace('#(^\0\0\0\x2D)|(\0\0\0\x2D$)#', "", $res);
    }
    
    return mb_convert_encoding($res, mb_internal_encoding(), "UTF-32BE");
}

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Reduce (UTF-8) strings to alphanumeric