fredyounan
8/26/2015 - 11:24 AM

Fixing Broken UTF-8

Fixing Broken UTF-8

<?php

function fixBrokenUTF8($contents) {
    static $replacement = chr(0xEF) . chr(0xBF) . chr(0xBD);

    $expectingNewChar = true;
    $expectedLength = 0;
    $expectedIndex = 0;
    $i = 0;
    $out = "";

    while ($i < strlen($contents)) {
        $byte = ord($contents[$i]);
        // Start of a new character?
        if ($expectingNewChar) {
            // 1-byte ASCII chars, append to output
            if ($byte < 0x80) {
                $out .= $contents[$i];
            }
            // First byte >= 0xF0 means a 4-byte char
            elseif ($byte >= 0xF0) {
                $expectingNewChar = false;
                $expectedLength = 4;
                $expectedIndex = 2;
            }
            // First byte >= 0xE0 means a 3-byte char
            elseif ($byte >= 0xE0) {
                $expectingNewChar = false;
                $expectedLength = 3;
                $expectedIndex = 2;
            }
            // First byte >= 0xC0 means a 2-byte char
            elseif ($byte >= 0xC0) {
                $expectingNewChar = false;
                $expectedLength = 2;
                $expectedIndex = 2;
            }
        }
        // Not the start of a new char, but expecting
        // one of the bytes in a multibyte char
        else {
            // Subsequent bytes in a multibyte char have
            // to be between 0x80 and 0xC0. If it's not
            // then something is corrupt, so add the replacement
            // char to the output and look for a new char
            if (! (($byte >= 0x80) && ($byte < 0xC0))) {
                $out .= $replacement;
                $expectingNewChar = true;
                // This prevents $i++ at the end of the loop,
                // since this "corrupt" byte could be the start
                // of a new character
                continue;
            } else {
                // A valid byte, but we haven't gotten as many
                // bytes for this char yet as we need
                if ($expectedIndex < $expectedLength) {
                    $expectedIndex++;
                }
                // Gotten all the bytes for the valid multibyte char,
                // put them in the output and expect a new character
                // next.
                else {
                    $out .= substr($contents, $i - $expectedLength + 1,
                                   $expectedLength);
                    $expectingNewChar = true;
                }
            }
        }
        // Advance to the next byte
        $i++;
    }
    return $out;
}