thuai
8/15/2014 - 2:02 AM

NSData+OADataHelpers.m

// Author: Oleg Andreev <oleganza@gmail.com> 
// May 28, 2011
// Do What The Fuck You Want Public License <http://www.wtfpl.net>

#import "NSData+OADataHelpers.h"

#if !__has_feature(objc_arc)
  #error ARC must be enabled!
#endif

@implementation NSData (OADataHelpers)

- (NSString*) UTF8String
{
  // First we try strict decoding to avoid iconv overhead when not needed (majority of cases).
  NSString* str = [[NSString alloc] initWithData:self encoding:NSUTF8StringEncoding];
  if (!str)
  {
    // Here data contains invalid characters, so we'll try to clean them up.
    return [[NSString alloc] initWithData:[self dataByHealingUTF8Stream] encoding:NSUTF8StringEncoding];
  }
  return str;
}

- (NSData*) dataByHealingUTF8Stream
{
  NSUInteger length = [self length];
  
  if (length == 0) return self;

  // Replaces all broken sequences by � character and returns NSData with valid UTF-8 bytes.
  
#if DEBUG
  int warningsCounter = 10;
#endif
  
  //  bits
  //  7   	U+007F      0xxxxxxx
  //  11   	U+07FF      110xxxxx	10xxxxxx
  //  16  	U+FFFF      1110xxxx	10xxxxxx	10xxxxxx
  //  21  	U+1FFFFF    11110xxx	10xxxxxx	10xxxxxx	10xxxxxx
  //  26  	U+3FFFFFF   111110xx	10xxxxxx	10xxxxxx	10xxxxxx	10xxxxxx
  //  31  	U+7FFFFFFF  1111110x	10xxxxxx	10xxxxxx	10xxxxxx	10xxxxxx	10xxxxxx
  
  #define b00000000 0x00
  #define b10000000 0x80
  #define b11000000 0xc0
  #define b11100000 0xe0
  #define b11110000 0xf0
  #define b11111000 0xf8
  #define b11111100 0xfc
  #define b11111110 0xfe
  
  static NSString* replacementCharacter = @"�";
  NSData* replacementCharacterData = [replacementCharacter dataUsingEncoding:NSUTF8StringEncoding];
  
  NSMutableData* resultData = [NSMutableData dataWithCapacity:[self length]];
  
  const char *bytes = [self bytes];
  
  
  static const NSUInteger bufferMaxSize = 1024;
  char buffer[bufferMaxSize]; // not initialized, but will be filled in completely before copying to resultData
  NSUInteger bufferIndex = 0;
  
  #define FlushBuffer() if (bufferIndex > 0) { \
    [resultData appendBytes:buffer length:bufferIndex]; \
    bufferIndex = 0; \
  }
  #define CheckBuffer() if ((bufferIndex+5) >= bufferMaxSize) { \
    [resultData appendBytes:buffer length:bufferIndex]; \
    bufferIndex = 0; \
  }
  
  NSUInteger byteIndex = 0;
  BOOL invalidByte = NO;
  while (byteIndex < length)
  {
    char byte = bytes[byteIndex];
    
    // ASCII character is always a UTF-8 character
    if ((byte & b10000000) == b00000000) // 0xxxxxxx
    {
      CheckBuffer();
      buffer[bufferIndex++] = byte;
    }
    else if ((byte & b11100000) == b11000000) // 110xxxxx 10xxxxxx
    {
      if (byteIndex+1 >= length) {
        FlushBuffer();
        return resultData;
      }
      char byte2 = bytes[++byteIndex];
      if ((byte2 & b11000000) == b10000000)
      {
        // This 2-byte character still can be invalid. Check if we can create a string with it.
        unsigned char tuple[] = {byte, byte2};
        CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 2, kCFStringEncodingUTF8, false);
        if (cfstr)
        {
          CFRelease(cfstr);
          CheckBuffer();
          buffer[bufferIndex++] = byte;
          buffer[bufferIndex++] = byte2;
        }
        else
        {
          invalidByte = YES;
        }
      }
      else
      {
        invalidByte = YES;
      }
    }
    else if ((byte & b11110000) == b11100000) // 1110xxxx 10xxxxxx 10xxxxxx
    {
      if (byteIndex+2 >= length) {
        FlushBuffer();
        return resultData;
      }
      char byte2 = bytes[++byteIndex];
      char byte3 = bytes[++byteIndex];
      if ((byte2 & b11000000) == b10000000 && 
          (byte3 & b11000000) == b10000000)
      {
        // This 3-byte character still can be invalid. Check if we can create a string with it.
        unsigned char tuple[] = {byte, byte2, byte3};
        CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 3, kCFStringEncodingUTF8, false);
        if (cfstr)
        {
          CFRelease(cfstr);
          CheckBuffer();
          buffer[bufferIndex++] = byte;
          buffer[bufferIndex++] = byte2;
          buffer[bufferIndex++] = byte3;
        }
        else
        {
          invalidByte = YES;
        }
      }
      else
      {
        invalidByte = YES;
      }
    }
    else if ((byte & b11111000) == b11110000) // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    {
      if (byteIndex+3 >= length) {
        FlushBuffer();
        return resultData;
      }
      char byte2 = bytes[++byteIndex];
      char byte3 = bytes[++byteIndex];
      char byte4 = bytes[++byteIndex];
      if ((byte2 & b11000000) == b10000000 && 
          (byte3 & b11000000) == b10000000 && 
          (byte4 & b11000000) == b10000000)
      {
        // This 4-byte character still can be invalid. Check if we can create a string with it.
        unsigned char tuple[] = {byte, byte2, byte3, byte4};
        CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 4, kCFStringEncodingUTF8, false);
        if (cfstr)
        {
          CFRelease(cfstr);
          CheckBuffer();
          buffer[bufferIndex++] = byte;
          buffer[bufferIndex++] = byte2;
          buffer[bufferIndex++] = byte3;
          buffer[bufferIndex++] = byte4;
        }
        else
        {
          invalidByte = YES;
        }
      }
      else
      {
        invalidByte = YES;
      }
    }
    else if ((byte & b11111100) == b11111000) // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    {
      if (byteIndex+4 >= length) {
        FlushBuffer();
        return resultData;
      }
      char byte2 = bytes[++byteIndex];
      char byte3 = bytes[++byteIndex];
      char byte4 = bytes[++byteIndex];
      char byte5 = bytes[++byteIndex];
      if ((byte2 & b11000000) == b10000000 && 
          (byte3 & b11000000) == b10000000 && 
          (byte4 & b11000000) == b10000000 && 
          (byte5 & b11000000) == b10000000)
      {
        // This 5-byte character still can be invalid. Check if we can create a string with it.
        unsigned char tuple[] = {byte, byte2, byte3, byte4, byte5};
        CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 5, kCFStringEncodingUTF8, false);
        if (cfstr)
        {
          CFRelease(cfstr);
          CheckBuffer();
          buffer[bufferIndex++] = byte;
          buffer[bufferIndex++] = byte2;
          buffer[bufferIndex++] = byte3;
          buffer[bufferIndex++] = byte4;
          buffer[bufferIndex++] = byte5;
        }
        else
        {
          invalidByte = YES;
        }
      }
      else
      {
        invalidByte = YES;
      }
    }
    else if ((byte & b11111110) == b11111100) // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    {
      if (byteIndex+5 >= length) {
        FlushBuffer();
        return resultData;
      }
      char byte2 = bytes[++byteIndex];
      char byte3 = bytes[++byteIndex];
      char byte4 = bytes[++byteIndex];
      char byte5 = bytes[++byteIndex];
      char byte6 = bytes[++byteIndex];
      if ((byte2 & b11000000) == b10000000 && 
          (byte3 & b11000000) == b10000000 && 
          (byte4 & b11000000) == b10000000 && 
          (byte5 & b11000000) == b10000000 &&
          (byte6 & b11000000) == b10000000)
      {
        // This 6-byte character still can be invalid. Check if we can create a string with it.
        unsigned char tuple[] = {byte, byte2, byte3, byte4, byte5, byte6};
        CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 6, kCFStringEncodingUTF8, false);
        if (cfstr)
        {
          CFRelease(cfstr);
          CheckBuffer();
          buffer[bufferIndex++] = byte;
          buffer[bufferIndex++] = byte2;
          buffer[bufferIndex++] = byte3;
          buffer[bufferIndex++] = byte4;
          buffer[bufferIndex++] = byte5;
          buffer[bufferIndex++] = byte6;
        }
        else
        {
          invalidByte = YES;
        }
        
      }
      else
      {
        invalidByte = YES;
      }
    }
    else
    {
      invalidByte = YES;
    }
    
    if (invalidByte)
    {
#if DEBUG
      if (warningsCounter)
      {
        warningsCounter--;
        //NSLog(@"NSData dataByHealingUTF8Stream: broken byte encountered at index %d", byteIndex);
      }
#endif
      invalidByte = NO;
      FlushBuffer();
      [resultData appendData:replacementCharacterData];
    }
    
    byteIndex++;
  }
  FlushBuffer();
  return resultData;
}

@end