ryochack
8/30/2016 - 11:15 AM

is_utf8.c

#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>

/**
  https://tools.ietf.org/html/rfc3629
  -----------------------------------------------------------------------------
  |  Code Points        | First Byte | Second Byte | Third Byte | Fourth Byte |
  |  U+0000..U+007F     |     00..7F |             |            |             |
  |  U+0080..U+07FF     |     C2..DF |      80..BF |            |             |
  |  U+0800..U+0FFF     |         E0 |      A0..BF |     80..BF |             |
  |  U+1000..U+CFFF     |     E1..EC |      80..BF |     80..BF |             |
  |  U+D000..U+D7FF     |         ED |      80..9F |     80..BF |             |
  |  U+E000..U+FFFF     |     EE..EF |      80..BF |     80..BF |             |
  |  U+10000..U+3FFFF   |         F0 |      90..BF |     80..BF |      80..BF |
  |  U+40000..U+FFFFF   |     F1..F3 |      80..BF |     80..BF |      80..BF |
  |  U+100000..U+10FFFF |         F4 |      80..8F |     80..BF |      80..BF |
  -----------------------------------------------------------------------------
**/

bool is_utf8_1(const char *str)
{
	const uint8_t *p = (const uint8_t*)str;

	/*  U+0000..U+007F     | 00..7F |        |        |         */
	if ((*p >= 0x00) && (*p <= 0x7F)) {
		return true;
	}
	return false;
}


bool is_utf8_2(const char *str)
{
	const uint8_t *p = (const uint8_t*)str;

	/*  U+0080..U+07FF     | C2..DF | 80..BF |        |         */
	if ((*p >= 0xC2) && (*p <= 0xDF)) {
		p++;
		if ((*p >= 0x80) && (*p <= 0xBF)) {
			return true;
		}
	}
	return false;
}


bool is_utf8_3(const char *str)
{
	const uint8_t *p = (const uint8_t*)str;

	/*  U+0800..U+0FFF     |     E0 | A0..BF | 80..BF |         */
	if (*p == 0xE0) {
		p++;
		if ((*p >= 0xA0) && (*p <= 0xBF)) {
			p++;
			if ((*p >= 0x80) && (*p <= 0xBF)) {
				return true;
			}
		}
	}
	/*  U+1000..U+CFFF     | E1..EC | 80..BF | 80..BF |         */
	else if ((*p >= 0xE1) && (*p <= 0xEC)) {
		p++;
		if ((*p >= 0x80) && (*p <= 0xBF)) {
			p++;
			if ((*p >= 0x80) && (*p <= 0xBF)) {
				return true;
			}
		}
	}
	/*  U+D000..U+D7FF     |     ED | 80..9F | 80..BF |         */
	else if (*p == 0xED) {
		p++;
		if ((*p >= 0x80) && (*p <= 0x9F)) {
			p++;
			if ((*p >= 0x80) && (*p <= 0xBF)) {
				return true;
			}
		}
	}
	/*  U+E000..U+FFFF     | EE..EF | 80..BF | 80..BF |         */
	else if ((*p >= 0xEE) && (*p <= 0xEF)) {
		p++;
		if ((*p >= 0x80) && (*p <= 0xBF)) {
			p++;
			if ((*p >= 0x80) && (*p <= 0xBF)) {
				return true;
			}
		}
	}
	return false;
}


bool is_utf8_4(const char *str)
{
	const uint8_t *p = (const uint8_t*)str;

	/*  U+10000..U+3FFFF   |     F0 | 90..BF | 80..BF | 80..BF  */
	if (*p == 0xF0) {
		p++;
		if ((*p >= 0x90) && (*p <= 0xBF)) {
			p++;
			if ((*p >= 0x80) && (*p <= 0xBF)) {
				p++;
				if ((*p >= 0x80) && (*p <= 0xBF)) {
					return true;
				}
			}
		}
	}
	/*  U+40000..U+FFFFF   | F1..F3 | 80..BF | 80..BF | 80..BF  */
	else if ((*p >= 0xF1) && (*p <= 0xF3)) {
		p++;
		if ((*p >= 0x80) && (*p <= 0xBF)) {
			p++;
			if ((*p >= 0x80) && (*p <= 0xBF)) {
				p++;
				if ((*p >= 0x80) && (*p <= 0xBF)) {
					return true;
				}
			}
		}
	}
	/*  U+100000..U+10FFFF |     F4 | 80..8F | 80..BF | 80..BF  */
	else if (*p >= 0xF4) {
		p++;
		if ((*p >= 0x80) && (*p <= 0x8F)) {
			p++;
			if ((*p >= 0x80) && (*p <= 0xBF)) {
				p++;
				if ((*p >= 0x80) && (*p <= 0xBF)) {
					return true;
				}
			}
		}
	}
	return false;
}


bool is_utf8(const char *str, uint32_t length)
{
	const char *p = str;

	while ((p - str) < length) {
		int32_t remain = length - (p - str);

		if (*p == '\0') {
			return true;
		} else if (is_utf8_1(p)) {
			p += 1;
		} else if ((remain >= 2) && is_utf8_2(p)) {
			p += 2;
		} else if ((remain >= 3) && is_utf8_3(p)) {
			p += 3;
		} else if ((remain >= 4) && is_utf8_4(p)) {
			p += 4;
		} else {
			return false;
		}
	}

	return true;
}


int get_fsize(const char *path) {
	struct stat sb;
	int err;
	err = stat(path, &sb);
	if (err) {
		fprintf(stderr, "fstat error!\n");
		return -1;
	}
	return sb.st_size;
}

int main(int argc, char **argv)
{
	if (argc < 2) {
		fprintf(stderr, "require test data's path\n");
		return 1;
	}

	FILE *fp = fopen(argv[1], "r");
	if (fp == NULL) {
		fprintf(stderr, "fp = NULL\n");
		return 1;
	}

	int allocsize = get_fsize(argv[1]);
	char *mem = (char*)malloc(allocsize);
	if (mem == NULL) {
		fprintf(stderr, "malloc return NULL\n");
		return 1;
	}

	int rdsize = fread(mem, 1, allocsize, fp);
	printf("size read=%d (alloc=%d)\n", rdsize, allocsize);
	printf("is_utf8=%d\n", is_utf8(mem, rdsize));

	return 0;
}