#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
/**
https://tools.ietf.org/html/rfc3629
-----------------------------------------------------------------------------
| Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
| U+0000..U+007F | 00..7F | | | |
| U+0080..U+07FF | C2..DF | 80..BF | | |
| U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
| U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
| U+D000..U+D7FF | ED | 80..9F | 80..BF | |
| U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
| U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
| U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
| U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
-----------------------------------------------------------------------------
**/
bool is_utf8_1(const char *str)
{
const uint8_t *p = (const uint8_t*)str;
/* U+0000..U+007F | 00..7F | | | */
if ((*p >= 0x00) && (*p <= 0x7F)) {
return true;
}
return false;
}
bool is_utf8_2(const char *str)
{
const uint8_t *p = (const uint8_t*)str;
/* U+0080..U+07FF | C2..DF | 80..BF | | */
if ((*p >= 0xC2) && (*p <= 0xDF)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
return true;
}
}
return false;
}
bool is_utf8_3(const char *str)
{
const uint8_t *p = (const uint8_t*)str;
/* U+0800..U+0FFF | E0 | A0..BF | 80..BF | */
if (*p == 0xE0) {
p++;
if ((*p >= 0xA0) && (*p <= 0xBF)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
return true;
}
}
}
/* U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | */
else if ((*p >= 0xE1) && (*p <= 0xEC)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
return true;
}
}
}
/* U+D000..U+D7FF | ED | 80..9F | 80..BF | */
else if (*p == 0xED) {
p++;
if ((*p >= 0x80) && (*p <= 0x9F)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
return true;
}
}
}
/* U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | */
else if ((*p >= 0xEE) && (*p <= 0xEF)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
return true;
}
}
}
return false;
}
bool is_utf8_4(const char *str)
{
const uint8_t *p = (const uint8_t*)str;
/* U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF */
if (*p == 0xF0) {
p++;
if ((*p >= 0x90) && (*p <= 0xBF)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
return true;
}
}
}
}
/* U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF */
else if ((*p >= 0xF1) && (*p <= 0xF3)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
return true;
}
}
}
}
/* U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF */
else if (*p >= 0xF4) {
p++;
if ((*p >= 0x80) && (*p <= 0x8F)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
p++;
if ((*p >= 0x80) && (*p <= 0xBF)) {
return true;
}
}
}
}
return false;
}
bool is_utf8(const char *str, uint32_t length)
{
const char *p = str;
while ((p - str) < length) {
int32_t remain = length - (p - str);
if (*p == '\0') {
return true;
} else if (is_utf8_1(p)) {
p += 1;
} else if ((remain >= 2) && is_utf8_2(p)) {
p += 2;
} else if ((remain >= 3) && is_utf8_3(p)) {
p += 3;
} else if ((remain >= 4) && is_utf8_4(p)) {
p += 4;
} else {
return false;
}
}
return true;
}
int get_fsize(const char *path) {
struct stat sb;
int err;
err = stat(path, &sb);
if (err) {
fprintf(stderr, "fstat error!\n");
return -1;
}
return sb.st_size;
}
int main(int argc, char **argv)
{
if (argc < 2) {
fprintf(stderr, "require test data's path\n");
return 1;
}
FILE *fp = fopen(argv[1], "r");
if (fp == NULL) {
fprintf(stderr, "fp = NULL\n");
return 1;
}
int allocsize = get_fsize(argv[1]);
char *mem = (char*)malloc(allocsize);
if (mem == NULL) {
fprintf(stderr, "malloc return NULL\n");
return 1;
}
int rdsize = fread(mem, 1, allocsize, fp);
printf("size read=%d (alloc=%d)\n", rdsize, allocsize);
printf("is_utf8=%d\n", is_utf8(mem, rdsize));
return 0;
}