UTF8 decoder class for cpp string
class utf8 {
vector<uint32_t> _codes;
string _s;
int nunits(unsigned char byte) {
return
byte<0x80? 1:
byte<0xe0? 2:
byte<0xf0? 3:
byte<0xf8? 4:
-1;
}
uint32_t unit(string s) {
check(size(s)==nunits(s[0]), "wrong unit:", s);
if (1==size(s)) return s[0];
uint32_t c = (s[0]&(0xff>>(size(s)+1))) << (6*(size(s)-1));
for (int i=1; i<size(s); ++i) {
c |= (s[i]&0x3f) << (6*(size(s)-1-i));
}
return c;
}
public:
int len() {return size(_codes);}
string str() {return _s;}
uint32_t code(int i) {return _codes.at(i);}
vector<uint32_t> codes() {return _codes;}
utf8(string s) {
_s = s;
for (int i=0; i<size(s);) {
int n = nunits(s[i]);
_codes.push_back(unit(s.substr(i, n)));
i += n;
}
}
};