iyak
2/5/2017 - 9:31 AM

UTF8 decoder class for cpp string

UTF8 decoder class for cpp string

class utf8 {
  vector<uint32_t> _codes;
  string _s;

  int nunits(unsigned char byte) {
    return
    byte<0x80? 1:
    byte<0xe0? 2:
    byte<0xf0? 3:
    byte<0xf8? 4:
    -1;
  }

  uint32_t unit(string s) {
    check(size(s)==nunits(s[0]), "wrong unit:", s);
    if (1==size(s)) return s[0];
    uint32_t c = (s[0]&(0xff>>(size(s)+1))) << (6*(size(s)-1));
    for (int i=1; i<size(s); ++i) {
      c |= (s[i]&0x3f) << (6*(size(s)-1-i));
    }
    return c;
  }

public:
  int len() {return size(_codes);}
  string str() {return _s;}
  uint32_t code(int i) {return _codes.at(i);}
  vector<uint32_t> codes() {return _codes;}
  utf8(string s) {
    _s = s;
    for (int i=0; i<size(s);) {
      int n = nunits(s[i]);
      _codes.push_back(unit(s.substr(i, n)));
      i += n;
    }
  }
};