is cjk word
def is_cjk_char(char):
CJK = [
[u'\u4E00', u'\u9FFF'],
[u'\u3400', u'\u4DBF'],
[u'\u20000', u'\u2A6DF'],
[u'\u2A700', u'\u2B73F'],
[u'\u2B740', u'\u2B81F'],
[u'\u2B820', u'\u2CEAF'],
[u'\uF900', u'\uFAFF'],
[u'\u2F800', u'\u2FA1F'],
]
if type(char) == str:
char = unicode(char, 'utf8')
if len(char) != 1:
raise Exception('must be a char')
ret = False
for minu, maxu in CJK:
if char >= minu and char <= maxu:
ret = True
break
return ret
def is_cjk_word(word):
if type(word) == str:
word = unicode(word, 'utf8')
ret = True
for char in word:
if not is_cjk_char(char):
ret = False
break
return ret