yuliji
7/19/2017 - 11:05 AM

is cjk word

is cjk word

def is_cjk_char(char):
    CJK = [
        [u'\u4E00', u'\u9FFF'],
        [u'\u3400', u'\u4DBF'],
        [u'\u20000', u'\u2A6DF'],
        [u'\u2A700', u'\u2B73F'],
        [u'\u2B740', u'\u2B81F'],
        [u'\u2B820', u'\u2CEAF'],
        [u'\uF900', u'\uFAFF'],
        [u'\u2F800', u'\u2FA1F'],
    ]
    
    if type(char) == str:
        char = unicode(char, 'utf8')
        
    if len(char) != 1:
        raise Exception('must be a char')
        
    ret = False
    
    for minu, maxu in CJK:
        if char >= minu and char <= maxu:
            ret = True
            break
            
    return ret
        

def is_cjk_word(word):
    if type(word) == str:
        word = unicode(word, 'utf8')
        
    ret = True
    
    for char in word:
        if not is_cjk_char(char):
            ret = False
            break
            
    return ret