reorx
12/1/2012 - 6:47 PM

re_chinese.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re


# zh_CN
def sharp(s):
    if not isinstance(s, unicode):
        s = unicode(s, 'utf8')
    rep = re.compile(ur'#[\w\u2E80-\u9FFF]+')
    fits = []
    for i in rep.findall(s):
        fits.append(
            re.search(ur'[\w\u2E80-\u9FFF]+', i).group())
    return fits


# zh_CN
def frank(s):
    if not isinstance(s, unicode):
        s = unicode(s, 'utf8')
    rep = re.compile(ur'[\w\u2E80-\u9FFF]+')
    return rep.findall(s)


def src2name(src):
    rep = re.compile(ur'[\w]+')
    src_red = rep.findall(src)
    if not src_red:
        return None
    return '_'.join(src_red)


def plainstr(s):
    rep = re.compile(ur'[\w][^\n]+')
    s_red = rep.findall(s)
    if not s_red:
        return None
    return s_red[0]


class TextFilter(object):
    def __init__(self, mode=str):
        self.mode = mode

    def _search(self, p, s):
        res = re.search(p, s)
        if not res:
            return None
        else:
            return self.mode(res.group())

    def clean(self, s):
        mode = self.mode
        if str == mode:
            return self._search('.+', s)
        if int == mode:
            return self._search('\d+', s)