JokerQyou
5/23/2016 - 7:26 AM

Regular expression to split by punctuation

Regular expression to split by punctuation

# coding: utf-8
from __future__ import print_function

import re

pattern = u''.join([
    u'[',
    u'\u0020-\u002f',  # < General Latin characters, exclude @, letters and numbers
    u'\u003A-\u003f',
    u'\u005b-\u0060',
    u'\u007b-\u007f',  # > See http://jrgraphix.net/r/Unicode/0020-007F for details
    u'\u2000-\u206f',  # General Punctuation
    u'\u2e00-\u2e7f',  # Supplemental Punctuation
    u'\u3000-\u303f',  # CJK Symbols and Punctuation
    u'\uff00-\uffef',  # Halfwidth and Fullwidth Forms
    u'\ufff0-\uffff',  # Specials
    u']+',
])

# The above form expalins how the pattern works, but you can use this equivalent too
# u'[ -/:-?[-`{-\x7f\u2000-\u206f\u2e00-\u2e7f\u3000-\u303f\uff00-\uffef\ufff0-\uffff]+'

a = u'''@测试员1,关注一下这个。@测试员2?@测试员3!@测试员4,@tester5:你好 @其他人
 @更多人,换行带了空格
@还有谁?换行不带空格'''

[print(i) for i in re.split(pattern, a)]