akellehe
11/30/2015 - 1:21 AM

Parse article links out of wikipedia string

Parse article links out of wikipedia string

import mwlib.parser.nodes
import mwlib.uparser 
import codecs


with codecs.open('us.dat', 'rb', encoding='utf-8') as fp: 
    text = fp.read()
    article = mwlib.uparser.parseString(title='us', raw=text) 
    nodes = article.find(mwlib.parser.nodes.ArticleLink) 
    for n in nodes:
        print n