technocrat
10/9/2015 - 1:45 AM

Preparation of text to find specific pattern of words and then collect matches with a specific keyword in a list

Preparation of text to find specific pattern of words and then collect matches with a specific keyword in a list

import nltk
from nltk.chunk import *
from nltk.chunk.util import *
from nltk.chunk.regexp import *
from nltk import Tree

cp = nltk.RegexpParser('CHUNK: {<NN> <VB> <IN> <NN>}')
bucket = []
brown = nltk.corpus.brown
for sent in brown.tagged_sents():
	tree = cp.parse(sent)
	for subtree in tree.subtrees():
		if subtree.label() == 'CHUNK':
			if 'sciatica' in ' '.join([(''.join(''.join(leaf[0]))) for leaf in subtree]):
				bucket.append(' '.join([(''.join(''.join(leaf[0]))) for leaf in subtree]))