aurora1625
11/20/2013 - 11:46 PM

Extract only the abstract of PUBMED raw data

Extract only the abstract of PUBMED raw data

__author__ = 'sean'

from bs4 import BeautifulSoup
import os
import cPickle as pickle

path = '/Users/sean/ml/dataset/pubmed-bioinfo-abstracts/paperAbstracts/'
filenames = os.listdir(path)

txt_corpus = list()
for thefile in filenames:
    print thefile
    # deal with the damn .DS_Store file in MAC
    if thefile == ".DS_Store":
        continue
    with open(path + thefile, "rb") as f:
        strings = f.read()
        soup = BeautifulSoup(strings)
        for hit in soup.findAll(attrs={'class' : 'abstract_text'}):
            abstract = hit.contents[1].text
        txt_corpus.append(abstract)
print 'done'
with open('pubmed_abstract.pkl', 'wb') as dicpkl:
    pickle.dump(txt_corpus, dicpkl)
print 'pickle saved'