Extract only the abstract of PUBMED raw data

11/20/2013 - 11:46 PM

Extract only the abstract of PUBMED raw data

__author__ = 'sean'

from bs4 import BeautifulSoup
import os
import cPickle as pickle

path = '/Users/sean/ml/dataset/pubmed-bioinfo-abstracts/paperAbstracts/'
filenames = os.listdir(path)

txt_corpus = list()
for thefile in filenames:
    print thefile
    # deal with the damn .DS_Store file in MAC
    if thefile == ".DS_Store":
        continue
    with open(path + thefile, "rb") as f:
        strings = f.read()
        soup = BeautifulSoup(strings)
        for hit in soup.findAll(attrs={'class' : 'abstract_text'}):
            abstract = hit.contents[1].text
        txt_corpus.append(abstract)
print 'done'
with open('pubmed_abstract.pkl', 'wb') as dicpkl:
    pickle.dump(txt_corpus, dicpkl)
print 'pickle saved'

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Extract only the abstract of PUBMED raw data