alexanderholt
12/18/2017 - 3:48 AM

Wikipedia section loop

For when pages have different names for the same type of section.

import wikipedia
import numpy as np

# you'll need to get the exact names of the titles of the pages beforehand
example_titles = 
['Algol (film)','Dr. Jekyll and Mr. Hyde (1920 Haydon film)',
 'Figures of the Night', 'The Invisible Ray (1920 serial)', 'The Man from Beyond',
 'Black Oxen','Aelita','The Hands of Orlac (1924 film)']

# create a list of all the names you think/know the section might be called
possibles = ['Plot','Synopsis','Plot synopsis','Plot summary', 
             'Story','Plotline','The Beginning','Summary',
            'Content','Premise']
# sometimes those names have 'Edit' latched onto the end due to 
# user error on Wikipedia. In that case, it will be 'PlotEdit'
# so it's easiest just to make another list that acccounts for that
possibles_edit = [i + 'Edit' for i in possibles]
#then merge those two lists together
all_possibles = possibles + possibles_edit

# now for the actual fetching!
for i in titles:
# load the page once and save it as a variable, otherwise it will request
# the page every time.
# always do a try, except when pulling from the API, in case it gets confused
# by the tttle.
    try:
        wik = wikipedia.WikipediaPage(i[0])
    except:
        wik = np.NaN

# a new try, except for the plot
    try:
        # for all possible titles in all_possibles list
        for j in all_possibles:
            # if that section does exist, i.e. it doesn't return 'None'
            if wik.section(j) != None:
                #then that's what the plot is! Otherwise try the next one!
                plot_ = wik.section(j).replace('\n','').replace("\'","")
    # if none of those work, or if the page didn't load from above, then plot
    # equals np.NaN
    except:
        plot= np.NaN