Regular Expressions (RegEx)

10/8/2019 - 8:17 PM
Regular Expressions (RegEx)

########################################
### FIND TEXT PATTERNS WITH REGULAR EXPRESSIONS ###
########################################

'''
Regular expressions, or regexes, are descriptions for a pattern of text. \d stands for a digit character. All regex functions
in Python are in the re module.

You first create a Regex object that contains the pattern you're looking for. You then use the search() method of the Regex object
which searches whatever string you pass to it. The search() method will return None if the regex pattern is not found in the string.
Match objects have a group() method that will return the actual matched text from the string.
'''

# import re module that holds the regex functions
import re
text = '''
<a href="/babyname/Liiam" class="flex-1"><br />
    <a href="/babyname/Kirk" class="flex-1"><br />
    <a href="/babyname/Liv" class="flex-1"><br />
    <em>312-294-8293</em><br />
    <em>312-783-4938</em><br />
    <h1>x</h1>
    <h1>xx</h1>
    <h1>xxx</h1>
    <h1>xxxx</h1>
    '''

# 12 characters have special meanings - \, ^, $, ., |, ?, *, +, (, ), [, {
# these special characers are escaped by using \
# \d searches for a digit character,
# \w searches for a word character plus underscore, but not other punctuation like /, !, #, etc.
# \s matches a whitespace character, including tabs and line breaks
# [] creates a character class, such as [ae] which searches an a or an e 
# . matches any single wildcard character except line break - however, usually a clss or negated class is faster and more precise

# There are different ways to find matches:
# match(): determines if the RE matche at the beginning of the string
# search(): scans a string, finding the location of the first match
# findall(): finds all substrings where the RE matches, returning them as a list
# finditer(): finds all substrinbs where the RE matches, returning them as an iterator


# search text for the pattern indicated of digits 'ddd-ddd-ddd'
re.search(r'\d\d\d-\d\d\d-\d\d\d\d', text) # result is <_sre.SRE_Match object; span=(161, 173), match='312-294-8293'>, span being the location and match being the string matched
re.findall(r'\d\d\d-\d\d\d-\d\d\d\d', text) # result is ['312-294-8293', '312-783-4938']
re.finditer(r'\d\d\d-\d\d\d-\d\d\d\d', text) # example of iteratable object using finditer(), you can then loop through the iterations
for match in re.finditer(r'\d\d\d-\d\d\d-\d\d\d\d', text):
    print(match.span())

''' Result is:
(158, 170)
(190, 202)
'''

# search for pattern of alpha characers '/' followed by a space and then 7 alpha characters
re.findall(r'/\w\w\w\w\w\w\w', text) # <_sre.SRE_Match object; span=(6, 10), match='a hr'>

# search for pattern of alpha characers 'w' followed by a space and then 'ww'
re.search(r'\w\s\w\w', text) # result is <_sre.SRE_Match object; span=(6, 10), match='a hr'>
re.search(r'\w \w\w', text) # result is also <_sre.SRE_Match object; span=(6, 10), match='a hr'>

# example of character class, matching any of the values in the class (and not multiple values).
# in this example, it will search for i or x, finding the first
re.search(r'[ix]', text) # result is <_sre.SRE_Match object; span=(25, 26), match='i'>, note the text is 'Liiam' but it only matches the first instance of 'i'

# example of character class range, matching alpha characters in-between d and j
re.search(r'[d-j]', text) # result is <_sre.SRE_Match object; span=(4, 5), match='h'>
re.findall(r'[d-j]', text) # result is ['h', 'e', 'f', 'e', 'i', 'f', 'e', 'h', 'e', 'f', 'e', 'i', 'f', 'e', 'h', 'e', 'f', 'e', 'i', 'f', 'e', 'e', 'e', 'e', 'e']

# example of character class with digits
re.search(r'[0-5]', text) # result is <_sre.SRE_Match object; span=(42, 43), match='1'>
re.findall(r'[0-5]', text) # result is ['1', '1', '1', '3', '1', '2', '2', '4', '2', '3', '3', '1', '2', '3', '4', '3', '1', '1', '1', '1', '1', '1']

# multiple character classes, searching for digits between 0 - 6 or letters between r and y
re.search(r'[0-6r-y]', text) # result is <_sre.SRE_Match object; span=(5, 6), match='r'>
re.findall(r'[0-6r-y]', text) # result is ['r', 'x', 'y', 's', 's', 'x', '1', 'r', 'r', 'y', 'r', 's', 's', 'x', '1', 'r', 'r', 'y', 'v', 's', 's', 'x', '1', 'r', '3', '1', '2', '2', '4', '2', '3', 'r', '3', '1', '2', '3', '4', '3', 'r', '1', 'x', '1', '1', 'x', 'x', '1', '1', 'x', 'x', 'x', '1']

# definition of your own character class
re.findall(r'[mfqt]', text) # result is ['f', 'm', 'm', 'f', 'f', 'm', 'f', 'f', 'm', 'f', 'm', 'm', 'm', 'm']

# example of ^ (first character in brackets), which negates the class: in this case, it'll search for characters that aren't \n (break) or any character in '<a href'
re.search(r'[^\n<a href]', text) # result is <_sre.SRE_Match object; span=(12, 13), match='='>
re.findall(r'[^\n<a href]', text) # result is ['=', 'x', '"', '/', 'b', 'b', 'y', 'n', 'm', '/', 'L', 'i', 'i', 'm', '"', 'c', 'l', 's', 's', '=', '"', 'l', 'x', '-', '1', '"', '>', 'b', '/', '>', '=', '"', '/', 'b', 'b', 'y', 'n', 'm', '/', 'K', 'i', 'k', '"', 'c', 'l', 's', 's', '=', '"', 'l', 'x', '-', '1', '"', '>', 'b', '/', '>', '=', '"', '/', 'b', 'b', 'y', 'n', 'm', '/', 'L', 'i', 'v', '"', 'c', 'l', 's', 's', '=', '"', 'l', 'x', '-', '1', '"', '>', 'b', '/', '>', 'm', '>', '3', '1', '2', '-', '2', '9', '4', '-', '8', '2', '9', '3', '/', 'm', '>', 'b', '/', '>', 'm', '>', '3', '1', '2', '-', '7', '8', '3', '-', '4', '9', '3', '8', '/', 'm', '>', 'b', '/', '>', '1', '>', 'x', '/', '1', '>', '1', '>', 'x', 'x', '/', '1', '>', '1', '>', 'x', 'x', 'x', '/', '1', '>']

# . which finds a string with a single wildcard character, like a _ in SQL
# note this won't find b followed by more than one letter followed by another b
re.findall(r'b.b', text) # ['bab', 'bab', 'bab']

# * which finds just those characters with any number of characters in-between (but not including the characters in-between)
re.findall('b*y', text) # result is ['by', 'by', 'by']

# .* which finds just those characters along with any number of characters in-between, like a * in SQL
re.findall('b.*y', text) # result is ['baby', 'baby', 'baby']

# ^ which searches at the beginning of the string (note this is different than ^ in a class [], which negates the class)
re.search('^hey', 'hey look at that bird') # there is a match since 'hey' is at the end of the string <_sre.SRE_Match object; span=(17, 21), match='bird'>
re.search('^look', 'hey look at that bird') # there is no match since even though 'look' is in the string, it's not at the end


# $ which search as the end of the string
re.search('bird$', 'hey look at that bird') # there is a match since 'bird' is at the end of the string <_sre.SRE_Match object; span=(17, 21), match='bird'>
re.search('look$', 'hey look at that bird') # there is no match since even though 'look' is in the string, it's not at the end

# using {} to find a character repeating different times
re.search('x{3}', text) # finds 'x' character repeating 3 times in a row, result is <_sre.SRE_Match object; span=(261, 264), match='xxx'>
re.findall('x{2,}', text) # finds 'x' character repeating 2 or more times in a row, result is ['xx', 'xxx']
re.findall('x{2,4}', text) # character repeating between 2 to 4 times in a row, result is ['xx', 'xxx', 'xxxx']
re.findall('x{2,4}?', text) # adding a ? will make the string non-greedy, meaning it'll find as short a string as possible: ['xx', 'xx', 'xx', 'xx']

# how to search for OR using |, in this case searching for m or x
re.search(r'm|x', text) # result is <_sre.SRE_Match object; span=(25, 26), match='m'>
re.findall(r'm|x', text) # result is ['x', 'm', 'm', 'x', 'm', 'x', 'm', 'x', 'm', 'm', 'm', 'm', 'x', 'x', 'x', 'x', 'x', 'x']

# you can do a case insensitive search using re.IGNORECASE or re.I
re.search('bird', 'hey look at that bird') # result is <_sre.SRE_Match object; span=(17, 21), match='bird'>
re.search('BiRd', 'hey look at that bird') # result is no match
re.search('BiRd', 'hey look at that bird', re.I) # result is <_sre.SRE_Match object; span=(17, 21), match='bird'>

###################################################
### WORKING EXAMPLE - FIND ALL MATCHES OF A WILCARD PATTERN IN TEXT ###
###################################################
import re
text = '''
    <a href="/babyname/Liam" class="flex-1">
    <a href="/babyname/Kirk" class="flex-1">
    <a href="/babyname/Liv" class="flex-1">
    '''

# search everything between '/babyname/' and the next '"'. The wildcard is (.*?)
matches  = re.findall(r'/babyname/(.*?)"', text) # result is ['Liam', 'Kirk', 'Liv']

for match in matches:
    print(match)


###################################################
### FIND ALL MATCHES - GREEDY VS. NON-GREEDY ###
###################################################

import re
text = '''
<td><span data-sort-value="Blyton, Enid"><span class="vcard"><span class="fn"><a href="/wiki/Enid_Blyton">
<td><span data-sort-value="Sheldon, Sidney"><span class="vcard"><span class="fn"><a href="/wiki/Sidney_Sheldon" title="Sidney Sheldon">Sidney Sheldon</a></span></span></span>
    '''
# search between first '<' and the last '>'. Greedy, will consume as much of the string as possible. Will return each large chunks of the string between '<' and '>'
matches  = re.findall(r'<.*>', text) # Result is: '<td><span data-sort-value="Blyton, Enid"><span class="vcard"><span class="fn">', etc.

# search between first '<' and the next '>'. Non-greedy ('?' is added), will consume as little of the string as possible. Will return each tag by itself. 
matches  = re.findall(r'<.*?>', text) # Result is: '<td>, '<span data-sort-value="Blyton, Enid">', '<span class="vcard">', etc.

# search between first '<' and the next '>'. Non-greedy, return the same as above but wilcard is placed within () so you don't return the actual search patterns (the '<' and '>')
matches  = re.findall(r'<(.*?)>', text) # Result is: 'td', 'span data-sort-value="Blyton, Enid"', 'span class="vcard"', etc.

# another non-greedy example
matches  = re.findall(r'<span data-sort-value=(.*?)>', text) # Result is: ['"Blyton, Enid"', '"Sheldon, Sidney"']

# another example that uses wildcards in the search pattern. Note '.*?' appears in the beginning search pattern, which 
# will match '<span data-sort-value="' then any number of wilcard characters then the next ', ', and THEN the '(.*?)' will
# specify the actual match that you want to pull over, getting any number of characters until the next '">'. In this way, it'll
# find a match on both '<span data-sort-value="Blyton, ' and on '<span data-sort-value="Sheldon, ', and then it'll pull over 'Enid' and 'Sidney'.
matches  = re.findall(r'<span data-sort-value=".*?, (.*?)">', text) # Result is: ['Enid', 'Sidney']

for match in matches:
    print(match)

###################################################
### USING COMPILE WITH MATCHING OBJECTS AND GROUPS ###
###################################################

# you can place your regex pattern into a compile object so you don't have to specify it for each query

import re
text = '''
<a href="/babyname/Liiam" class="flex-1"><br />
    <a href="/babyname/Kirk" class="flex-1"><br />
    <a href="/babyname/Liv" class="flex-1"><br />
    <em>312-294-8293</em><br />
    <em>312-783-4938</em><br />
    <h1>x</h1>
    <h1>xx</h1>
    <h1>xxx</h1>
    '''

# compile the string into a regex object searching for digit characters separated by -
myRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

# create a matching object mo that finds myRegex in the variable text
mo = myRegex.search(text)
mo # result is first instance<_sre.SRE_Match object; span=(158, 170), match='312-294-8293'>
mo.group() # group() returns the string matched by the RE, '312-294-8293'
mo.start() # start() returns the starting position of the match, 158
mo.end() # end() returns the end position, 170
mo.span() # span() shows the span, (158, 170)

# in most programs, the common style is to store the matching object in a variable, and then check if it is None
text = '''
<a href="/babyname/Liiam" class="flex-1"><br />
    <a href="/babyname/Kirk" class="flex-1"><br />
    <a href="/babyname/Liv" class="flex-1"><br />
    <em>312-294-8293</em><br />
    <em>312-783-4938</em><br />
    <h1>x</h1>
    <h1>xx</h1>
    <h1>xxx</h1>
    '''

myRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = myRegex.search(text)

if mo: # if matching object isn't None
    print('Match found: ', mo.group()) # group() prints the matched string
else:
    print('No match found') # result is 'Match found:  312-294-8293'

# alternate example using findall() to print all matches found if matching object isn't None
mo = myRegex.findall(text)
if mo:
    for match in mo:
        print(match)
else:
    print('No match found')

''' Result is:
312-294-8293
312-783-4938
'''

# alternate example using finditer()
mo = myRegex.finditer(text)
if mo:
    for match in mo:
        print(match.span())
else:
    print('No match found')

# you can use () to create groups within your regex string
text = '''
<a href="/babyname/Liiam" class="flex-1"><br />
    <a href="/babyname/Kirk" class="flex-1"><br />
    <a href="/babyname/Liv" class="flex-1"><br />
    <em>312-294-8293</em><br />
    <em>312-783-4938</em><br />
    <h1>x</h1>
    <h1>xx</h1>
    <h1>xxx</h1>
    '''

myRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # regex string with three groups in-between ()
mo = myRegex.search(text)
mo.group() # result is '312-294-8293'
mo.group(0) # result is '312-294-8293'
mo.group(1) # result is '312'
mo.group(2) # result is '294'
mo.group(3) # result is '8293'
mo.groups() # result is all groups at once in a tuple, ('312', '294', '8293')

# if you use groups in your search string and you have multiple matches as the result of findall(), it'll return tuples containing each group
myRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # regex string with three groups in-between ()
mo = myRegex.findall(text) # result is [('312', '294', '8293'), ('312', '783', '4938')]

# you can use a pipe | to match different options for a group in the same statement (a group that uses an OR statement)
text = 'I really like red cardinals'

# specify group with pipe that looks for either  Sauce, Parlour, or Rat
myRegex = re.compile(r'I really like (red |blue |green )cardinals') 
mo = myRegex.search(text)
mo.group() # result is 'I really like red cardinals'
mo.group(1) # result is 'red'

# the ? characters makes the previous group optional

text = 'I really like cardinals'
# regex pattern where 'red ' is an optional group, and a match will be found even if that part doesn't exist
myRegex = re.compile(r'I really like (red )?cardinals')
mo = myRegex.search(text)
mo.group() # result is 'I really like cardinals'

text = 'I really like red cardinals'
# regex pattern where 'red ' is an optional group
myRegex = re.compile(r'I really like (red )?cardinals')
mo = myRegex.search(text)
mo.group() # result is 'I really like red cardinals'

# the * character means match ZERO OR MORE instances (an optional group that can optionally be repeated)

# example with zero instances of the optional group
text = 'I really like cardinals'
# regex pattern where 'Really Super is Optional'
myRegex = re.compile(r'I really like (red )*cardinals')
mo = myRegex.search(text)
mo.group() # result is 'I really like red cardinals'

# example with one instance of the optional group
text = 'I really like red cardinals'
# regex pattern where 'Really Super is Optional'
myRegex = re.compile(r'I really like (red )*cardinals')
mo = myRegex.search(text)
mo.group() # result is 'I really like red cardinals'

# example with multiple instances of the optional group
text = 'I really like red red red red cardinals'
# regex pattern where 'Really Super is Optional'
myRegex = re.compile(r'I really like (red )*cardinals')
mo = myRegex.search(text)
mo.group() # result is 'I really like red red red red cardinals'

# the + character means match ONE OR MORE instances (a group that can optionally be repeated, but must have at least one)

# example with zero instances of the optional group
text = 'I really like cardinals'
# regex pattern where 'Really Super is Optional'
myRegex = re.compile(r'I really like (red )+cardinals')
mo = myRegex.search(text)
mo.group() # result is None

# example with one instance of the optional group
text = 'I really like red cardinals'
# regex pattern where 'Really Super is Optional'
myRegex = re.compile(r'I really like (red )+cardinals')
mo = myRegex.search(text)
mo.group() # result is 'I really like red cardinals'

# example with multiple instances of the optional group
text = 'I really like red red red red cardinals'
# regex pattern where 'Really Super is Optional'
myRegex = re.compile(r'I really like (red )+cardinals')
mo = myRegex.search(text)
mo.group() # result is 'I really like red red red red cardinals'

# curly brackets {} can be used to match specific repititions

# example looking for red twice but not finding it
text = 'I really like red cardinals'
# regex pattern where 'Really Super is Optional'
myRegex = re.compile(r'I really like (red ){2}cardinals')
mo = myRegex.search(text)
mo.group() # result is None

# example looking for red twice and finding it
text = 'I really like red red cardinals'
# regex pattern where 'Really Super is Optional'
myRegex = re.compile(r'I really like (red ){2}cardinals')
mo = myRegex.search(text)
mo.group() # result is 'I really like red red cardinals'

# example using a range in curly brackets, looking for 3, 4 or 5 repetitions of red
text = 'I really like red red red red cardinals'
# regex pattern where 'Really Super is Optional'
myRegex = re.compile(r'I really like (red ){3,5}cardinals')
mo = myRegex.search(text)
mo.group() # result is 'I really like red red red red cardinals'

# example using a range in curly brackets with ? to make it non-greedy, looking for 3, 4 or 5 repetitions of red
text = 'I really like red red red red cardinals'
# regex pattern where 'Really Super is Optional'
myRegex = re.compile(r'I really like (red ){3,5}?')
mo = myRegex.search(text)
mo.group() # result is 'I really like red red red red '

###################################################
### SUBSTITUTING STRINGS WITH THE sub() METHOD ###
###################################################

# sub() or substring can replace text returned
text = '''
<a href="/babyname/Liiam" class="flex-1"><br />
    <em>312-294-8293</em><br />
    <em>312-783-4938</em><br />
    '''

myRegex = re.compile(r'<em>.*?</em>')
mo = myRegex.findall(text)
mo # result is ['<em>312-294-8293</em>', '<em>312-783-4938</em>']

mo = myRegex.sub('This text has been substringed! Booyah!', text)
mo # result is '\n<a href="/babyname/Liiam" class="flex-1"><br />\n    This text has been substringed! Booyah!<br />\n    This text has been substringed! Booyah!<br />\n    '

###################################################
### MANAGING COMPLEX REGEXES WITH VERBOSE AND COMMENTS ###
###################################################

# for complicated regexes, you can break them into different lines with comments:

phoneRegex = re.complie(r'''(
    (\d{3}\(\d{3}\))?   # area code
    (\s|-|\.)?          # separator
    \d{3}               # first 3 digits
    (\s|-||.)           # separator
    \d{4}               # last 4 digits

)''', re.VERBOSE)

########################################
### WORKING EXAMPLE - HOW TO DO REGEX PATTERN MATCHING ON VALUES IN A DICTIONARY ###
########################################

import re
customerDatabase = {'ID-1': 'Whiskers', 
           'ID-2': 'cat', 
           'ID-3': '353-555-2523',
           'ID-4': 'sloth',
           'ID-5': '278-465-6475',
           'ID-6': '773-293-9382',
           }
phoneNumberFinder = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # variable to hold the search pattern you want to find
print('PHONE NUMBERS FOUND:')
# loop through the items
for key, value in customerDatabase.items():
    # use search method of phoneNumberFinder variable, plugging in the key value of the current iteration
    matchingObject = phoneNumberFinder.search(value) 
    # use 'if matchingObject:', which evaluates whether matchingObject has a value or instead is None (does not have a value).
    # If you try to print matchingObject.group() by itself (without the if), it'll throw an error if a None value if found and your program will stop
    # If matchingObject has a value, the if statement will evaluate as True and the print block announcing the match will fire
    # If matchingObject does not have a value and is None, the print block won't fire, but the program will not throw an error and will continue
    if matchingObject: 
        print('Customer: ' + key + ', Match: ' + matchingObject.group())


########################################
### WORKING EXAMPLE - HOW TO DO PATTERN REGEX AGAINST VARIABLE THAT HOLDS TWO COLUMNS, AKA HOW TO SEARCH FOR A VALUE WITHIN A TABULAR DATASET WITH COLUMNS AND ROWS ###
########################################

import re
import pandas as pd
import numpy as np

# load the file
dataframe = pd.read_csv('C:\\Users\\mcmah\\Dropbox\\Code\\Python\\Jupyter\\Data Science Course\\Simple Linear Regression\\1.01. Simple linear regression.csv')

# preview what's in the variable dataframe
dataframe['GPA']
# specify the search string. Specifying GPA will search only GPA column due to how intertuples returns the column values for each row 
phoneNumberFinderRegex = re.compile(r'GPA=\d.\d\d') 
print('PHONE NUMBERS FOUND:')
# loop through values. Intertuples returns a tuple for each row with each column separated by a comma, like 'Pandas(Index=69, SAT=1931, GPA=3.58)'
for row in dataframe.itertuples(): 
    # in matchingObject, use phoneNumberFinderRegex variable and search each row as a string
    matchingObject = phoneNumberFinderRegex.search(str(row))
    # use if matchingObject, since if you try to print matchingObject.group() by itself, it'll throw an error if a None value if found
    if matchingObject: 
        print('Row: ' + str(row[0]) + ', Match: ' + str(matchingObject.group())) # str(row[0]) pulls the value from the first element of the tuple, which is Index, so you can identify the row where the match was found




########################################
### WORKING EXAMPLE - HOW COPY A WEB PAGE'S CODE TO A VARIABLE AND SEARCH THE CODE FOR PATTERNS ###
########################################
import urllib.request
import re

# load the URL path
urlPath = urllib.request.urlopen("https://nameberry.com/list/196/British-Boy-Names?all=1")
# read the URL source as bytecode
websiteBytes = urlPath.read()
# convert the bytecode to utf8, exposing the actual HTML code
htmlCode = websiteBytes.decode("utf8")
urlPath.close()

# preview that the HTML code has been loaded
print(htmlCode)

# search for text within the HTML tags
matches  = re.findall(r'/babyname/(.*?)" ', htmlCode) # search everything between '/babyname/' and '" '. 
for match in matches:
    print(match + ' McMahon')


########################################
### BABY NAME COPY WORK BABYNAMEWIZARD ###
########################################
import urllib.request
import re

urlPath2

print(urlPath2)

counter = 2

urlPath2

print("urlPath" + str(counter))

urls = {}

print(urls(2))

urls.get(2, 0)

urls = {1: "http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark", 
        2: "http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia",
        3: "http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia",
        4: "http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia",
        5: "http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia",
        6: "http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia",
        7: "http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia",
        8: "http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia",
        9: "http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia",
        10: "http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia",
        11: "http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia",
        12: "http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia",
        }

# load the URL path
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath2 = urllib.request.urlopen("http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia")
urlPath = urllib.request.urlopen(urls.get(2, 0))
urlPath = urllib.request.urlopen(urls.get(counter, 0))
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")

# read the URL source as bytecode
mybytes = urlPath.read()

# convert the bytecode to utf8, exposing the actual HTML code
htmlCode = mybytes.decode("utf8")
urlPath.close()

# preview that the HTML code has been loaded
print(htmlCode)

# search for text within the HTML tags
matches = re.findall(r'/namipedia/boy/(.*?)\"', htmlCode) # search everything between '/babyname/' and '" '. The wildcard is (.*)

# NOW GO BACK, CHANGE THE urlPath PARAMETER, RE-RUN RE.FINDALL, THEN ADD THE NEW LIST TO THE ORIGINAL LIST IN matches

# change the URL path and then add initial matches list to new re.findall list
matches = matches + (re.findall(r'/namipedia/boy/(.*?)\"', htmlCode))

# print the final list!
for match in matches:
    print(match + ' McMahon')




########################################
### WORKING EXAMPLE - BABY NAME COPY WORK WIKI IRISH PEOPLE ###
########################################
import urllib.request
import re

# load the URL path
urlPath = urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_Irish_people")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/russian-boys-names-most-popular-names-for-boys-in-russia")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")
urlPath = urllib.request.urlopen("http://www.babynamewizard.com/name-list/danish-boys-names-most-popular-names-for-boys-in-denmark")

# read the URL source as bytecode
mybytes = urlPath.read()

# convert the bytecode to utf8, exposing the actual HTML code
htmlCode = mybytes.decode("utf8")
urlPath.close()

# preview that the HTML code has been loaded
print(htmlCode)

# search for text within the HTML tags
matches = re.findall(r'wiki/(.*?)_', htmlCode) 
for match in matches:
    print(match.capitalize() + ' McMahon')
Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Regular Expressions (RegEx)