Remove HTML tags when adding words to index.
# Question 4: Remove Tags
# When we add our words to the index, we don't really want to include
# html tags such as <body>, <head>, <table>, <a href="..."> and so on.
# Write a procedure, remove_tags, that takes as input a string and returns
# a list of words, in order, with the tags removed.
# Tags are defined to be strings surrounded by < >.
# Words are separated by whitespace or tags.
# You may assume the input does not include any unclosed tags, that is,
# there will be no '<' without a following '>'.
def remove_tags(string):
# words seperated by "<>" and " "
result = ''
word = ''
flag = 0
for char in string:
if flag == 0:
if char == '<':
flag = 1
result += word
word = ' '
elif char == ' ':
result += word
word = ' '
else:
word += char
if flag == 1 and char == '>':
flag = 0
if char != '>' and char != ' ':
result += word
return result.split()
def remove_tags_teacher_method(string):
start = string.find('<')
while start != -1:
end = string.find('>', start)
string = string[:start] + ' ' + string[end:]
start = string.find('<')
return string.split()
print remove_tags_teacher_method('This sentence has no tags.')
# Incorrect. Your procedure did not return
# "['This', 'sentence', 'has', 'no', 'tags.']" for the input
# 'This sentence has no tags.'
#
# print remove_tags('''<h1>Title</h1><p>This is a
# <a href="http://www.udacity.com">link</a>.<p>''')
# #>>> ['Title','This','is','a','link','.']
#
# print remove_tags('''<table cellpadding='3'>
# <tr><td>Hello</td><td>World!</td></tr>
# </table>''')
# #>>> ['Hello','World!']
#
# print remove_tags("<hello><goodbye>")
# #>>> []
#
# print remove_tags("This is plain text.")
# #>>> ['This', 'is', 'plain', 'text.']