Juuggo
4/25/2017 - 1:07 AM

Remove HTML tags when adding words to index.

Remove HTML tags when adding words to index.

# Question 4: Remove Tags

# When we add our words to the index, we don't really want to include
# html tags such as <body>, <head>, <table>, <a href="..."> and so on.

# Write a procedure, remove_tags, that takes as input a string and returns
# a list of words, in order, with the tags removed.
# Tags are defined to be strings surrounded by < >.
# Words are separated by whitespace or tags.
# You may assume the input does not include any unclosed tags, that is,
# there will be no '<' without a following '>'.

def remove_tags(string):
    # words seperated by "<>" and " "

    result = ''
    word = ''
    flag = 0
    for char in string:
        if flag == 0:
            if char == '<':
                flag = 1
                result += word
                word = ' '
            elif char == ' ':
                result += word
                word = ' '
            else:
                word += char
        if flag == 1 and char == '>':
            flag = 0
    if char != '>' and char != ' ':
        result += word
    return result.split()

def remove_tags_teacher_method(string):
    start = string.find('<')
    while start != -1:
        end = string.find('>', start)
        string = string[:start] + ' ' + string[end:]
        start = string.find('<')
    return string.split()

print remove_tags_teacher_method('This sentence has no tags.')
# Incorrect. Your procedure did not return
#     "['This', 'sentence', 'has', 'no', 'tags.']" for the input
#     'This sentence has no tags.'

#
# print remove_tags('''<h1>Title</h1><p>This is a
#                     <a href="http://www.udacity.com">link</a>.<p>''')
# #>>> ['Title','This','is','a','link','.']
#
# print remove_tags('''<table cellpadding='3'>
#                      <tr><td>Hello</td><td>World!</td></tr>
#                      </table>''')
# #>>> ['Hello','World!']
#
# print remove_tags("<hello><goodbye>")
# #>>> []
#
# print remove_tags("This is plain text.")
# #>>> ['This', 'is', 'plain', 'text.']