Finds references to external scripts and stylesheets from HTML page and prints download instructions for those files.
import sys
import argparse
from HTMLParser import HTMLParser
file_list = []
class AssetFinder(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'script':
for attr in attrs:
if attr[0] == 'src':
file_list.append(attr[1])
if tag == 'link':
for attr in attrs:
if attr[0] == 'href':
file_list.append(attr[1])
def handle_endtag(self, tag):
pass
def handle_data(self, data):
pass
if __name__ == "__main__":
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-i', '--input', required=True)
args = arg_parser.parse_args()
f = open(args.input, 'r')
html = f.read()
parser = AssetFinder()
parser.feed(html)
for f in file_list:
print "curl -lsO", f