kkroesch
6/30/2016 - 10:34 AM

Finds references to external scripts and stylesheets from HTML page and prints download instructions for those files.

Finds references to external scripts and stylesheets from HTML page and prints download instructions for those files.

import sys
import argparse
from HTMLParser import HTMLParser

file_list = []


class AssetFinder(HTMLParser):

    def handle_starttag(self, tag, attrs):
        if tag == 'script':
            for attr in attrs:
                if attr[0] == 'src':
                    file_list.append(attr[1])
            if tag == 'link':
                for attr in attrs:
                    if attr[0] == 'href':
                        file_list.append(attr[1])

    def handle_endtag(self, tag):
        pass
    
    def handle_data(self, data):
        pass

if __name__ == "__main__":
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('-i', '--input', required=True)
    args = arg_parser.parse_args()
    
    f = open(args.input, 'r')
    html = f.read()

    parser = AssetFinder()
    parser.feed(html)
    for f in file_list:
        print "curl -lsO", f