CodyKochmann
5/9/2016 - 5:11 PM

this is regex for all supported html tags in a page

this is regex for all supported html tags in a page

import re

# this is a static list of all html tags excluding script tags
all_html_tag_regex = r'<\/?(html|title|body|h|p|br|hr|abbr|address|b|bdi|bdo|blockquote|cite|code|del|dfn|em|i|ins|kbd|mark|meter|pre|progress|q|rp|rt|ruby|s|samp|small|strong|sub|sup|time|u|var|wbr|form|input|textarea|button|select|optgroup|option|label|fieldset|legend|datalist|keygen|output|iframe|img|map|area|canvas|figcaption|figure|audio|source|track|video|a|link|nav|ul|ol|li|dl|dt|dd|menu|menuitem|table|caption|th|tr|td|thead|tbody|tfoot|col|colgroup|style|div|span|header|footer|main|section|article|aside|details|dialog|summary|head|meta|base|script|noscript|embed|object|param).{0,}>'

# this one supports removing script tags too!
all_script_and_html_tags_regex = r'(&[a-z]+?\;|<script[\W\D\S\s\w\d]+?<\/script>|<style[\W\D\S\s\w\d]+?<\/style>|<!?[*-\[\]\'\"\/\_\(\)\\\:\%\-a-zA-Z\ \n]+?>)'