bpeterso2000
5/1/2014 - 3:05 AM

Safe encoding whitelist -- test whether selected encoding name is safely support by Python

Safe encoding whitelist -- test whether selected encoding name is safely support by Python

import argparse
import codecs
import re


"""
Reference: 'U+DEADBEEF: Why you shouldn't trust arbitrary text encodings'
<http://rspeer.github.io/blog/2014/03/30/unicode-deadbeef/>
"""

ENCODING_WHITELIST = re.compile(
    r'ascii|'
    r'utf-(8|16)|'
    r'cp(437|125[0-8])|'
    r'iso-8859-([1-9]|1[0-5])|'
    r'mac-roman')


def is_supported_encoding(encoding):
    """
    :returns: normalized encoding name or None if not found
    :raises: LookupError (Python unable to locate encoding name)
    """
    name = codecs.lookup(encoding).name
    if ENCODING_WHITELIST.match(name):
        return name


if __name__ == '__main__':
    """ Test from command line """
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--encoding', default='utf-8', help=(
        "check to see if encoding is supported "
        "and return normalized encoding name"))
    args = parser.parse_args()
    try:
        encoding_name = is_supported_encoding(args.encoding)
        print("Supported encoding: '{}'".format(encoding_name) if encoding_name
              else "Specified encoding is not supported by this application.")
    except LookupError:
        print("LookupError: Specified encoding is not supported by Python.")