5/8/2013 - 4:22 AM

A li’l class for data URI manipulation in Python.

A li’l class for data URI manipulation in Python.

import mimetypes
import re
import urllib

MIMETYPE_REGEX = r'[\w]+\/[\w\-\+\.]+'
_MIMETYPE_RE = re.compile('^{}$'.format(MIMETYPE_REGEX))

CHARSET_REGEX = r'[\w\-\+\.]+'
_CHARSET_RE = re.compile('^{}$'.format(CHARSET_REGEX))

    r'data:' +
    r'(?P<mimetype>{})?'.format(MIMETYPE_REGEX) +
    r'(?:\;charset\=(?P<charset>{}))?'.format(CHARSET_REGEX) +
    r'(?P<base64>\;base64)?' +
_DATA_URI_RE = re.compile(r'^{}$'.format(DATA_URI_REGEX), re.DOTALL)

class DataURI(str):

    def make(cls, mimetype, charset, base64, data):
        parts = ['data:']
        if mimetype is not None:
            if not _MIMETYPE_RE.match(mimetype):
                raise ValueError("Invalid mimetype: %r" % mimetype)
        if charset is not None:
            if not _CHARSET_RE.match(charset):
                raise ValueError("Invalid charset: %r" % charset)
            parts.extend([';charset=', charset])
        if base64:
            encoded_data = data.encode('base64').replace('\n', '')
            encoded_data = urllib.quote(data)
        parts.extend([',', encoded_data])
        return cls(''.join(parts))

    def from_file(cls, filename, charset=None, base64=True):
        mimetype, _ = mimetypes.guess_type(filename, strict=False)
        with open(filename) as fp:
            data = fp.read()
        return cls.make(mimetype, charset, base64, data)

    def __new__(cls, *args, **kwargs):
        uri = super(DataURI, cls).__new__(cls, *args, **kwargs)
        uri._parse  # Trigger any ValueErrors on instantiation.
        return uri

    def __repr__(self):
        return 'DataURI(%s)' % (super(DataURI, self).__repr__(),)

    def wrap(self, width=76):
        return type(self)('\n'.join(textwrap.wrap(self, width)))

    def mimetype(self):
        return self._parse[0]

    def charset(self):
        return self._parse[1]

    def is_base64(self):
        return self._parse[2]

    def data(self):
        return self._parse[3]

    def _parse(self):
        match = _DATA_URI_RE.match(self)
        if not match:
            raise ValueError("Not a valid data URI: %r" % self)
        mimetype = match.group('mimetype') or None
        charset = match.group('charset') or None
        if match.group('base64'):
            data = match.group('data').decode('base64')
            data = urllib.unquote(match.group('data'))
        return mimetype, charset, bool(match.group('base64')), data


Data URI manipulation made easy.

This isn't very robust, and will reject a number of valid data URIs. However, it meets the most useful case: a mimetype, a charset, and the base64 flag.


>>> uri = DataURI('data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu')
>>> uri.mimetype
>>> uri.charset
>>> uri.is_base64
>>> uri.data
'The quick brown fox jumped over the lazy dog.'

Note that DataURI.data won't decode the data bytestring into a unicode string based on the charset.

Creating from a string

>>> made = DataURI.make('text/plain', charset='us-ascii', base64=True, data='This is a message.')
>>> made
>>> made.data
'This is a message.'

Creating from a file

This is really just a convenience method.

>>> png_uri = DataURI.from_file('somefile.png')
>>> png_uri.mimetype
>>> png_uri.data


This code is released under the Unlicense (c.f. http://unlicense.org/).