smotaal
7/12/2019 - 5:37 PM

Data URL Header

Data URL Header

Data URL Header

To extract the specific [<mediatype>][;base64] portions of Data URLs — per MDN's doc — the following aspects were explored:

  1. Hard-coding a specific list of allowed <type>/<subtype> into the expression versus hard-coding only the <type>.

  2. A more complete capture of the *( ";" parameter ) portion — per RFC2397 — returning separately the attribute-value pairs and/or the last base64 portions.

The recommended expression for #28614 would roughly be (pending obvious refinements if will be used):

/^(?:((?:text|application)\/(?:[A-Z][-.0-9A-Z]*)?[A-Z]+)((?:;[A-Z][!%'()*\-.0-9A-Z_~]*=[!%'()*\-.0-9A-Z_~]*)*)(;base64)?),/i;

Note: See the annotated code snippet for more details.

This is expected to work as follows:

  • matcher.exec('text/javascript,')

    NOTE: Assuming text/javascript;, to be invalid

    [
      // 0: valid data-uri head
      'text/javascript,',
      // 1: mime
      'text/javascript',
      // 2: attributes
      '',
      // 3: base64
      undefined,
    ];
    
  • matcher.exec('text/javascript;base64,')

    NOTE: Assuming ;base64, and base64, to be invalid

    [
      // 0: valid data-uri head
      'text/javascript;base64,',
      // 1: mime
      'text/javascript',
      // 2: attributes
      '',
      // 3: base64
      ';base64',
    ];
    
  • matcher.exec('text/javascript;a=b;base64,')

    [
      // 0: valid data-uri head
      'text/javascript;a=b;base64,',
      // 1: mime
      'text/javascript',
      // 2: attributes
      ';a=b',
      // 3: base64
      ';base64',
    ];
    
[
  'text/javascript',
  'application/json',
  'application/wasm'
].flatMap(v => [v, `${v},`, `${v};base64,`, `${v};charset=US-ASCII,`]).reduce((r, v) => (r[v] = /^(?:((?:text|application)\/(?:[A-Z][-.0-9A-Z]*)?[A-Z]+)((?:;[A-Z][!%'()*\-.0-9A-Z_~]*=[!%'()*\-.0-9A-Z_~]*)*)(;base64)?),/i.exec(v),r),{});

result: ({
  "text/javascript": null,
  "text/javascript,": ["text/javascript,", "text/javascript", "", undefined],
  "text/javascript;base64,": ["text/javascript;base64,", "text/javascript", "", ";base64"],
  "text/javascript;charset=US-ASCII,": [
    "text/javascript;charset=US-ASCII,",
    "text/javascript",
    ";charset=US-ASCII",
    undefined
  ],
  "application/json": null,
  "application/json,": ["application/json,", "application/json", "", undefined],
  "application/json;base64,": ["application/json;base64,", "application/json", "", ";base64"],
  "application/json;charset=US-ASCII,": [
    "application/json;charset=US-ASCII,",
    "application/json",
    ";charset=US-ASCII",
    undefined
  ],
  "application/wasm": null,
  "application/wasm,": ["application/wasm,", "application/wasm", "", undefined],
  "application/wasm;base64,": ["application/wasm;base64,", "application/wasm", "", ";base64"],
  "application/wasm;charset=US-ASCII,": [
    "application/wasm;charset=US-ASCII,",
    "application/wasm",
    ";charset=US-ASCII",
    undefined
  ]
})
const createDataURLHeaderMatcher = options => {
	const formats = {
		...{...options}.formats,
		__proto__: null,
		'text/javascript': 'module',
		'application/json': 'json',
	};

	const matcher = new RegExp(
		`^(?:${
			// <MEDIATYPE> data-uri forms only
			//
			//   SEE: https://tools.ietf.org/html/rfc2045#section-5.1
			//
			`(${
				// <TYPE>/<SUBTYPE> "text" or "application" only

				/(?:text|application)\/(?:[A-Z][-.0-9A-Z]*)?[A-Z]+/i.source

				// Strict alternative: (keyof formats only)
				//    Object.getOwnPropertyNames(formats)
				//      .filter(t=>/(?:[A-Z]+)\/(?:[A-Z][-.0-9A-Z]*)?[A-Z]+$/i.test(t))
				// 	    .map(t=>t.reaplce(/[\\^$*+?.()|[\]{}]/g, '\\$&'))
				//      .join('|')
			})(${
				// <PARAMETER> "safe" uri tokens only (no quotes/spaces)

				/(?:;[A-Z][!%'()*\-.0-9A-Z_~]*=[!%'()*\-.0-9A-Z_~]*)*/i.source

				// Derived from:
				//   Array(255)
				//     .fill(undefined)
				//     .map((v,i)=>encodeURIComponent(v=String.fromCodePoint(i))===v&&v)
				//     .filter(Boolean)
			})(;base64)?`
		}),`,
		'i',
	);

	return matcher;
};