sebacruz
2/18/2012 - 8:53 AM

Normalizes URL according to RFC 3986 to use it in comparison operations.

Normalizes URL according to RFC 3986 to use it in comparison operations.

<?php
/**
 * Normalizes URL according to RFC 3986 to use it in comparison operations.
 * The function gets URL argument by reference and modifies it.
 * 
 * It returns the normalized URL on success and FALSE of failure.
 * 
 * @copyright  Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
 * @license    http://framework.zend.com/license/new-bsd     New BSD License
 * @package    Zend_OpenId
 * @filesource https://github.com/zendframework/zf2/blob/master/library/Zend/OpenId/OpenId.php
 * 
 * @param  string $url URL to be normalized
 * @return string|bool Normalized URL on success and FALSE of failure.
 */
function normalize_url($url) {
	// RFC 3986, 6.2.2.  Syntax-Based Normalization

	// RFC 3986, 6.2.2.2 Percent-Encoding Normalization
	$i = 0;
	$n = strlen($url);
	$res = '';
	
	while ($i < $n) {
		if ($url[$i] == '%') {
			if ($i + 2 >= $n) {
				return FALSE;
			}
			
			++$i;
			
			if ($url[$i] >= '0' && $url[$i] <= '9') {
				$c = ord($url[$i]) - ord('0');
			} else if ($url[$i] >= 'A' && $url[$i] <= 'F') {
				$c = ord($url[$i]) - ord('A') + 10;
			} else if ($url[$i] >= 'a' && $url[$i] <= 'f') {
				$c = ord($url[$i]) - ord('a') + 10;
			} else {
				return FALSE;
			}
			
			++$i;
			
			if ($url[$i] >= '0' && $url[$i] <= '9') {
				$c = ($c << 4) | (ord($url[$i]) - ord('0'));
			} else if ($url[$i] >= 'A' && $url[$i] <= 'F') {
				$c = ($c << 4) | (ord($url[$i]) - ord('A') + 10);
			} else if ($url[$i] >= 'a' && $url[$i] <= 'f') {
				$c = ($c << 4) | (ord($url[$i]) - ord('a') + 10);
			} else {
				return FALSE;
			}
			
			++$i;
			
			$ch = chr($c);
			if (($ch >= 'A' && $ch <= 'Z') ||
				($ch >= 'a' && $ch <= 'z') ||
				$ch == '-' ||
				$ch == '.' ||
				$ch == '_' ||
				$ch == '~') {
				$res .= $ch;
			} else {
				$res .= '%';
				if (($c >> 4) < 10) {
					$res .= chr(($c >> 4) + ord('0'));
				} else {
					$res .= chr(($c >> 4) - 10 + ord('A'));
				}
				$c = $c & 0xf;
				if ($c < 10) {
					$res .= chr($c + ord('0'));
				} else {
					$res .= chr($c - 10 + ord('A'));
				}
			}
		} else {
			$res .= $url[$i++];
		}
	}

	if (!preg_match('|^([^:]+)://([^:@]*(?:[:][^@]*)?@)?([^/:@?#]*)(?:[:]([^/?#]*))?(/[^?#]*)?((?:[?](?:[^#]*))?)((?:#.*)?)$|', $res, $reg)) {
		return FALSE;
	}
	
	$scheme   = $reg[1];
	$auth     = $reg[2];
	$host     = $reg[3];
	$port     = $reg[4];
	$path     = $reg[5];
	$query    = $reg[6];
	$fragment = $reg[7]; /* strip it */

	if (empty($scheme) || empty($host)) {
		return FALSE;
	}

	// RFC 3986, 6.2.2.1.  Case Normalization
	$scheme = strtolower($scheme);
	$host   = strtolower($host);

	// RFC 3986, 6.2.2.3.  Path Segment Normalization
	if (!empty($path)) {
		$i   = 0;
		$n   = strlen($path);
		$res = "";
		
		while ($i < $n) {
			if ($path[$i] == '/') {
				++$i;
				while ($i < $n && $path[$i] == '/') {
					++$i;
				}
				if ($i < $n && $path[$i] == '.') {
					++$i;
					if ($i < $n && $path[$i] == '.') {
						++$i;
						if ($i == $n || $path[$i] == '/') {
							if (($pos = strrpos($res, '/')) !== FALSE) {
								$res = substr($res, 0, $pos);
							}
						} else {
								$res .= '/..';
						}
					} else if ($i != $n && $path[$i] != '/') {
						$res .= '/.';
					}
				} else {
					$res .= '/';
				}
			} else {
				$res .= $path[$i++];
			}
		}
		$path = $res;
	}

	// RFC 3986,6.2.3.  Scheme-Based Normalization
	if ($scheme == 'http') {
		if ($port == 80) {
			$port = '';
		}
	} else if ($scheme == 'https') {
		if ($port == 443) {
			$port = '';
		}
	}
	
	if (empty($path)) {
		$path = '/';
	}

	$url = $scheme
		. '://'
		. $auth
		. $host
		. (empty($port) ? '' : (':' . $port))
		. $path
		. $query;
	
	return $url;
}