Param(
[Parameter (Mandatory=$false)]
[string] $baseUrl = "http://localhost:83",
[Parameter (Mandatory=$false)]
[string] $proxyUrl = "http:localhost:9999"
)
function IsIgnoredUrl ([string] $url)
{
if($url.ToLower().StartsWith("http://") -or $url.ToLower().StartsWith("https://") -or $url.StartsWith("//") -or $url.StartsWith("../") -or $url.StartsWith("#") -or $url.StartsWith("?") -or $url.ToLower().StartsWith("javascript:") -or $url.ToLower().StartsWith("mailto:"))
{
return $true
}
return $false
}
function ResolveUrl ([string] $baseUrl, [string] $url)
{
if($url.StartsWith("//"))
{
return "http:" + $url
}
if(IsIgnoredUrl($url))
{
return $url
}
return $baseUrl.TrimEnd("/") + "/" + $url.TrimStart("/")
}
function PreloadUrl ([string] $url)
{
$url = $url.Trim()
if($global:processedUrls.Contains($url))
{
Write-Verbose ("Skipping url {0} (already loaded)" -f $url)
return
}
$global:processedUrls += $url
if(IsIgnoredUrl($url))
{
Write-Verbose ("Skipping url {0}" -f $url)
return
}
$url = ResolveUrl $baseUrl $url
Write-Host $url
$global:invokedUrls += $url
$content = (Invoke-WebRequest $url -Proxy $global:proxyUrl).Content
if($url.ToLower().EndsWith(".jpg") -or $url.ToLower().EndsWith(".png") -or $url.ToLower().EndsWith(".tiff") -or $url.ToLower().EndsWith(".gif"))
{
return
}
$links = ([regex]::matches($content, "<link.+href\s*=\s*([`"'])(.*?)\1")) | ForEach { $_.Groups[2].Value }
$links | ForEach {
$linkContent = PreloadUrl $_
if($linkContent -ne $null)
{
$cssFiles = ([regex]::matches($linkContent, ":url\((.*?)\)")) | ForEach { $_.Groups[1].Value }
$cssFiles
$cssFiles | ForEach { PreloadUrl $_ | Out-Null }
}
}
$imgs= ([regex]::matches($content, "<img.+src\s*=\s*([`"'])(.*?)\1")) | ForEach { $_.Groups[2].Value }
$imgs | ForEach { PreloadUrl $_ | Out-Null}
$hrefs = ([regex]::matches($content, "<a.+href\s*=\s*([`"'])(.*?)\1")) | ForEach { $_.Groups[2].Value }
$hrefs | ForEach {
$linkContent = PreloadUrl $_
}
$locs= ([regex]::matches($content, "<a.+href\s*=\s*([`"'])(.*?)\1")) | ForEach { $_.Groups[2].Value }
return $content
}
# keep a list of processed URLs so that no URL is processed twice
$global:processedUrls = @()
$global:invokedUrls = @()
# urls to index
$urls = @()
$urls += "/" # take the root url as a starting point
$urls += "/favicon.ico" # load the favicon, this is not in the HTML so needs to be added "manually"
# add all urls from the sitemap
$sitemapUrl = ResolveUrl $baseUrl "/sitemap.xml"
Write-Host ("Getting sitemap from {0}" -f $sitemapUrl)
$sitemap = [xml] (Invoke-WebRequest -Uri $sitemapUrl -Proxy $proxyUrl).Content
$urls += $sitemap.urlset.url.loc
foreach ($url in $urls)
{
PreloadUrl $url | Out-Null
}
Write-Host ("Processed {0} urls of which {1} webapp urls are ivoked" -f $global:processedUrls.Count, $global:invokedUrls.Count)