jan-h
5/3/2017 - 6:50 AM

crawl-site.ps1

Param(
    [Parameter (Mandatory=$false)]
    [string] $baseUrl = "http://localhost:83",

    [Parameter (Mandatory=$false)]
    [string] $proxyUrl = "http:localhost:9999"
)

function IsIgnoredUrl ([string] $url)
{
    if($url.ToLower().StartsWith("http://") -or $url.ToLower().StartsWith("https://") -or $url.StartsWith("//") -or $url.StartsWith("../") -or $url.StartsWith("#") -or $url.StartsWith("?") -or $url.ToLower().StartsWith("javascript:") -or $url.ToLower().StartsWith("mailto:"))
    {
        return $true
    }
    return $false
}

function ResolveUrl ([string] $baseUrl, [string] $url)
{
    if($url.StartsWith("//"))
    {
        return "http:" + $url
    }

    if(IsIgnoredUrl($url))
    {
        return $url
    }

    return $baseUrl.TrimEnd("/") + "/" + $url.TrimStart("/")
}

function PreloadUrl ([string] $url)
{
    $url = $url.Trim()

    if($global:processedUrls.Contains($url))
    {
        Write-Verbose ("Skipping url {0} (already loaded)" -f $url)
        return
    }
    $global:processedUrls += $url

    if(IsIgnoredUrl($url))
    {
        Write-Verbose ("Skipping url {0}" -f $url)
        return
    }

    $url = ResolveUrl $baseUrl $url
    Write-Host $url

    $global:invokedUrls += $url
    $content = (Invoke-WebRequest $url -Proxy $global:proxyUrl).Content

    if($url.ToLower().EndsWith(".jpg") -or $url.ToLower().EndsWith(".png") -or $url.ToLower().EndsWith(".tiff") -or $url.ToLower().EndsWith(".gif"))
    {
        return
    }

    $links = ([regex]::matches($content, "<link.+href\s*=\s*([`"'])(.*?)\1")) | ForEach { $_.Groups[2].Value }
    $links | ForEach {
            $linkContent = PreloadUrl $_ 
            if($linkContent -ne $null)
            {
                $cssFiles = ([regex]::matches($linkContent, ":url\((.*?)\)")) | ForEach { $_.Groups[1].Value }
                $cssFiles
                $cssFiles | ForEach { PreloadUrl $_ | Out-Null }
            }
        }

    $imgs= ([regex]::matches($content, "<img.+src\s*=\s*([`"'])(.*?)\1")) | ForEach { $_.Groups[2].Value }
    $imgs | ForEach { PreloadUrl $_ | Out-Null}

    $hrefs = ([regex]::matches($content, "<a.+href\s*=\s*([`"'])(.*?)\1")) | ForEach { $_.Groups[2].Value }
    $hrefs | ForEach {
        $linkContent = PreloadUrl $_ 
    }

    $locs= ([regex]::matches($content, "<a.+href\s*=\s*([`"'])(.*?)\1")) | ForEach { $_.Groups[2].Value }

    return $content
}

# keep a list of processed URLs so that no URL is processed twice
$global:processedUrls = @()
$global:invokedUrls = @()

# urls to index
$urls = @()
$urls += "/"  # take the root url as a starting point
$urls += "/favicon.ico" # load the favicon, this is not in the HTML so needs to be added "manually"

# add all urls from the sitemap
$sitemapUrl = ResolveUrl $baseUrl "/sitemap.xml"
Write-Host ("Getting sitemap from {0}" -f $sitemapUrl)
$sitemap = [xml] (Invoke-WebRequest -Uri $sitemapUrl -Proxy $proxyUrl).Content
$urls += $sitemap.urlset.url.loc

foreach ($url in $urls)
{
    PreloadUrl $url | Out-Null
}

Write-Host ("Processed {0} urls of which {1} webapp urls are ivoked" -f $global:processedUrls.Count, $global:invokedUrls.Count)