dylan-k
9/30/2016 - 2:19 PM

wget crawler

wget crawler

#!/bin/bash
#
# Crawls a domain
# Retreives all visible URLs and their page titles
# Saves to CSV
# $1 = URL
# $2 = csv filename
#
# USAGE:
# save this script as, say, crawler.sh”. 
# Then “chmod +x crawler.sh”. 
# Then run the script passing in the name of this site you want to crawl, for example ` ./crawler.sh http://www.someSite.com output`      

# Text color variables
txtund=$(tput sgr 0 1)

# Underline
txtbld=$(tput bold)

# Bold
bldred=${txtbld}$(tput setaf 1)   # red
bldblu=${txtbld}$(tput setaf 4)   # blue
bldgreen=${txtbld}$(tput setaf 2) # green
bldwht=${txtbld}$(tput setaf 7)   # white
txtrst=$(tput sgr0)               # Reset
info=${bldwht}*${txtrst}          # Feedback
pass=${bldblu}*${txtrst}
warn=${bldred}*${txtrst}
ques=${bldblu}?${txtrst}
printf "%s=== Crawling $1 ===  %s" "$bldgreen" "$txtrst"

# wget in Spider mode, outputs to wglog file 
# man wget is your friend, but...  
# --reject switch to ignore specific file types (images, javascript etc.)
# --reject-regex switch to ignore URL parts (eg urls with question marks by using --reject-regex "(.*)\?(.*)"  )  
# --no-check-certificate switch to (be careful!) ignore security certificates
wget --reject-regex "(.*)\?(.*)" --no-check-certificate --spider --recursive --no-clobber --no-directories $1 --reject bmp,css,gif,ico,jpg,jpeg,js,mp3,mp4,pdf,png,swf,txt,xml,xls,zip,eot,svg,ttf,woff,woff2,rdf 2>&1 | tee wglog v          
printf " %s========================================== \n" "$bldgreen"
printf "%s=== Crawl Finished... ===%s \n" "$bldgreen" "$txtrst"
printf "%s=== Begin retreiving page titles... ===%s \n" "$bldgreen" "$txtrst"
printf "%s========================================== \n" "$dgreen"
printf "%s** Run tail -f $1.csv for progress%s \n" "$bldred" "$txtrst"

# from wglog, grab URLs
# curl each URL and grep title
cat wglog | grep '^--' | awk '{print $3}' | sort | uniq | while read url; do {
  printf "%s* Retreiving title for: %s$url%s \n" "$bldgreen" "$txtrst$txtbld" "$txtrst"
  printf "\"${url}\",\"$(curl -s ${url} | sed -n 's/.*<title>\(.*\)<\/title>.*/\1/ip;T;q')\"\n" >>$2.csv
}; done

# clean up log file
rm wglog
exit