cloudcalvin
2/3/2018 - 4:58 PM

Unix Shell-Script to crawl a list of website URLs using curl

Unix Shell-Script to crawl a list of website URLs using curl

#!/bin/sh
timezone="Europe/Zurich"
# List of valid timezones: wikipedia.org/wiki/List_of_tz_database_time_zones
script="${0##*/}"
rootdir=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd)
logfile="$script.log"
log="$rootdir/$logfile"
now=$(TZ=":$timezone" date)
# Uncomment 'mailto=' (remove #) to enable emailing the log upon completion
#mailto="your@email.com"
mailsubj="$script log from $now"
logging() {
  now=$(TZ=":$timezone" date)
  if [[ -z "$1" || -z "$2" ]]; then
    echo "$now [ERROR] Nothing to log. Use:\nlogging <level> <result>"
    exit 2
  else
    echo "$now [$1] $2" >> $log
  fi
}
if [ -z "$1" ]; then
  echo "$now [ERROR] Missing file input. Use:\n$rootdir/$script /path/to/urls.txt"
  exit 2
else
  input="$1"
fi
logging "INFO" "Reading file: $input"
cat $input|while read line; do
  logging "INFO" "Crawling URL: $line"
  curlstart=$(date +"%s")
  curlresult=`curl -sSL -w '%{http_code} %{url_effective}' $line -o /dev/null`
  # curl parameters: -sS = silent; -L = follow redirects; -w = custom output format; -o = trash output
  logging "INFO" "$curlresult"
  curldone=$(date +"%s")
  difftime=$(($curldone-$curlstart))
  logging "INFO" "Crawl-time: $(($difftime / 3600)):$(($difftime / 60)):$(($difftime % 60))"
done
logging "INFO" "Done reading file: $input"
if [ ! -z "$mailto" -a "$mailto" != " " ]; then
  logging "INFO" "Sending Email to: $mailto"
  # Using postfix mail command to email the logfile contents
  cat $log | mail -s "$mailsubj" $mailto
fi
exit
https://www.apple.com/
https://wikipedia.org
https://swissmacuser.ch/
https://twitter.com/swissmacuser
# This is an example output generated by curl-crawler
Sun Feb 19 21:56:07 CET 2017 [INFO] Reading file: ./urls.txt
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://www.apple.com/
Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.apple.com/
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://wikipedia.org
Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.wikipedia.org/
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://swissmacuser.ch/
Sun Feb 19 21:56:08 CET 2017 [INFO] 200 https://swissmacuser.ch/
Sun Feb 19 21:56:08 CET 2017 [INFO] Crawl-time: 0:0:1
Sun Feb 19 21:56:08 CET 2017 [INFO] Crawling URL: https://twitter.com/swissmacuser
Sun Feb 19 21:56:09 CET 2017 [INFO] 200 https://twitter.com/swissmacuser
Sun Feb 19 21:56:09 CET 2017 [INFO] Crawl-time: 0:0:1
Sun Feb 19 21:56:09 CET 2017 [INFO] Done reading file: ./urls.txt