steveosoule
9/22/2016 - 4:24 PM

WGET Crawl Script

WGET Crawl Script

#!/bin/sh

# wget --mirror --adjust-extension --page-requisites --execute robots=off --wait=30 --rand om-wait --convert-links --user-agent=Mozilla http://www.example.com

### V1
# wget \
#      --recursive \
#      --no-clobber \
#      --page-requisites \
#      --html-extension \
#      --convert-links \
#      --restrict-file-names=windows \
#      --domains www.example.com \
#      --no-parent \
#          www.example.com


### V2
# wget \
#      --recursive \
#      --no-clobber \
#      --page-requisites \
#      --html-extension \
#      --convert-links \
#      --execute robots=off \
#      --restrict-file-names=windows \
#      --domains www.example.com \
#      --no-parent \
#          www.example.com



# wget \
# 	--user-agent='Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/52.0.2725.0 Mobile/13B143 Safari/601.1.46' \
# 	--execute robots=off \
# 	--recursive \
# 	--mirror \
# 	--wait=10 \
# 	--random-wait \
# 		www.example.com \
# 			2>&1 | grep '^--' | awk '{ print $3 }' | grep -v '\.\(css\|js\|png\|gif\|jpg\|JPG\)$' > www.example.com.txt

# wget \
# 	--mirror \
# 	--recursive \
# 	--execute robots=off \
# 	--user-agent='Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/52.0.2725.0 Mobile/13B143 Safari/601.1.46' \
# 	--timestamping \
# 	--page-requisites \
# 	--html-extension \
# 	--restrict-file-names=windows \
# 	--wait=1 \
# 	--random-wait \
# 	--domains www.example.com \
# 	--debug \
# 	--output-file=sample.log \
# 	--progress=dot \
# 	--directory-prefix=sample \
# 		www.example.com


# wget \
# 	--mirror \
# 	--recursive \
# 	--execute robots=off \
# 	--user-agent='Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/52.0.2725.0 Mobile/13B143 Safari/601.1.46' \
# 	--timestamping \
# 	--page-requisites \
# 	--html-extension \
# 	--restrict-file-names=windows \
# 	--wait=1 \
# 	--random-wait \
# 	--domains www.example.com \
# 	--progress=bar \
# 		www.example.com

wget \
	--mirror \
	--recursive \
	--execute robots=off \
	--user-agent='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2725.0 Safari/537.36' \
	--timestamping \
	--page-requisites \
	--html-extension \
	--restrict-file-names=windows \
	--random-wait \
	--convert-links \
	--domains www.example.com \
		www.example.com