pcoughlin
4/8/2016 - 11:10 PM

Mirror a site. Start with base URL And $URL/sitemap.xml

Mirror a site. Start with base URL And $URL/sitemap.xml

# use wget to mirror a site
#   $1 is site URL
#     - format is http(s)://www.sitename.com/ (or no slash at end)
#   ${1%%+(/)} removes all slashes at end
#   ${1%/} removes 1 slash at end
Mirror_Start=`date`
MirrorLog="`basename $1`/_MirrorSite.log"
mkdir `basename $1`
MirrorDir="${1%/}"
# With Fix of URLs after download
WgetParms=" -E -Kk --mirror -p -e robots=off"
# Without Fix of URLs after download
#WgetParms=" --mirror -e robots=off"
# Set extglob setting in bash for pattern matching (used for "${1%/}" )
shopt -s extglob
# Mirror from default home page
wget $WgetParms $1 > $MirrorLog
# Mirror from /sitemap.xml if it exists
echo '========================== Trying sitemap.xml =============================' | tee -a $MirrorLog
if wget -q -O- "$MirrorDir/sitemap.xml" ; then
        cat sitemap.xml | grep -o '<loc>.*</loc>' | grep -o 'http[^<"]*' | xargs wget $WgetParms >>$MirrorLog
        echo 'Sitemap='"|$MirrorDir/sitemap.xml|"
else
        echo 'Sitemap='"|$MirrorDir/sitemap.xml| FAILED! Error=$?" | tee -a $MirrorLog
fi
if wget -q -O- "$MirrorDir/sitemap/" ; then
        cat sitemap | grep -o '<loc>.*</loc>' | grep -o 'http[^<"]*' | xargs wget $WgetParms >>$MirrorLog
        echo 'Sitemap='"|$MirrorDir/sitemap/|"
else
        echo 'Sitemap='"|$MirrorDir/sitemap/| FAILED! Error=$?" | tee -a $MirrorLog
fi

echo $Mirror_Start "<-- Start time" | tee -a $MirrorLog
echo `date` "<---- End Time" | tee -a $MirrorLog