dkrashen
3/6/2015 - 8:20 PM

change urls in website directories with unfortunate encodings

change urls in website directories with unfortunate encodings

#!/bin/bash

# script to change homepage urls from fully qualified to absolute i.e.
# from http://my.domain/~user/somestuff/etc
# to   /~user/somestuff/etc
#
# due to the unfortunate circumstances of life, this script is assuming
# that the pages are formatted either in an encoding which sed easily
# can deal with, or perhaps in UTF-16LE, which was what was outputted by
# word. the strategy is cludgey, and relies on the fact that when sed fails
# for bad character encoding reasons, it produces an empty file. 

domain='http:\/\/www.mydomain.com'

getpath(){
  echo $(cd $1; pwd; cd ..)
}

checkargs(){
  if [ $# -lt 2 ]; then
    echo "need more arguments. correct syntax is:"
    echo "$0 <target files directoryname> <personal directory name>"
    exit
  elif [ ! -d $1 ]; then
    echo "$1 is not a directory, exiting"
    exit
  fi
}

#function below will expect a file's full path
#arguments are: <target html file> <sedstring>
convert_target_utf16le(){
  #convert file assuming UTF-16LE format
  cat $1 | iconv -f UTF-16LE -t UTF-8 | sed $sed_string | \
    iconv -f UTF-8 -t UTF-16LE > $1.changed_
  #possible that the resulting file is empty!
  if [ -s $1.changed_ ]; then
    mv $1.changed_ $1
  else
    #looks like it's empty! get rid of it.
    rm $1.changed_
  fi
}

#function below will expect a file's full path
#arguments are: <target html file> <sedstring>
convert_target_standard(){
  #convert file assuming no special formatting
  cat $1 | sed $sed_string > $1.changed_
  #possible that the resulting file is empty!
  if [ -s $1.changed_ ]; then
    mv $1.changed_ $1
  else
    #looks like it's empty! get rid of it.
    rm $1.changed_
  fi
}

#function below will expect a file's full path
#arguments are: <script> <target file/dir> <sedstring>
process_target(){
  script=$1
  file=$2
  sedstring=$3
  if [ -d $file ]; then
    new_target_dir=$(getpath $file)
    bash $script $new_target_dir $sedstring
  elif [[ $file =~ \.html?$ ]]; then
    # append newline- sed doesen't like files that don't end in newlines!
    echo >> $file
    convert_target_utf16le $file $sedstring
#    convert_target_standard $file $sedstring
  fi
}

checkargs $@

personal_directory="$domain\/\~$2"
new_path="\/\~$2"
sed_string="s/$personal_directory/$new_path/g"

# get the full paths
script=$0
target_dir=$(getpath $1)

for t in $(ls $target_dir); do
  process_target $script $target_dir/$t $2
done