wikiti
2/6/2017 - 3:31 PM

MOSES Utilities

MOSES Utilities

#!/bin/bash
# -----------------------------------------------------------------------------
#  MOSES VARS UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
#   This script will serve as a source file for multiple common variables
# across the different utility scripts.
#
#   Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------

ROOT="$MOSES_DIR"
MOSES_ROOT="$ROOT/mosesdecoder"
MOSES_SCRIPTS="$MOSES_ROOT/scripts"
MGIZA_PATH="$ROOT/mgiza/mgizapp/inst/bin/"

MOSES_PATHS="$MOSES_ROOT/bin:$MOSES_SCRIPTS/training/"
MOSES_PATHS="$MOSES_PATHS:$MOSES_SCRIPTS/tokenizer/:$MOSES_SCRIPTS/recaser/"

# MOSES_INI_FILE="$DIR/build/model/model/moses-bin.ini"

DIR_MODELS="$MOSES_DIR_MODELS"

export PATH=$PATH:$MOSES_PATHS
#!/bin/bash
# -----------------------------------------------------------------------------
#  MOSES TRANSLATOR UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
#   This script will facilitate the task of translating a group of phrases
# given a (trained) translation model.
#
#   Usage: Just run the command `moses-clean.sh` with the model's unique name.
# Make sure to create some backups before cleaning. The translations will be
# place on <output> file; one line per translated sentence. Note that the word
# alignment will be placed on the <output>.aligment file.
#
#     moses-translate.sh <input strings file> <output> [dictionary]
#
#   Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------

set -e

USAGE="moses-translate.sh <model> <input strings file> <output> [dictionary]"

# -----------------------------------------------------------------------------
#  IMPORT MOSES VARS AND ARGUMENTS CHECK
# -----------------------------------------------------------------------------

source $MOSES_VARS_FILE

if [  $# -lt 3 ]; then
  echo $USAGE
  exit 1
fi

# -----------------------------------------------------------------------------
#  PROCESS VARIABLES
# -----------------------------------------------------------------------------

MODEL_NAME="$1"
MODEL_FOLDER="$DIR_MODELS/$MODEL_NAME"
MODEL_BUILD_FOLDER="$MODEL_FOLDER/build"
MODEL_BUILD_TMP_FOLDER="$MODEL_BUILD_FOLDER/tmp"

INPUT_FILE="$2"
OUTPUT_FILE="$3"
OUTPUT_FILE_ALIGNMENT="$OUTPUT_FILE.alignment"
DICTIONARY_FILE="$4"

MOSES_INI_FILE="$MODEL_BUILD_FOLDER/model/model/moses-bin.ini"

source $MODEL_FOLDER/config

# -----------------------------------------------------------------------------
#  PREPARE INPUT
# -----------------------------------------------------------------------------

# Variables & folders
mkdir -p $MODEL_BUILD_TMP_FOLDER

# Write the input string into a temp file
cp $INPUT_FILE $MODEL_BUILD_TMP_FOLDER/in.raw

# Tokenize string
tokenizer.perl -l $LANGUAGE_SOURCE < $MODEL_BUILD_TMP_FOLDER/in.raw \
  > $MODEL_BUILD_TMP_FOLDER/in.tok

# Recase string
lowercase.perl < $MODEL_BUILD_TMP_FOLDER/in.tok \
  > $MODEL_BUILD_TMP_FOLDER/in

# Use dictionary on this step (if present=), and replace required strings.
if [ $DICTIONARY_FILE != "" ] && [ -f $DICTIONARY_FILE ]; then
  # Add extra white spaces
  sed -i 's/$/  /' $MODEL_BUILD_TMP_FOLDER/in

  # For each dictionary term, try to replace it in the corpus.
  while read line; do
    original=$(echo $line | awk -F' --- ' '{print $1}')
    translated=$(echo $line | awk -F' --- ' '{print $2}')
    prob=$(echo $line | awk -F' --- ' '{print $3}')

    sed -i "s/ $original / <n translation=\"$translated\" prob=\"$prob\">$original<\/n> /g" \
      $MODEL_BUILD_TMP_FOLDER/in

  done < $DICTIONARY_FILE
fi

# -----------------------------------------------------------------------------
#  TRANSLATE CONTENT
# -----------------------------------------------------------------------------

# Run decoder
moses -f $MOSES_INI_FILE -alignment-output-file $OUTPUT_FILE_ALIGNMENT \
  -xml-input $XML_INPUT \
  < $MODEL_BUILD_TMP_FOLDER/in \
  > $MODEL_BUILD_TMP_FOLDER/out.raw

# -----------------------------------------------------------------------------
#  PREPARE OUTPUT
# -----------------------------------------------------------------------------

# Recase the output
recase.perl --in $MODEL_BUILD_TMP_FOLDER/out.raw --lang $LANGUAGE_TARGET \
  > $MODEL_BUILD_TMP_FOLDER/out.cased \
  --model $MODEL_BUILD_FOLDER/recaser-model/moses.ini ||

  # Use default language 'en' if the previous command fails
  recase.perl --in $MODEL_BUILD_TMP_FOLDER/out.raw --lang "en" \
    > $MODEL_BUILD_TMP_FOLDER/out.cased \
    --model $MODEL_BUILD_FOLDER/recaser-model/moses.ini

# Detokenize the output
detokenizer.perl -l $LANGUAGE_TARGET < $MODEL_BUILD_TMP_FOLDER/out.cased \
  > $MODEL_BUILD_TMP_FOLDER/out

# Copy the output to the output file
cp $MODEL_BUILD_TMP_FOLDER/out $OUTPUT_FILE

# Clear data
#rm -Rf $MODEL_BUILD_TMP_FOLDER

# -----------------------------------------------------------------------------
#  NOTIFY SUCCESS
# -----------------------------------------------------------------------------

echo "Translated results were placed on $OUTPUT_FILE"
#!/bin/bash
# -----------------------------------------------------------------------------
#  MOSES INITIALIZER UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
#   This script should be used to create the skeleton of a translation model.
# The generated tree should look like this:
#
# model_dir           # Root folder
# |- build            # Build folder
# |  |- corpus        # Processed corpus content
# |  |- lm            # Language model content
# |  |- model         # Translation model content
# |  |- recaser-model # Recasing model content
# |- config           # Config file, used as a variables file
# |- corpus           # Raw corpus folder
#   |- raw.source     # Source language corpus
#   |- raw.target     # Target language corpus
#
#   Models will be generated under the folder $MOSES_DIR_MODELS. Since the
# models are subfolders, prefixed may be added to their name to generate
# hierarchical subdirectories. Check moses-train.sh for more information
# about training models.
#
#   Usage: Just run this script with a model name, a source language and a
# target language. After that, the model template will be created in
# $MOSES_DIR_MODELS/<model name>. You can specify subfolders as a model
# name:
#
#     moses-init.sh category1/model_name en es
#
#   Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------

set -e

USAGE="moses-init.sh <model name> <source language> <target language>"

# -----------------------------------------------------------------------------
#  IMPORT MOSES VARS AND ARGUMENTS CHECK
# -----------------------------------------------------------------------------

source $MOSES_VARS_FILE

if [  $# -lt 3 ]; then
  echo $USAGE
  exit 1
fi

# -----------------------------------------------------------------------------
#  PROCESS VARIABLES
# -----------------------------------------------------------------------------

MODEL_NAME="$1"
MODEL_FOLDER="$DIR_MODELS/$MODEL_NAME"
MODEL_BUILD_FOLDER="$MODEL_FOLDER/build"
MODEL_CORPUS_FOLDER="$MODEL_FOLDER/corpus"

LANGUAGE_SOURCE="$2"
LANGUAGE_TARGET="$3"

# -----------------------------------------------------------------------------
#  CREATE AND CHECK FOLDERS
# -----------------------------------------------------------------------------

if [ -d "$MODEL_FOLDER" ]; then
  echo "'$MODEL_NAME' model already exists."
  exit 1
fi

mkdir -p $MODEL_CORPUS_FOLDER $MODEL_BUILD_FOLDER/build
mkdir -p $MODEL_BUILD_FOLDER/corpus $MODEL_BUILD_FOLDER/lm \
  $MODEL_BUILD_FOLDER/model $MODEL_BUILD_FOLDER/recaser-model

# -----------------------------------------------------------------------------
#  CREATE CORPUS FILES
# -----------------------------------------------------------------------------

SOURCE_STRINGS="A first string.
The second string.
One third phrase.
Finally, the fourth text."

TARGET_STRINGS="Una primera cadena.
La segunda cadena.
Una tercera frase.
Finalmente, el cuarto texto."

echo -e "$SOURCE_STRINGS" > $MODEL_CORPUS_FOLDER/raw.$LANGUAGE_SOURCE
echo -e "$TARGET_STRINGS" > $MODEL_CORPUS_FOLDER/raw.$LANGUAGE_TARGET

# -----------------------------------------------------------------------------
#  SET CONFIGURATION FILES
# -----------------------------------------------------------------------------

CONFIG="
MODEL_NAME='$MODEL_NAME'
LANGUAGE_SOURCE='$LANGUAGE_SOURCE'
LANGUAGE_TARGET='$LANGUAGE_TARGET'
LM_ORDER=3
REORDERING='wbe-msd-bidirectional-fe'
ALIGNMENT='grow-diag-final-and'
XML_INPUT='exclusive'
"

echo "$CONFIG" > $MODEL_FOLDER/config

# -----------------------------------------------------------------------------
#  NOTIFY SUCCESS
# -----------------------------------------------------------------------------

echo "Model '$MODEL_NAME' was created on $MODEL_FOLDER"
#!/bin/bash
# -----------------------------------------------------------------------------
#  MOSES CLEAN UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
#   This script will facilitate the task of cleaning the content of a
# translation model folder, given its name. It will just clear the `build/`
# folder located in the model's subdirectory. Don't use wildcards or
#
#   Usage: Just run the command `moses-clean.sh` with the model's unique name.
# Make sure to create some backups before cleaning.
#
#     moses-clean.sh <your model name>
#
#   Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------

set -e

USAGE="moses-clean.sh <your model name>"

# -----------------------------------------------------------------------------
#  IMPORT MOSES VARS AND ARGUMENTS CHECK
# -----------------------------------------------------------------------------

source $MOSES_VARS_FILE

if [  $# -lt 1 ]; then
  echo $USAGE
  exit 1
fi

# -----------------------------------------------------------------------------
#  PROCESS VARIABLES
# -----------------------------------------------------------------------------

MODEL_NAME="$1"
MODEL_FOLDER="$DIR_MODELS/$MODEL_NAME"
MODEL_BUILD_FOLDER="$MODEL_FOLDER/build"

# -----------------------------------------------------------------------------
#  CLEAN BUILD FOLDER
# -----------------------------------------------------------------------------

if ! [ -d "$MODEL_BUILD_FOLDER" ]; then
  echo "'$MODEL_NAME' model was not found on '$MODEL_FOLDER'"
  exit 1
fi

rm -Rf $MODEL_BUILD_FOLDER
mkdir -p $MODEL_BUILD_FOLDER/corpus $MODEL_BUILD_FOLDER/lm \
  $MODEL_BUILD_FOLDER/model $MODEL_BUILD_FOLDER/recaser-model

# -----------------------------------------------------------------------------
#  NOTIFY SUCCESS
# -----------------------------------------------------------------------------

echo "Contents of '$MODEL_NAME' were cleaned"
#!/bin/bash
# -----------------------------------------------------------------------------
#  MOSES TRAINER UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
#   Note: This script may be EXTREMELY slow due to moses training system; for
# a corpus of 1 million sentences, it may take up to 8 hours to train.
#
#   This script will facilitate the task of training a language model. It's
# probably the most complext script on this Moses bundle. After an empty model
# has been created, and the corpus files has been filled, the model can be
# trained with this script.
#
#
#   Usage: Make sure your model's corpus, located in
# $MOSES_DIR_MODELS/<your model>/corpus/raw.* is filled with your sentences.
# Then, run the command `moses-train.sh` with the model's unique name. Make
# sure to create some backups before re-training.
#
#     moses-train.sh <your model name>
#
#   Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------

# -----------------------------------------------------------------------------
#  IMPORT MOSES VARS AND ARGUMENTS CHECK
# -----------------------------------------------------------------------------

set -e

USAGE="moses-train.sh <your model name>"

source $MOSES_VARS_FILE

if [  $# -le 0 ]; then
  echo $USAGE
  exit 1
fi

# -----------------------------------------------------------------------------
#  PROCESS VARIABLES
# -----------------------------------------------------------------------------

MODEL_NAME="$1"
MODEL_FOLDER="$DIR_MODELS/$MODEL_NAME"

# -----------------------------------------------------------------------------
#  CLEAN FOLDER & CHANGE ENV
# -----------------------------------------------------------------------------

if ! [ -d "$MODEL_FOLDER" ]; then
  echo "'$MODEL_NAME' model was not found on '$MODEL_FOLDER'"
  exit 1
fi

moses-clean.sh $MODEL_NAME

cd $MODEL_FOLDER
source config

# -----------------------------------------------------------------------------
#  PREPARE CORPUS DATA
# -----------------------------------------------------------------------------

# Tokenisation

tokenizer.perl -threads 2 -l $LANGUAGE_SOURCE < corpus/raw.$LANGUAGE_SOURCE \
  > build/corpus/tok.$LANGUAGE_SOURCE
tokenizer.perl -threads 2 -l $LANGUAGE_TARGET < corpus/raw.$LANGUAGE_TARGET \
  > build/corpus/tok.$LANGUAGE_TARGET

# Recaser model

train-recaser.perl --dir $MODEL_FOLDER/build/recaser-model \
  --corpus build/corpus/tok.$LANGUAGE_TARGET \
  -train-script $MOSES_SCRIPTS/training/train-model.perl \
  -scripts-root-dir $MOSES_SCRIPTS

# Recase corpus
lowercase.perl < build/corpus/tok.$LANGUAGE_SOURCE \
  > build/corpus/cased.$LANGUAGE_SOURCE
lowercase.perl < build/corpus/tok.$LANGUAGE_TARGET \
  > build/corpus/cased.$LANGUAGE_TARGET

# Clean corpus

clean-corpus-n.perl build/corpus/cased $LANGUAGE_SOURCE $LANGUAGE_TARGET \
  build/corpus/clean 1 50

# -----------------------------------------------------------------------------
#  TRAIN LANGUAGE MODEL
# -----------------------------------------------------------------------------

lmplz -o $LM_ORDER < build/corpus/clean.$LANGUAGE_TARGET \
  > build/lm/language-model.arpa.$LANGUAGE_TARGET

build_binary build/lm/language-model.arpa.$LANGUAGE_TARGET \
  build/lm/language-model.blm.$LANGUAGE_TARGET

# -----------------------------------------------------------------------------
#  TRAIN TRANSLATION MODEL
# -----------------------------------------------------------------------------

train-model.perl -root-dir $MODEL_FOLDER/build/model \
  --corpus $MODEL_FOLDER/build/corpus/clean \
  --f $LANGUAGE_SOURCE --e $LANGUAGE_TARGET \
  --mgiza --external-bin-dir="$MGIZA_PATH" \
  --lm \
  "0:$LM_ORDER:$MODEL_FOLDER/build/lm/language-model.blm.$LANGUAGE_TARGET" \
  -alignment $ALIGNMENT -reordering $REORDERING

# Binarize phrases table
processPhraseTableMin -nscores 4 -threads 4 \
  -in build/model/model/phrase-table.gz  \
  -out build/model/model/phrase-table

# Binarize reordering table
processLexicalTableMin -threads 4 \
  -in build/model/model/reordering-table.$REORDERING.gz  \
  -out build/model/model/reordering-table

# Update moses file
cp build/model/model/moses.ini build/model/model/moses-bin.ini
sed -i 's/phrase-table.gz/phrase-table.minphr/g' build/model/model/moses-bin.ini
sed -i 's/PhraseDictionaryMemory/PhraseDictionaryCompact/g' build/model/model/moses-bin.ini
sed -i "s/reordering-table.$REORDERING.gz/reordering-table/g" build/model/model/moses-bin.ini

# -----------------------------------------------------------------------------
#  NOTIFY SUCCESS
# -----------------------------------------------------------------------------

echo "Model '$MODEL_NAME' was successfully trained!"
#!/bin/bash
# -----------------------------------------------------------------------------
#  MOSES INSTALLER UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
#   This script will facilitate the task of installing the Moses system on any
# Ubuntu machine. Instead of compiling and installing it manually, this script
# will do all the process automatically. Intener connection is required to
# download sources, system packages and binaries. Compilation may take a couple
# of hours to complete. The siblings scripts (moses-*.sh) must be placed on
# the same folder as this script.
#
#   The following steps are executed, in secuential order:
#
# 1. Setup user configuration variables (installation directory, sources
#    location, etc).
# 2. Setup process variables (installation directory per app, etc).
# 3. Download source files.
# 4. Compile all source files.
# 5. Install Moses path into the environment.
#
#   To keep the installation clean, the trained models should be located on
# $DIR_INSTALLATION/models/. Each subfolder will represent a custom and
# independent translation model, which should include the following elements:
#
# - Used corpus (raw and processed).
# - Language model (raw and binarized).
# - Recasing model (raw and binarized).
# - Reordering table (raw and binarized).
# - Translation model (raw and binarized).
#
#   After the installation is done, Moses is be ready to be used. To
# manage translation models and language models (training), it's recommended
# to use the downloaded bash scripts after the installation is done, located in
# $DIR_INSTALLATION/scripts/. Review each script file to understand what they
# do, and how do they work.
#
#   Usage: Just edit the variables on the section 'CONFIGURATION VARIABLES'
# according to your needs, add execution permissions, and execute the script:
#
#     vim install-moses.sh
#     chmod +x install-moses.sh
#     ./install-moses.sh
#
#   Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------

set -e
set -x

# -----------------------------------------------------------------------------
#  CONFIGURATION VARIABLES
# -----------------------------------------------------------------------------

# Target directories
DIR_INSTALLATION="$HOME/moses"
DIR_MODELS="$HOME/moses/models"

# Versioned packages
PACKAGE_BOOST="libboost1.55-all-dev"

# Repositories urls (git)
REPO_MOSES="https://github.com/moses-smt/mosesdecoder"
REPO_MGIZA="https://github.com/moses-smt/mgiza"

# Installation extra arguments
ARGS_INSTALL_MOSES=""

# Profile configuration file
BASH_CONFIG_FILE="$HOME/.bashrc"

# -----------------------------------------------------------------------------
#  PROCESS VARIABLES
# -----------------------------------------------------------------------------

CURRENT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

PATH_MOSES="$DIR_INSTALLATION/mosesdecoder"
PATH_MGIZA="$DIR_INSTALLATION/mgiza"
PATH_MGIZA_APP="$PATH_MGIZA/mgizapp"
PATH_SCRIPTS="$DIR_INSTALLATION/scripts"

# -----------------------------------------------------------------------------
#  INSTALL DEPENDENCIES
# -----------------------------------------------------------------------------

# Install apt-get packages
sudo apt-get install -y build-essential git-core pkg-config automake libtool \
     wget zlib1g-dev python-dev libbz2-dev cmake

# Install Boost library
sudo apt-get install -y $PACKAGE_BOOST


# -----------------------------------------------------------------------------
#  DOWNLOAD SOURCE
# -----------------------------------------------------------------------------

mkdir -p $DIR_INSTALLATION
cd $DIR_INSTALLATION

git clone $REPO_MOSES $PATH_MOSES
git clone $REPO_MGIZA $PATH_MGIZA

# -----------------------------------------------------------------------------
#  COMPILE AND MANAGE SOURCE CODE
# -----------------------------------------------------------------------------

# Compile Moses
cd $PATH_MOSES
make -f contrib/Makefiles/install-dependencies.gmake
sudo chmod +x compile.sh
./compile.sh $ARGS_INSTALL_MOSES
cd $DIR_INSTALLATION

# Compile MGIZA
cd $PATH_MGIZA_APP
cmake .
make
make install
cp inst/scripts/* inst/bin/
cd $DIR_INSTALLATION

# Manage custom scripts
cp $CURRENT_DIR/*.sh $PATH_SCRIPTS/
cd $PATH_SCRIPTS
sudo chmod +x *.sh
cd $DIR_INSTALLATION

# -----------------------------------------------------------------------------
#  SETUP FOLDERS & ENVIRONMENT VARIABLES
# -----------------------------------------------------------------------------

# Create missing folders
mkdir -p $DIR_MODELS

# Create some configuration variables
VARIABLES="
# ------------------------------------
# MOSES CONFIGURATION VARIABLES
# ------------------------------------
export MOSES_DIR=$DIR_INSTALLATION
export MOSES_DIR_MODELS=$DIR_MODELS
export MOSES_SCRIPTS=$PATH_SCRIPTS
export MOSES_VARS_FILE=$PATH_SCRIPTS/moses-vars.sh
export PATH=\$PATH:$PATH_SCRIPTS
"

# Add config script to bash profile.
echo "$VARIABLES" >> $BASH_CONFIG_FILE


# -----------------------------------------------------------------------------
#  NOTIFY SUCCESS
# -----------------------------------------------------------------------------


echo "Moses has been installed on $DIR_INSTALLATION. Restart your session."