MOSES Utilities
#!/bin/bash
# -----------------------------------------------------------------------------
# MOSES VARS UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
# This script will serve as a source file for multiple common variables
# across the different utility scripts.
#
# Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------
ROOT="$MOSES_DIR"
MOSES_ROOT="$ROOT/mosesdecoder"
MOSES_SCRIPTS="$MOSES_ROOT/scripts"
MGIZA_PATH="$ROOT/mgiza/mgizapp/inst/bin/"
MOSES_PATHS="$MOSES_ROOT/bin:$MOSES_SCRIPTS/training/"
MOSES_PATHS="$MOSES_PATHS:$MOSES_SCRIPTS/tokenizer/:$MOSES_SCRIPTS/recaser/"
# MOSES_INI_FILE="$DIR/build/model/model/moses-bin.ini"
DIR_MODELS="$MOSES_DIR_MODELS"
export PATH=$PATH:$MOSES_PATHS
#!/bin/bash
# -----------------------------------------------------------------------------
# MOSES TRANSLATOR UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
# This script will facilitate the task of translating a group of phrases
# given a (trained) translation model.
#
# Usage: Just run the command `moses-clean.sh` with the model's unique name.
# Make sure to create some backups before cleaning. The translations will be
# place on <output> file; one line per translated sentence. Note that the word
# alignment will be placed on the <output>.aligment file.
#
# moses-translate.sh <input strings file> <output> [dictionary]
#
# Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------
set -e
USAGE="moses-translate.sh <model> <input strings file> <output> [dictionary]"
# -----------------------------------------------------------------------------
# IMPORT MOSES VARS AND ARGUMENTS CHECK
# -----------------------------------------------------------------------------
source $MOSES_VARS_FILE
if [ $# -lt 3 ]; then
echo $USAGE
exit 1
fi
# -----------------------------------------------------------------------------
# PROCESS VARIABLES
# -----------------------------------------------------------------------------
MODEL_NAME="$1"
MODEL_FOLDER="$DIR_MODELS/$MODEL_NAME"
MODEL_BUILD_FOLDER="$MODEL_FOLDER/build"
MODEL_BUILD_TMP_FOLDER="$MODEL_BUILD_FOLDER/tmp"
INPUT_FILE="$2"
OUTPUT_FILE="$3"
OUTPUT_FILE_ALIGNMENT="$OUTPUT_FILE.alignment"
DICTIONARY_FILE="$4"
MOSES_INI_FILE="$MODEL_BUILD_FOLDER/model/model/moses-bin.ini"
source $MODEL_FOLDER/config
# -----------------------------------------------------------------------------
# PREPARE INPUT
# -----------------------------------------------------------------------------
# Variables & folders
mkdir -p $MODEL_BUILD_TMP_FOLDER
# Write the input string into a temp file
cp $INPUT_FILE $MODEL_BUILD_TMP_FOLDER/in.raw
# Tokenize string
tokenizer.perl -l $LANGUAGE_SOURCE < $MODEL_BUILD_TMP_FOLDER/in.raw \
> $MODEL_BUILD_TMP_FOLDER/in.tok
# Recase string
lowercase.perl < $MODEL_BUILD_TMP_FOLDER/in.tok \
> $MODEL_BUILD_TMP_FOLDER/in
# Use dictionary on this step (if present=), and replace required strings.
if [ $DICTIONARY_FILE != "" ] && [ -f $DICTIONARY_FILE ]; then
# Add extra white spaces
sed -i 's/$/ /' $MODEL_BUILD_TMP_FOLDER/in
# For each dictionary term, try to replace it in the corpus.
while read line; do
original=$(echo $line | awk -F' --- ' '{print $1}')
translated=$(echo $line | awk -F' --- ' '{print $2}')
prob=$(echo $line | awk -F' --- ' '{print $3}')
sed -i "s/ $original / <n translation=\"$translated\" prob=\"$prob\">$original<\/n> /g" \
$MODEL_BUILD_TMP_FOLDER/in
done < $DICTIONARY_FILE
fi
# -----------------------------------------------------------------------------
# TRANSLATE CONTENT
# -----------------------------------------------------------------------------
# Run decoder
moses -f $MOSES_INI_FILE -alignment-output-file $OUTPUT_FILE_ALIGNMENT \
-xml-input $XML_INPUT \
< $MODEL_BUILD_TMP_FOLDER/in \
> $MODEL_BUILD_TMP_FOLDER/out.raw
# -----------------------------------------------------------------------------
# PREPARE OUTPUT
# -----------------------------------------------------------------------------
# Recase the output
recase.perl --in $MODEL_BUILD_TMP_FOLDER/out.raw --lang $LANGUAGE_TARGET \
> $MODEL_BUILD_TMP_FOLDER/out.cased \
--model $MODEL_BUILD_FOLDER/recaser-model/moses.ini ||
# Use default language 'en' if the previous command fails
recase.perl --in $MODEL_BUILD_TMP_FOLDER/out.raw --lang "en" \
> $MODEL_BUILD_TMP_FOLDER/out.cased \
--model $MODEL_BUILD_FOLDER/recaser-model/moses.ini
# Detokenize the output
detokenizer.perl -l $LANGUAGE_TARGET < $MODEL_BUILD_TMP_FOLDER/out.cased \
> $MODEL_BUILD_TMP_FOLDER/out
# Copy the output to the output file
cp $MODEL_BUILD_TMP_FOLDER/out $OUTPUT_FILE
# Clear data
#rm -Rf $MODEL_BUILD_TMP_FOLDER
# -----------------------------------------------------------------------------
# NOTIFY SUCCESS
# -----------------------------------------------------------------------------
echo "Translated results were placed on $OUTPUT_FILE"
#!/bin/bash
# -----------------------------------------------------------------------------
# MOSES INITIALIZER UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
# This script should be used to create the skeleton of a translation model.
# The generated tree should look like this:
#
# model_dir # Root folder
# |- build # Build folder
# | |- corpus # Processed corpus content
# | |- lm # Language model content
# | |- model # Translation model content
# | |- recaser-model # Recasing model content
# |- config # Config file, used as a variables file
# |- corpus # Raw corpus folder
# |- raw.source # Source language corpus
# |- raw.target # Target language corpus
#
# Models will be generated under the folder $MOSES_DIR_MODELS. Since the
# models are subfolders, prefixed may be added to their name to generate
# hierarchical subdirectories. Check moses-train.sh for more information
# about training models.
#
# Usage: Just run this script with a model name, a source language and a
# target language. After that, the model template will be created in
# $MOSES_DIR_MODELS/<model name>. You can specify subfolders as a model
# name:
#
# moses-init.sh category1/model_name en es
#
# Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------
set -e
USAGE="moses-init.sh <model name> <source language> <target language>"
# -----------------------------------------------------------------------------
# IMPORT MOSES VARS AND ARGUMENTS CHECK
# -----------------------------------------------------------------------------
source $MOSES_VARS_FILE
if [ $# -lt 3 ]; then
echo $USAGE
exit 1
fi
# -----------------------------------------------------------------------------
# PROCESS VARIABLES
# -----------------------------------------------------------------------------
MODEL_NAME="$1"
MODEL_FOLDER="$DIR_MODELS/$MODEL_NAME"
MODEL_BUILD_FOLDER="$MODEL_FOLDER/build"
MODEL_CORPUS_FOLDER="$MODEL_FOLDER/corpus"
LANGUAGE_SOURCE="$2"
LANGUAGE_TARGET="$3"
# -----------------------------------------------------------------------------
# CREATE AND CHECK FOLDERS
# -----------------------------------------------------------------------------
if [ -d "$MODEL_FOLDER" ]; then
echo "'$MODEL_NAME' model already exists."
exit 1
fi
mkdir -p $MODEL_CORPUS_FOLDER $MODEL_BUILD_FOLDER/build
mkdir -p $MODEL_BUILD_FOLDER/corpus $MODEL_BUILD_FOLDER/lm \
$MODEL_BUILD_FOLDER/model $MODEL_BUILD_FOLDER/recaser-model
# -----------------------------------------------------------------------------
# CREATE CORPUS FILES
# -----------------------------------------------------------------------------
SOURCE_STRINGS="A first string.
The second string.
One third phrase.
Finally, the fourth text."
TARGET_STRINGS="Una primera cadena.
La segunda cadena.
Una tercera frase.
Finalmente, el cuarto texto."
echo -e "$SOURCE_STRINGS" > $MODEL_CORPUS_FOLDER/raw.$LANGUAGE_SOURCE
echo -e "$TARGET_STRINGS" > $MODEL_CORPUS_FOLDER/raw.$LANGUAGE_TARGET
# -----------------------------------------------------------------------------
# SET CONFIGURATION FILES
# -----------------------------------------------------------------------------
CONFIG="
MODEL_NAME='$MODEL_NAME'
LANGUAGE_SOURCE='$LANGUAGE_SOURCE'
LANGUAGE_TARGET='$LANGUAGE_TARGET'
LM_ORDER=3
REORDERING='wbe-msd-bidirectional-fe'
ALIGNMENT='grow-diag-final-and'
XML_INPUT='exclusive'
"
echo "$CONFIG" > $MODEL_FOLDER/config
# -----------------------------------------------------------------------------
# NOTIFY SUCCESS
# -----------------------------------------------------------------------------
echo "Model '$MODEL_NAME' was created on $MODEL_FOLDER"
#!/bin/bash
# -----------------------------------------------------------------------------
# MOSES CLEAN UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
# This script will facilitate the task of cleaning the content of a
# translation model folder, given its name. It will just clear the `build/`
# folder located in the model's subdirectory. Don't use wildcards or
#
# Usage: Just run the command `moses-clean.sh` with the model's unique name.
# Make sure to create some backups before cleaning.
#
# moses-clean.sh <your model name>
#
# Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------
set -e
USAGE="moses-clean.sh <your model name>"
# -----------------------------------------------------------------------------
# IMPORT MOSES VARS AND ARGUMENTS CHECK
# -----------------------------------------------------------------------------
source $MOSES_VARS_FILE
if [ $# -lt 1 ]; then
echo $USAGE
exit 1
fi
# -----------------------------------------------------------------------------
# PROCESS VARIABLES
# -----------------------------------------------------------------------------
MODEL_NAME="$1"
MODEL_FOLDER="$DIR_MODELS/$MODEL_NAME"
MODEL_BUILD_FOLDER="$MODEL_FOLDER/build"
# -----------------------------------------------------------------------------
# CLEAN BUILD FOLDER
# -----------------------------------------------------------------------------
if ! [ -d "$MODEL_BUILD_FOLDER" ]; then
echo "'$MODEL_NAME' model was not found on '$MODEL_FOLDER'"
exit 1
fi
rm -Rf $MODEL_BUILD_FOLDER
mkdir -p $MODEL_BUILD_FOLDER/corpus $MODEL_BUILD_FOLDER/lm \
$MODEL_BUILD_FOLDER/model $MODEL_BUILD_FOLDER/recaser-model
# -----------------------------------------------------------------------------
# NOTIFY SUCCESS
# -----------------------------------------------------------------------------
echo "Contents of '$MODEL_NAME' were cleaned"
#!/bin/bash
# -----------------------------------------------------------------------------
# MOSES TRAINER UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
# Note: This script may be EXTREMELY slow due to moses training system; for
# a corpus of 1 million sentences, it may take up to 8 hours to train.
#
# This script will facilitate the task of training a language model. It's
# probably the most complext script on this Moses bundle. After an empty model
# has been created, and the corpus files has been filled, the model can be
# trained with this script.
#
#
# Usage: Make sure your model's corpus, located in
# $MOSES_DIR_MODELS/<your model>/corpus/raw.* is filled with your sentences.
# Then, run the command `moses-train.sh` with the model's unique name. Make
# sure to create some backups before re-training.
#
# moses-train.sh <your model name>
#
# Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# IMPORT MOSES VARS AND ARGUMENTS CHECK
# -----------------------------------------------------------------------------
set -e
USAGE="moses-train.sh <your model name>"
source $MOSES_VARS_FILE
if [ $# -le 0 ]; then
echo $USAGE
exit 1
fi
# -----------------------------------------------------------------------------
# PROCESS VARIABLES
# -----------------------------------------------------------------------------
MODEL_NAME="$1"
MODEL_FOLDER="$DIR_MODELS/$MODEL_NAME"
# -----------------------------------------------------------------------------
# CLEAN FOLDER & CHANGE ENV
# -----------------------------------------------------------------------------
if ! [ -d "$MODEL_FOLDER" ]; then
echo "'$MODEL_NAME' model was not found on '$MODEL_FOLDER'"
exit 1
fi
moses-clean.sh $MODEL_NAME
cd $MODEL_FOLDER
source config
# -----------------------------------------------------------------------------
# PREPARE CORPUS DATA
# -----------------------------------------------------------------------------
# Tokenisation
tokenizer.perl -threads 2 -l $LANGUAGE_SOURCE < corpus/raw.$LANGUAGE_SOURCE \
> build/corpus/tok.$LANGUAGE_SOURCE
tokenizer.perl -threads 2 -l $LANGUAGE_TARGET < corpus/raw.$LANGUAGE_TARGET \
> build/corpus/tok.$LANGUAGE_TARGET
# Recaser model
train-recaser.perl --dir $MODEL_FOLDER/build/recaser-model \
--corpus build/corpus/tok.$LANGUAGE_TARGET \
-train-script $MOSES_SCRIPTS/training/train-model.perl \
-scripts-root-dir $MOSES_SCRIPTS
# Recase corpus
lowercase.perl < build/corpus/tok.$LANGUAGE_SOURCE \
> build/corpus/cased.$LANGUAGE_SOURCE
lowercase.perl < build/corpus/tok.$LANGUAGE_TARGET \
> build/corpus/cased.$LANGUAGE_TARGET
# Clean corpus
clean-corpus-n.perl build/corpus/cased $LANGUAGE_SOURCE $LANGUAGE_TARGET \
build/corpus/clean 1 50
# -----------------------------------------------------------------------------
# TRAIN LANGUAGE MODEL
# -----------------------------------------------------------------------------
lmplz -o $LM_ORDER < build/corpus/clean.$LANGUAGE_TARGET \
> build/lm/language-model.arpa.$LANGUAGE_TARGET
build_binary build/lm/language-model.arpa.$LANGUAGE_TARGET \
build/lm/language-model.blm.$LANGUAGE_TARGET
# -----------------------------------------------------------------------------
# TRAIN TRANSLATION MODEL
# -----------------------------------------------------------------------------
train-model.perl -root-dir $MODEL_FOLDER/build/model \
--corpus $MODEL_FOLDER/build/corpus/clean \
--f $LANGUAGE_SOURCE --e $LANGUAGE_TARGET \
--mgiza --external-bin-dir="$MGIZA_PATH" \
--lm \
"0:$LM_ORDER:$MODEL_FOLDER/build/lm/language-model.blm.$LANGUAGE_TARGET" \
-alignment $ALIGNMENT -reordering $REORDERING
# Binarize phrases table
processPhraseTableMin -nscores 4 -threads 4 \
-in build/model/model/phrase-table.gz \
-out build/model/model/phrase-table
# Binarize reordering table
processLexicalTableMin -threads 4 \
-in build/model/model/reordering-table.$REORDERING.gz \
-out build/model/model/reordering-table
# Update moses file
cp build/model/model/moses.ini build/model/model/moses-bin.ini
sed -i 's/phrase-table.gz/phrase-table.minphr/g' build/model/model/moses-bin.ini
sed -i 's/PhraseDictionaryMemory/PhraseDictionaryCompact/g' build/model/model/moses-bin.ini
sed -i "s/reordering-table.$REORDERING.gz/reordering-table/g" build/model/model/moses-bin.ini
# -----------------------------------------------------------------------------
# NOTIFY SUCCESS
# -----------------------------------------------------------------------------
echo "Model '$MODEL_NAME' was successfully trained!"
#!/bin/bash
# -----------------------------------------------------------------------------
# MOSES INSTALLER UTILITY
# -----------------------------------------------------------------------------
# Author: Daniel Herzog, GearTranslations S.L.
# Date: 6th February, 2017
#
# This script will facilitate the task of installing the Moses system on any
# Ubuntu machine. Instead of compiling and installing it manually, this script
# will do all the process automatically. Intener connection is required to
# download sources, system packages and binaries. Compilation may take a couple
# of hours to complete. The siblings scripts (moses-*.sh) must be placed on
# the same folder as this script.
#
# The following steps are executed, in secuential order:
#
# 1. Setup user configuration variables (installation directory, sources
# location, etc).
# 2. Setup process variables (installation directory per app, etc).
# 3. Download source files.
# 4. Compile all source files.
# 5. Install Moses path into the environment.
#
# To keep the installation clean, the trained models should be located on
# $DIR_INSTALLATION/models/. Each subfolder will represent a custom and
# independent translation model, which should include the following elements:
#
# - Used corpus (raw and processed).
# - Language model (raw and binarized).
# - Recasing model (raw and binarized).
# - Reordering table (raw and binarized).
# - Translation model (raw and binarized).
#
# After the installation is done, Moses is be ready to be used. To
# manage translation models and language models (training), it's recommended
# to use the downloaded bash scripts after the installation is done, located in
# $DIR_INSTALLATION/scripts/. Review each script file to understand what they
# do, and how do they work.
#
# Usage: Just edit the variables on the section 'CONFIGURATION VARIABLES'
# according to your needs, add execution permissions, and execute the script:
#
# vim install-moses.sh
# chmod +x install-moses.sh
# ./install-moses.sh
#
# Disclaimer: This is a simple utility that I (Daniel) wrote to manage and
# allow quick, basic and replicable installations of the Moses toolchain.
# This is not a perfect script, and errors may occur during execution.
# -----------------------------------------------------------------------------
set -e
set -x
# -----------------------------------------------------------------------------
# CONFIGURATION VARIABLES
# -----------------------------------------------------------------------------
# Target directories
DIR_INSTALLATION="$HOME/moses"
DIR_MODELS="$HOME/moses/models"
# Versioned packages
PACKAGE_BOOST="libboost1.55-all-dev"
# Repositories urls (git)
REPO_MOSES="https://github.com/moses-smt/mosesdecoder"
REPO_MGIZA="https://github.com/moses-smt/mgiza"
# Installation extra arguments
ARGS_INSTALL_MOSES=""
# Profile configuration file
BASH_CONFIG_FILE="$HOME/.bashrc"
# -----------------------------------------------------------------------------
# PROCESS VARIABLES
# -----------------------------------------------------------------------------
CURRENT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PATH_MOSES="$DIR_INSTALLATION/mosesdecoder"
PATH_MGIZA="$DIR_INSTALLATION/mgiza"
PATH_MGIZA_APP="$PATH_MGIZA/mgizapp"
PATH_SCRIPTS="$DIR_INSTALLATION/scripts"
# -----------------------------------------------------------------------------
# INSTALL DEPENDENCIES
# -----------------------------------------------------------------------------
# Install apt-get packages
sudo apt-get install -y build-essential git-core pkg-config automake libtool \
wget zlib1g-dev python-dev libbz2-dev cmake
# Install Boost library
sudo apt-get install -y $PACKAGE_BOOST
# -----------------------------------------------------------------------------
# DOWNLOAD SOURCE
# -----------------------------------------------------------------------------
mkdir -p $DIR_INSTALLATION
cd $DIR_INSTALLATION
git clone $REPO_MOSES $PATH_MOSES
git clone $REPO_MGIZA $PATH_MGIZA
# -----------------------------------------------------------------------------
# COMPILE AND MANAGE SOURCE CODE
# -----------------------------------------------------------------------------
# Compile Moses
cd $PATH_MOSES
make -f contrib/Makefiles/install-dependencies.gmake
sudo chmod +x compile.sh
./compile.sh $ARGS_INSTALL_MOSES
cd $DIR_INSTALLATION
# Compile MGIZA
cd $PATH_MGIZA_APP
cmake .
make
make install
cp inst/scripts/* inst/bin/
cd $DIR_INSTALLATION
# Manage custom scripts
cp $CURRENT_DIR/*.sh $PATH_SCRIPTS/
cd $PATH_SCRIPTS
sudo chmod +x *.sh
cd $DIR_INSTALLATION
# -----------------------------------------------------------------------------
# SETUP FOLDERS & ENVIRONMENT VARIABLES
# -----------------------------------------------------------------------------
# Create missing folders
mkdir -p $DIR_MODELS
# Create some configuration variables
VARIABLES="
# ------------------------------------
# MOSES CONFIGURATION VARIABLES
# ------------------------------------
export MOSES_DIR=$DIR_INSTALLATION
export MOSES_DIR_MODELS=$DIR_MODELS
export MOSES_SCRIPTS=$PATH_SCRIPTS
export MOSES_VARS_FILE=$PATH_SCRIPTS/moses-vars.sh
export PATH=\$PATH:$PATH_SCRIPTS
"
# Add config script to bash profile.
echo "$VARIABLES" >> $BASH_CONFIG_FILE
# -----------------------------------------------------------------------------
# NOTIFY SUCCESS
# -----------------------------------------------------------------------------
echo "Moses has been installed on $DIR_INSTALLATION. Restart your session."