Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

removed merging, dummy steps in favor of file prefixes #10

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 16 additions & 38 deletions confs/data.tconf
Original file line number Diff line number Diff line change
@@ -1,43 +1,21 @@
global {

SRC=(TrainDataSource:
iwslt_deen_2014="de"
)
TRG=(TrainDataSource:
iwslt_deen_2014="en"
)
trg_lang=en # FIXME (only used by wrap_xml, under some rare cases)
SRC=de
TRG=en

train_data=(TrainDataSource:
iwslt_deen_2014=(side:
src="/path/to/iwslt/train.tags.nourl.de-en.de"
trg="/path/to/iwslt/train.tags.nourl.de-en.en"
)
)
# IWSLT
train_prefix="/home/hltcoe/mpost/code/tape4nmt/iwslt/train.tags.nourl.de-en"
dev_prefix="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.dev2010.de-en"
test_prefix=(TestSet:
iwslt10="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.tst2010.de-en"
iwslt11="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.tst2011.de-en"
iwslt12="/home/hltcoe/mpost/code/tape4nmt/iwslt/IWSLT14.TED.tst2011.de-en")

dev_data=(DevDataSource:
iwslt_deen_dev2010=(side:
src="/path/to/iwslt/IWSLT14.TED.dev2010.de-en.de.xml"
trg="/path/to/iwslt/IWSLT14.TED.dev2010.de-en.en.xml"
)
iwslt_deen_dev2012=(side:
src="/path/to/iwslt/IWSLT14.TEDX.dev2012.de-en.de.xml"
trg="/path/to/iwslt/IWSLT14.TEDX.dev2012.de-en.en.xml"
)
)

test_data=(TestDataSource:
iwslt_deen_test2010=(side:
src="/path/to/iwslt/IWSLT14.TED.tst2010.de-en.de.xml"
trg="/path/to/iwslt/IWSLT14.TED.tst2010.de-en.en.xml"
)
iwslt_deen_test2011=(side:
src="/path/to/iwslt/IWSLT14.TED.tst2011.de-en.de.xml"
trg="/path/to/iwslt/IWSLT14.TED.tst2011.de-en.en.xml"
)
iwslt_deen_test2012=(side:
src="/path/to/iwslt/IWSLT14.TED.tst2012.de-en.de.xml"
trg="/path/to/iwslt/IWSLT14.TED.tst2012.de-en.en.xml"
)
)
# WMT18
# These are the file prefixes, to which $SRC and $TRG are appended.
# You can list any number of prefixes, which will be concatenated.
# You can also use SacreBLEU to generate data (it will call `--echo src|ref` depending on the side).
# train_prefix="/export/common/data/corpora/bitext/de-en/train/commoncrawl.de-en /export/common/data/corpora/bitext/de-en/train/europarl-v7.de-en /export/common/data/corpora/bitext/de-en/train/news-commentary-v13.de-en /export/common/data/corpora/bitext/raw/wmt17/rapid2016.de-en"
# dev_prefix="/home/hltcoe/mpost/data/bitext/de-en/test/newstest2016.de-en /home/hltcoe/mpost/data/bitext/de-en/test/newstest2017.de-en"
# test_prefix="sacrebleu://wmt18 en-de"
}
73 changes: 0 additions & 73 deletions confs/pipeline.tconf

This file was deleted.

10 changes: 0 additions & 10 deletions fairseq.tape
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,6 @@ import "tapes/bleu.tape"

# ==== pipeline ends here ====

plan test {
reach sacrebleu, multi_bleu via (SgmDev: yes) * (SgmTest: yes) * (MergeTest: yes) *
(UseExistingTruecaser: no) * (TrainSampleSize: DontSample) * (DoTokenize: yes) *
(DoTruecase: yes) * (SubwordMethod: bpe) * (TrainDataSource: iwslt_deen_2014) *
(DevDataSource: iwslt_deen_dev2010 iwslt_deen_dev2012) *
(TestDataSource: iwslt_deen_test2010 iwslt_deen_test2011 iwslt_deen_test2012) *
(Architecture: fconv_iwslt_de_en) * (ClipNorm: 0.1) * (Dropout: 0.1) * (BpeMergeOps: 49500) *
(TestMode: no)
}

# Nuts and bolts:
global {
ducttape_experimental_packages=true
Expand Down
9 changes: 9 additions & 0 deletions fairseq.tconf
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,12 @@ global {

use_cpu=(TestMode: no yes)
}

plan test {
reach sacrebleu, multi_bleu via (SgmDev: yes) * (SgmTest: yes) * (MergeTest: yes) *
(UseExistingTruecaser: no) * (TrainSampleSize: DontSample) * (DoTokenize: yes) *
(DoTruecase: yes) * (SubwordMethod: bpe) * (TrainDataSource: iwslt_deen_2014) *
(DevDataSource: iwslt_deen_dev2010 iwslt_deen_dev2012) *
(TestDataSource: iwslt_deen_test2010 iwslt_deen_test2011 iwslt_deen_test2012) *
(Architecture: fconv_iwslt_de_en) * (ClipNorm: 0.1) * (Dropout: 0.1) * (BpeMergeOps: 49500)
}
15 changes: 12 additions & 3 deletions run
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

set -u

TAPEDIR=$(dirname $0)

TOOLKIT=${1:-}

if [[ -z $TOOLKIT ]]; then
Expand All @@ -12,16 +14,23 @@ if [[ -z $TOOLKIT ]]; then
fi
shift

TCONF=$TOOLKIT.tconf
TAPEFILE=$TAPEDIR/$TOOLKIT.tape
if [[ ! -e $TAPEFILE ]]; then
echo "Fatal: Couldn't find toolkit tape file $TAPEFILE"
exit 1
fi

TCONF=${1:-}
if [[ ! -e $TCONF ]]; then
echo "Fatal: Couldn't find $TOOLKIT.tconf"
echo "Fatal: Couldn't find $TCONF"
exit 1
fi
shift

DUCTTAPE=$(which ducttape)
if [[ $? -ne 0 ]]; then
echo "Can't find ducttape."
exit 1
fi

${DUCTTAPE} ${TOOLKIT}.tape -C ${TCONF} $@
${DUCTTAPE} ${TAPEFILE} -C ${TCONF} $@
15 changes: 0 additions & 15 deletions sockeye.tape
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import "tapes/packages.tape"
import "tapes/submitters.tape"
import "tapes/versioners.tape"
import "tapes/dummy.tape"

# ==== pipeline starts here ====

Expand All @@ -17,11 +16,6 @@ import "tapes/prepare_train.tape"
# - extract dev/test from sgm format, if the wrapping exists
import "tapes/prepare_devtest.tape"

# merge multiple train/dev/test sets
# note that merging of train/dev is mandatory,
# while test is controlled by the branch point `MergeTest`
import "tapes/merge.tape"

# tasks related to tokenize
import "tapes/tokenize.tape"

Expand All @@ -46,15 +40,6 @@ import "tapes/bleu.tape"

# ==== pipeline ends here ====

plan test {
reach sacrebleu via
(SubwordMethod: sentencepiece) * (DoTokenize: no) * (DoTruecase: no) *
(TrainDataSource: iwslt_deen_2014) *
(SgmDev: yes) * (DevDataSource: iwslt_deen_dev2012) *
(SgmTest: yes) * (TestDataSource: iwslt_deen_test2012_small) *
(TestMode: yes)
}

# Nuts and bolts:
global {
ducttape_experimental_packages=true
Expand Down
61 changes: 22 additions & 39 deletions sockeye.tconf
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import "confs/data.tconf"
import "confs/pipeline.tconf"

global {
##################################################################################################
Expand All @@ -9,55 +8,23 @@ global {
# All ducttape files will be written underneath this directory
ducttape_output="out"

num_layers=(TestMode: no="6:6" yes="1:1")
num_layers=(TestMode: no=(NumLayers: 6_6="6:6" 10_2="10:2") yes="1:1")
model_size=512
embed_size="512:512"

# all default is consistent with nematus
train_train_from="" # if there is a previous model to start with
train_train_from_state_dict="" # if there is a previous dict to start with
train_start_epoch="" # if trained for certain amount of epochs previously

train_batch_type=(TestMode: no="word" yes="sentence")
train_batch_size=(TestMode: no="80" yes=8)
train_optim="adam"
train_dropout=(Dropout: 0.1 0.3 0.5)
train_lr="0.001"

# train_lr_min="1e-8"
train_lr_min=""
train_lr_shrink="0.5"

# train_lr_scheduler="inverse_sqrt"
# train_warmup_init_lr="1e-07"
# train_warmup_updates="4000"
# train_criterion="label_smoothed_cross_entropy"
# train_label_smoothing="0.1"
train_lr_scheduler=""
train_warmup_init_lr=""
train_warmup_updates=""
train_criterion=""
train_label_smoothing=""
train_clip_norm=(ClipNorm: 0.0 0.1 0.5 1 5)
train_max_tokens="4000"
train_arch=(Architecture: conv="fconv" transformer="transformer" fconv_iwslt_de_en="fconv_iwslt_de_en" transformer_iwslt_de_en="transformer_iwslt_de_en")
train_share_input_output_embed=""
train_skip_invalid_size_inputs_valid_test="yes"
train_adam_beta1="0.9"
train_adam_beta2="0.999"
train_batch_size=(TestMode: no="4096" yes=8)

# Sockeye
train_checkpoint_freq=(TestMode: no=5000 yes=100)
train_max_checkpoints_not_improved=(TestMode: no=16 yes=0)
train_num_decode_and_eval=(TestMode: no=500 yes=10)

# TEST CONFIGURATIONS
test_model_selection_strategy="acc"
test_max_sent_length="300"
test_beam_size=(TestMode: no="12" yes="1")
test_batch_size=1
test_replace_unk="True"
test_remove_bpe=""
test_max_sent_length=100


##################################################################################################
# Job submission parameters
Expand All @@ -76,7 +43,7 @@ global {
resource_flags_decode="-q gpu.q -l gpu=1,mem_free=4g"

# SGE: flags for notifying about job completion (put in your email address!)
action_flags="-m ae -M YOUR_EMAIL_HERE"
action_flags="-m ae -M [email protected]"
mjpost marked this conversation as resolved.
Show resolved Hide resolved

# The default submitter: shell (run locally) or sge (run on a grid)
submitter=(TestMode: no="sge" yes="shell")
Expand All @@ -99,8 +66,24 @@ global {
bpe_operations=32000

# options for cleaning training data
MaxLen=80
MaxLen=100
Ratio=1

use_cpu=(TestMode: no yes)
}

plan test {
reach sacrebleu via
(SubwordMethod: sentencepiece) * (DoTokenize: no) * (DoTruecase: no) *
(TestMode: yes)

reach sacrebleu via
(SubwordMethod: bpe) * (DoTokenize: yes) * (DoTruecase: yes no) *
(TestMode: yes)
}

plan transformer {
reach sacrebleu via
(SubwordMethod: sentencepiece) * (DoTokenize: no) * (DoTruecase: no) *
(NumLayers: 6_6 10_2)
}
8 changes: 4 additions & 4 deletions tapes/bleu.tape
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# as that will involve creating a wrap template for merged xml
task nist_bleu : mosesdecoder
< in=$out@wrap_xml
< wrap_template=$out@download_or_link[DevtestDataSection:test,side:src]
< ref=$out@dummy_aggregate_merge[DevtestDataSection:test,side:trg]
< wrap_template=$out@download_or_link[DataSection:test,side:src]
< ref=$tokenized_data[DataSection:test,side:trg]
> bleu
> bleu_c
:: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags {
Expand All @@ -14,7 +14,7 @@ task nist_bleu : mosesdecoder

task multi_bleu : mosesdecoder
< in=$detokenized_output
< ref=$out@dummy_aggregate_merge[DataSection:devtest,DevtestDataSection:test,side:trg]
< ref=$tokenized_data[DataSection:test,side:trg]
> bleu
> bleu_c
:: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags {
Expand All @@ -34,7 +34,7 @@ task multi_bleu : mosesdecoder

task sacrebleu : sacrebleu
< in=$detokenized_output
< ref=$out@dummy_aggregate_merge[DataSection:devtest,DevtestDataSection:test,side:trg]
< ref=$raw_data_test_trg
> bleu
> signature
:: .submitter=$submitter .action_flags=$action_flags .resource_flags=$resource_flags
Expand Down
Loading