sockeye.tconf

##################################################################################################
# Packages used
##################################################################################################

package sockeye :: .versioner=git .repo="https://github.com/mjpost/sockeye" .ref=HEAD { }
package sacrebleu :: .versioner=pip .package="sacrebleu" .tag="1.2.12" { }
package subword_nmt :: .versioner=pip .package="subword-nmt" .tag="0.3.5" { }
package mosesdecoder :: .versioner=git .repo="https://github.com/moses-smt/mosesdecoder" .ref=HEAD { }

package sentencepiece :: .versioner=git .repo="https://github.com/google/sentencepiece" .ref="tags/v0.1.5" {  # v0.1.6 throws segfault
  mkdir build
  cd build
  cmake ..
  make -j $(nproc)
}

package tools
    :: .versioner=git .repo="https://github.com/shuoyangd/tape4nmt-tools" .ref=HEAD {
  pip install -r requirements.txt
}

# using my fork for now, as fairseq evolves pretty fast
package fairseq
    :: .versioner=git .repo="https://github.com/shuoyangd/fairseq" .ref=HEAD {

  python setup.py build develop
}

global {

  ##################################################################################################
  # Data-related stuff
  ##################################################################################################

  SRC=(TrainDataSource:
    iwslt_deen_2014="de"
  )
  TRG=(TrainDataSource:
    iwslt_deen_2014="en"
  )
  trg_lang=en  # FIXME (only used by wrap_xml, under some rare cases)

  train_data=(TrainDataSource:
    iwslt_deen_2014=(side:
      src="/path/to/iwslt/train.tags.nourl.de-en.de"
      trg="/path/to/iwslt/train.tags.nourl.de-en.en"
    )
  )

  dev_data=(DevDataSource:
    iwslt_deen_dev2010=(side:
      src="/path/to/iwslt/IWSLT14.TED.dev2010.de-en.de.xml"
      trg="/path/to/iwslt/IWSLT14.TED.dev2010.de-en.en.xml"
    )
    iwslt_deen_dev2012=(side:
      src="/path/to/iwslt/IWSLT14.TEDX.dev2012.de-en.de.xml"
      trg="/path/to/iwslt/IWSLT14.TEDX.dev2012.de-en.en.xml"
    )
  )

  test_data=(TestDataSource:
    iwslt_deen_test2010=(side:
      src="/path/to/iwslt/IWSLT14.TED.tst2010.de-en.de.xml"
      trg="/path/to/iwslt/IWSLT14.TED.tst2010.de-en.en.xml"
    )
    iwslt_deen_test2011=(side:
      src="/path/to/iwslt/IWSLT14.TED.tst2011.de-en.de.xml"
      trg="/path/to/iwslt/IWSLT14.TED.tst2011.de-en.en.xml"
    )
    iwslt_deen_test2012=(side:
      src="/path/to/iwslt/IWSLT14.TED.tst2012.de-en.de.xml"
      trg="/path/to/iwslt/IWSLT14.TED.tst2012.de-en.en.xml"
    )
  )

  ##################################################################################################
  # General options you should set for your environment
  ##################################################################################################

  # All ducttape files will be written underneath this directory
  ducttape_output="out"

  num_layers=(TestMode: no="6:6" yes="1:1")
  model_size=512
  embed_size="512:512"

  # all default is consistent with nematus
  train_train_from="" # if there is a previous model to start with
  train_train_from_state_dict="" # if there is a previous dict to start with
  train_start_epoch="" # if trained for certain amount of epochs previously

  train_batch_type=(TestMode: no="word" yes="sentence")
  train_batch_size=(TestMode: no="80" yes=8)
  train_optim="adam"
  train_dropout=(Dropout: 0.1 0.3 0.5)
  train_lr="0.001"

  # train_lr_min="1e-8"
  train_lr_min=""
  train_lr_shrink="0.5"

  # train_lr_scheduler="inverse_sqrt"
  # train_warmup_init_lr="1e-07"
  # train_warmup_updates="4000"
  # train_criterion="label_smoothed_cross_entropy"
  # train_label_smoothing="0.1"
  train_lr_scheduler=""
  train_warmup_init_lr=""
  train_warmup_updates=""
  train_criterion=""
  train_label_smoothing=""
  train_clip_norm=(ClipNorm: 0.0 0.1 0.5 1 5)
  train_max_tokens="4000"
  train_arch=(Architecture: conv="fconv" transformer="transformer" fconv_iwslt_de_en="fconv_iwslt_de_en" transformer_iwslt_de_en="transformer_iwslt_de_en")
  train_share_input_output_embed=""
  train_skip_invalid_size_inputs_valid_test="yes"
  train_adam_beta1="0.9"
  train_adam_beta2="0.999"

  # Sockeye
  train_checkpoint_freq=(TestMode: no=5000 yes=100)
  train_max_checkpoints_not_improved=(TestMode: no=16 yes=0)
  train_num_decode_and_eval=(TestMode: no=500 yes=10)

  # TEST CONFIGURATIONS
  test_model_selection_strategy="acc"
  test_max_sent_length="300"
  test_beam_size=(TestMode: no="12" yes="1")
  test_batch_size=1
  test_replace_unk="True"
  test_remove_bpe=""

  ##################################################################################################
  # Job submission parameters
  ##################################################################################################

  # SGE: generic job flags
  resource_flags="-l mem_free=2g"

  # SGE: larger job flags
  resource_flags_16g="-l mem_free=16g"

  # SGE: flags for training a model
  resource_flags_train="-q gpu.q -l gpu=1,mem_free=4g"

  # SGE: flags for decoding
  resource_flags_decode="-q gpu.q -l gpu=1,mem_free=4g"

  # SGE: flags for notifying about job completion (put in your email address!)
  action_flags="-m ae -M YOUR_EMAIL_HERE"

  # The default submitter: shell (run locally) or sge (run on a grid)
  submitter=(TestMode: no="sge" yes="shell")

  # Virtual env location. This should be a file path to the virtual env you want loaded before tasks.
  # This variable supports both conda and Python's virtualenv. For conda, use "conda:ENV" as the value,
  # where "ENV" is the name of the conda environment that should be loaded. For virtualenv, supply
  # the path to the script that should be loaded.
  pyenv=(TestMode: no="conda:sockeye" yes="conda:sockeye-cpu")

  ##################################################################################################
  # Preprocessing options
  ##################################################################################################

  # sentencepiece options
  sentencepiece_vocab_size=8000
  sentencepiece_model_type="unigram"

  # no of BPE operations
  bpe_operations=32000

  # options for cleaning training data
  MaxLen=80
  Ratio=1

  # flags for moses tokenizer
  tokenizer_flags="-no-escape -a -q"

  use_cpu=(TestMode: no yes)
}