mozilla
diff --git a/‎Taskfile.yml‎
Lines changed: 1 addition & 1 deletion b/‎Taskfile.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/training/README.md‎
Lines changed: 12 additions & 0 deletions b/‎docs/training/README.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/training/opus-trainer.md‎
Lines changed: 4 additions & 3 deletions b/‎docs/training/opus-trainer.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎pipeline/alignments/generate-shortlist.sh‎
Lines changed: 7 additions & 6 deletions b/‎pipeline/alignments/generate-shortlist.sh‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎pipeline/cefilter/score.sh‎
Lines changed: 5 additions & 4 deletions b/‎pipeline/cefilter/score.sh‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎pipeline/common/command_runner.py‎
Lines changed: 1 addition & 1 deletion b/‎pipeline/common/command_runner.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pipeline/data/requirements/data.in‎
Lines changed: 1 addition & 1 deletion b/‎pipeline/data/requirements/data.in‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pipeline/data/requirements/data.txt‎
Lines changed: 3 additions & 3 deletions b/‎pipeline/data/requirements/data.txt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pipeline/eval/eval.py‎
Lines changed: 10 additions & 5 deletions b/‎pipeline/eval/eval.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎pipeline/quantize/export.sh‎
Lines changed: 17 additions & 7 deletions b/‎pipeline/quantize/export.sh‎
Lines changed: 17 additions & 7 deletions
@@ -65,7 +65,7 @@ tasks:
     summary: |
       The models will be saved to: ./data/taskcluster-model
       Example: `task config-generator -- en fi`
-    deps: [poetry-install-utils]
+    deps: [poetry-install-utils, poetry-install-utils-docker]
     cmds:
       - >-
           PYTHONPATH=$(pwd) poetry run python -W ignore utils/config_generator.py {{.CLI_ARGS}}
 
@@ -192,6 +192,18 @@ for example [teacher.train.yml](https://github.com/mozilla/translations/tree/mai
 
 ### Model training
 
+#### Vocabulary
+
+Use separate SentencePiece vocabularies for source and target languages if they have different scripts (for example, Latin and Cyrillic).
+```yaml
+spm-vocab-split: true
+```
+
+The default size of SentencePiece vocabulary is 32k, increase to 64k when using a joint vocabulary for CJK languages.
+```yaml
+spm-vocab-size: 64000
+```
+
 #### Teacher ensemble
 
 Change to 1 not to use an ensemble of two teachers. The ensemble is more expensive to train and run decoding for, 
 
@@ -90,7 +90,8 @@ modifiers:
   custom_detok_trg: "icu:{trg}"
   augment: 1
   tag: 0
-  spm_vocab: {vocab}
+  spm_vocab_src: {vocab_src}
+  spm_vocab_trg: {vocab_trg}
 seed: 1111
 
 # parallel sentences + token alignments
@@ -101,8 +102,8 @@ num_fields: 3
 
 `Tags` modifiers requires whitespace, Moses or ICU tokenized alignments as input. 
 Marian requires Sentencepiece tokenized alignments and raw text input. 
-To make them compatible `Tags` modifier can remap the alignments in the end using the passed Sentencepiece model `spm_vocab: vocab.spm` (student model use case). 
-If the `spm_vocab` argument is missing `Tags` modifier will remove alignments and output only the parallel sentences (teacher model use case). 
+To make them compatible `Tags` modifier can remap the alignments in the end using the passed Sentencepiece model `spm_vocab_*: vocab.spm` (student model use case). 
+If the `spm_vocab_trg` argument is missing `Tags` modifier will remove alignments and output only the parallel sentences (teacher model use case). 
 
 Currently, ICUs-tokenized text and its alignments are passed to OpusTrainer (to work around CJK languages where whitespace-based tokenization doesn't make sense). 
 Whitespaces are represented with a special symbol "▁" to allow for lossless text reconstruction on OpusTrainer side. 
 
@@ -16,9 +16,10 @@ echo "###### Generating alignments and shortlist"
 [[ -z "${TRG}" ]] && echo "TRG is empty"
 
 corpus_prefix=$1
-vocab_path=$2
-output_dir=$3
-threads=$4
+vocab_src=$2
+vocab_trg=$3
+output_dir=$4
+threads=$5
 
 if [ "$threads" = "auto" ]; then
   threads=$(nproc)
@@ -36,11 +37,11 @@ corpus_trg="${corpus_prefix}.${TRG}.zst"
 
 echo "### Subword segmentation with SentencePiece"
 zstdmt -dc "${corpus_src}" |
-  parallel --no-notice --pipe -k -j "${threads}" --block 50M "${MARIAN}/spm_encode" --model "${vocab_path}" \
+  parallel --no-notice --pipe -k -j "${threads}" --block 50M "${MARIAN}/spm_encode" --model "${vocab_src}" \
    >"${dir}/corpus.spm.${SRC}"
 
 zstdmt -dc "${corpus_trg}" |
-  parallel --no-notice --pipe -k -j "${threads}" --block 50M "${MARIAN}/spm_encode" --model "${vocab_path}" \
+  parallel --no-notice --pipe -k -j "${threads}" --block 50M "${MARIAN}/spm_encode" --model "${vocab_trg}" \
    >"${dir}/corpus.spm.${TRG}"
 
 python3 align.py \
@@ -65,7 +66,7 @@ rm "${dir}/corpus.spm.${SRC}"
 rm "${output_dir}/corpus.aln"
 
 echo "### Shortlist pruning"
-"${MARIAN}/spm_export_vocab" --model="${vocab_path}" --output="${dir}/vocab.txt"
+"${MARIAN}/spm_export_vocab" --model="${vocab_trg}" --output="${dir}/vocab.txt"
 zstdmt -dc "${dir}/lex.s2t.zst" |
   grep -v NULL |
   python3 "prune_shortlist.py" 100 "${dir}/vocab.txt" |
 
@@ -15,9 +15,10 @@ test -v TRG
 test -v WORKSPACE
 
 model=$1
-vocab=$2
-corpus_prefix=$3
-output=$4
+vocab_src=$2
+vocab_trg=$3
+corpus_prefix=$4
+output=$5
 
 zstdmt --rm -d "${corpus_prefix}.${SRC}.zst"
 zstdmt --rm -d "${corpus_prefix}.${TRG}.zst"
@@ -27,7 +28,7 @@ mkdir -p "${dir}"
 
 "${MARIAN}/marian-scorer" \
   --model "${model}" \
-  --vocabs "${vocab}" "${vocab}" \
+  --vocabs "${vocab_src}" "${vocab_trg}" \
   --train-sets "${corpus_prefix}.${TRG}" "${corpus_prefix}.${SRC}" \
   --mini-batch 32 \
   --mini-batch-words 1500 \
 
@@ -37,7 +37,7 @@ def apply_command_args(dict: dict[str, any]):
         if value is None:
             continue
 
-        if isinstance(value, list):
+        if isinstance(value, (list, tuple)):
             for v in value:
                 yield str(v)
             continue
 
@@ -1,4 +1,4 @@
-opustrainer==0.3
+opustrainer==0.4
 simalign==0.4
 mtdata==0.4.1
 psutil==6.0.0
 
@@ -489,9 +489,9 @@ opencc==1.1.9 \
     --hash=sha256:c6d5f9756ed08e67de36c53dc4d8f0bdc72889d6f57a8fc4d8b073d99c58d4dc \
     --hash=sha256:f4267b66ed6e656b5d8199f94e9673950ac39d49ebaf0e7927330801f06f038f
     # via -r pipeline/data/requirements/data.in
-opustrainer==0.3 \
-    --hash=sha256:75d10317ccf92c4ac8618debe23fe35d02b364ed69bd80c7815035c7d10dc5ad \
-    --hash=sha256:acf7050550d08409c12b634e26d1cee279aea8534161214232e6a826715f8a21
+opustrainer==0.4 \
+    --hash=sha256:0bdf4adbabd0cdc4e73c99b36d01c0e69178e237adfd28293498b413e26c415c \
+    --hash=sha256:bb973c52c7b4303e68ebc805cb8ad9e55518930131228a62ba112d2b2ab52ec6
     # via -r pipeline/data/requirements/data.in
 packaging==24.1 \
     --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \
 
@@ -117,10 +117,16 @@ def main(args_list: Optional[list[str]] = None) -> None:
         help="The Marian model (or models if its an ensemble) to use for translations",
     )
     parser.add_argument(
-        "--vocab",
+        "--vocab_src",
         required=False,
         type=str,
-        help="The path to a vocab file (optional)",
+        help="The path to a src vocab file (optional)",
+    )
+    parser.add_argument(
+        "--vocab_trg",
+        required=False,
+        type=str,
+        help="The path to a trg vocab file (optional)",
     )
     parser.add_argument(
         "--shortlist",
@@ -176,9 +182,8 @@ def main(args_list: Optional[list[str]] = None) -> None:
     elif not args.model_variant == "cpu":
         raise Exception(f"Unsupported model variant {args.model_variant}")
 
-    if args.vocab:
-        # Pass in the vocab twice as it's shared between the source and the target.
-        marian_extra_args = [*marian_extra_args, "--vocabs", args.vocab, args.vocab]
+    if args.vocab_src and args.vocab_trg:
+        marian_extra_args = [*marian_extra_args, "--vocabs", args.vocab_src, args.vocab_trg]
 
     if args.shortlist:
         # The final "false" argument tells Marian not to verify the correctness of the shortlist.
 
@@ -17,8 +17,9 @@ test -v BMT_MARIAN
 
 model_dir=$1
 shortlist=$2
-vocab=$3
-output_dir=$4
+vocab_src=$3
+vocab_trg=$4
+output_dir=$5
 
 mkdir -p "${output_dir}"
 
@@ -30,13 +31,22 @@ shortlist_bin="${output_dir}/lex.50.50.${SRC}${TRG}.s2t.bin"
 "${BMT_MARIAN}"/marian-conv \
   --shortlist "${shortlist}" 50 50 0 \
   --dump "${shortlist_bin}" \
-  --vocabs "${vocab}" "${vocab}"
+  --vocabs "${vocab_src}" "${vocab_trg}"
 pigz "${shortlist_bin}"
 
-vocab_out="${output_dir}/vocab.${SRC}${TRG}.spm"
-cp "${vocab}" "${vocab_out}"
-pigz "${vocab_out}"
-
+if cmp --silent "${vocab_src}" "${vocab_trg}"; then
+  echo "Vocab files are identical, output a joint vocab"
+  vocab_out="${output_dir}/vocab.${SRC}${TRG}.spm"
+  cp "${vocab_src}" "${vocab_out}"
+  pigz "${vocab_out}"
+else
+  vocab_src_out="${output_dir}/srcvocab.${SRC}${TRG}.spm"
+  vocab_trg_out="${output_dir}/trgvocab.${SRC}${TRG}.spm"
+  cp "${vocab_src}" "${vocab_src_out}"
+  cp "${vocab_trg}" "${vocab_trg_out}"
+  pigz "${vocab_src_out}"
+  pigz "${vocab_trg_out}"
+fi
 
 echo "### Export is completed. Results: ${output_dir}"
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-opustrainer==0.3`
	`1`	`+opustrainer==0.4`
`2`	`2`	`simalign==0.4`
`3`	`3`	`mtdata==0.4.1`
`4`	`4`	`psutil==6.0.0`