Add punctuation modifier (#1163)

evgenyrp · web-flow · commit 8d8bab91cf7a · 2025-07-02T17:40:01.000-07:00
* Update opus trainer

* Use punct modifier

* Add evaluation support

* Update config generation

* Update readme
diff --git a/docs/training/opus-trainer.md b/docs/training/opus-trainer.md
@@ -14,8 +14,9 @@ Data augmentation helps make translation models more robust, which is especially
 OpusTrainer augments data on the fly, meaning it will generate unique data for each epoch of training.
 
 Supported augmentations:
-- **Upper case** - make some sentences from the dataset upper case
-- **Title case** - use title case for some sentences from the dataset
+- **UpperCase** - make some sentences from the dataset upper case
+- **TitleCase** - use title case for some sentences from the dataset
+- **RemoveEndPunct** - removes terminal punctuation mark from the source and target sentences if it matches by type (e.g. `.` and `。`)
 - **Typos** - add random typos in some words
 - **Noise** - insert lines with random unicode noise
 - **Tags (inline noise)** - add emojis and other random Unicode symbols in the source and target sentences in the appropriate positions
@@ -80,6 +81,7 @@ finetune:
 modifiers:
 - UpperCase: 0.1 # Apply randomly to 10% of sentences
 - TitleCase: 0.1
+- RemoveEndPunct: 0.2
 - Typos: 0.05
 - Noise: 0.0005
   min_word_length: 2 # Minimum word length for each word in the noisy sentence
@@ -146,6 +148,8 @@ For example:
 
 `aug-upper` -  applies upper case to the whole dataset
 
+`aug-punct` -  applies modification of punctuation
+
 `aug-noise` -  generates extra lines with noise (1 line of noise for each line of the dataset, so the dataset becomes twice longer)
 
 `aug-inline-noise` -  inserts the same random noise in the appropriate positions of the source and target sentences based on dynamically generated alignments. 
@@ -168,6 +172,7 @@ so it should only be used on small evaluation datasets.
     - flores_aug-mix_devtest
     - flores_aug-title_devtest
     - flores_aug-upper_devtest
+    - flores_aug-punct_devtest
     - flores_aug-typos_devtest
     - flores_aug-noise_devtest
     - flores_aug-inline-noise_devtest
diff --git a/pipeline/data/parallel_importer.py b/pipeline/data/parallel_importer.py
@@ -20,6 +20,7 @@
 
 from opustrainer.modifiers.noise import NoiseModifier
 from opustrainer.modifiers.placeholders import PlaceholderTagModifier
+from opustrainer.modifiers.punctuation import RemoveEndPunctuationModifier
 from opustrainer.modifiers.surface import TitleCaseModifier, UpperCaseModifier
 from opustrainer.modifiers.typos import TypoModifier
 from opustrainer.types import Modifier
@@ -73,10 +74,12 @@ def get_typos_probs() -> Dict[str, float]:
     "aug-typos": lambda: TypoModifier(PROB_1, **get_typos_probs()),
     "aug-title": lambda: TitleCaseModifier(PROB_1),
     "aug-upper": lambda: UpperCaseModifier(PROB_1),
+    "aug-punct": lambda: RemoveEndPunctuationModifier(PROB_1),
     "aug-noise": lambda: NoiseModifier(PROB_1),
     "aug-inline-noise": lambda: PlaceholderTagModifier(NOISE_PROB, augment=1),
     "aug-mix": lambda: CompositeModifier(
         [
+            RemoveEndPunctuationModifier(MIX_PROB),
             TypoModifier(MIX_PROB, **get_typos_probs()),
             TitleCaseModifier(MIX_PROB),
             UpperCaseModifier(MIX_PROB),
@@ -86,6 +89,7 @@ def get_typos_probs() -> Dict[str, float]:
     ),
     "aug-mix-cjk": lambda: CompositeModifier(
         [
+            RemoveEndPunctuationModifier(MIX_PROB),
             NoiseModifier(MIX_PROB),
             PlaceholderTagModifier(NOISE_MIX_PROB, augment=1),
         ]
diff --git a/pipeline/data/requirements/data.in b/pipeline/data/requirements/data.in
@@ -1,4 +1,4 @@
-opustrainer==0.4
+opustrainer==0.5
 simalign==0.4
 mtdata==0.4.1
 psutil==6.0.0
diff --git a/pipeline/data/requirements/data.txt b/pipeline/data/requirements/data.txt
@@ -489,9 +489,9 @@ opencc==1.1.9 \
     --hash=sha256:c6d5f9756ed08e67de36c53dc4d8f0bdc72889d6f57a8fc4d8b073d99c58d4dc \
     --hash=sha256:f4267b66ed6e656b5d8199f94e9673950ac39d49ebaf0e7927330801f06f038f
     # via -r pipeline/data/requirements/data.in
-opustrainer==0.4 \
-    --hash=sha256:0bdf4adbabd0cdc4e73c99b36d01c0e69178e237adfd28293498b413e26c415c \
-    --hash=sha256:bb973c52c7b4303e68ebc805cb8ad9e55518930131228a62ba112d2b2ab52ec6
+opustrainer==0.5 \
+    --hash=sha256:d8533040747d23c128859d1948e464bbe991d2ae60fd036f416536680c8f08ea \
+    --hash=sha256:e3c61b6ce1c3a7225b1ed927d317bc1d7c0314c0a7bb2278528f341d366ccc70
     # via -r pipeline/data/requirements/data.in
 packaging==24.1 \
     --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \
diff --git a/pipeline/train/configs/opustrainer/student.cjk.yml b/pipeline/train/configs/opustrainer/student.cjk.yml
@@ -11,6 +11,8 @@ train:
 # The default values of the modifiers are taken from the paper https://arxiv.org/pdf/2311.14838.pdf
 # Please refer to docs/opus-trainer.md for further details
 modifiers:
+# Remove terminal punctuation to teach the model translate text without it
+- RemoveEndPunct: 0.2
 # Insert new sentences composed form Unicode noise
 - Noise: 0.0005
   min_word_length: 2 # Minimum word length for each word in the noisy sentence
diff --git a/pipeline/train/configs/opustrainer/student.yml b/pipeline/train/configs/opustrainer/student.yml
@@ -14,6 +14,8 @@ modifiers:
 # boost upper case a little as we see that the models underperform on upper case dataset on evaluation
 - UpperCase: 0.07 # Apply randomly to 7% of sentences
 - TitleCase: 0.05
+# Remove terminal punctuation to teach the model translate text without it
+- RemoveEndPunct: 0.2
 # Introduce artificial typos in the source text
 - Typos: 0.05
 # Insert new sentences composed form Unicode noise
diff --git a/pipeline/train/configs/opustrainer/teacher.one-stage.cjk.yml b/pipeline/train/configs/opustrainer/teacher.one-stage.cjk.yml
@@ -18,6 +18,8 @@ train:
 # The default values of the modifiers are taken from the paper https://arxiv.org/pdf/2311.14838.pdf
 # Please refer to docs/opus-trainer.md for further details
 modifiers:
+# Remove terminal punctuation to teach the model translate text without it
+- RemoveEndPunct: 0.2
 ## Insert new sentences composed form Unicode noise
 - Noise: 0.0005
   min_word_length: 2 # Minimum word length for each word in the noisy sentence
diff --git a/pipeline/train/configs/opustrainer/teacher.one-stage.yml b/pipeline/train/configs/opustrainer/teacher.one-stage.yml
@@ -15,6 +15,8 @@ train:
 # The default values of the modifiers are taken from the paper https://arxiv.org/pdf/2311.14838.pdf
 # Please refer to docs/opus-trainer.md for further details
 modifiers:
+# Remove terminal punctuation to teach the model translate text without it
+- RemoveEndPunct: 0.2
 # boost upper case a little as we see that the models underperform on upper case dataset on evaluation
 - UpperCase: 0.07 # Apply randomly to 7% of sentences
 - TitleCase: 0.05
diff --git a/pipeline/train/configs/opustrainer/teacher.two-stage.cjk.yml b/pipeline/train/configs/opustrainer/teacher.two-stage.cjk.yml
@@ -20,6 +20,8 @@ finetune:
 # The default values of the modifiers are taken from the paper https://arxiv.org/pdf/2311.14838.pdf
 # Please refer to docs/opus-trainer.md for further details
 modifiers:
+# Remove terminal punctuation to teach the model translate text without it
+- RemoveEndPunct: 0.2
 ## Insert new sentences composed form Unicode noise
 - Noise: 0.0005
   min_word_length: 2 # Minimum word length for each word in the noisy sentence
diff --git a/pipeline/train/configs/opustrainer/teacher.two-stage.yml b/pipeline/train/configs/opustrainer/teacher.two-stage.yml
@@ -20,6 +20,8 @@ finetune:
 # The default values of the modifiers are taken from the paper https://arxiv.org/pdf/2311.14838.pdf
 # Please refer to docs/opus-trainer.md for further details
 modifiers:
+# Remove terminal punctuation to teach the model translate text without it
+- RemoveEndPunct: 0.2
 # boost upper case a little as we see that the models underperform on upper case dataset on evaluation
 - UpperCase: 0.07 # Apply randomly to 7% of sentences
 - TitleCase: 0.05
diff --git a/pipeline/train/requirements/train.in b/pipeline/train/requirements/train.in
@@ -1,2 +1,2 @@
-opustrainer==0.4
+opustrainer==0.5
 gpustat==1.1.1
diff --git a/pipeline/train/requirements/train.txt b/pipeline/train/requirements/train.txt
@@ -23,9 +23,9 @@ nvidia-ml-py==12.560.30 \
     --hash=sha256:f0254dc7400647680a072ee02509bfd46102b60bdfeca321576d4d4817e7fe97 \
     --hash=sha256:fea371c94d63e38a611c17bbb85fe400e9c8ddb9e8684a9cd0e47786a4bc3c73
     # via gpustat
-opustrainer==0.4 \
-    --hash=sha256:0bdf4adbabd0cdc4e73c99b36d01c0e69178e237adfd28293498b413e26c415c \
-    --hash=sha256:bb973c52c7b4303e68ebc805cb8ad9e55518930131228a62ba112d2b2ab52ec6
+opustrainer==0.5 \
+    --hash=sha256:d8533040747d23c128859d1948e464bbe991d2ae60fd036f416536680c8f08ea \
+    --hash=sha256:e3c61b6ce1c3a7225b1ed927d317bc1d7c0314c0a7bb2278528f341d366ccc70
     # via -r pipeline/train/requirements/train.in
 psutil==6.1.0 \
     --hash=sha256:000d1d1ebd634b4efb383f4034437384e44a6d455260aaee2eca1e9c1b55f047 \
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/taskcluster/configs/config.prod.yml b/taskcluster/configs/config.prod.yml
diff --git a/tests/test_data_importer.py b/tests/test_data_importer.py
diff --git a/utils/config_generator.py b/utils/config_generator.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-opustrainer==0.4`
	`1`	`+opustrainer==0.5`
`2`	`2`	`simalign==0.4`
`3`	`3`	`mtdata==0.4.1`
`4`	`4`	`psutil==6.0.0`