Add id to chunks and alignments, bump to v0.2.0

Lauler · Lauler · commit 687f7fc5d8fa · 2026-03-02T09:24:50.000+01:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
-version = "0.1.1"
+version = "0.2.0"
 name = "easyaligner"
 requires-python = ">= 3.10"
 description = "Forced alignment pipeline designed for efficiency and ease of use."
diff --git a/src/easyaligner/alignment/pytorch.py b/src/easyaligner/alignment/pytorch.py
@@ -152,11 +152,13 @@ def align_chunks(
     """
     tokenizer_case = _get_processor_case(processor)  # determine if processor is cased or uncased
     chunk_mappings = []
-    for speech in metadata.speeches:
+    for speech_idx, speech in enumerate(metadata.speeches):
+        speech_id = speech.speech_id if speech.speech_id is not None else speech_idx
         emissions_filepath = Path(emissions_dir) / speech.probs_path
         emissions = np.load(emissions_filepath)
 
         for i, chunk in enumerate(speech.chunks):
+            chunk.id = f"{speech_id}-{i}"
             normalized_tokens, mapping = text_normalizer_fn(chunk.text)
             emissions_chunk = emissions[i]
             emissions_chunk = emissions_chunk[: chunk.num_logits]
@@ -177,6 +179,8 @@ def align_chunks(
                 alignment_mapping = process_fallback_alignment(
                     mapping, chunk.start, chunk.end, chunk.text, tokenizer, None, ndigits
                 )
+                for j, seg in enumerate(alignment_mapping):
+                    seg.id = f"{speech_id}-{i}-{j}"
                 chunk_mappings.extend(alignment_mapping)
                 speech.alignments.extend(alignment_mapping)
                 continue
@@ -225,6 +229,8 @@ def align_chunks(
             )
 
             alignment_mapping = encode_alignments(mapping, ndigits=ndigits)
+            for j, seg in enumerate(alignment_mapping):
+                seg.id = f"{speech_id}-{i}-{j}"
 
             chunk_mappings.extend(alignment_mapping)
             speech.alignments.extend(alignment_mapping)
@@ -253,7 +259,8 @@ def align_speech(
 ) -> list:
     tokenizer_case = _get_processor_case(processor)
     speech_mappings = []
-    for speech in metadata.speeches:
+    for speech_idx, speech in enumerate(metadata.speeches):
+        speech_id = speech.speech_id if speech.speech_id is not None else speech_idx
         emissions_filepath = Path(emissions_dir) / speech.probs_path
         emissions = np.load(emissions_filepath)
         emissions = np.vstack(emissions)
@@ -296,6 +303,8 @@ def align_speech(
                 speech.text_spans,
                 ndigits,
             )
+            for j, seg in enumerate(alignment_mapping):
+                seg.id = f"{speech_id}-{j}"
             speech.alignments.extend(alignment_mapping)
             speech_mappings.extend(alignment_mapping)
             if delete_emissions:
@@ -346,6 +355,8 @@ def align_speech(
         )
 
         alignment_mapping = encode_alignments(mapping, ndigits=ndigits)
+        for j, seg in enumerate(alignment_mapping):
+            seg.id = f"{speech_id}-{j}"
         speech.alignments.extend(alignment_mapping)
         speech_mappings.extend(alignment_mapping)
 
diff --git a/src/easyaligner/data/datamodel.py b/src/easyaligner/data/datamodel.py
@@ -30,7 +30,7 @@ def to_dict(self):
 
 class AudioChunk(msgspec.Struct):
     """
-    Segment of audio, usually created by VAD.
+    Segment of audio, usually created by Voice Activity Detection (VAD).
 
     Attributes
     ----------
@@ -50,6 +50,8 @@ class AudioChunk(msgspec.Struct):
         Language code for the chunk.
     language_prob : float, optional
         Probability/confidence of the detected language.
+    id : str or int, optional
+        Optional unique identifier for the chunk.
     """
 
     start: float
@@ -60,6 +62,7 @@ class AudioChunk(msgspec.Struct):
     num_logits: int | None = None
     language: str | None = None
     language_prob: float | None = None
+    id: str | int | None = None
 
     def to_dict(self):
         return {f: getattr(self, f) for f in self.__struct_fields__}
@@ -89,6 +92,8 @@ class AlignmentSegment(msgspec.Struct):
         The aligned text segment.
     words : list of WordSegment
         List of word-level alignment data within this segment.
+    id : str or int, optional
+        Optional unique identifier for the alignment segment.
     duration : float, optional
         Duration of the aligned segment in seconds.
     score : float, optional
@@ -99,6 +104,7 @@ class AlignmentSegment(msgspec.Struct):
     end: float  # in seconds
     text: str
     words: list[WordSegment] = []
+    id: str | int | None = None
     duration: float | None = None  # in seconds
     score: float | None = None  # Optional confidence score
 
@@ -116,7 +122,7 @@ def __post_init__(self):
 
 class SpeechSegment(msgspec.Struct):
     """
-    A slice of the audio that contains speech of interest to be aligned.
+    A slice of the audio file that contains speech of interest to be aligned.
 
     A `SpeechSegment` may be a speech given by a single speaker, a dialogue between
     multiple speakers, a book chapter, or whatever unit of organisational abstraction
diff --git a/src/easyaligner/text/__init__.py b/src/easyaligner/text/__init__.py
@@ -1,7 +1,6 @@
 from easyaligner.text.normalization import (
     SpanMapNormalizer,
     add_deletions_to_mapping,
-    format_symbols_abbreviations,
     merge_multitoken_expressions,
     text_normalizer,
 )
@@ -10,7 +9,6 @@
 __all__ = [
     "SpanMapNormalizer",
     "add_deletions_to_mapping",
-    "format_symbols_abbreviations",
     "load_tokenizer",
     "merge_multitoken_expressions",
     "text_normalizer",
diff --git a/src/easyaligner/text/normalization.py b/src/easyaligner/text/normalization.py
@@ -45,7 +45,37 @@ def text_normalizer(text: str) -> str:
     normalized_tokens = [item["normalized_token"] for item in mapping]
     return normalized_tokens, mapping
 
+
 class SpanMapNormalizer:
+    r"""
+    Apply regex text transformations while keeping track of the character spans
+    in the original text.
+
+    Parameters
+    ----------
+    text : str
+        The input text to be normalized.
+
+    Example
+    -------
+    ```python
+    from easyaligner.text.normalization import SpanMapNormalizer
+
+    text = '''Book 1. Chapter 1, The Period. It was the best of times. It was the worst of times.
+    It was the age of wisdom. It was the age of foolishness. It was the epoch of belief.
+    It was the epoch of incredulity. It was the season of light.
+    It was the season of darkness. It was the spring of hope.'''
+
+    normalizer = SpanMapNormalizer(text)
+    normalizer.transform(r"[^\w\s]", "")  # Remove punctuation and special characters
+    normalizer.transform(r"\S+", lambda m: m.group().lower()) # Lowercase
+    normalizer.transform(r"\s+", " ")  # Normalize whitespace to a single space
+    normalizer.transform(r"^\s+|\s+$", "")  # Strip leading and trailing whitespace
+    print(normalizer.current_text)
+    print(normalizer.get_token_map())
+    ```
+    """
+
     def __init__(self, text: str):
         self.original_text = text
         self.current_text = text