Skip to content

Commit 687f7fc

Browse files
committed
Add id to chunks and alignments, bump to v0.2.0
1 parent fd94539 commit 687f7fc

5 files changed

Lines changed: 52 additions & 7 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ requires = ["setuptools>=67.0.0"]
33
build-backend = "setuptools.build_meta"
44

55
[project]
6-
version = "0.1.1"
6+
version = "0.2.0"
77
name = "easyaligner"
88
requires-python = ">= 3.10"
99
description = "Forced alignment pipeline designed for efficiency and ease of use."

src/easyaligner/alignment/pytorch.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,11 +152,13 @@ def align_chunks(
152152
"""
153153
tokenizer_case = _get_processor_case(processor) # determine if processor is cased or uncased
154154
chunk_mappings = []
155-
for speech in metadata.speeches:
155+
for speech_idx, speech in enumerate(metadata.speeches):
156+
speech_id = speech.speech_id if speech.speech_id is not None else speech_idx
156157
emissions_filepath = Path(emissions_dir) / speech.probs_path
157158
emissions = np.load(emissions_filepath)
158159

159160
for i, chunk in enumerate(speech.chunks):
161+
chunk.id = f"{speech_id}-{i}"
160162
normalized_tokens, mapping = text_normalizer_fn(chunk.text)
161163
emissions_chunk = emissions[i]
162164
emissions_chunk = emissions_chunk[: chunk.num_logits]
@@ -177,6 +179,8 @@ def align_chunks(
177179
alignment_mapping = process_fallback_alignment(
178180
mapping, chunk.start, chunk.end, chunk.text, tokenizer, None, ndigits
179181
)
182+
for j, seg in enumerate(alignment_mapping):
183+
seg.id = f"{speech_id}-{i}-{j}"
180184
chunk_mappings.extend(alignment_mapping)
181185
speech.alignments.extend(alignment_mapping)
182186
continue
@@ -225,6 +229,8 @@ def align_chunks(
225229
)
226230

227231
alignment_mapping = encode_alignments(mapping, ndigits=ndigits)
232+
for j, seg in enumerate(alignment_mapping):
233+
seg.id = f"{speech_id}-{i}-{j}"
228234

229235
chunk_mappings.extend(alignment_mapping)
230236
speech.alignments.extend(alignment_mapping)
@@ -253,7 +259,8 @@ def align_speech(
253259
) -> list:
254260
tokenizer_case = _get_processor_case(processor)
255261
speech_mappings = []
256-
for speech in metadata.speeches:
262+
for speech_idx, speech in enumerate(metadata.speeches):
263+
speech_id = speech.speech_id if speech.speech_id is not None else speech_idx
257264
emissions_filepath = Path(emissions_dir) / speech.probs_path
258265
emissions = np.load(emissions_filepath)
259266
emissions = np.vstack(emissions)
@@ -296,6 +303,8 @@ def align_speech(
296303
speech.text_spans,
297304
ndigits,
298305
)
306+
for j, seg in enumerate(alignment_mapping):
307+
seg.id = f"{speech_id}-{j}"
299308
speech.alignments.extend(alignment_mapping)
300309
speech_mappings.extend(alignment_mapping)
301310
if delete_emissions:
@@ -346,6 +355,8 @@ def align_speech(
346355
)
347356

348357
alignment_mapping = encode_alignments(mapping, ndigits=ndigits)
358+
for j, seg in enumerate(alignment_mapping):
359+
seg.id = f"{speech_id}-{j}"
349360
speech.alignments.extend(alignment_mapping)
350361
speech_mappings.extend(alignment_mapping)
351362

src/easyaligner/data/datamodel.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def to_dict(self):
3030

3131
class AudioChunk(msgspec.Struct):
3232
"""
33-
Segment of audio, usually created by VAD.
33+
Segment of audio, usually created by Voice Activity Detection (VAD).
3434
3535
Attributes
3636
----------
@@ -50,6 +50,8 @@ class AudioChunk(msgspec.Struct):
5050
Language code for the chunk.
5151
language_prob : float, optional
5252
Probability/confidence of the detected language.
53+
id : str or int, optional
54+
Optional unique identifier for the chunk.
5355
"""
5456

5557
start: float
@@ -60,6 +62,7 @@ class AudioChunk(msgspec.Struct):
6062
num_logits: int | None = None
6163
language: str | None = None
6264
language_prob: float | None = None
65+
id: str | int | None = None
6366

6467
def to_dict(self):
6568
return {f: getattr(self, f) for f in self.__struct_fields__}
@@ -89,6 +92,8 @@ class AlignmentSegment(msgspec.Struct):
8992
The aligned text segment.
9093
words : list of WordSegment
9194
List of word-level alignment data within this segment.
95+
id : str or int, optional
96+
Optional unique identifier for the alignment segment.
9297
duration : float, optional
9398
Duration of the aligned segment in seconds.
9499
score : float, optional
@@ -99,6 +104,7 @@ class AlignmentSegment(msgspec.Struct):
99104
end: float # in seconds
100105
text: str
101106
words: list[WordSegment] = []
107+
id: str | int | None = None
102108
duration: float | None = None # in seconds
103109
score: float | None = None # Optional confidence score
104110

@@ -116,7 +122,7 @@ def __post_init__(self):
116122

117123
class SpeechSegment(msgspec.Struct):
118124
"""
119-
A slice of the audio that contains speech of interest to be aligned.
125+
A slice of the audio file that contains speech of interest to be aligned.
120126
121127
A `SpeechSegment` may be a speech given by a single speaker, a dialogue between
122128
multiple speakers, a book chapter, or whatever unit of organisational abstraction

src/easyaligner/text/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from easyaligner.text.normalization import (
22
SpanMapNormalizer,
33
add_deletions_to_mapping,
4-
format_symbols_abbreviations,
54
merge_multitoken_expressions,
65
text_normalizer,
76
)
@@ -10,7 +9,6 @@
109
__all__ = [
1110
"SpanMapNormalizer",
1211
"add_deletions_to_mapping",
13-
"format_symbols_abbreviations",
1412
"load_tokenizer",
1513
"merge_multitoken_expressions",
1614
"text_normalizer",

src/easyaligner/text/normalization.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,37 @@ def text_normalizer(text: str) -> str:
4545
normalized_tokens = [item["normalized_token"] for item in mapping]
4646
return normalized_tokens, mapping
4747

48+
4849
class SpanMapNormalizer:
50+
r"""
51+
Apply regex text transformations while keeping track of the character spans
52+
in the original text.
53+
54+
Parameters
55+
----------
56+
text : str
57+
The input text to be normalized.
58+
59+
Example
60+
-------
61+
```python
62+
from easyaligner.text.normalization import SpanMapNormalizer
63+
64+
text = '''Book 1. Chapter 1, The Period. It was the best of times. It was the worst of times.
65+
It was the age of wisdom. It was the age of foolishness. It was the epoch of belief.
66+
It was the epoch of incredulity. It was the season of light.
67+
It was the season of darkness. It was the spring of hope.'''
68+
69+
normalizer = SpanMapNormalizer(text)
70+
normalizer.transform(r"[^\w\s]", "") # Remove punctuation and special characters
71+
normalizer.transform(r"\S+", lambda m: m.group().lower()) # Lowercase
72+
normalizer.transform(r"\s+", " ") # Normalize whitespace to a single space
73+
normalizer.transform(r"^\s+|\s+$", "") # Strip leading and trailing whitespace
74+
print(normalizer.current_text)
75+
print(normalizer.get_token_map())
76+
```
77+
"""
78+
4979
def __init__(self, text: str):
5080
self.original_text = text
5181
self.current_text = text

0 commit comments

Comments
 (0)