Skip to content

Commit 0755cd9

Browse files
authored
VTT reader: fix handling of multiple timestamp tags
1 parent 3562f7b commit 0755cd9

File tree

2 files changed

+39
-14
lines changed

2 files changed

+39
-14
lines changed

src/main/python/ttconv/vtt/reader.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class _TextCueParser:
5353

5454
def __init__(self, paragraph: model.P, line_number: int) -> None:
5555
self.line_num: int = line_number
56+
self.paragraph: model.P = paragraph
5657
self.parent: model.ContentElement = paragraph
5758

5859
# handle the special case of ruby elements where children cannot be added one by one
@@ -73,23 +74,25 @@ def handle_token(self, token: Token) -> None:
7374

7475
def _handle_ts(self, token: TimestampTagToken):
7576

76-
span = self._make_span(self.parent)
77-
self.parent.push_child(span)
78-
self.parent = span
79-
8077
ts = vtt_timestamp_to_secs(token.timestamp)
81-
parent_begin = None
82-
parent = self.parent
83-
while parent is not None:
84-
parent_begin = parent.get_begin()
85-
if parent_begin is not None:
86-
break
87-
parent = parent.parent()
88-
if ts is not None and parent_begin is not None and parent_begin <= ts:
89-
span.set_begin(ts - parent_begin)
90-
else:
78+
if ts is None:
79+
LOGGER.warning("Invalid timestamp tag %s at line %s", token.timestamp, self.line_num)
80+
return
81+
82+
# we handle only top-level timestamp tags
83+
if self.parent.get_begin() is None:
84+
LOGGER.warning("Nested timestamp tag %s at line %s", token.timestamp, self.line_num)
85+
return
86+
87+
p_begin = self.paragraph.get_begin()
88+
if p_begin is None or p_begin >= ts:
9189
LOGGER.warning("Invalid timestamp tag %s", token.timestamp)
9290

91+
span = self._make_span(self.paragraph)
92+
span.set_begin(ts - p_begin)
93+
self.paragraph.push_child(span)
94+
self.parent = span
95+
9396
def _handle_starttag(self, token: StartTagToken):
9497

9598
tag = token.tag.lower()

src/test/python/test_vtt_reader.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,28 @@ def test_single_line_with_space(self):
141141
div = list(body[0])
142142
self.assertEqual(len(div), 2)
143143

144+
def test_toplevel_timestamp_tags(self):
145+
# from https://github.com/sandflow/ttconv/issues/439
146+
SAMPLE = """WEBVTT
147+
Kind: captions
148+
Language: en
149+
150+
00:00:00.799 --> 00:00:02.869 align:start position:0%
151+
\x20
152+
hi<00:00:01.040><c> everyone</c><00:00:01.920><c> today</c><00:00:02.240><c> we're</c><00:00:02.399><c> going</c><00:00:02.639><c> to</c><00:00:02.720><c> be</c>
153+
154+
00:00:02.869 --> 00:00:02.879 align:start position:0%
155+
hi everyone today we're going to be
156+
"""
157+
158+
doc = to_model(io.StringIO(SAMPLE))
159+
body = list(doc.get_body())
160+
spans_and_brs = list(body[0][0])
161+
self.assertIsNone(spans_and_brs[0].get_begin()) # \x20
162+
self.assertIsNone(spans_and_brs[2].get_begin()) # hi
163+
self.assertEqual(spans_and_brs[3].get_begin(), 1.040 - 0.799) # everyone
164+
self.assertEqual(spans_and_brs[4].get_begin(), 1.920 - 0.799) # today
165+
144166
def test_italic(self):
145167
f = io.StringIO(r"""WEBVTT
146168

0 commit comments

Comments
 (0)