Skip to content

Commit e11967a

Browse files
committed
Fix loading manifests from stdin (#810)
Piping a manifest into `lhotse split - <out>` (or any other command using load_manifest_lazy_or_eager) failed with a JSONDecodeError. Root cause: load_manifest_lazy_or_eager routes "-" to load_manifest_lazy, which consumes one line of stdin to detect the manifest class and then builds a LazyManifestIterator that re-opens the input on every iteration. stdin is a one-shot stream that cannot be re-opened, so subsequent reads see either truncated or empty data. Fix: - load_manifest_lazy_or_eager now eagerly slurps stdin into a list and dispatches to from_dicts (mirroring load_manifest's logic), so the manifest is fully materialized in a single pass. - The `lhotse split` and `lhotse split-lazy` CLI commands derived output filenames from the input path's stem/suffix, which produced bogus names like `out/-.0` (no extension) when reading from stdin. They now fall back to a `manifest.<idx>.jsonl.gz` naming scheme when the input is "-". Tests: - test/test_load_manifest_stdin.py exercises the stdin path with monkeypatched sys.stdin, covering full-load, split(), double-iteration, type detection, empty input, explicit manifest_cls, and garbage input.
1 parent d4d3a74 commit e11967a

3 files changed

Lines changed: 143 additions & 4 deletions

File tree

lhotse/bin/modes/manipulation.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,14 +168,22 @@ def split(
168168

169169
output_dir = Path(output_dir)
170170
manifest = Path(manifest)
171-
suffix = "".join(manifest.suffixes)
171+
is_stdin = str(manifest) == "-"
172+
if is_stdin:
173+
# The "-" placeholder for stdin has no useful stem or suffix to derive
174+
# output filenames from, so fall back to sensible defaults.
175+
stem = "manifest"
176+
suffix = ".jsonl.gz"
177+
else:
178+
stem = manifest.stem
179+
suffix = "".join(manifest.suffixes)
172180
any_set = load_manifest_lazy_or_eager(manifest)
173181
parts = any_set.split(num_splits=num_splits, shuffle=shuffle)
174182
output_dir.mkdir(parents=True, exist_ok=True)
175183
num_digits = len(str(num_splits))
176184
for idx, part in enumerate(parts, start=start_idx):
177185
idx = f"{idx}".zfill(num_digits) if pad else str(idx)
178-
part.to_file((output_dir / manifest.stem).with_suffix(f".{idx}{suffix}"))
186+
part.to_file((output_dir / stem).with_suffix(f".{idx}{suffix}"))
179187

180188

181189
@cli.command()
@@ -206,11 +214,12 @@ def split_lazy(
206214

207215
output_dir = Path(output_dir)
208216
manifest = Path(manifest)
217+
prefix = "manifest" if str(manifest) == "-" else manifest.stem
209218
any_set = load_manifest_lazy_or_eager(manifest)
210219
any_set.split_lazy(
211220
output_dir=output_dir,
212221
chunk_size=chunk_size,
213-
prefix=manifest.stem,
222+
prefix=prefix,
214223
start_idx=start_idx,
215224
)
216225

lhotse/serialization.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -491,13 +491,52 @@ def load_manifest_lazy_or_eager(
491491
"""
492492
Generic utility for reading an arbitrary manifest.
493493
If possible, opens the manifest lazily, otherwise reads everything into memory.
494+
495+
.. note::
496+
When ``path`` is ``"-"`` (stdin), the manifest is always loaded eagerly,
497+
because stdin is a one-shot stream and the lazy reader needs to re-open
498+
the input to iterate over it (see GitHub issue #810).
494499
"""
495-
if extension_contains(".jsonl", path) or str(path) == "-":
500+
if str(path) == "-":
501+
# stdin cannot be re-opened or seeked, so lazy iteration (which
502+
# reopens the input on every pass) does not work for it. Eagerly
503+
# consume stdin into a list and build the manifest from it.
504+
return _load_manifest_from_stdin(manifest_cls=manifest_cls)
505+
if extension_contains(".jsonl", path):
496506
return load_manifest_lazy(path)
497507
else:
498508
return load_manifest(path, manifest_cls=manifest_cls)
499509

500510

511+
def _load_manifest_from_stdin(manifest_cls=None) -> Optional[Manifest]:
512+
"""
513+
Read a JSONL manifest from standard input and build it eagerly.
514+
515+
Stdin is a one-shot stream that cannot be re-opened, which is incompatible
516+
with Lhotse's lazy iterators. We therefore slurp every line once and then
517+
dispatch to the same ``from_dicts`` logic used by :func:`load_manifest`.
518+
"""
519+
from lhotse import CutSet, FeatureSet, RecordingSet, SupervisionSet
520+
521+
raw_data = list(load_jsonl("-"))
522+
if not raw_data:
523+
return None # empty manifest
524+
525+
if manifest_cls is not None:
526+
candidates = [manifest_cls]
527+
else:
528+
candidates = [RecordingSet, SupervisionSet, FeatureSet, CutSet]
529+
for manifest_type in candidates:
530+
try:
531+
data_set = manifest_type.from_dicts(raw_data)
532+
if len(data_set) == 0:
533+
raise RuntimeError()
534+
return data_set
535+
except Exception:
536+
pass
537+
raise ValueError("Unknown type of manifest read from stdin.")
538+
539+
501540
def resolve_manifest_set_class(item):
502541
"""Returns the right *Set class for a manifest, e.g. Recording -> RecordingSet."""
503542
from lhotse import (

test/test_load_manifest_stdin.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
"""Tests for loading manifests from stdin (``"-"``).
2+
3+
See https://github.com/lhotse-speech/lhotse/issues/810.
4+
"""
5+
6+
import io
7+
import json
8+
import sys
9+
10+
import pytest
11+
12+
from lhotse import RecordingSet
13+
from lhotse.serialization import load_manifest_lazy_or_eager
14+
from lhotse.testing.dummies import dummy_recording, dummy_supervision
15+
16+
17+
def _redirect_stdin(monkeypatch, payload: str):
18+
monkeypatch.setattr(sys, "stdin", io.StringIO(payload))
19+
20+
21+
def _to_jsonl(items) -> str:
22+
return "\n".join(json.dumps(item.to_dict()) for item in items) + "\n"
23+
24+
25+
def test_load_manifest_from_stdin_returns_full_manifest(monkeypatch):
26+
recordings = [dummy_recording(i) for i in range(8)]
27+
_redirect_stdin(monkeypatch, _to_jsonl(recordings))
28+
29+
manifest = load_manifest_lazy_or_eager("-")
30+
31+
assert isinstance(manifest, RecordingSet)
32+
assert len(manifest) == 8
33+
assert [r.id for r in manifest] == [r.id for r in recordings]
34+
35+
36+
def test_load_manifest_from_stdin_supports_split(monkeypatch):
37+
"""Regression test for #810: ``load + split`` used to fail because the lazy
38+
loader tried to re-read stdin to materialize the iterator.
39+
"""
40+
recordings = [dummy_recording(i) for i in range(8)]
41+
_redirect_stdin(monkeypatch, _to_jsonl(recordings))
42+
43+
manifest = load_manifest_lazy_or_eager("-")
44+
parts = manifest.split(num_splits=4)
45+
46+
assert len(parts) == 4
47+
assert sum(len(p) for p in parts) == len(recordings)
48+
49+
50+
def test_load_manifest_from_stdin_can_be_iterated_twice(monkeypatch):
51+
recordings = [dummy_recording(i) for i in range(3)]
52+
_redirect_stdin(monkeypatch, _to_jsonl(recordings))
53+
54+
manifest = load_manifest_lazy_or_eager("-")
55+
first_pass = [r.id for r in manifest]
56+
second_pass = [r.id for r in manifest]
57+
58+
assert first_pass == second_pass == [r.id for r in recordings]
59+
60+
61+
def test_load_manifest_from_stdin_detects_supervision_set(monkeypatch):
62+
supervisions = [dummy_supervision(i) for i in range(2)]
63+
_redirect_stdin(monkeypatch, _to_jsonl(supervisions))
64+
65+
manifest = load_manifest_lazy_or_eager("-")
66+
67+
from lhotse import SupervisionSet
68+
69+
assert isinstance(manifest, SupervisionSet)
70+
assert len(manifest) == 2
71+
72+
73+
def test_load_manifest_from_stdin_returns_none_for_empty_input(monkeypatch):
74+
_redirect_stdin(monkeypatch, "")
75+
assert load_manifest_lazy_or_eager("-") is None
76+
77+
78+
def test_load_manifest_from_stdin_with_explicit_manifest_cls(monkeypatch):
79+
recordings = [dummy_recording(i) for i in range(2)]
80+
_redirect_stdin(monkeypatch, _to_jsonl(recordings))
81+
82+
manifest = load_manifest_lazy_or_eager("-", manifest_cls=RecordingSet)
83+
84+
assert isinstance(manifest, RecordingSet)
85+
assert len(manifest) == 2
86+
87+
88+
def test_load_manifest_from_stdin_garbage_raises(monkeypatch):
89+
_redirect_stdin(monkeypatch, "this is not json\n")
90+
with pytest.raises(Exception):
91+
load_manifest_lazy_or_eager("-")

0 commit comments

Comments
 (0)