Skip to content

Commit b54acc8

Browse files
committed
changes handling of output directory for merge direct command
1 parent 4f9744e commit b54acc8

2 files changed

Lines changed: 3 additions & 11 deletions

File tree

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,6 @@ folders, or to a parent directory containing multiple dataset subdirectories.
7373
If `shard_*` folders are present directly in `--input-dir`, MMIRAGE merges that
7474
root dataset directly and ignores nested internal folders.
7575

76-
By default, merged output is written to `<dataset.output_dir>/merged` for each configured dataset.
77-
7876
For multiple datasets, you can also choose a shared merge root:
7977

8078
```bash

src/mmirage/merge_shards.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -158,13 +158,7 @@ def merge_input_dir(input_dir: str, output_dir: str) -> List[MergeReport]:
158158

159159
reports: List[MergeReport] = []
160160
for dataset_dir in dataset_dirs:
161-
if dataset_dir == input_dir:
162-
ds_output_dir = output_dir
163-
else:
164-
dataset_name = os.path.basename(dataset_dir)
165-
ds_output_dir = os.path.join(output_dir, dataset_name)
166-
167-
reports.append(merge_dataset_dir(dataset_dir, ds_output_dir))
161+
reports.append(merge_dataset_dir(dataset_dir, output_dir))
168162

169163
return reports
170164

@@ -216,8 +210,8 @@ def main():
216210
"""CLI entrypoint for directory-based shard merging.
217211
218212
Scans --input-dir for dataset subdirectories containing shard_* folders.
219-
For each dataset directory, merges shard datasets and writes to --output-dir
220-
while preserving the dataset directory name.
213+
For each dataset directory, merges shard datasets and writes directly to
214+
the provided `--output-dir`.
221215
"""
222216
ap = argparse.ArgumentParser("Merge processed shard datasets into HF datasets.")
223217
ap.add_argument(

0 commit comments

Comments
 (0)