-
Notifications
You must be signed in to change notification settings - Fork 152
Expand file tree
/
Copy pathcreate.py
More file actions
74 lines (64 loc) · 2.39 KB
/
create.py
File metadata and controls
74 lines (64 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import typer
from data_designer.cli.controllers.generation_controller import GenerationController
from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS
def create_command(
config_source: str = typer.Argument(
help=(
"Path or URL to a config file (.yaml/.yml/.json), or a local Python module (.py)"
" that defines a load_config_builder() function."
),
),
num_records: int = typer.Option(
DEFAULT_NUM_RECORDS,
"--num-records",
"-n",
help="Number of records to generate.",
min=1,
),
dataset_name: str = typer.Option(
"dataset",
"--dataset-name",
"-d",
help="Name for the generated dataset folder.",
),
artifact_path: str | None = typer.Option(
None,
"--artifact-path",
"-o",
help="Path where generated artifacts will be stored. Defaults to ./artifacts.",
),
output_format: str | None = typer.Option(
None,
"--output-format",
"-f",
help=(
"Export the dataset to a single file after generation. "
"Supported formats: jsonl, csv, parquet. "
"The file is written to <artifact-path>/<dataset-name>/dataset.<format>."
),
),
) -> None:
"""Create a full dataset and save results to disk.
This runs the complete generation pipeline: building the dataset, profiling
the data, and storing all artifacts to the specified output path.
Examples:
# Create dataset from a YAML config
data-designer create my_config.yaml
# Create with custom settings
data-designer create my_config.yaml --num-records 1000 --dataset-name my_dataset
# Create from a remote config URL
data-designer create https://example.com/my_config.json --dataset-name my_dataset
# Create from a Python module with custom output path
data-designer create my_config.py --artifact-path /path/to/output
"""
controller = GenerationController()
controller.run_create(
config_source=config_source,
num_records=num_records,
dataset_name=dataset_name,
artifact_path=artifact_path,
output_format=output_format,
)