Skip to content

Commit 73e7539

Browse files
committed
fix typing, remove dataset preprocessing until it's migrated
1 parent 6a7d1aa commit 73e7539

File tree

2 files changed

+3
-139
lines changed

2 files changed

+3
-139
lines changed

src/guidellm/__main__.py

Lines changed: 0 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -45,26 +45,12 @@
4545
reimport_benchmarks_report,
4646
)
4747
from guidellm.mock_server import MockServer, MockServerConfig
48-
from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset
4948
from guidellm.scheduler import StrategyType
5049
from guidellm.schemas import GenerativeRequestType
5150
from guidellm.settings import print_config
5251
from guidellm.utils import Console, DefaultGroupHandler, get_literal_vals
5352
from guidellm.utils import cli as cli_tools
5453

55-
__all__ = [
56-
"STRATEGY_PROFILE_CHOICES",
57-
"benchmark",
58-
"cli",
59-
"config",
60-
"dataset",
61-
"decode_escaped_str",
62-
"from_file",
63-
"mock_server",
64-
"preprocess",
65-
"run",
66-
]
67-
6854
STRATEGY_PROFILE_CHOICES: list[str] = list(get_literal_vals(ProfileType | StrategyType))
6955
"""Available strategy and profile type choices for benchmark execution."""
7056

@@ -469,128 +455,6 @@ def preprocess():
469455
"""Dataset preprocessing utilities."""
470456

471457

472-
@preprocess.command(
473-
"dataset",
474-
help=(
475-
"Process a dataset to have specific prompt and output token sizes. "
476-
"Supports multiple strategies for handling prompts and optional "
477-
"Hugging Face Hub upload.\n\n"
478-
"DATA: Path to the input dataset or dataset ID.\n\n"
479-
"OUTPUT_PATH: Path to save the processed dataset, including file suffix."
480-
),
481-
context_settings={"auto_envvar_prefix": "GUIDELLM"},
482-
)
483-
@click.argument(
484-
"data",
485-
type=str,
486-
required=True,
487-
)
488-
@click.argument(
489-
"output_path",
490-
type=click.Path(file_okay=True, dir_okay=False, writable=True, resolve_path=True),
491-
required=True,
492-
)
493-
@click.option(
494-
"--processor",
495-
type=str,
496-
required=True,
497-
help="Processor or tokenizer name for calculating token counts.",
498-
)
499-
@click.option(
500-
"--processor-args",
501-
default=None,
502-
callback=cli_tools.parse_json,
503-
help="JSON string of arguments to pass to the processor constructor.",
504-
)
505-
@click.option(
506-
"--data-args",
507-
callback=cli_tools.parse_json,
508-
help="JSON string of arguments to pass to dataset creation.",
509-
)
510-
@click.option(
511-
"--short-prompt-strategy",
512-
type=click.Choice([s.value for s in ShortPromptStrategy]),
513-
default=ShortPromptStrategy.IGNORE.value,
514-
show_default=True,
515-
help="Strategy for handling prompts shorter than target length.",
516-
)
517-
@click.option(
518-
"--pad-char",
519-
type=str,
520-
default="",
521-
callback=decode_escaped_str,
522-
help="Character to pad short prompts with when using 'pad' strategy.",
523-
)
524-
@click.option(
525-
"--concat-delimiter",
526-
type=str,
527-
default="",
528-
help=(
529-
"Delimiter for concatenating short prompts (used with 'concatenate' strategy)."
530-
),
531-
)
532-
@click.option(
533-
"--prompt-tokens",
534-
type=str,
535-
default=None,
536-
help="Prompt tokens configuration (JSON, YAML file, or key=value string).",
537-
)
538-
@click.option(
539-
"--output-tokens",
540-
type=str,
541-
default=None,
542-
help="Output tokens configuration (JSON, YAML file, or key=value string).",
543-
)
544-
@click.option(
545-
"--push-to-hub",
546-
is_flag=True,
547-
help="Push the processed dataset to Hugging Face Hub.",
548-
)
549-
@click.option(
550-
"--hub-dataset-id",
551-
type=str,
552-
default=None,
553-
help=("Hugging Face Hub dataset ID for upload (required if --push-to-hub is set)."),
554-
)
555-
@click.option(
556-
"--random-seed",
557-
type=int,
558-
default=42,
559-
show_default=True,
560-
help="Random seed for reproducible token sampling.",
561-
)
562-
def dataset(
563-
data,
564-
output_path,
565-
processor,
566-
processor_args,
567-
data_args,
568-
short_prompt_strategy,
569-
pad_char,
570-
concat_delimiter,
571-
prompt_tokens,
572-
output_tokens,
573-
push_to_hub,
574-
hub_dataset_id,
575-
random_seed,
576-
):
577-
process_dataset(
578-
data=data,
579-
output_path=output_path,
580-
processor=processor,
581-
prompt_tokens=prompt_tokens,
582-
output_tokens=output_tokens,
583-
processor_args=processor_args,
584-
data_args=data_args,
585-
short_prompt_strategy=short_prompt_strategy,
586-
pad_char=pad_char,
587-
concat_delimiter=concat_delimiter,
588-
push_to_hub=push_to_hub,
589-
hub_dataset_id=hub_dataset_id,
590-
random_seed=random_seed,
591-
)
592-
593-
594458
@cli.command(
595459
"mock-server",
596460
help=(

src/guidellm/preprocess/dataset.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -276,8 +276,8 @@ def process_dataset(
276276
processor_args,
277277
"dataset conversion.",
278278
)
279-
prompt_column = column_mappings.get("prompt_column")
280-
output_column = column_mappings.get(
279+
prompt_column = column_mappings.get("prompt_column") # type: ignore[attr-defined]
280+
output_column = column_mappings.get( # type: ignore[attr-defined]
281281
"output_tokens_count_column", "output_tokens_count"
282282
)
283283

@@ -304,7 +304,7 @@ def process_dataset(
304304
)
305305
)
306306

307-
dataset_iterator = iter(dataset)
307+
dataset_iterator = iter(dataset) # type: ignore[call-overload]
308308
processed_prompts = []
309309
prompt_handler = STRATEGY_HANDLERS[short_prompt_strategy]
310310

0 commit comments

Comments
 (0)