feat: load HF datasets in inference

neuralsorcerer · web-flow · commit 907e0ff0731a · 2025-08-30T03:07:43.000+05:30
diff --git a/README.md b/README.md
@@ -163,9 +163,14 @@ matrix deploy_applications --applications "[{'model_name': 'meta-llama/Llama-4-M
 # download math-500 dataset
 python -m matrix.scripts.hf_dataset_to_jsonl HuggingFaceH4/MATH-500 test test.jsonl
 
-# query math-500
+# query math-500 from local jsonl
 matrix inference --app_name maverick-fp8 --input_jsonls test.jsonl --output_jsonl response.jsonl --batch_size=64 \
   --system_prompt "Please reason step by step, and put your final answer within \boxed{}." --max_tokens 30000 --text_key problem --timeout_secs 1800
+
+# or query directly from the Hugging Face dataset
+matrix inference --app_name maverick-fp8 --input_hf_dataset HuggingFaceH4/MATH-500 --hf_dataset_split test \
+  --output_jsonl response.jsonl --batch_size=64 \
+  --system_prompt "Please reason step by step, and put your final answer within \boxed{}." --max_tokens 30000 --text_key problem --timeout_secs 1800
 ```
 
 #### Input Format
diff --git a/matrix/app_server/app_api.py b/matrix/app_server/app_api.py
@@ -406,7 +406,9 @@ def inference(
         self,
         app_name: str,
         output_jsonl: str,
-        input_jsonls: str,
+        input_jsonls: str | None = None,
+        input_hf_dataset: str | None = None,
+        hf_dataset_split: str = "train",
         load_balance: bool = True,
         **kwargs,
     ):
@@ -448,6 +450,8 @@ async def get_one_endpoint() -> str:
                     input_jsonls,
                     model=metadata["model_name"],
                     app_name=metadata["name"],
+                    input_hf_dataset=input_hf_dataset,
+                    hf_dataset_split=hf_dataset_split,
                     **kwargs,
                 )
             )
diff --git a/matrix/cli.py b/matrix/cli.py
@@ -196,7 +196,15 @@ def deploy_applications(
             yaml_config,
         )
 
-    def inference(self, app_name: str, output_jsonl: str, input_jsonls: str, **kwargs):
+    def inference(
+        self,
+        app_name: str,
+        output_jsonl: str,
+        input_jsonls: str | None = None,
+        input_hf_dataset: str | None = None,
+        hf_dataset_split: str = "train",
+        **kwargs,
+    ):
         """
         Run batch inference using a deployed application.
 
@@ -206,7 +214,9 @@ def inference(self, app_name: str, output_jsonl: str, input_jsonls: str, **kwarg
         Args:
             app_name (str): The name of the deployed application to use.
             output_jsonl (str): Path to save inference results in JSONL format.
-            input_jsonls (str): Path to input data in JSONL format.
+            input_jsonls (str | None): Path to input data in JSONL format.
+            input_hf_dataset (str | None): Hugging Face dataset name to load directly.
+            hf_dataset_split (str): Dataset split to load when using a Hugging Face dataset.
             **kwargs: Additional parameters for inference (e.g., temperature, max_tokens).
 
         Returns:
@@ -216,6 +226,8 @@ def inference(self, app_name: str, output_jsonl: str, input_jsonls: str, **kwarg
             app_name,
             output_jsonl,
             input_jsonls,
+            input_hf_dataset=input_hf_dataset,
+            hf_dataset_split=hf_dataset_split,
             **kwargs,
         )
 
diff --git a/matrix/client/query_llm.py b/matrix/client/query_llm.py
@@ -136,6 +136,55 @@ def get_text_length(messages: tp.List[tp.Dict[str, str]]) -> int:
     return data
 
 
+def load_from_hf_dataset(
+    dataset_name: str,
+    split: str,
+    text_key: str,
+    messages_key: str,
+    system_prompt: str,
+) -> tp.List[tp.Dict[str, tp.Any]]:
+    from datasets import load_dataset
+
+    def get_request(key: str, data: tp.Dict[str, tp.Any]) -> tp.Optional[tp.Any]:
+        keys = key.split(".")
+        current_data = data
+        for k in keys:
+            if isinstance(current_data, dict) and k in current_data:
+                current_data = current_data[k]
+            else:
+                return None
+        return current_data
+
+    def get_metadata_key(text_key: str) -> str:
+        parts = text_key.split(".")
+        parts[-1] = "metadata"
+        return ".".join(parts)
+
+    dataset = load_dataset(dataset_name, split=split)
+    data = []
+    for idx, sample in enumerate(dataset):
+        text = get_request(text_key, sample)
+        if text:
+            messages = convert_llama_instruct_text(text)
+            metadata = get_request(get_metadata_key(text_key), sample)
+        else:
+            messages = get_request(messages_key, sample)  # type: ignore
+            assert messages, f"either {text_key} or {messages_key} should exist"
+            metadata = get_request(get_metadata_key(messages_key), sample)
+
+        if system_prompt:
+            if messages[0]["role"] == "system":
+                messages[0]["content"] = system_prompt
+            else:
+                messages.insert(0, {"role": "system", "content": system_prompt})
+
+        if metadata is None:
+            metadata = {"index": idx}
+        data.append({"metadata": metadata, "messages": messages})
+    logger.info(f"Loaded {len(data)} samples from {dataset_name} split {split}")
+    return data
+
+
 def _convert_token_log_probs(token_log_probs):
     if not token_log_probs.token_map:
         return None
@@ -617,9 +666,9 @@ def batch_requests(
 async def main(
     url: tp.Union[str, tp.Callable[[], tp.Awaitable[str]]],
     output_file: str,
-    input_jsonls: str,
-    app_name: str,
-    model: str,
+    input_jsonls: str | None = None,
+    app_name: str = "",
+    model: str = "",
     batch_size=32,
     seed=42,
     temperature=0.7,
@@ -632,6 +681,8 @@ async def main(
     system_prompt="",
     timeout_secs=600,
     batch_mode=False,
+    input_hf_dataset: str | None = None,
+    hf_dataset_split: str = "train",
 ) -> tp.Dict[str, int]:
     """Send jsonl llama3 instruct prompt for inference and save both the request and response as jsonl.
     params:
@@ -640,6 +691,8 @@ async def main(
     input_jsonls: variable num of input jsonl files, each line is a json with two formats
         1. {text_key: prompt} if text_key is found, prompt is raw text
         2. {messages_key: Iterable[ChatCompletionMessageParam]} if messages_key is found.
+    input_hf_dataset: name of a Hugging Face dataset to load directly.
+    hf_dataset_split: dataset split to use when loading from Hugging Face.
     model: the huggingface model name or a directory.
     batch_size: max number of concurrent requests.
     seed: seed.
@@ -661,17 +714,25 @@ async def main(
         os.makedirs(save_dir, exist_ok=True)
     if os.path.exists(output_file):
         logger.warning(f"Output file '{output_file}' already exists, overwriting...")
-    input_files = glob.glob(input_jsonls)
-    if not input_files:
-        logger.error(f"No input files found matching pattern: {input_jsonls}")
-        return {}
-
-    lines = load_from_jsonl(
-        tuple(input_files),
-        text_key,
-        messages_key,
-        system_prompt=system_prompt,
-    )
+    if input_hf_dataset:
+        lines = load_from_hf_dataset(
+            input_hf_dataset,
+            hf_dataset_split,
+            text_key,
+            messages_key,
+            system_prompt=system_prompt,
+        )
+    else:
+        input_files = glob.glob(input_jsonls or "")
+        if not input_files:
+            logger.error(f"No input files found matching pattern: {input_jsonls}")
+            return {}
+        lines = load_from_jsonl(
+            tuple(input_files),
+            text_key,
+            messages_key,
+            system_prompt=system_prompt,
+        )
     stats = {"success": 0, "total": 0, "sum_latency": 0}
     if batch_mode:
         outputs = await batch_requests_async(
diff --git a/tests/unit/query/test_load_from_hf_dataset.py b/tests/unit/query/test_load_from_hf_dataset.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from datasets import Dataset
+
+
+def test_load_from_hf_dataset(monkeypatch):
+    from matrix.client import query_llm
+
+    dataset = Dataset.from_dict({"problem": ["1+1", "2+2"]})
+
+    def mock_load_dataset(*args, **kwargs):
+        return dataset
+
+    monkeypatch.setattr("datasets.load_dataset", mock_load_dataset)
+
+    lines = query_llm.load_from_hf_dataset(
+        "dummy",
+        "train",
+        text_key="problem",
+        messages_key="request.messages",
+        system_prompt="sys",
+    )
+
+    assert len(lines) == 2
+    assert lines[0]["messages"][0]["role"] == "system"
+    assert lines[0]["messages"][0]["content"] == "sys"
+    assert lines[0]["messages"][1]["content"] == "1+1"
+    assert lines[0]["metadata"]["index"] == 0