Add granite documents format (#1566)

elronbandel · OfirArviv · web-flow · commit 912dc2aadbd2 · 2025-02-02T11:35:42.000+02:00
* Add granite documents format

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Update process method to use Optional for stream_name parameter

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

---------

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;
Co-authored-by: OfirArviv &lt;ofir.arviv@ibm.com&gt;
diff --git a/prepare/formats/models/granite.py b/prepare/formats/models/granite.py
@@ -0,0 +1,6 @@
+from unitxt.catalog import add_to_catalog
+from unitxt.formats import GraniteDocumentsFormat
+
+format = GraniteDocumentsFormat(model="ibm-granite/granite-3.1-8b-instruct")
+
+add_to_catalog(format, "formats.models.granite_3_1_documents", overwrite=True)
diff --git a/src/unitxt/catalog/formats/models/granite_3_1_documents.json b/src/unitxt/catalog/formats/models/granite_3_1_documents.json
@@ -0,0 +1,4 @@
+{
+    "__type__": "granite_documents_format",
+    "model": "ibm-granite/granite-3.1-8b-instruct"
+}
diff --git a/src/unitxt/formats.py b/src/unitxt/formats.py
@@ -13,6 +13,7 @@
 
 from .dataclass import OptionalField
 from .dict_utils import dict_get
+from .error_utils import UnitxtError
 from .image_operators import image_to_data_url
 from .operator import InstanceOperator
 from .settings_utils import get_constants
@@ -25,6 +26,55 @@ class Format(InstanceOperator):
     pass
 
 
+class GraniteDocumentsFormat(Format):
+    model: str = "ibm-granite/granite-3.1-8b-instruct"
+    citations: bool = True
+    length: str = "long"
+
+    _requirements_list = ["transformers"]
+
+    def prepare(self):
+        super().prepare()
+        from transformers import AutoTokenizer
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model)
+
+    def process(
+        self, instance: Dict[str, Any], stream_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        inputs = instance["input_fields"]
+        if "question" not in inputs:
+            raise UnitxtError(
+                "GraniteRAGFormat works only for tasks with field: 'question'"
+            )
+        if "context" not in inputs and "contexts" not in inputs:
+            raise UnitxtError(
+                "GraniteRAGFormat works only for tasks with field: 'context' or 'contexts"
+            )
+
+        if "context" in inputs:
+            texts = [inputs["context"]]
+        if "contexts" in inputs:
+            texts = inputs["contexts"]
+
+        documents = []
+        for text in texts:
+            documents.append({"title": "", "text": text})
+
+        question = inputs["question"]
+
+        instance["source"] = self.tokenizer.apply_chat_template(
+            [
+                {"role": "user", "content": question},
+            ],
+            documents=documents,
+            controls={"citations": self.citations, "length": self.length},
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        return instance
+
+
 def apply_capital_new_line_notation(text: str) -> str:
     r"""Transforms a given string by applying the Capital New Line Notation.
 
diff --git a/tests/library/test_formats.py b/tests/library/test_formats.py
@@ -1,18 +1,29 @@
+from datetime import datetime
+
+from unitxt.api import load_dataset
 from unitxt.card import TaskCard
-from unitxt.formats import ChatAPIFormat, HFSystemFormat, SystemFormat
+from unitxt.collections_operators import Wrap
+from unitxt.formats import (
+    ChatAPIFormat,
+    GraniteDocumentsFormat,
+    HFSystemFormat,
+    SystemFormat,
+)
 from unitxt.loaders import LoadFromDictionary
+from unitxt.operators import Rename, Set
 from unitxt.settings_utils import get_constants
 from unitxt.standard import DatasetRecipe
 from unitxt.system_prompts import TextualSystemPrompt
 from unitxt.task import Task
-from unitxt.templates import InputOutputTemplate
+from unitxt.templates import InputOutputTemplate, MultiReferenceTemplate, TemplatesDict
 from unitxt.test_utils.operators import (
     check_operator,
 )
 
 from tests.library.test_image_operators import create_random_jpeg_image
 from tests.utils import UnitxtTestCase
 
+# Assume
 constants = get_constants()
 
 
@@ -327,6 +338,132 @@ def test_hf_system_format(self):
             tester=self,
         )
 
+    def test_granite_documents_format(self):
+        inputs = [
+            {
+                "input_fields": {
+                    "question": "what is love?",
+                    "contexts": ["love is love"],
+                },
+            },
+            {
+                "input_fields": {
+                    "question": "what is love?",
+                    "context": "love is love",
+                },
+            },
+        ]
+
+        system_format = GraniteDocumentsFormat()
+
+        today = datetime.today().strftime("%B %d, %Y")
+        targets = [
+            {
+                "input_fields": {
+                    "question": "what is love?",
+                    "contexts": ["love is love"],
+                },
+                "source": "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: "
+                + today
+                + '.\nYou are Granite, developed by IBM. Write the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.<|end_of_text|>\n<|start_of_role|>documents<|end_of_role|>Document 0\nlove is love<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>what is love?<|end_of_text|>\n<|start_of_role|>assistant {"citations": true, "length": "long"}<|end_of_role|>',
+            },
+            {
+                "input_fields": {
+                    "question": "what is love?",
+                    "context": "love is love",
+                },
+                "source": "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: "
+                + today
+                + '.\nYou are Granite, developed by IBM. Write the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.<|end_of_text|>\n<|start_of_role|>documents<|end_of_role|>Document 0\nlove is love<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>what is love?<|end_of_text|>\n<|start_of_role|>assistant {"citations": true, "length": "long"}<|end_of_role|>',
+            },
+        ]
+
+        check_operator(
+            operator=system_format,
+            inputs=inputs,
+            targets=targets,
+            tester=self,
+        )
+
+        data = {
+            "test": [
+                {
+                    "query": "What city is the largest in Texas?",
+                    "extracted_chunks": "Austin is the capital of Texas.\nHouston is the the largest city in Texas but not the capital of it. ",
+                    "expected_answer": "Houston",
+                },
+                {
+                    "query": "What city is the capital of Texas?",
+                    "extracted_chunks": "Houston is the the largest city in Texas but not the capital of it. ",
+                    "expected_answer": "Austin",
+                },
+            ]
+        }
+
+        card = TaskCard(
+            # Assumes this csv, contains 3 fields
+            # question (string), extracted_chunks (string), expected_answer (string)
+            loader=LoadFromDictionary(data=data),
+            # Map these fields to the fields of the task.rag.response_generation task.
+            # See https://www.unitxt.ai/en/latest/catalog/catalog.tasks.rag.response_generation.html
+            preprocess_steps=[
+                Rename(field_to_field={"query": "question"}),
+                Wrap(field="extracted_chunks", inside="list", to_field="contexts"),
+                Wrap(
+                    field="expected_answer", inside="list", to_field="reference_answers"
+                ),
+                Set(
+                    fields={
+                        "contexts_ids": [],
+                    }
+                ),
+            ],
+            # Specify the task and the desired metrics (note that these are part of the default
+            # metrics for the task, so the metrics selection can be omitted).
+            task="tasks.rag.response_generation",
+            # Specify a default template
+            templates=TemplatesDict(
+                {
+                    "simple": MultiReferenceTemplate(
+                        instruction="Answer the question based on the information provided in the document given below.\n\n",
+                        input_format="Document: {contexts}\nQuestion: {question}",
+                        references_field="reference_answers",
+                    ),
+                }
+            ),
+        )
+
+        # select recommended metrics according to your available resources.
+        metrics = [
+            "metrics.rag.response_generation.recommended.cpu_only.all",
+            # "metrics.rag.response_generation.recommended.small_llm.all",
+            # "metrics.rag.response_generation.recommended.llmaj_watsonx.all",
+            # "metrics.rag.response_generation.recommended.llmaj_rits.all"
+            # "metrics.rag.response_generation.recommended.llmaj_azure.all"
+        ]
+
+        # Verbalize the dataset using the template
+        dataset = load_dataset(
+            card=card,
+            template_card_index="simple",
+            format=GraniteDocumentsFormat(),
+            split="test",
+            max_test_instances=10,
+            metrics=metrics,
+        )
+
+        self.assertListEqual(
+            dataset["source"],
+            [
+                "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: "
+                + today
+                + '.\nYou are Granite, developed by IBM. Write the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.<|end_of_text|>\n<|start_of_role|>documents<|end_of_role|>Document 0\nAustin is the capital of Texas.\nHouston is the the largest city in Texas but not the capital of it. <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What city is the largest in Texas?<|end_of_text|>\n<|start_of_role|>assistant {"citations": true, "length": "long"}<|end_of_role|>',
+                "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: "
+                + today
+                + '.\nYou are Granite, developed by IBM. Write the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\n\nIn your response, use the symbols <co> and </co> to indicate when a fact comes from a document in the search result, e.g <co>0</co> for a fact from document 0. Afterwards, list all the citations with their corresponding documents in an ordered list.<|end_of_text|>\n<|start_of_role|>documents<|end_of_role|>Document 0\nHouston is the the largest city in Texas but not the capital of it. <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What city is the capital of Texas?<|end_of_text|>\n<|start_of_role|>assistant {"citations": true, "length": "long"}<|end_of_role|>',
+            ],
+        )
+
     def test_system_format(self):
         instruction = "solve the math exercises"
 

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +    "__type__": "granite_documents_format",
 +    "model": "ibm-granite/granite-3.1-8b-instruct"
 +}