IBM
diff --git a/‎.github/workflows/docs.yml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/docs.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/library_tests.yml
Lines changed: 5 additions & 3 deletions b/‎.github/workflows/library_tests.yml
Lines changed: 5 additions & 3 deletions
diff --git a/‎.github/workflows/performance.yml
Lines changed: 15 additions & 12 deletions b/‎.github/workflows/performance.yml
Lines changed: 15 additions & 12 deletions
diff --git a/‎.github/workflows/test_helm.yml
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/test_helm.yml
Lines changed: 4 additions & 2 deletions
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 0 additions & 5 deletions b/‎.pre-commit-config.yaml
Lines changed: 0 additions & 5 deletions
diff --git a/‎README.md
Lines changed: 37 additions & 56 deletions b/‎README.md
Lines changed: 37 additions & 56 deletions
diff --git a/‎assets/banner.png
26.4 KB b/‎assets/banner.png
26.4 KB
diff --git a/‎docs/_static/banner.png
26.4 KB b/‎docs/_static/banner.png
26.4 KB
diff --git a/‎docs/blog/inference_engines_blog.rst
Lines changed: 6 additions & 4 deletions b/‎docs/blog/inference_engines_blog.rst
Lines changed: 6 additions & 4 deletions
@@ -9,7 +9,7 @@ on:
 concurrency:
     group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
     cancel-in-progress: true
-    
+
 jobs:
     docs:
 
@@ -23,10 +23,10 @@ jobs:
 
         - uses: actions/setup-python@v5
           with:
-            python-version: '3.9'
+            python-version: '3.8'
 
         - run: curl -LsSf https://astral.sh/uv/install.sh | sh
-        - run: uv pip install --system ".[tests,docs]"
+        - run: uv pip install --system ".[docs]"
 
         - name: Compile Docs
           run: make docs
 
@@ -34,7 +34,9 @@ jobs:
     - run: pip install coverage[toml]
 
     - name: Run Tests
-      run:  coverage run --omit=*/preparation -m unittest discover -s tests/library -p "test_*.py"
+      run: coverage run -m unittest discover -s tests/library -p "test_*.py"
 
-    - name: Upload Coverage to Codecov
-      uses: codecov/codecov-action@v2
+    - run: coverage report
+
+    - name: Upload Coverage to Coveralls
+      uses: coverallsapp/github-action@v2
@@ -17,32 +17,35 @@ jobs:
     env:
       OS: ubuntu-latest
       UNITXT_DEFAULT_VERBOSITY: error
+      UNITXT_MOCK_INFERENCE_MODE: "True"
       DATASETS_VERBOSITY: error
       HF_HUB_VERBOSITY: error
       HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
       TQDM_DISABLE: "True"
-
     steps:
     - uses: actions/checkout@v4
 
     - uses: actions/setup-python@v5
       with:
-        python-version: '3.9'
+        python-version: '3.10'
 
     - name: Install Requirements
       run: |
         curl -LsSf https://astral.sh/uv/install.sh | sh
-        uv pip install --system -e ".[tests]"
+        uv pip install --system ".[tests,watsonx,inference-tests]"
+        uv pip install --system litellm
+        uv pip install --system diskcache
+        huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
 
     - name: Prepare the dirs for performance evaluation in main
       run: |
         mkdir -p performance_action
-        mkdir -p performance_action/logs
-        echo "" > performance_action/__init__.py
-        echo " " > performance_action/logs/cards_benchmark.prof
-        echo " " > performance_action/logs/cards_benchmark.json
-        cp performance/card_profiler.py performance_action/card_profiler.py
-        cp performance/compare_performance_results.py performance_action/compare_performance_results.py
+        cp performance/bluebench_profiler.py performance_action/bluebench_profiler.py
+        cp performance/compare_benchmark_performance_results.py performance_action/compare_benchmark_performance_results.py
+
+    - name: Run performance on PR just to warm the cache, output will be overwritten
+      run : |
+        python performance_action/bluebench_profiler.py --output_file performance_action/pr_results.json
 
     - name: Checkout main branch
       uses: actions/checkout@v4
@@ -52,7 +55,7 @@ jobs:
 
     - name: Run performance on main branch
       run: |
-        python performance_action/card_profiler.py --output_file performance_action/main_results.json
+        python performance_action/bluebench_profiler.py --output_file performance_action/main_results.json
 
     - name: Checkout PR branch
       uses: actions/checkout@v4
@@ -62,8 +65,8 @@ jobs:
 
     - name: Run performance on PR branch
       run: |
-        python performance_action/card_profiler.py --output_file performance_action/pr_results.json
+        python performance_action/bluebench_profiler.py --output_file performance_action/pr_results.json
 
     - name: Compare main and PR performance results
       run: |
-        python performance_action/compare_performance_results.py performance_action/main_results.json performance_action/pr_results.json >> $GITHUB_STEP_SUMMARY
+        python performance_action/compare_benchmark_performance_results.py performance_action/main_results.json performance_action/pr_results.json >> $GITHUB_STEP_SUMMARY
@@ -22,8 +22,10 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: '3.9'
-          cache: 'pip' # caching pip dependencies
-      - run: pip install --upgrade 'crfm-helm[unitxt]>=0.5.3'
+
+      - run: curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run: uv pip install --upgrade --system "crfm-helm[unitxt]>=0.5.3"
+      - run: uv pip install --system "scikit-learn==1.5.2"
 
       - name: Test Helm
         run: utils/run_helm.sh
 
@@ -157,3 +157,5 @@ src/unitxt/catalog/processors/example/to_string.json
 prod_env/*
 benchmark_output/*
 .litellm_cache
+
+docs/_static/data.js
@@ -10,11 +10,6 @@ repos:
         args: [--fix]
         exclude: src/unitxt/metrics.py|examples/evaluate_existing_dataset_no_install.py
       # Run the linter on the specific file with the ignore flag
-      - id: ruff
-        name: ruff (src/unitxt/metrics.py)
-        files: src/unitxt/metrics.py
-        args: [--fix, --ignore, C901]
-      # Run the linter on the specific file with the ignore flag
       - id: ruff
         name: ruff (examples/evaluate_existing_dataset_no_install.py)
         files: examples/evaluate_existing_dataset_no_install.py
 
@@ -21,7 +21,7 @@ In the dynamic landscape of generative NLP, traditional text processing pipeline
 ![license](https://img.shields.io/github/license/ibm/unitxt)
 ![python](https://img.shields.io/badge/python-3.8%20|%203.9-blue)
 ![tests](https://img.shields.io/github/actions/workflow/status/ibm/unitxt/library_tests.yml?branch=main&label=tests)
-[![codecov](https://codecov.io/gh/IBM/unitxt/branch/main/graph/badge.svg?token=mlrWq9cwz3)](https://codecov.io/gh/IBM/unitxt)
+[![Coverage Status](https://coveralls.io/repos/github/IBM/unitxt/badge.svg)](https://coveralls.io/github/IBM/unitxt)
 ![Read the Docs](https://img.shields.io/readthedocs/unitxt)
 [![downloads](https://static.pepy.tech/personalized-badge/unitxt?period=total&units=international_system&left_color=grey&right_color=green&left_text=downloads)](https://pepy.tech/project/unitxt)
 
@@ -48,80 +48,61 @@ Then launch the ui by running:
 unitxt-explore
 ```
 
-# 🦄 Example 
+# 🦄 Example
 
 This is a simple example of running end-to-end evaluation in self contained python code over user data.
 
 See more examples in examples subdirectory.
 
 ```python
-from unitxt import get_logger
-from unitxt.api import evaluate, load_dataset
-from unitxt.blocks import Task, TaskCard
-from unitxt.inference import HFPipelineBasedInferenceEngine
-from unitxt.loaders import LoadFromDictionary
-from unitxt.templates import InputOutputTemplate, TemplatesDict
-from unitxt.text_utils import print_dict
-
-logger = get_logger()
-
-# Set up question answer pairs in a dictionary
-data = {
-    "test": [
-        {"question": "What is the capital of Texas?", "answer": "Austin"},
-        {"question": "What is the color of the sky?", "answer": "Blue"},
-    ]
-}
-
-card = TaskCard(
-    # Load the data from the dictionary.  Data can be  also loaded from HF, CSV files, COS and other sources using different loaders.
-    loader=LoadFromDictionary(data=data),
-    # Define the QA task input and output and metrics.
-    task=Task(
-        input_fields={"question": str},
-        reference_fields={"answer": str},
-        prediction_type=str,
-        metrics=["metrics.accuracy"],
-    ),
+# Import required components
+from unitxt import evaluate, create_dataset
+from unitxt.blocks import Task, InputOutputTemplate
+from unitxt.inference import HFAutoModelInferenceEngine
+
+# Question-answer dataset
+data = [
+    {"question": "What is the capital of Texas?", "answer": "Austin"},
+    {"question": "What is the color of the sky?", "answer": "Blue"},
+]
+
+# Define the task and evaluation metric
+task = Task(
+    input_fields={"question": str},
+    reference_fields={"answer": str},
+    prediction_type=str,
+    metrics=["metrics.accuracy"],
 )
 
-# Create a simple template that formats the input.
-# Add lowercase normalization as a post processor on the model prediction.
-
+# Create a template to format inputs and outputs
 template = InputOutputTemplate(
     instruction="Answer the following question.",
     input_format="{question}",
     output_format="{answer}",
     postprocessors=["processors.lower_case"],
 )
-# Verbalize the dataset using the template
-dataset = load_dataset(card=card, template=template)
-test_dataset = dataset["test"]
 
+# Prepare the dataset
+dataset = create_dataset(
+    task=task,
+    template=template,
+    format="formats.chat_api",
+    test_set=data,
+    split="test",
+)
 
-# Infer using flan t5 base using HF API
-# can be replaced with any prediction code, 
-# including the built in WMLInferenceEngine and OpenAiInferenceEngine.
-model_name = "google/flan-t5-base"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=32
+# Set up the model (supports Hugging Face, WatsonX, OpenAI, etc.)
+model = HFAutoModelInferenceEngine(
+    model_name="Qwen/Qwen1.5-0.5B-Chat", max_new_tokens=32
 )
-predictions = inference_model.infer(test_dataset)
-evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
 
-# Print results
-for instance in evaluated_dataset:
-    print_dict(
-        instance,
-        keys_to_print=[
-            "source", # input to the model
-            "prediction", # model prediction 
-            "processed_prediction", # model prediction after post processing
-            "references", # reference answer
-            "score", # scores (per instance and global)
-        ],
-    )
+# Generate predictions and evaluate
+predictions = model(dataset)
+results = evaluate(predictions=predictions, data=dataset)
 
+# Print results
+print("Global Results:\n", results.global_scores.summary)
+print("Instance Results:\n", results.instance_scores.summary)
 ```
 
 # 🦄 Contributors
 
@@ -1,6 +1,8 @@
 .. title:: Unitxt Embraces Rich Chat Format and Cross API Inference: Simplifying LLM Evaluation
-.. authors:: Elron Bandel
-.. date:: 2024-11-19
+
+:Authors: Elron Bandel
+
+:Date: 2024-11-19
 
 =================================================================================================
 [19/11/2024] Unitxt Embraces Rich Chat Format and Cross API Inference: Simplifying LLM Evaluation
@@ -21,8 +23,8 @@ Introducing Two Major Enhancements
 -----------------------------------
 
 1. **Producing Data in Chat API Format**
-   Unitxt now can produces data in the widely adopted Chat API format.
-   This ensures compatibility with popular LLM Provider APIs and avoid the need from custom per model formatting.
+   Unitxt can produce data in the widely adopted Chat API format.
+   This ensures compatibility with popular LLM Provider APIs and avoid the need for custom per model formatting.
    Additionally, the format supports multiple modalities such as text, images, and videos.
 
 2. **A Comprehensive Array of Inference Engines**