fix review comments

anuprulez · anuprulez · commit 03e7b4cbe146 · 2025-03-26T15:05:11.000Z
diff --git a/tools/tabpfn/main.py b/tools/tabpfn/main.py
@@ -7,7 +7,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from catboost import CatBoostClassifier, CatBoostRegressor
+
 from sklearn.metrics import (
     average_precision_score,
     precision_recall_curve,
@@ -25,7 +25,7 @@ def separate_features_labels(data):
     return features, labels
 
 
-def classification_plot(y_true, y_scores, m_name):
+def classification_plot(y_true, y_scores):
     plt.figure(figsize=(8, 6))
     is_binary = len(np.unique(y_true)) == 2
     if is_binary:
@@ -37,7 +37,7 @@ def classification_plot(y_true, y_scores, m_name):
             precision,
             label=f"Precision-Recall Curve (AP={average_precision:.2f})",
         )
-        plt.title(f"{m_name}: Precision-Recall Curve (binary classification)")
+        plt.title("Precision-Recall Curve (binary classification)")
     else:
         y_true_bin = label_binarize(y_true, classes=np.unique(y_true))
         n_classes = y_true_bin.shape[1]
@@ -59,16 +59,16 @@ def classification_plot(y_true, y_scores, m_name):
             recall, precision, linestyle="--", color="black", label="Micro-average"
         )
         plt.title(
-            f"{m_name}: Precision-Recall Curve (Multiclass Classification)"
+            "Precision-Recall Curve (Multiclass Classification)"
         )
     plt.xlabel("Recall")
     plt.ylabel("Precision")
     plt.legend(loc="lower left")
     plt.grid(True)
-    plt.savefig(f"output_plot_{m_name}.png")
+    plt.savefig("output_plot.png")
 
 
-def regression_plot(xval, yval, title, xlabel, ylabel, m_name):
+def regression_plot(xval, yval, title, xlabel, ylabel):
     plt.figure(figsize=(8, 6))
     plt.xlabel(xlabel)
     plt.ylabel(ylabel)
@@ -78,7 +78,7 @@ def regression_plot(xval, yval, title, xlabel, ylabel, m_name):
     plt.scatter(xval, yval, alpha=0.8)
     xticks = np.arange(len(xval))
     plt.plot(xticks, xticks, color="red", linestyle="--", label="y = x")
-    plt.savefig(f"output_plot_{m_name}.png")
+    plt.savefig("output_plot.png")
 
 
 def train_evaluate(args):
@@ -95,43 +95,34 @@ def train_evaluate(args):
         te_labels = []
     s_time = time.time()
     if args["selected_task"] == "Classification":
-        models = [
-            ("TabPFN", TabPFNClassifier(random_state=42)),
-            ("CatBoost", CatBoostClassifier(random_state=42, verbose=0)),
-        ]
-        for m_name, model in models:
-            model.fit(tr_features, tr_labels)
-            y_eval = model.predict(te_features)
-            pred_probas_test = model.predict_proba(te_features)
-            if len(te_labels) > 0:
-                classification_plot(te_labels, pred_probas_test, m_name)
-            te_features["predicted_labels"] = y_eval
-            te_features.to_csv(
-                f"output_predicted_data_{m_name}", sep="\t", index=None
-            )
+        classifier = TabPFNClassifier(random_state=42)
+        classifier.fit(tr_features, tr_labels)
+        y_eval = classifier.predict(te_features)
+        pred_probas_test = classifier.predict_proba(te_features)
+        if len(te_labels) > 0:
+            classification_plot(te_labels, pred_probas_test)
+        te_features["predicted_labels"] = y_eval
+        te_features.to_csv(
+            "output_predicted_data", sep="\t", index=None
+        )
     else:
-        models = [
-            ("TabPFN", TabPFNRegressor(random_state=42)),
-            ("CatBoost", CatBoostRegressor(random_state=42, verbose=0)),
-        ]
-        for m_name, model in models:
-            model.fit(tr_features, tr_labels)
-            y_eval = model.predict(te_features)
-            if len(te_labels) > 0:
-                score = root_mean_squared_error(te_labels, y_eval)
-                r2_metric_score = r2_score(te_labels, y_eval)
-                regression_plot(
-                    te_labels,
-                    y_eval,
-                    f"Scatter plot for {m_name}: True vs predicted values. RMSE={score:.2f}, R2={r2_metric_score:.2f}",
-                    "True values",
-                    "Predicted values",
-                    m_name,
-                )
-            te_features["predicted_labels"] = y_eval
-            te_features.to_csv(
-                f"output_predicted_data_{m_name}", sep="\t", index=None
+        regressor = TabPFNRegressor(random_state=42)
+        regressor.fit(tr_features, tr_labels)
+        y_eval = regressor.predict(te_features)
+        if len(te_labels) > 0:
+            score = root_mean_squared_error(te_labels, y_eval)
+            r2_metric_score = r2_score(te_labels, y_eval)
+            regression_plot(
+                te_labels,
+                y_eval,
+                f"Scatter plot: True vs predicted values. RMSE={score:.2f}, R2={r2_metric_score:.2f}",
+                "True values",
+                "Predicted values",
             )
+    te_features["predicted_labels"] = y_eval
+    te_features.to_csv(
+        "output_predicted_data", sep="\t", index=None
+    )
     e_time = time.time()
     print(
         f"Time taken by TabPFN for training and prediction: {e_time - s_time} seconds"
diff --git a/tools/tabpfn/tabpfn.xml b/tools/tabpfn/tabpfn.xml
@@ -2,7 +2,7 @@
     <description>with PyTorch</description>
     <macros>
         <token name="@TOOL_VERSION@">2.0.3</token>
-        <token name="@VERSION_SUFFIX@">1.2</token>
+        <token name="@VERSION_SUFFIX@">1.1</token>
     </macros>
     <creator>
         <organization name="European Galaxy Team" url="https://galaxyproject.org/eu/"/>
@@ -11,7 +11,8 @@
     </creator>
     <requirements>
         <requirement type="package" version="@TOOL_VERSION@">tabpfn</requirement>
-        <requirement type="package" version="1.2.7">catboost</requirement>
+        <requirement type="package" version="2.2.2">pandas</requirement>
+	    <requirement type="package" version="3.9.2">matplotlib</requirement>
     </requirements>
     <version_command>echo "@VERSION@"</version_command>
     <command detect_errors="aggressive">
@@ -33,67 +34,63 @@
         <param name="testhaslabels" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Does test data contain labels?" help="Set this parameter when test data contains labels"/>
     </inputs>
     <outputs>
-        <data format="tabular" name="output_predicted_data_TabPFN" from_work_dir="output_predicted_data_TabPFN" label="Predicted data by TabPFN"/>
-        <data format="tabular" name="output_predicted_data_CatBoost" from_work_dir="output_predicted_data_CatBoost" label="Predicted data by CatBoost"/>
-        <data format="png" name="output_plot_TabPFN" from_work_dir="output_plot_TabPFN.png" label="Prediction plot on test data TabPFN">
-            <filter>testhaslabels is True</filter>
-        </data>
-        <data format="png" name="output_plot_CatBoost" from_work_dir="output_plot_CatBoost.png" label="Prediction plot on test data using CatBoost">
+        <data format="tabular" name="output_predicted_data" from_work_dir="output_predicted_data" label="Predicted data"/>
+        <data format="png" name="output_plot" from_work_dir="output_plot.png" label="Prediction plot on test data">
             <filter>testhaslabels is True</filter>
         </data>
     </outputs>
     <tests>
-        <test expect_num_outputs="2">
+        <test expect_num_outputs="1">
             <param name="selected_task" value="Classification"/>
             <param name="train_data" value="classification_local_train_rows.tabular" ftype="tabular"/>
             <param name="test_data" value="classification_local_test_rows.tabular" ftype="tabular"/>
             <param name="testhaslabels" value="false"/>
-            <output name="output_predicted_data_TabPFN">
+            <output name="output_predicted_data">
                 <assert_contents>
                     <has_n_columns n="42"/>
                     <has_n_lines n="3"/>
                 </assert_contents>
             </output>
         </test>
-        <test expect_num_outputs="4">
+        <test expect_num_outputs="2">
             <param name="selected_task" value="Classification"/>
             <param name="train_data" value="classification_local_train_rows.tabular" ftype="tabular"/>
             <param name="test_data" value="classification_local_test_rows_labels.tabular" ftype="tabular"/>
             <param name="testhaslabels" value="true"/>
-            <output name="output_plot_TabPFN" file="prc_binary.png" compare="sim_size"/>
+            <output name="output_plot" file="prc_binary.png" compare="sim_size"/>
         </test>
-        <test expect_num_outputs="4">
+        <test expect_num_outputs="2">
             <param name="selected_task" value="Classification"/>
             <param name="train_data" value="train_data_multiclass.tabular" ftype="tabular"/>
             <param name="test_data" value="test_data_multiclass_labels.tabular" ftype="tabular"/>
             <param name="testhaslabels" value="true"/>
-            <output name="output_plot_TabPFN" file="prc_multiclass.png" compare="sim_size"/>
+            <output name="output_plot" file="prc_multiclass.png" compare="sim_size"/>
         </test>
-        <test expect_num_outputs="2">
+        <test expect_num_outputs="1">
             <param name="selected_task" value="Classification"/>
             <param name="train_data" value="train_data_multiclass.tabular" ftype="tabular"/>
             <param name="test_data" value="test_data_multiclass_nolabels.tabular" ftype="tabular"/>
             <param name="testhaslabels" value="false"/>
-            <output name="output_predicted_data_CatBoost">
+            <output name="output_predicted_data">
                 <assert_contents>
                     <has_n_columns n="11"/>
                     <has_n_lines n="502"/>
                 </assert_contents>
             </output>
         </test>
-        <test expect_num_outputs="4">
+        <test expect_num_outputs="2">
             <param name="selected_task" value="Regression"/>
             <param name="train_data" value="regression_local_train_rows.tabular" ftype="tabular"/>
             <param name="test_data" value="regression_local_test_rows_labels.tabular" ftype="tabular"/>
             <param name="testhaslabels" value="true"/>
-            <output name="output_plot_TabPFN" file="r2_curve.png" compare="sim_size"/>
+            <output name="output_plot" file="r2_curve.png" compare="sim_size"/>
           </test>
-          <test expect_num_outputs="2">
+          <test expect_num_outputs="1">
             <param name="selected_task" value="Regression"/>
             <param name="train_data" value="regression_local_train_rows.tabular" ftype="tabular"/>
             <param name="test_data" value="regression_local_test_rows.tabular" ftype="tabular"/>
             <param name="testhaslabels" value="false"/>
-            <output name="output_predicted_data_TabPFN">
+            <output name="output_predicted_data">
               <assert_contents>
                 <has_n_columns n="14"/>
                 <has_n_lines n="105"/>
@@ -110,7 +107,6 @@
             **Input files**
                 - Training data: the training data should contain features and the last column should be the class labels. It should be in tabular format.
                 - Test data: the test data should also contain the same features as the training data and the last column should be the class labels if labels are avaialble. It should be in tabular format. It is not required for the test data to have labels.
-                - Above files show performance comparison of TabPFN with CatBoost (https://github.com/catboost/catboost).
 
             **Output files**
                 - Predicted data along with predicted labels.