Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 44 additions & 26 deletions tools/tabpfn/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import (
average_precision_score,
precision_recall_curve,
Expand All @@ -24,7 +25,7 @@ def separate_features_labels(data):
return features, labels


def classification_plot(y_true, y_scores):
def classification_plot(y_true, y_scores, m_name):
plt.figure(figsize=(8, 6))
is_binary = len(np.unique(y_true)) == 2
if is_binary:
Expand All @@ -36,7 +37,7 @@ def classification_plot(y_true, y_scores):
precision,
label=f"Precision-Recall Curve (AP={average_precision:.2f})",
)
plt.title("Precision-Recall Curve (binary classification)")
plt.title("{}: Precision-Recall Curve (binary classification)".format(m_name))
else:
y_true_bin = label_binarize(y_true, classes=np.unique(y_true))
n_classes = y_true_bin.shape[1]
Expand All @@ -57,15 +58,17 @@ def classification_plot(y_true, y_scores):
plt.plot(
recall, precision, linestyle="--", color="black", label="Micro-average"
)
plt.title("Precision-Recall Curve (Multiclass Classification)")
plt.title(
"{}: Precision-Recall Curve (Multiclass Classification)".format(m_name)
)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend(loc="lower left")
plt.grid(True)
plt.savefig("output_plot.png")
plt.savefig("output_plot_{}.png".format(m_name))


def regression_plot(xval, yval, title, xlabel, ylabel):
def regression_plot(xval, yval, title, xlabel, ylabel, m_name):
plt.figure(figsize=(8, 6))
plt.xlabel(xlabel)
plt.ylabel(ylabel)
Expand All @@ -75,7 +78,7 @@ def regression_plot(xval, yval, title, xlabel, ylabel):
plt.scatter(xval, yval, alpha=0.8)
xticks = np.arange(len(xval))
plt.plot(xticks, xticks, color="red", linestyle="--", label="y = x")
plt.savefig("output_plot.png")
plt.savefig("output_plot_{}.png".format(m_name))


def train_evaluate(args):
Expand All @@ -92,34 +95,49 @@ def train_evaluate(args):
te_labels = []
s_time = time.time()
if args["selected_task"] == "Classification":
classifier = TabPFNClassifier()
classifier.fit(tr_features, tr_labels)
y_eval = classifier.predict(te_features)
pred_probas_test = classifier.predict_proba(te_features)
if len(te_labels) > 0:
classification_plot(te_labels, pred_probas_test)
models = [
("TabPFN", TabPFNClassifier(random_state=42)),
("CatBoost", CatBoostClassifier(random_state=42, verbose=0)),
]
for m_name, model in models:
model.fit(tr_features, tr_labels)
y_eval = model.predict(te_features)
pred_probas_test = model.predict_proba(te_features)
if len(te_labels) > 0:
classification_plot(te_labels, pred_probas_test, m_name)
te_features["predicted_labels"] = y_eval
te_features.to_csv(
"output_predicted_data_{}".format(m_name), sep="\t", index=None
)
else:
regressor = TabPFNRegressor()
regressor.fit(tr_features, tr_labels)
y_eval = regressor.predict(te_features)
if len(te_labels) > 0:
score = root_mean_squared_error(te_labels, y_eval)
r2_metric_score = r2_score(te_labels, y_eval)
regression_plot(
te_labels,
y_eval,
f"Scatter plot: True vs predicted values. RMSE={score:.2f}, R2={r2_metric_score:.2f}",
"True values",
"Predicted values",
models = [
("TabPFN", TabPFNRegressor(random_state=42)),
("CatBoost", CatBoostRegressor(random_state=42, verbose=0)),
]
for m_name, model in models:
model.fit(tr_features, tr_labels)
y_eval = model.predict(te_features)
if len(te_labels) > 0:
score = root_mean_squared_error(te_labels, y_eval)
r2_metric_score = r2_score(te_labels, y_eval)
regression_plot(
te_labels,
y_eval,
f"Scatter plot for {m_name}: True vs predicted values. RMSE={score:.2f}, R2={r2_metric_score:.2f}",
"True values",
"Predicted values",
m_name,
)
te_features["predicted_labels"] = y_eval
te_features.to_csv(
"output_predicted_data_{}".format(m_name), sep="\t", index=None
)
e_time = time.time()
print(
"Time taken by TabPFN for training and prediction: {} seconds".format(
e_time - s_time
)
)
te_features["predicted_labels"] = y_eval
te_features.to_csv("output_predicted_data", sep="\t", index=None)


if __name__ == "__main__":
Expand Down
206 changes: 105 additions & 101 deletions tools/tabpfn/tabpfn.xml
Original file line number Diff line number Diff line change
@@ -1,119 +1,123 @@
<?xml version="1.0"?>
<tool id="tabpfn" name="Tabular data prediction using TabPFN" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.0">
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to change the name now?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

<?xml version="1.0"?> was introduced by xmllint. It is removed now.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree if the tool name is not updated while keeping CatBoost comparison, it may not sound good.

<description>with PyTorch</description>
<macros>
<token name="@TOOL_VERSION@">2.0.3</token>
<token name="@VERSION_SUFFIX@">1.1</token>
</macros>
<creator>
<organization name="European Galaxy Team" url="https://galaxyproject.org/eu/" />
<person givenName="Anup" familyName="Kumar" email="kumara@informatik.uni-freiburg.de" />
<person givenName="Frank" familyName="Hutter" email="fh@cs.uni-freiburg.de" />
</creator>
<requirements>
<requirement type="package" version="@TOOL_VERSION@">tabpfn</requirement>
<requirement type="package" version="2.2.2">pandas</requirement>
<requirement type="package" version="3.9.2">matplotlib</requirement>
</requirements>
<version_command>echo "@VERSION@"</version_command>
<command detect_errors="aggressive">
<![CDATA[
<description>with PyTorch</description>
<macros>
<token name="@TOOL_VERSION@">2.0.3</token>
<token name="@VERSION_SUFFIX@">1.2</token>
</macros>
<creator>
<organization name="European Galaxy Team" url="https://galaxyproject.org/eu/"/>
<person givenName="Anup" familyName="Kumar" email="kumara@informatik.uni-freiburg.de"/>
<person givenName="Frank" familyName="Hutter" email="fh@cs.uni-freiburg.de"/>
</creator>
<requirements>
<requirement type="package" version="@TOOL_VERSION@">tabpfn</requirement>
<requirement type="package" version="1.2.7">catboost</requirement>
</requirements>
<version_command>echo "@VERSION@"</version_command>
<command detect_errors="aggressive"><![CDATA[
python '$__tool_directory__/main.py'
--selected_task '$selected_task'
--train_data '$train_data'
--testhaslabels '$testhaslabels'
--test_data '$test_data'
]]>
</command>
<inputs>
<param name="selected_task" type="select" label="Select a machine learning task">
<option value="Classification" selected="true"></option>
<option value="Regression" selected="false"></option>
</param>
<param name="train_data" type="data" format="tabular" label="Train data" help="Please provide training data for training model. It should contain labels/class/target in the last column" />
<param name="test_data" type="data" format="tabular" label="Test data" help="Please provide test data for evaluating model. It may or may not contain labels/class/target in the last column" />
<param name="testhaslabels" type="boolean" truevalue="haslabels" falsevalue="" checked="false" label="Does test data contain labels?" help="Set this parameter when test data contains labels" />
</inputs>
<outputs>
<data format="tabular" name="output_predicted_data" from_work_dir="output_predicted_data" label="Predicted data"></data>
<data format="png" name="output_plot" from_work_dir="output_plot.png" label="Prediction plot on test data">
<filter>testhaslabels is True</filter>
</data>
</outputs>
<tests>
<test expect_num_outputs="1">
<param name="selected_task" value="Classification" />
<param name="train_data" value="classification_local_train_rows.tabular" ftype="tabular" />
<param name="test_data" value="classification_local_test_rows.tabular" ftype="tabular" />
<param name="testhaslabels" value="" />
<output name="output_predicted_data">
<assert_contents>
<has_n_columns n="42" />
<has_n_lines n="3" />
</assert_contents>
</output>
</test>
<test expect_num_outputs="2">
<param name="selected_task" value="Classification" />
<param name="train_data" value="classification_local_train_rows.tabular" ftype="tabular" />
<param name="test_data" value="classification_local_test_rows_labels.tabular" ftype="tabular" />
<param name="testhaslabels" value="haslabels" />
<output name="output_plot" file="prc_binary.png" compare="sim_size" />
</test>
<test expect_num_outputs="2">
<param name="selected_task" value="Classification" />
<param name="train_data" value="train_data_multiclass.tabular" ftype="tabular" />
<param name="test_data" value="test_data_multiclass_labels.tabular" ftype="tabular" />
<param name="testhaslabels" value="haslabels" />
<output name="output_plot" file="prc_multiclass.png" compare="sim_size" />
</test>
<test expect_num_outputs="1">
<param name="selected_task" value="Classification" />
<param name="train_data" value="train_data_multiclass.tabular" ftype="tabular" />
<param name="test_data" value="test_data_multiclass_nolabels.tabular" ftype="tabular" />
<param name="testhaslabels" value="" />
<output name="output_predicted_data">
<assert_contents>
<has_n_columns n="11" />
<has_n_lines n="502" />
</assert_contents>
</output>
</test>
<test expect_num_outputs="2">
<param name="selected_task" value="Regression" />
<param name="train_data" value="regression_local_train_rows.tabular" ftype="tabular" />
<param name="test_data" value="regression_local_test_rows_labels.tabular" ftype="tabular" />
<param name="testhaslabels" value="haslabels" />
<output name="output_plot" file="r2_curve.png" compare="sim_size" />
</test>
<test expect_num_outputs="1">
<param name="selected_task" value="Regression" />
<param name="train_data" value="regression_local_train_rows.tabular" ftype="tabular" />
<param name="test_data" value="regression_local_test_rows.tabular" ftype="tabular" />
<param name="testhaslabels" value="" />
<output name="output_predicted_data">
<assert_contents>
<has_n_columns n="14" />
<has_n_lines n="105" />
</assert_contents>
</output>
</test>
</tests>
<help>
<![CDATA[
]]></command>
<inputs>
<param name="selected_task" type="select" label="Select a machine learning task">
<option value="Classification" selected="true"/>
<option value="Regression" selected="false"/>
</param>
<param name="train_data" type="data" format="tabular" label="Train data" help="Please provide training data for training model. It should contain labels/class/target in the last column"/>
<param name="test_data" type="data" format="tabular" label="Test data" help="Please provide test data for evaluating model. It may or may not contain labels/class/target in the last column"/>
<param name="testhaslabels" type="boolean" truevalue="haslabels" falsevalue="" checked="false" label="Does test data contain labels?" help="Set this parameter when test data contains labels"/>
</inputs>
<outputs>
<data format="tabular" name="output_predicted_data_TabPFN" from_work_dir="output_predicted_data_TabPFN" label="Predicted data by TabPFN"/>
<data format="tabular" name="output_predicted_data_CatBoost" from_work_dir="output_predicted_data_CatBoost" label="Predicted data by CatBoost"/>
<data format="png" name="output_plot_TabPFN" from_work_dir="output_plot_TabPFN.png" label="Prediction plot on test data TabPFN">
<filter>testhaslabels is True</filter>
</data>
<data format="png" name="output_plot_CatBoost" from_work_dir="output_plot_CatBoost.png" label="Prediction plot on test data using CatBoost">
<filter>testhaslabels is True</filter>
</data>
</outputs>
<tests>
<test expect_num_outputs="2">
<param name="selected_task" value="Classification"/>
<param name="train_data" value="classification_local_train_rows.tabular" ftype="tabular"/>
<param name="test_data" value="classification_local_test_rows.tabular" ftype="tabular"/>
<param name="testhaslabels" value=""/>
<output name="output_predicted_data_TabPFN">
<assert_contents>
<has_n_columns n="42"/>
<has_n_lines n="3"/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="3">
<param name="selected_task" value="Classification"/>
<param name="train_data" value="classification_local_train_rows.tabular" ftype="tabular"/>
<param name="test_data" value="classification_local_test_rows_labels.tabular" ftype="tabular"/>
<param name="testhaslabels" value="haslabels"/>
<output name="output_plot_TabPFN" file="prc_binary.png" compare="sim_size"/>
</test>
<test expect_num_outputs="3">
<param name="selected_task" value="Classification"/>
<param name="train_data" value="train_data_multiclass.tabular" ftype="tabular"/>
<param name="test_data" value="test_data_multiclass_labels.tabular" ftype="tabular"/>
<param name="testhaslabels" value="haslabels"/>
<output name="output_plot_TabPFN" file="prc_multiclass.png" compare="sim_size"/>
</test>
<test expect_num_outputs="2">
<param name="selected_task" value="Classification"/>
<param name="train_data" value="train_data_multiclass.tabular" ftype="tabular"/>
<param name="test_data" value="test_data_multiclass_nolabels.tabular" ftype="tabular"/>
<param name="testhaslabels" value=""/>
<output name="output_predicted_data_CatBoost">
<assert_contents>
<has_n_columns n="11"/>
<has_n_lines n="502"/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="3">
<param name="selected_task" value="Regression"/>
<param name="train_data" value="regression_local_train_rows.tabular" ftype="tabular"/>
<param name="test_data" value="regression_local_test_rows_labels.tabular" ftype="tabular"/>
<param name="testhaslabels" value="haslabels"/>
<output name="output_plot_TabPFN" file="r2_curve.png" compare="sim_size"/>
</test>
<test expect_num_outputs="2">
<param name="selected_task" value="Regression"/>
<param name="train_data" value="regression_local_train_rows.tabular" ftype="tabular"/>
<param name="test_data" value="regression_local_test_rows.tabular" ftype="tabular"/>
<param name="testhaslabels" value=""/>
<output name="output_predicted_data_TabPFN">
<assert_contents>
<has_n_columns n="14"/>
<has_n_lines n="105"/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
**What it does**

Classification and Regression on tabular data by TabPFN

**Input files**
- Training data: the training data should contain features and the last column should be the class labels. It should be in tabular format.
- Test data: the test data should also contain the same features as the training data and the last column should be the class labels if labels are avaialble. It should be in tabular format. It is not required for the test data to have labels.
- Above files show performance comparison of TabPFN with CatBoost (https://github.com/catboost/catboost).

**Output files**
- Predicted data along with predicted labels.
- Prediction plot (when test data has labels available).
]]>
</help>
<citations>
<citation type="doi">10.1038/s41586-024-08328-6</citation>
</citations>

**License**
- TabPFN is available under an open source license (https://github.com/PriorLabs/TabPFN?tab=License-1-ov-file) that combines Apache with a LLama-like attribution clause. It requires you to prominently display "Built with TabPFN" when you use a pipeline including it in production.
]]></help>
<citations>
<citation type="doi">10.1038/s41586-024-08328-6</citation>
</citations>
</tool>
Loading