TabPFN: Add ignore_pretraining_limits to classifiers and regressors (#1847)

anuprulez · Anup Kumar · web-flow · commit 5f1f7b83ced6 · 2026-04-22T23:44:04.000+02:00
* Add ignore_pretraining_limits to classifiers and regressors The parameter `ignore_pretraining_limits=True` removes the limit to use only 1000 samples for training. PriorLabs/TabPFN#169 https://huggingface.co/Prior-Labs/TabPFN-v2-reg/discussions/2 Currently, the tool fails when training size > 1000 samples: ``` Traceback (most recent call last): File "/opt/galaxy/shed_tools/toolshed.g2.bx.psu.edu/repos/bgruening/tabpfn/ed78e1448387/tabpfn/main.py", line 167, in <module> train_evaluate(args) File "/opt/galaxy/shed_tools/toolshed.g2.bx.psu.edu/repos/bgruening/tabpfn/ed78e1448387/tabpfn/main.py", line 98, in train_evaluate classifier.fit(tr_features, tr_labels) File "/usr/local/lib/python3.12/contextlib.py", line 81, in inner return func(*args, **kwds) ^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/tabpfn_common_utils/telemetry/core/decorators.py", line 321, in wrapper return _safe_call_with_telemetry( ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/tabpfn_common_utils/telemetry/core/decorators.py", line 365, in _safe_call_with_telemetry result = func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/tabpfn/classifier.py", line 758, in fit ensemble_configs, X, y = self._initialize_dataset_preprocessing( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/tabpfn/classifier.py", line 642, in _initialize_dataset_preprocessing X, y, feature_names, n_features, original_y_name = ensure_compatible_fit_inputs( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/tabpfn/validation.py", line 75, in ensure_compatible_fit_inputs validate_dataset_size( File "/usr/local/lib/python3.12/site-packages/tabpfn/validation.py", line 137, in validate_dataset_size _validate_num_samples_for_cpu( File "/usr/local/lib/python3.12/site-packages/tabpfn/validation.py", line 274, in _validate_num_samples_for_cpu raise RuntimeError( RuntimeError: Running on CPU with more than 1000 samples is not allowed by default due to slow performance. To override this behavior, set the environment variable TABPFN_ALLOW_CPU_LARGE_DATASET=1 or set ignore_pretraining_limits=True. Alternatively, consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client ``` * toggle pretraining limits based on number of samples --------- Co-authored-by: Anup Kumar <anuprulez@gmail.com>
diff --git a/tools/tabpfn/main.py b/tools/tabpfn/main.py
@@ -84,6 +84,8 @@ def train_evaluate(args):
     """
     Train TabPFN and predict
     """
+    MAX_IGNORE_PRETRAINING_LIMITS_SAMPLES = 1000
+    SEED = 42
     # prepare train data
     tr_features, tr_labels = separate_features_labels(args["train_data"], args["train_header"])
     # prepare test data
@@ -94,7 +96,10 @@ def train_evaluate(args):
         te_labels = []
     s_time = time.time()
     if args["selected_task"] == "Classification":
-        classifier = TabPFNClassifier(random_state=42, model_path=args["model_path"])
+        if tr_features.shape[0] <= MAX_IGNORE_PRETRAINING_LIMITS_SAMPLES:
+            classifier = TabPFNClassifier(random_state=SEED, model_path=args["model_path"])
+        else:
+            classifier = TabPFNClassifier(random_state=SEED, model_path=args["model_path"], ignore_pretraining_limits=True)
         classifier.fit(tr_features, tr_labels)
         y_eval = classifier.predict(te_features)
         pred_probas_test = classifier.predict_proba(te_features)
@@ -105,7 +110,10 @@ def train_evaluate(args):
             "output_predicted_data", sep="\t", index=None
         )
     else:
-        regressor = TabPFNRegressor(random_state=42, model_path=args["model_path"])
+        if tr_features.shape[0] <= MAX_IGNORE_PRETRAINING_LIMITS_SAMPLES:
+            regressor = TabPFNRegressor(random_state=SEED, model_path=args["model_path"])
+        else:
+            regressor = TabPFNRegressor(random_state=SEED, model_path=args["model_path"], ignore_pretraining_limits=True)
         regressor.fit(tr_features, tr_labels)
         y_eval = regressor.predict(te_features)
         if len(te_labels) > 0: