Skip to content

Commit 5f1f7b8

Browse files
anuprulezAnup Kumar
andauthored
TabPFN: Add ignore_pretraining_limits to classifiers and regressors (#1847)
* Add ignore_pretraining_limits to classifiers and regressors The parameter `ignore_pretraining_limits=True` removes the limit to use only 1000 samples for training. PriorLabs/TabPFN#169 https://huggingface.co/Prior-Labs/TabPFN-v2-reg/discussions/2 Currently, the tool fails when training size > 1000 samples: ``` Traceback (most recent call last): File "/opt/galaxy/shed_tools/toolshed.g2.bx.psu.edu/repos/bgruening/tabpfn/ed78e1448387/tabpfn/main.py", line 167, in <module> train_evaluate(args) File "/opt/galaxy/shed_tools/toolshed.g2.bx.psu.edu/repos/bgruening/tabpfn/ed78e1448387/tabpfn/main.py", line 98, in train_evaluate classifier.fit(tr_features, tr_labels) File "/usr/local/lib/python3.12/contextlib.py", line 81, in inner return func(*args, **kwds) ^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/tabpfn_common_utils/telemetry/core/decorators.py", line 321, in wrapper return _safe_call_with_telemetry( ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/tabpfn_common_utils/telemetry/core/decorators.py", line 365, in _safe_call_with_telemetry result = func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/tabpfn/classifier.py", line 758, in fit ensemble_configs, X, y = self._initialize_dataset_preprocessing( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/tabpfn/classifier.py", line 642, in _initialize_dataset_preprocessing X, y, feature_names, n_features, original_y_name = ensure_compatible_fit_inputs( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/tabpfn/validation.py", line 75, in ensure_compatible_fit_inputs validate_dataset_size( File "/usr/local/lib/python3.12/site-packages/tabpfn/validation.py", line 137, in validate_dataset_size _validate_num_samples_for_cpu( File "/usr/local/lib/python3.12/site-packages/tabpfn/validation.py", line 274, in _validate_num_samples_for_cpu raise RuntimeError( RuntimeError: Running on CPU with more than 1000 samples is not allowed by default due to slow performance. To override this behavior, set the environment variable TABPFN_ALLOW_CPU_LARGE_DATASET=1 or set ignore_pretraining_limits=True. Alternatively, consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client ``` * toggle pretraining limits based on number of samples --------- Co-authored-by: Anup Kumar <anuprulez@gmail.com>
1 parent 16cb5bf commit 5f1f7b8

1 file changed

Lines changed: 10 additions & 2 deletions

File tree

tools/tabpfn/main.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ def train_evaluate(args):
8484
"""
8585
Train TabPFN and predict
8686
"""
87+
MAX_IGNORE_PRETRAINING_LIMITS_SAMPLES = 1000
88+
SEED = 42
8789
# prepare train data
8890
tr_features, tr_labels = separate_features_labels(args["train_data"], args["train_header"])
8991
# prepare test data
@@ -94,7 +96,10 @@ def train_evaluate(args):
9496
te_labels = []
9597
s_time = time.time()
9698
if args["selected_task"] == "Classification":
97-
classifier = TabPFNClassifier(random_state=42, model_path=args["model_path"])
99+
if tr_features.shape[0] <= MAX_IGNORE_PRETRAINING_LIMITS_SAMPLES:
100+
classifier = TabPFNClassifier(random_state=SEED, model_path=args["model_path"])
101+
else:
102+
classifier = TabPFNClassifier(random_state=SEED, model_path=args["model_path"], ignore_pretraining_limits=True)
98103
classifier.fit(tr_features, tr_labels)
99104
y_eval = classifier.predict(te_features)
100105
pred_probas_test = classifier.predict_proba(te_features)
@@ -105,7 +110,10 @@ def train_evaluate(args):
105110
"output_predicted_data", sep="\t", index=None
106111
)
107112
else:
108-
regressor = TabPFNRegressor(random_state=42, model_path=args["model_path"])
113+
if tr_features.shape[0] <= MAX_IGNORE_PRETRAINING_LIMITS_SAMPLES:
114+
regressor = TabPFNRegressor(random_state=SEED, model_path=args["model_path"])
115+
else:
116+
regressor = TabPFNRegressor(random_state=SEED, model_path=args["model_path"], ignore_pretraining_limits=True)
109117
regressor.fit(tr_features, tr_labels)
110118
y_eval = regressor.predict(te_features)
111119
if len(te_labels) > 0:

0 commit comments

Comments
 (0)