Skip to content

Commit 7069ce0

Browse files
manwithacatclaude
andcommitted
fix: preflight now detects optimizer.pt disk usage in checkpoints
Root cause: Kaggle training failed because HuggingFace Trainer saves optimizer states (optimizer.pt) in checkpoints by default, which is ~4.6GB for NLLB-600M on top of the 2.3GB model weights. Changes: - Preflight disk estimation now includes optimizer states (2x model weights) - Added save_only_model config detection - Notebook generator now uses save_only_model from TOML config - Added recommendation to use save_only_model=True on disk-constrained platforms For NLLB-600M on Kaggle P100: - Without fix: checkpoint ~7GB, peak disk ~20GB (exceeds 10GB limit) - With save_only_model=True: checkpoint ~2.3GB, peak disk ~10GB (fits) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 3b0a200 commit 7069ce0

2 files changed

Lines changed: 155 additions & 93 deletions

File tree

src/commands/notebook/build.ts

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
* This ensures reproducible notebook generation with explicit ML decisions.
66
*/
77

8-
import { z } from 'zod'
8+
import TOML from '@iarna/toml'
99
import { existsSync, readFileSync } from 'fs'
1010
import { basename, dirname, join } from 'path'
11-
import TOML from '@iarna/toml'
12-
import type { CommandDefinition } from '../../types/commands'
13-
import { success, error } from '../../lib/output'
11+
import { z } from 'zod'
12+
import { error, success } from '../../lib/output'
1413
import { createTemplateEngine, PLATFORM_DISPLAY_NAMES } from '../../templates'
14+
import type { CommandDefinition } from '../../types/commands'
1515
import type { PlatformId } from '../../types/platform'
1616
import type { TemplateContext } from '../../types/template'
1717

@@ -158,8 +158,12 @@ function generateNotebook(config: TrainingConfig, configPath: string): string {
158158
# kaggle:
159159
# accelerator: gpu
160160
# dataSources:
161-
${dataSources.map(s => `# - type: ${s.type}
162-
# name: ${s.name}`).join('\n')}
161+
${dataSources
162+
.map(
163+
(s) => `# - type: ${s.type}
164+
# name: ${s.name}`
165+
)
166+
.join('\n')}
163167
# docker_image: gcr.io/kaggle-gpu-images/python
164168
# isGpuEnabled: true
165169
# isInternetEnabled: true
@@ -274,6 +278,7 @@ CONFIG = {
274278
275279
# Checkpoints
276280
"save_total_limit": ${config.checkpoints.save_total_limit},
281+
"save_only_model": ${config.checkpoints.save_optimizer === false ? 'True' : 'False'}, # True = skip optimizer.pt (saves ~4GB per checkpoint)
277282
"load_best_at_end": ${config.checkpoints.load_best_at_end ? 'True' : 'False'},
278283
279284
# Early stopping
@@ -307,7 +312,7 @@ for k, v in CONFIG.items():
307312
# %%
308313
# Dataset sources (in priority order)
309314
DATASET_SOURCES = [
310-
${dataSources.map(s => ` "${s.path}",`).join('\n')}
315+
${dataSources.map((s) => ` "${s.path}",`).join('\n')}
311316
]
312317
313318
train_df = None
@@ -424,6 +429,7 @@ training_args = Seq2SeqTrainingArguments(
424429
save_strategy="steps",
425430
save_steps=CONFIG["save_steps"],
426431
save_total_limit=CONFIG["save_total_limit"],
432+
save_only_model=CONFIG["save_only_model"], # Skip optimizer.pt to save disk space
427433
logging_steps=CONFIG["logging_steps"],
428434
load_best_model_at_end=CONFIG["load_best_at_end"],
429435
metric_for_best_model=CONFIG["metric_for_best_model"],
@@ -569,8 +575,8 @@ function generateMetadata(config: TrainingConfig, outputPath: string): Record<st
569575
enable_gpu: true,
570576
enable_tpu: false,
571577
enable_internet: true,
572-
dataset_sources: dataSources.filter(s => s.type === 'dataset').map(s => s.name),
573-
competition_sources: dataSources.filter(s => s.type === 'competition').map(s => s.name),
578+
dataset_sources: dataSources.filter((s) => s.type === 'dataset').map((s) => s.name),
579+
competition_sources: dataSources.filter((s) => s.type === 'competition').map((s) => s.name),
574580
kernel_sources: [],
575581
model_sources: [],
576582
}
@@ -634,10 +640,8 @@ Example config structure: see notebooks/kaggle/training.toml
634640
}
635641

636642
// Determine output path
637-
const outputPath = args.output || join(
638-
dirname(args.path),
639-
`${config.meta.name.toLowerCase().replace(/[^a-z0-9]/g, '_')}.py`
640-
)
643+
const outputPath =
644+
args.output || join(dirname(args.path), `${config.meta.name.toLowerCase().replace(/[^a-z0-9]/g, '_')}.py`)
641645

642646
// Generate notebook
643647
const notebook = generateNotebook(config, args.path)
@@ -675,12 +679,15 @@ Example config structure: see notebooks/kaggle/training.toml
675679
if (!args.skipPreflight) {
676680
// Import preflight dynamically to avoid circular deps
677681
const { preflight } = await import('../preflight')
678-
preflightResult = await preflight.run({
679-
path: outputPath,
680-
platform: config.platform.target,
681-
samples: 2000,
682-
verbose: false,
683-
}, ctx)
682+
preflightResult = await preflight.run(
683+
{
684+
path: outputPath,
685+
platform: config.platform.target,
686+
samples: 2000,
687+
verbose: false,
688+
},
689+
ctx
690+
)
684691
}
685692

686693
return success({

0 commit comments

Comments
 (0)