kitops-ts/examples/llm-prompts-ci.js at main · kitops-ml/kitops-ts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env node
/**
 * Package a prompt dataset for LLM fine-tuning and publish it via CI.
 *
 * The `prompts` layer is designed for exactly this use case: versioning prompt
 * templates, instruction sets, and RLHF preference data alongside the model
 * weights and training code they belong to. Storing everything in a single
 * ModelKit keeps the fine-tuning run fully reproducible — you always know
 * which prompts produced which checkpoint.
 *
 * Expected directory layout:
 *
 *   fine-tune/
 *   ├── Kitfile              (generated below if absent)
 *   ├── checkpoints/         model checkpoint directory
 *   ├── prompts/
 *   │   ├── system.txt       system prompt template
 *   │   ├── instructions.jsonl  instruction-following examples
 *   │   └── preferences.jsonl   RLHF preference pairs
 *   ├── data/
 *   │   └── train.jsonl      raw training corpus
 *   └── train.py             training script
 *
 * Run:
 *   MODEL_VERSION=0.2.0 REGISTRY_USER=... REGISTRY_PASS=... node examples/llm-prompts-ci.js
 */

import { writeFile, access } from 'fs/promises';
import { stringify as toYaml } from 'yaml';
import { login, pack, push, logout } from '../dist/index.js';

const registry = process.env.REGISTRY ?? 'registry.example.com';
const user = requireEnv('REGISTRY_USER');
const pass = requireEnv('REGISTRY_PASS');
const version = process.env.MODEL_VERSION ?? 'latest';

const ref = `${registry}/org/llm-finetune:v${version}`;
const workdir = './fine-tune';

// Generate a Kitfile if one isn't already committed.
const kitfilePath = `${workdir}/Kitfile`;
const kitfileExists = await access(kitfilePath).then(() => true).catch(() => false);

if (!kitfileExists) {
  const kitfile = {
    manifestVersion: '1.0.0',
    package: {
      name: 'llm-finetune',
      version,
      description: 'Fine-tuning run with versioned prompt dataset',
      authors: ['AI Team'],
    },
    model: {
      name: 'base-checkpoint',
      path: './checkpoints',
      description: 'Latest training checkpoint',
    },
    // The prompts layer captures everything that shaped the model's behaviour:
    // system prompts, instruction templates, and preference data used in RLHF.
    prompts: [
      {
        path: './prompts/system.txt',
        description: 'System prompt template used during fine-tuning',
      },
      {
        path: './prompts/instructions.jsonl',
        description: 'Instruction-following examples (ShareGPT format)',
      },
      {
        path: './prompts/preferences.jsonl',
        description: 'RLHF preference pairs for DPO training',
      },
    ],
    datasets: [
      {
        name: 'training-corpus',
        path: './data/train.jsonl',
        description: 'Raw pre-training corpus used for continued pre-training',
      },
    ],
    code: [
      { path: './train.py', description: 'Fine-tuning entry point' },
    ],
  };

  await writeFile(kitfilePath, toYaml(kitfile, { lineWidth: 0 }));
  console.log('Kitfile generated.');
}

await login(registry, user, pass);

try {
  await pack(workdir, { tag: ref });
  console.log(`Packed: ${ref}`);

  await push(ref);
  console.log(`Pushed: ${ref}`);
} finally {
  await logout(registry);
}

function requireEnv(name) {
  const value = process.env[name];
  if (!value) {
    console.error(`Missing required environment variable: ${name}`);
    process.exit(1);
  }
  return value;
}