-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathgenerate_data_local.mjs
More file actions
134 lines (108 loc) · 4.24 KB
/
Copy pathgenerate_data_local.mjs
File metadata and controls
134 lines (108 loc) · 4.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env node
/**
* Generate Training Data for SOMA (Local Ollama Version)
* Uses local Ollama models to generate synthetic examples
* No API calls needed!
*/
import { promises as fs } from 'fs';
import path from 'path';
import { spawn } from 'child_process';
async function ollamaGenerate(prompt) {
return new Promise((resolve, reject) => {
const proc = spawn('ollama', ['run', 'gemma3:4b', prompt]);
let output = '';
proc.stdout.on('data', (data) => {
output += data.toString();
});
proc.on('close', (code) => {
if (code === 0) {
resolve(output.trim());
} else {
reject(new Error(`Ollama exited with code ${code}`));
}
});
});
}
async function main() {
console.log('🚀 SOMA Training Data Generator (Local Ollama)\n');
console.log('Using your local gemma3:4b model to generate training data!\n');
const outputDir = path.join(process.cwd(), 'SOMA', 'training-data');
await fs.mkdir(outputDir, { recursive: true });
const dataset = [];
// Topics for training
const topics = [
'artificial intelligence', 'machine learning', 'neural networks',
'software development', 'system design', 'algorithms',
'debugging', 'code optimization', 'testing',
'creative problem solving', 'brainstorming', 'decision making',
'ethics in AI', 'safety', 'reasoning',
'quantum computing', 'data structures', 'programming',
'productivity', 'learning techniques', 'self-improvement'
];
const numSamples = 100; // Start with 100 (Ollama is slower locally)
console.log(`🧠 Generating ${numSamples} training examples...\\n`);
for (let i = 0; i < numSamples; i++) {
try {
const topic = topics[Math.floor(Math.random() * topics.length)];
const queryTypes = [
`Explain ${topic} in simple terms`,
`What are the key concepts in ${topic}?`,
`How does ${topic} work?`,
`What are best practices for ${topic}?`,
`Can you give me a beginner's guide to ${topic}?`
];
const query = queryTypes[Math.floor(Math.random() * queryTypes.length)];
// Get response from local Ollama
const response = await ollamaGenerate(query);
dataset.push({
messages: [
{ role: 'user', content: query },
{ role: 'assistant', content: response }
],
metadata: {
source: 'synthetic_ollama_gemma3',
topic: topic
}
});
if ((i + 1) % 10 === 0) {
console.log(` Progress: ${i + 1}/${numSamples} (${((i + 1) / numSamples * 100).toFixed(0)}%)`);
}
} catch (error) {
console.error(` Error at sample ${i}: ${error.message}`);
}
}
console.log(`\\n ✅ Generated ${dataset.length} examples`);
// Add basic SOMA personality
console.log('\\n🎭 Adding SOMA personality...');
const personalityPrompt = `You are SOMA (Self-Organizing Memory Architecture), an AI assistant with the following traits:
- You are helpful, curious, and enthusiastic about learning
- You explain concepts clearly and adapt to the user's level
- You are honest about your limitations
- You think systematically and ask clarifying questions when needed`;
for (const example of dataset) {
example.messages.unshift({
role: 'system',
content: personalityPrompt
});
}
console.log(` ✅ Personality added to ${dataset.length} examples`);
// Save dataset
const timestamp = Date.now();
const outputPath = path.join(outputDir, `soma-training-${timestamp}.jsonl`);
console.log('\\n💾 Saving training dataset...');
const jsonl = dataset.map(item => JSON.stringify(item)).join('\\n');
await fs.writeFile(outputPath, jsonl, 'utf8');
console.log(`\\n✅ SUCCESS!`);
console.log(`\\n📁 Output: ${outputPath}`);
console.log(`📊 Total examples: ${dataset.length}`);
console.log(`\\n🚀 Next: Train Gemma 3!`);
console.log(` python scripts/finetune_gemma3.py`);
console.log(`\\n💡 This is your first dataset!`);
console.log(` After training, have conversations and regenerate data.`);
console.log(` Each iteration makes SOMA smarter!`);
process.exit(0);
}
main().catch((error) => {
console.error('\\n❌ Error:', error);
process.exit(1);
});