ada-slm/fix_v9g_curriculum.py at trunk · luna-system/ada-slm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python3
"""
Fix the v9g curriculum training script to work with messages format.
Quick patch to get training running.
"""

def fix_tokenize_function():
    """Update the tokenization function in the training script."""

    # Read the original script
    with open("train_v9g_curriculum.py", "r") as f:
        content = f.read()

    # Replace the problematic tokenize function
    old_tokenize = '''    def tokenize_function(examples):
        # Concatenate input and target with special tokens
        texts = []
        for input_text, target_text in zip(examples["input"], examples["target"]):
            text = f"{input_text}\\n{target_text}<|endoftext|>"
            texts.append(text)'''

    new_tokenize = '''    def tokenize_function(examples):
        # Extract from messages format: user -> assistant conversation
        texts = []
        for messages in examples["messages"]:
            user_msg = messages[0]["content"]  # user message
            assistant_msg = messages[1]["content"]  # assistant response
            text = f"{user_msg}\\n{assistant_msg}<|endoftext|>"
            texts.append(text)'''

    # Make the replacement
    content = content.replace(old_tokenize, new_tokenize)

    # Write back
    with open("train_v9g_curriculum_fixed.py", "w") as f:
        f.write(content)

    print("✅ Created fixed training script: train_v9g_curriculum_fixed.py")
    print("🔧 Tokenization function updated to work with messages format")

if __name__ == "__main__":
    fix_tokenize_function()