gliner-spacy-comparison-neo/sample_for_annotation.py at main · gauravvij/gliner-spacy-comparison-neo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import pandas as pd
import os
import json

def sample_emails(input_path, output_path, n=100):
    if not os.path.exists(input_path):
        print(f"Error: {input_path} not found")
        return

    # Using low_memory=False to handle potential mixed types in large csv
    df = pd.read_csv(input_path, low_memory=False)

    if len(df) < n:
        n = len(df)

    sampled_df = df.sample(n=n, random_state=42)

    records = []
    for i, row in sampled_df.iterrows():
        # Enron csv typically has 'message' column containing full email
        text = str(row['message'])
        records.append({
            "id": int(i),
            "text": text,
            "entities": []
        })

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(records, f, indent=2)
    print(f"Successfully sampled {n} emails to {output_path}")

if __name__ == "__main__":
    raw_path = "/root/test_gliner/data/raw/emails.csv"
    output_path = "/root/test_gliner/data/processed/gold_standard_template.json"
    sample_emails(raw_path, output_path)