-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsample_for_annotation.py
More file actions
36 lines (29 loc) · 1.1 KB
/
sample_for_annotation.py
File metadata and controls
36 lines (29 loc) · 1.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import pandas as pd
import os
import json
def sample_emails(input_path, output_path, n=100):
if not os.path.exists(input_path):
print(f"Error: {input_path} not found")
return
# Using low_memory=False to handle potential mixed types in large csv
df = pd.read_csv(input_path, low_memory=False)
if len(df) < n:
n = len(df)
sampled_df = df.sample(n=n, random_state=42)
records = []
for i, row in sampled_df.iterrows():
# Enron csv typically has 'message' column containing full email
text = str(row['message'])
records.append({
"id": int(i),
"text": text,
"entities": []
})
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
json.dump(records, f, indent=2)
print(f"Successfully sampled {n} emails to {output_path}")
if __name__ == "__main__":
raw_path = "/root/test_gliner/data/raw/emails.csv"
output_path = "/root/test_gliner/data/processed/gold_standard_template.json"
sample_emails(raw_path, output_path)