Skip to content

Commit 12f311c

Browse files
committed
add dataset creation
1 parent fd3f676 commit 12f311c

File tree

7 files changed

+4466
-2
lines changed

7 files changed

+4466
-2
lines changed

fastapi/app/ai/dataset/__init__.py

Whitespace-only changes.

fastapi/app/ai/dataset/dataset.json

Lines changed: 3536 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
import json
2+
import sys
3+
from pathlib import Path
4+
from refine_compare import semantic_compare
5+
6+
7+
def read_text_file(filepath):
8+
"""Read and return the contents of a text file."""
9+
try:
10+
with open(filepath, 'r', encoding='utf-8') as f:
11+
return f.read()
12+
except Exception as e:
13+
print(f"Error reading file {filepath}: {e}")
14+
sys.exit(1)
15+
16+
17+
def align_sentences_semantically(original_text, translated_text, source_lang, target_lang, sim_threshold=0.75):
18+
"""
19+
Use semantic comparison to get sentence pairs from both texts.
20+
Returns the sentences from the semantic_compare function.
21+
"""
22+
result = semantic_compare(
23+
original_blob=original_text,
24+
translated_blob=translated_text,
25+
source_language=source_lang,
26+
target_language=target_lang,
27+
sim_threshold=sim_threshold
28+
)
29+
30+
if not result["success"]:
31+
print("Warning: Semantic comparison was not fully successful")
32+
print("Continuing with available data...")
33+
34+
return result["original_sentences"], result["translated_sentences"]
35+
36+
37+
def create_qna_pairs(original_sentences, translated_sentences, source_lang, target_lang):
38+
"""
39+
Create QnA pairs from original and translated sentences.
40+
41+
Returns a list of QnA dictionaries with:
42+
- Individual sentence pairs (1:1)
43+
- Batched sentence pairs (10:10)
44+
- Both directions (original->translated and translated->original)
45+
46+
Optimized to process each sentence only once.
47+
"""
48+
qna_dataset = []
49+
50+
# Determine the minimum length to avoid index errors
51+
min_length = min(len(original_sentences), len(translated_sentences))
52+
53+
# Process all individual sentences in one loop
54+
for i in range(min_length):
55+
# Original -> Translated
56+
qna_dataset.append({
57+
"question": f"Translate this article from {source_lang} to {target_lang}: {original_sentences[i]}",
58+
"answer": translated_sentences[i],
59+
"type": "single_sentence",
60+
"direction": f"{source_lang}_to_{target_lang}",
61+
"index": i
62+
})
63+
64+
# Translated -> Original (Reverse)
65+
qna_dataset.append({
66+
"question": f"Translate this article from {target_lang} to {source_lang}: {translated_sentences[i]}",
67+
"answer": original_sentences[i],
68+
"type": "single_sentence",
69+
"direction": f"{target_lang}_to_{source_lang}",
70+
"index": i
71+
})
72+
73+
# Process batched sentences
74+
batch_size = 10
75+
for i in range(0, min_length, batch_size):
76+
end_idx = min(i + batch_size, min_length)
77+
78+
# Combine sentences once
79+
original_batch = " ".join(original_sentences[i:end_idx])
80+
translated_batch = " ".join(translated_sentences[i:end_idx])
81+
82+
# Original -> Translated
83+
qna_dataset.append({
84+
"question": f"Translate this article from {source_lang} to {target_lang}: {original_batch}",
85+
"answer": translated_batch,
86+
"type": "batch_sentences",
87+
"direction": f"{source_lang}_to_{target_lang}",
88+
"batch_start": i,
89+
"batch_end": end_idx,
90+
"batch_size": end_idx - i
91+
})
92+
93+
# Translated -> Original (Reverse) - reuse the same batch strings
94+
qna_dataset.append({
95+
"question": f"Translate this article from {target_lang} to {source_lang}: {translated_batch}",
96+
"answer": original_batch,
97+
"type": "batch_sentences",
98+
"direction": f"{target_lang}_to_{source_lang}",
99+
"batch_start": i,
100+
"batch_end": end_idx,
101+
"batch_size": end_idx - i
102+
})
103+
104+
return qna_dataset
105+
106+
107+
def generate_dataset(original_file, translated_file, source_lang, target_lang,
108+
sim_threshold=0.75, output_file="dataset.json"):
109+
"""
110+
Main function to generate QnA dataset from two text files using semantic comparison.
111+
"""
112+
print(f"Reading original file: {original_file}")
113+
original_text = read_text_file(original_file)
114+
115+
print(f"Reading translated file: {translated_file}")
116+
translated_text = read_text_file(translated_file)
117+
118+
print(f"Performing semantic comparison between {source_lang} and {target_lang}...")
119+
print(f"Using similarity threshold: {sim_threshold}")
120+
121+
original_sentences, translated_sentences = align_sentences_semantically(
122+
original_text,
123+
translated_text,
124+
source_lang,
125+
target_lang,
126+
sim_threshold
127+
)
128+
129+
print(f"Found {len(original_sentences)} sentences in original text")
130+
print(f"Found {len(translated_sentences)} sentences in translated text")
131+
132+
print("Generating QnA pairs...")
133+
qna_dataset = create_qna_pairs(original_sentences, translated_sentences, source_lang, target_lang)
134+
135+
# Create output structure
136+
output_data = {
137+
"metadata": {
138+
"source_language": source_lang,
139+
"target_language": target_lang,
140+
"original_file": str(original_file),
141+
"translated_file": str(translated_file),
142+
"similarity_threshold": sim_threshold,
143+
"original_sentence_count": len(original_sentences),
144+
"translated_sentence_count": len(translated_sentences),
145+
"total_qna_pairs": len(qna_dataset)
146+
},
147+
"qna_pairs": qna_dataset
148+
}
149+
150+
print(f"Writing dataset to {output_file}...")
151+
with open(output_file, 'w', encoding='utf-8') as f:
152+
json.dump(output_data, f, ensure_ascii=False, indent=2)
153+
154+
print(f"✓ Successfully generated dataset with {len(qna_dataset)} QnA pairs")
155+
print(f" - Single sentence pairs: {len([q for q in qna_dataset if q['type'] == 'single_sentence'])}")
156+
print(f" - Batch sentence pairs: {len([q for q in qna_dataset if q['type'] == 'batch_sentences'])}")
157+
158+
return output_data
159+
160+
161+
def main():
162+
# ============================================
163+
# CONFIGURE YOUR INPUT FILES HERE
164+
# ============================================
165+
166+
original_file = "door_eng.txt" # Path to original text file
167+
translated_file = "door_spanish.txt" # Path to translated text file
168+
source_lang = "en" # Source language code
169+
target_lang = "es" # Target language code
170+
sim_threshold = 0.75 # Similarity threshold for semantic matching (0.0-1.0)
171+
output_file = "dataset.json" # Output JSON file path
172+
173+
# ============================================
174+
175+
generate_dataset(
176+
original_file,
177+
translated_file,
178+
source_lang,
179+
target_lang,
180+
sim_threshold,
181+
output_file
182+
)
183+
184+
185+
if __name__ == "__main__":
186+
main()

0 commit comments

Comments
 (0)