1+ import json
2+ import sys
3+ from pathlib import Path
4+ from refine_compare import semantic_compare
5+
6+
7+ def read_text_file (filepath ):
8+ """Read and return the contents of a text file."""
9+ try :
10+ with open (filepath , 'r' , encoding = 'utf-8' ) as f :
11+ return f .read ()
12+ except Exception as e :
13+ print (f"Error reading file { filepath } : { e } " )
14+ sys .exit (1 )
15+
16+
17+ def align_sentences_semantically (original_text , translated_text , source_lang , target_lang , sim_threshold = 0.75 ):
18+ """
19+ Use semantic comparison to get sentence pairs from both texts.
20+ Returns the sentences from the semantic_compare function.
21+ """
22+ result = semantic_compare (
23+ original_blob = original_text ,
24+ translated_blob = translated_text ,
25+ source_language = source_lang ,
26+ target_language = target_lang ,
27+ sim_threshold = sim_threshold
28+ )
29+
30+ if not result ["success" ]:
31+ print ("Warning: Semantic comparison was not fully successful" )
32+ print ("Continuing with available data..." )
33+
34+ return result ["original_sentences" ], result ["translated_sentences" ]
35+
36+
37+ def create_qna_pairs (original_sentences , translated_sentences , source_lang , target_lang ):
38+ """
39+ Create QnA pairs from original and translated sentences.
40+
41+ Returns a list of QnA dictionaries with:
42+ - Individual sentence pairs (1:1)
43+ - Batched sentence pairs (10:10)
44+ - Both directions (original->translated and translated->original)
45+
46+ Optimized to process each sentence only once.
47+ """
48+ qna_dataset = []
49+
50+ # Determine the minimum length to avoid index errors
51+ min_length = min (len (original_sentences ), len (translated_sentences ))
52+
53+ # Process all individual sentences in one loop
54+ for i in range (min_length ):
55+ # Original -> Translated
56+ qna_dataset .append ({
57+ "question" : f"Translate this article from { source_lang } to { target_lang } : { original_sentences [i ]} " ,
58+ "answer" : translated_sentences [i ],
59+ "type" : "single_sentence" ,
60+ "direction" : f"{ source_lang } _to_{ target_lang } " ,
61+ "index" : i
62+ })
63+
64+ # Translated -> Original (Reverse)
65+ qna_dataset .append ({
66+ "question" : f"Translate this article from { target_lang } to { source_lang } : { translated_sentences [i ]} " ,
67+ "answer" : original_sentences [i ],
68+ "type" : "single_sentence" ,
69+ "direction" : f"{ target_lang } _to_{ source_lang } " ,
70+ "index" : i
71+ })
72+
73+ # Process batched sentences
74+ batch_size = 10
75+ for i in range (0 , min_length , batch_size ):
76+ end_idx = min (i + batch_size , min_length )
77+
78+ # Combine sentences once
79+ original_batch = " " .join (original_sentences [i :end_idx ])
80+ translated_batch = " " .join (translated_sentences [i :end_idx ])
81+
82+ # Original -> Translated
83+ qna_dataset .append ({
84+ "question" : f"Translate this article from { source_lang } to { target_lang } : { original_batch } " ,
85+ "answer" : translated_batch ,
86+ "type" : "batch_sentences" ,
87+ "direction" : f"{ source_lang } _to_{ target_lang } " ,
88+ "batch_start" : i ,
89+ "batch_end" : end_idx ,
90+ "batch_size" : end_idx - i
91+ })
92+
93+ # Translated -> Original (Reverse) - reuse the same batch strings
94+ qna_dataset .append ({
95+ "question" : f"Translate this article from { target_lang } to { source_lang } : { translated_batch } " ,
96+ "answer" : original_batch ,
97+ "type" : "batch_sentences" ,
98+ "direction" : f"{ target_lang } _to_{ source_lang } " ,
99+ "batch_start" : i ,
100+ "batch_end" : end_idx ,
101+ "batch_size" : end_idx - i
102+ })
103+
104+ return qna_dataset
105+
106+
107+ def generate_dataset (original_file , translated_file , source_lang , target_lang ,
108+ sim_threshold = 0.75 , output_file = "dataset.json" ):
109+ """
110+ Main function to generate QnA dataset from two text files using semantic comparison.
111+ """
112+ print (f"Reading original file: { original_file } " )
113+ original_text = read_text_file (original_file )
114+
115+ print (f"Reading translated file: { translated_file } " )
116+ translated_text = read_text_file (translated_file )
117+
118+ print (f"Performing semantic comparison between { source_lang } and { target_lang } ..." )
119+ print (f"Using similarity threshold: { sim_threshold } " )
120+
121+ original_sentences , translated_sentences = align_sentences_semantically (
122+ original_text ,
123+ translated_text ,
124+ source_lang ,
125+ target_lang ,
126+ sim_threshold
127+ )
128+
129+ print (f"Found { len (original_sentences )} sentences in original text" )
130+ print (f"Found { len (translated_sentences )} sentences in translated text" )
131+
132+ print ("Generating QnA pairs..." )
133+ qna_dataset = create_qna_pairs (original_sentences , translated_sentences , source_lang , target_lang )
134+
135+ # Create output structure
136+ output_data = {
137+ "metadata" : {
138+ "source_language" : source_lang ,
139+ "target_language" : target_lang ,
140+ "original_file" : str (original_file ),
141+ "translated_file" : str (translated_file ),
142+ "similarity_threshold" : sim_threshold ,
143+ "original_sentence_count" : len (original_sentences ),
144+ "translated_sentence_count" : len (translated_sentences ),
145+ "total_qna_pairs" : len (qna_dataset )
146+ },
147+ "qna_pairs" : qna_dataset
148+ }
149+
150+ print (f"Writing dataset to { output_file } ..." )
151+ with open (output_file , 'w' , encoding = 'utf-8' ) as f :
152+ json .dump (output_data , f , ensure_ascii = False , indent = 2 )
153+
154+ print (f"✓ Successfully generated dataset with { len (qna_dataset )} QnA pairs" )
155+ print (f" - Single sentence pairs: { len ([q for q in qna_dataset if q ['type' ] == 'single_sentence' ])} " )
156+ print (f" - Batch sentence pairs: { len ([q for q in qna_dataset if q ['type' ] == 'batch_sentences' ])} " )
157+
158+ return output_data
159+
160+
161+ def main ():
162+ # ============================================
163+ # CONFIGURE YOUR INPUT FILES HERE
164+ # ============================================
165+
166+ original_file = "door_eng.txt" # Path to original text file
167+ translated_file = "door_spanish.txt" # Path to translated text file
168+ source_lang = "en" # Source language code
169+ target_lang = "es" # Target language code
170+ sim_threshold = 0.75 # Similarity threshold for semantic matching (0.0-1.0)
171+ output_file = "dataset.json" # Output JSON file path
172+
173+ # ============================================
174+
175+ generate_dataset (
176+ original_file ,
177+ translated_file ,
178+ source_lang ,
179+ target_lang ,
180+ sim_threshold ,
181+ output_file
182+ )
183+
184+
185+ if __name__ == "__main__" :
186+ main ()
0 commit comments