5252 CreateDatasetHttpResponse ,
5353)
5454
55- valid_file_types = ["csv" , "json" ]
55+ valid_file_types = ["csv" , "json" , "jsonl" ]
5656
5757evolution_map = {
5858 "Reasoning" : EvolutionTemplate .reasoning_evolution ,
@@ -216,13 +216,15 @@ async def a_generate_goldens_from_docs(
216216 similarity_threshold = context_construction_config .context_similarity_threshold ,
217217 max_retries = context_construction_config .max_retries ,
218218 )
219- contexts , source_files , context_scores = (
220- await context_generator .a_generate_contexts (
221- max_contexts_per_source_file = context_construction_config .max_contexts_per_document ,
222- min_contexts_per_source_file = context_construction_config .min_contexts_per_document ,
223- max_context_size = context_construction_config .max_context_length ,
224- min_context_size = context_construction_config .min_context_length ,
225- )
219+ (
220+ contexts ,
221+ source_files ,
222+ context_scores ,
223+ ) = await context_generator .a_generate_contexts (
224+ max_contexts_per_source_file = context_construction_config .max_contexts_per_document ,
225+ min_contexts_per_source_file = context_construction_config .min_contexts_per_document ,
226+ max_context_size = context_construction_config .max_context_length ,
227+ min_context_size = context_construction_config .min_context_length ,
226228 )
227229 if self .synthesis_cost :
228230 self .synthesis_cost += context_generator .total_cost
@@ -297,7 +299,6 @@ def generate_goldens_from_contexts(
297299 async_mode = False ,
298300 ) as progress_bar :
299301 for i , context in enumerate (contexts ):
300-
301302 # Generate inputs
302303 prompt = SynthesizerTemplate .generate_synthetic_inputs (
303304 context = context ,
@@ -315,7 +316,6 @@ def generate_goldens_from_contexts(
315316 context , synthetic_inputs
316317 )
317318 for j , data in enumerate (qualified_synthetic_inputs ):
318-
319319 # Evolve input
320320 evolved_input , evolutions_used = self ._evolve_input (
321321 input = data .input ,
@@ -458,7 +458,6 @@ async def _a_generate_from_context(
458458 context , synthetic_inputs
459459 )
460460 for i , data in enumerate (qualified_synthetic_inputs ):
461-
462461 # Evolve input
463462 evolved_input , evolutions_used = await self ._a_evolve_input (
464463 input = data .input ,
@@ -536,7 +535,6 @@ async def _a_generate_text_to_sql_from_context(
536535 prompt
537536 )
538537 for data in synthetic_inputs :
539-
540538 # Generate expected output
541539 expected_output = None
542540 if include_expected_output :
@@ -592,7 +590,6 @@ async def a_generate_goldens_from_scratch(
592590 progress_bar = None ,
593591 async_mode = True ,
594592 ) as progress_bar :
595-
596593 # Generate inputs
597594 prompt = PromptSynthesizerTemplate .generate_synthetic_prompts (
598595 scenario = self .styling_config .scenario ,
@@ -665,7 +662,6 @@ def generate_goldens_from_scratch(
665662 progress_bar = None ,
666663 async_mode = False ,
667664 ) as progress_bar :
668-
669665 # Generate inputs
670666 prompt = PromptSynthesizerTemplate .generate_synthetic_prompts (
671667 scenario = self .styling_config .scenario ,
@@ -728,7 +724,6 @@ def generate_goldens_from_goldens(
728724 max_goldens_per_golden : int = 2 ,
729725 include_expected_output : bool = True ,
730726 ) -> List [Golden ]:
731-
732727 if self .async_mode :
733728 loop = get_or_create_event_loop ()
734729 return loop .run_until_complete (
@@ -850,14 +845,12 @@ async def _a_rewrite_inputs(
850845 context : List [str ],
851846 inputs : List [SyntheticData ],
852847 ) -> Tuple [List [SyntheticData ], List [float ]]:
853-
854848 # Evaluate input quality
855849 scores = []
856850 filtered_inputs = []
857851 for item in inputs :
858852 input = item .input
859853 for _ in range (self .filtration_config .max_quality_retries ):
860-
861854 # Evaluate synthetically generated inputs
862855 evaluation_prompt = FilterTemplate .evaluate_synthetic_inputs (
863856 input
@@ -895,14 +888,12 @@ def _rewrite_inputs(
895888 context : List [str ],
896889 inputs : List [SyntheticData ],
897890 ) -> Tuple [List [SyntheticData ], List [float ]]:
898-
899891 # Evaluate input quality
900892 scores = []
901893 filtered_inputs = []
902894 for item in inputs :
903895 input = item .input
904896 for _ in range (self .filtration_config .max_quality_retries ):
905-
906897 # Evaluate synthetically generated inputs
907898 evaluation_prompt = FilterTemplate .evaluate_synthetic_inputs (
908899 input
@@ -1216,7 +1207,7 @@ def push(
12161207
12171208 def save_as (
12181209 self ,
1219- file_type : Literal ["json" , "csv" ],
1210+ file_type : Literal ["json" , "csv" , "jsonl" ],
12201211 directory : str ,
12211212 file_name : Optional [str ] = None ,
12221213 quiet : bool = False ,
@@ -1240,8 +1231,9 @@ def save_as(
12401231 """
12411232 if str (file_type ).lower () not in valid_file_types :
12421233 raise ValueError (
1243- "Invalid file type. Available file types to save as: "
1244- ", " .join (type for type in valid_file_types )
1234+ "Invalid file type. Available file types to save as: , " .join (
1235+ type for type in valid_file_types
1236+ )
12451237 )
12461238
12471239 if file_name and "." in file_name :
@@ -1301,6 +1293,17 @@ def save_as(
13011293 golden .source_file ,
13021294 ]
13031295 )
1296+ elif file_type == "jsonl" :
1297+ with open (full_file_path , "w" , encoding = "utf-8" ) as file :
1298+ for golden in self .synthetic_goldens :
1299+ record = {
1300+ "input" : golden .input ,
1301+ "actual_output" : golden .actual_output ,
1302+ "expected_output" : golden .expected_output ,
1303+ "context" : golden .context ,
1304+ "source_file" : golden .source_file ,
1305+ }
1306+ file .write (json .dumps (record , ensure_ascii = False ) + "\n " )
13041307 if not quiet :
13051308 print (f"Synthetic goldens saved at { full_file_path } !" )
13061309
0 commit comments