Skip to content

Commit def7906

Browse files
authored
Merge pull request #1652 from yudhiesh/feat/add-jsonl-as-file-format
feat: add in jsonl file format to save data
2 parents 1a5f92e + ad8f5a3 commit def7906

3 files changed

Lines changed: 41 additions & 24 deletions

File tree

deepeval/dataset/dataset.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
)
3535
from deepeval.utils import convert_keys_to_snake_case, is_confident
3636

37-
valid_file_types = ["csv", "json"]
37+
valid_file_types = ["csv", "json", "jsonl"]
3838

3939

4040
def validate_test_case_type(
@@ -836,7 +836,7 @@ def generate_goldens_from_scratch(
836836

837837
def save_as(
838838
self,
839-
file_type: Literal["json", "csv"],
839+
file_type: Literal["json", "csv", "jsonl"],
840840
directory: str,
841841
include_test_cases: bool = False,
842842
) -> str:
@@ -924,6 +924,18 @@ def save_as(
924924
golden.source_file,
925925
]
926926
)
927+
elif file_type == "jsonl":
928+
with open(full_file_path, "w", encoding="utf-8") as file:
929+
for golden in goldens:
930+
record = {
931+
"input": golden.input,
932+
"actual_output": golden.actual_output,
933+
"expected_output": golden.expected_output,
934+
"retrieval_context": golden.retrieval_context,
935+
"context": golden.context,
936+
"source_file": golden.source_file,
937+
}
938+
file.write(json.dumps(record, ensure_ascii=False) + "\n")
927939

928940
print(f"Evaluation dataset saved at {full_file_path}!")
929941
return full_file_path

deepeval/synthesizer/synthesizer.py

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
CreateDatasetHttpResponse,
5353
)
5454

55-
valid_file_types = ["csv", "json"]
55+
valid_file_types = ["csv", "json", "jsonl"]
5656

5757
evolution_map = {
5858
"Reasoning": EvolutionTemplate.reasoning_evolution,
@@ -216,13 +216,15 @@ async def a_generate_goldens_from_docs(
216216
similarity_threshold=context_construction_config.context_similarity_threshold,
217217
max_retries=context_construction_config.max_retries,
218218
)
219-
contexts, source_files, context_scores = (
220-
await context_generator.a_generate_contexts(
221-
max_contexts_per_source_file=context_construction_config.max_contexts_per_document,
222-
min_contexts_per_source_file=context_construction_config.min_contexts_per_document,
223-
max_context_size=context_construction_config.max_context_length,
224-
min_context_size=context_construction_config.min_context_length,
225-
)
219+
(
220+
contexts,
221+
source_files,
222+
context_scores,
223+
) = await context_generator.a_generate_contexts(
224+
max_contexts_per_source_file=context_construction_config.max_contexts_per_document,
225+
min_contexts_per_source_file=context_construction_config.min_contexts_per_document,
226+
max_context_size=context_construction_config.max_context_length,
227+
min_context_size=context_construction_config.min_context_length,
226228
)
227229
if self.synthesis_cost:
228230
self.synthesis_cost += context_generator.total_cost
@@ -297,7 +299,6 @@ def generate_goldens_from_contexts(
297299
async_mode=False,
298300
) as progress_bar:
299301
for i, context in enumerate(contexts):
300-
301302
# Generate inputs
302303
prompt = SynthesizerTemplate.generate_synthetic_inputs(
303304
context=context,
@@ -315,7 +316,6 @@ def generate_goldens_from_contexts(
315316
context, synthetic_inputs
316317
)
317318
for j, data in enumerate(qualified_synthetic_inputs):
318-
319319
# Evolve input
320320
evolved_input, evolutions_used = self._evolve_input(
321321
input=data.input,
@@ -458,7 +458,6 @@ async def _a_generate_from_context(
458458
context, synthetic_inputs
459459
)
460460
for i, data in enumerate(qualified_synthetic_inputs):
461-
462461
# Evolve input
463462
evolved_input, evolutions_used = await self._a_evolve_input(
464463
input=data.input,
@@ -536,7 +535,6 @@ async def _a_generate_text_to_sql_from_context(
536535
prompt
537536
)
538537
for data in synthetic_inputs:
539-
540538
# Generate expected output
541539
expected_output = None
542540
if include_expected_output:
@@ -592,7 +590,6 @@ async def a_generate_goldens_from_scratch(
592590
progress_bar=None,
593591
async_mode=True,
594592
) as progress_bar:
595-
596593
# Generate inputs
597594
prompt = PromptSynthesizerTemplate.generate_synthetic_prompts(
598595
scenario=self.styling_config.scenario,
@@ -665,7 +662,6 @@ def generate_goldens_from_scratch(
665662
progress_bar=None,
666663
async_mode=False,
667664
) as progress_bar:
668-
669665
# Generate inputs
670666
prompt = PromptSynthesizerTemplate.generate_synthetic_prompts(
671667
scenario=self.styling_config.scenario,
@@ -728,7 +724,6 @@ def generate_goldens_from_goldens(
728724
max_goldens_per_golden: int = 2,
729725
include_expected_output: bool = True,
730726
) -> List[Golden]:
731-
732727
if self.async_mode:
733728
loop = get_or_create_event_loop()
734729
return loop.run_until_complete(
@@ -850,14 +845,12 @@ async def _a_rewrite_inputs(
850845
context: List[str],
851846
inputs: List[SyntheticData],
852847
) -> Tuple[List[SyntheticData], List[float]]:
853-
854848
# Evaluate input quality
855849
scores = []
856850
filtered_inputs = []
857851
for item in inputs:
858852
input = item.input
859853
for _ in range(self.filtration_config.max_quality_retries):
860-
861854
# Evaluate synthetically generated inputs
862855
evaluation_prompt = FilterTemplate.evaluate_synthetic_inputs(
863856
input
@@ -895,14 +888,12 @@ def _rewrite_inputs(
895888
context: List[str],
896889
inputs: List[SyntheticData],
897890
) -> Tuple[List[SyntheticData], List[float]]:
898-
899891
# Evaluate input quality
900892
scores = []
901893
filtered_inputs = []
902894
for item in inputs:
903895
input = item.input
904896
for _ in range(self.filtration_config.max_quality_retries):
905-
906897
# Evaluate synthetically generated inputs
907898
evaluation_prompt = FilterTemplate.evaluate_synthetic_inputs(
908899
input
@@ -1216,7 +1207,7 @@ def push(
12161207

12171208
def save_as(
12181209
self,
1219-
file_type: Literal["json", "csv"],
1210+
file_type: Literal["json", "csv", "jsonl"],
12201211
directory: str,
12211212
file_name: Optional[str] = None,
12221213
quiet: bool = False,
@@ -1240,8 +1231,9 @@ def save_as(
12401231
"""
12411232
if str(file_type).lower() not in valid_file_types:
12421233
raise ValueError(
1243-
"Invalid file type. Available file types to save as: "
1244-
", ".join(type for type in valid_file_types)
1234+
"Invalid file type. Available file types to save as: , ".join(
1235+
type for type in valid_file_types
1236+
)
12451237
)
12461238

12471239
if file_name and "." in file_name:
@@ -1301,6 +1293,17 @@ def save_as(
13011293
golden.source_file,
13021294
]
13031295
)
1296+
elif file_type == "jsonl":
1297+
with open(full_file_path, "w", encoding="utf-8") as file:
1298+
for golden in self.synthetic_goldens:
1299+
record = {
1300+
"input": golden.input,
1301+
"actual_output": golden.actual_output,
1302+
"expected_output": golden.expected_output,
1303+
"context": golden.context,
1304+
"source_file": golden.source_file,
1305+
}
1306+
file.write(json.dumps(record, ensure_ascii=False) + "\n")
13041307
if not quiet:
13051308
print(f"Synthetic goldens saved at {full_file_path}!")
13061309

tests/test_synthesizer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,8 @@ def test_save_goldens(synthesizer: Synthesizer, file_type: str):
365365
synthesizer.save_as("csv", "./goldens")
366366
elif file_type == "json":
367367
synthesizer.save_as("json", "./goldens")
368+
elif file_type == "jsonl":
369+
synthesizer.save_as("jsonl", "./goldens")
368370

369371

370372
@pytest.fixture

0 commit comments

Comments
 (0)