forked from jim-plus/llm-abliteration
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathjsonl_to_parquet.py
More file actions
104 lines (80 loc) · 3.67 KB
/
jsonl_to_parquet.py
File metadata and controls
104 lines (80 loc) · 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
import json
import argparse
import sys
from pathlib import Path
def jsonl_to_parquet(input_file, output_file=None, chunk_size=10000):
"""
Convert a JSONL file to Parquet format.
Args:
input_file (str): Path to input JSONL file
output_file (str, optional): Path to output Parquet file.
If None, uses input filename with .parquet extension
chunk_size (int): Number of rows to process at once for memory efficiency
"""
input_path = Path(input_file)
if output_file is None:
output_file = input_path.with_suffix('.parquet')
output_path = Path(output_file)
print(f"Converting {input_path} to {output_path}")
try:
# Read JSONL file in chunks
chunks = []
total_lines = 0
with open(input_path, 'r', encoding='utf-8') as f:
chunk_data = []
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line: # Skip empty lines
continue
try:
row_dict = json.loads(line)
chunk_data.append(row_dict)
total_lines += 1
# Process chunk when it reaches chunk_size
if len(chunk_data) >= chunk_size:
chunks.append(pd.DataFrame(chunk_data))
print(f"Processed {total_lines} lines...")
chunk_data = []
except json.JSONDecodeError as e:
print(f"Warning: Skipping malformed JSON at line {line_num}: {e}")
continue
# Process remaining data
if chunk_data:
chunks.append(pd.DataFrame(chunk_data))
if not chunks:
print("Error: No valid data found in JSONL file.")
return
# Combine all chunks into a single DataFrame
print(f"Combining {len(chunks)} chunks...")
df = pd.concat(chunks, ignore_index=True)
print(f"Created DataFrame with shape: {df.shape}")
# Write to Parquet
df.to_parquet(output_path, index=False, engine='pyarrow')
print(f"Successfully converted to {output_path}")
except FileNotFoundError:
print(f"Error: Input file '{input_file}' not found.")
except Exception as e:
print(f"Error during conversion: {e}")
def main():
parser = argparse.ArgumentParser(description="Convert JSONL file to Parquet format")
parser.add_argument("input", help="Input JSONL file path")
parser.add_argument("-o", "--output", help="Output Parquet file path (optional)")
parser.add_argument("-c", "--chunk-size", type=int, default=10000,
help="Chunk size for processing (default: 10000)")
args = parser.parse_args()
jsonl_to_parquet(args.input, args.output, args.chunk_size)
if __name__ == "__main__":
# Example usage when run directly
if len(sys.argv) == 1:
# Interactive mode - you can modify these paths
input_file = "example.jsonl"
output_file = "example.parquet" # Optional, will auto-generate if None
print("Running in example mode...")
print("Modify the input_file and output_file variables in the script")
print(f"Current input_file: {input_file}")
print(f"Current output_file: {output_file}")
# Uncomment the line below to run with example files
# jsonl_to_parquet(input_file, output_file)
else:
main()