Skip to content

Commit bd9d6ed

Browse files
committed
Add download and extraction functions to parquet_to_json script; create requirements.txt
1 parent 0c8ff19 commit bd9d6ed

File tree

2 files changed

+72
-4
lines changed

2 files changed

+72
-4
lines changed

scripts/parquet_to_json.py

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,27 @@
11
import os
2+
import argparse
23
import pyarrow.parquet as pq
4+
import requests
5+
import zipfile
6+
7+
8+
def download_file(url, output_path):
9+
"""Download a file from a URL to a specified path."""
10+
print(f"Downloading dataset from {url} to {output_path}...")
11+
response = requests.get(url, stream=True)
12+
response.raise_for_status()
13+
with open(output_path, "wb") as f:
14+
for chunk in response.iter_content(chunk_size=8192):
15+
f.write(chunk)
16+
print("Download complete.")
17+
18+
19+
def extract_zip(zip_path, extract_to):
20+
"""Extract a ZIP file to a specified directory."""
21+
print(f"Extracting {zip_path} to {extract_to}...")
22+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
23+
zip_ref.extractall(extract_to)
24+
print("Extraction complete.")
325

426

527
def convert_parquet_to_json_streaming(
@@ -30,8 +52,42 @@ def convert_parquet_to_json_streaming(
3052

3153

3254
if __name__ == "__main__":
33-
directory = "samples/wikipedia/"
34-
filenames = ["a.parquet", "b.parquet", "c.parquet", "d.parquet"]
35-
output_file = "samples/wikipedia.json"
55+
parser = argparse.ArgumentParser(
56+
description="Download, extract, and convert Parquet files to JSON Lines format."
57+
)
58+
parser.add_argument(
59+
"--download_url", required=True, help="URL to download the dataset."
60+
)
61+
parser.add_argument(
62+
"--download_dir", required=True, help="Directory to save the downloaded file."
63+
)
64+
parser.add_argument(
65+
"--extract_dir", required=True, help="Directory to extract the dataset."
66+
)
67+
parser.add_argument(
68+
"--output_file",
69+
required=True,
70+
help="Output JSON file (JSON Lines format).",
71+
)
72+
parser.add_argument(
73+
"--include_files",
74+
default="",
75+
help="Comma-separated list of Parquet files to include in the JSON output.",
76+
)
77+
args = parser.parse_args()
78+
79+
# Step 1: Download the dataset
80+
zip_file_path = os.path.join(args.download_dir, "dataset.zip")
81+
# download_file(args.download_url, zip_file_path)
82+
83+
# # Step 2: Extract the dataset
84+
# extract_zip(zip_file_path, args.extract_dir)
85+
86+
# Step 3: Convert Parquet files to JSON
87+
include_files = args.include_files.split(",") if args.include_files else None
88+
if include_files:
89+
filenames = [f for f in os.listdir(args.extract_dir) if f in include_files]
90+
else:
91+
filenames = [f for f in os.listdir(args.extract_dir) if f.endswith(".parquet")]
3692

37-
convert_parquet_to_json_streaming(directory, filenames, output_file)
93+
convert_parquet_to_json_streaming(args.extract_dir, filenames, args.output_file)

scripts/requirements.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
certifi==2025.6.15
2+
charset-normalizer==3.4.2
3+
idna==3.10
4+
numpy==2.3.1
5+
pandas==2.3.0
6+
pyarrow==20.0.0
7+
python-dateutil==2.9.0.post0
8+
pytz==2025.2
9+
requests==2.32.4
10+
six==1.17.0
11+
tzdata==2025.2
12+
urllib3==2.5.0

0 commit comments

Comments
 (0)