|
1 | 1 | import os |
| 2 | +import argparse |
2 | 3 | import pyarrow.parquet as pq |
| 4 | +import requests |
| 5 | +import zipfile |
| 6 | + |
| 7 | + |
| 8 | +def download_file(url, output_path): |
| 9 | + """Download a file from a URL to a specified path.""" |
| 10 | + print(f"Downloading dataset from {url} to {output_path}...") |
| 11 | + response = requests.get(url, stream=True) |
| 12 | + response.raise_for_status() |
| 13 | + with open(output_path, "wb") as f: |
| 14 | + for chunk in response.iter_content(chunk_size=8192): |
| 15 | + f.write(chunk) |
| 16 | + print("Download complete.") |
| 17 | + |
| 18 | + |
| 19 | +def extract_zip(zip_path, extract_to): |
| 20 | + """Extract a ZIP file to a specified directory.""" |
| 21 | + print(f"Extracting {zip_path} to {extract_to}...") |
| 22 | + with zipfile.ZipFile(zip_path, "r") as zip_ref: |
| 23 | + zip_ref.extractall(extract_to) |
| 24 | + print("Extraction complete.") |
3 | 25 |
|
4 | 26 |
|
5 | 27 | def convert_parquet_to_json_streaming( |
@@ -30,8 +52,42 @@ def convert_parquet_to_json_streaming( |
30 | 52 |
|
31 | 53 |
|
32 | 54 | if __name__ == "__main__": |
33 | | - directory = "samples/wikipedia/" |
34 | | - filenames = ["a.parquet", "b.parquet", "c.parquet", "d.parquet"] |
35 | | - output_file = "samples/wikipedia.json" |
| 55 | + parser = argparse.ArgumentParser( |
| 56 | + description="Download, extract, and convert Parquet files to JSON Lines format." |
| 57 | + ) |
| 58 | + parser.add_argument( |
| 59 | + "--download_url", required=True, help="URL to download the dataset." |
| 60 | + ) |
| 61 | + parser.add_argument( |
| 62 | + "--download_dir", required=True, help="Directory to save the downloaded file." |
| 63 | + ) |
| 64 | + parser.add_argument( |
| 65 | + "--extract_dir", required=True, help="Directory to extract the dataset." |
| 66 | + ) |
| 67 | + parser.add_argument( |
| 68 | + "--output_file", |
| 69 | + required=True, |
| 70 | + help="Output JSON file (JSON Lines format).", |
| 71 | + ) |
| 72 | + parser.add_argument( |
| 73 | + "--include_files", |
| 74 | + default="", |
| 75 | + help="Comma-separated list of Parquet files to include in the JSON output.", |
| 76 | + ) |
| 77 | + args = parser.parse_args() |
| 78 | + |
| 79 | + # Step 1: Download the dataset |
| 80 | + zip_file_path = os.path.join(args.download_dir, "dataset.zip") |
| 81 | + # download_file(args.download_url, zip_file_path) |
| 82 | + |
| 83 | + # # Step 2: Extract the dataset |
| 84 | + # extract_zip(zip_file_path, args.extract_dir) |
| 85 | + |
| 86 | + # Step 3: Convert Parquet files to JSON |
| 87 | + include_files = args.include_files.split(",") if args.include_files else None |
| 88 | + if include_files: |
| 89 | + filenames = [f for f in os.listdir(args.extract_dir) if f in include_files] |
| 90 | + else: |
| 91 | + filenames = [f for f in os.listdir(args.extract_dir) if f.endswith(".parquet")] |
36 | 92 |
|
37 | | - convert_parquet_to_json_streaming(directory, filenames, output_file) |
| 93 | + convert_parquet_to_json_streaming(args.extract_dir, filenames, args.output_file) |
0 commit comments