Skip to content

Commit 6f3529f

Browse files
authored
feat: jsonl output format, and multi-file input (#7)
1 parent 5f5dae1 commit 6f3529f

File tree

4 files changed

+85
-19
lines changed

4 files changed

+85
-19
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "parquet-py"
3-
version = "0.1.0-beta"
3+
version = "0.2.0-beta"
44
edition = "2021"
55

66
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

README.md

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Using Rust bindings under the hood, Parquet-Py provides a fast and efficient way
99
## Features
1010

1111
- **Convert Parquet to CSV**: Convert your Parquet files into CSV format for easy viewing and processing in spreadsheet applications.
12-
- **Convert Parquet to JSON**: Easily convert your Parquet files into a JSON string format for quick inspection or processing.
12+
- **Convert Parquet to JSON / JSON Lines**: Easily convert your Parquet files into a JSON Array or JSON Lines format for quick inspection or processing.
1313
- **Iterable Parquet Rows**: Access Parquet file rows through an iterator, allowing for efficient row-by-row processing without loading the entire file into memory.
1414
- **Convert Parquet to Python List**: Transform your Parquet files into Python lists, where each row is represented as a dictionary within the list.
1515

@@ -21,6 +21,16 @@ Using Rust bindings under the hood, Parquet-Py provides a fast and efficient way
2121
## Usage
2222
### Command-Line Interface
2323

24+
> [!WARNING]
25+
>
26+
> The CLI is still under development and may not be fully functional.
27+
>
28+
> Breaking changes may occur in future releases.
29+
30+
> [!TIP]
31+
>
32+
> Multiple input files can be specified with `--input` option. For example, `--input file1.parquet --input file2.parquet`.
33+
2434
#### Converting Parquet to CSV
2535

2636
To convert a Parquet file into a CSV file, use the `parq convert` command.
@@ -29,14 +39,22 @@ To convert a Parquet file into a CSV file, use the `parq convert` command.
2939
parq convert --input path/to/your/file.parquet --format csv --output example.csv
3040
```
3141

32-
#### Converting Parquet to JSON
42+
#### Converting Parquet to JSON Array
3343

34-
To convert a Parquet file into a JSON string, use the `parq convert` command.
44+
To convert a Parquet file into a JSON Array, use the `parq convert` command.
3545

3646
```bash
3747
parq convert --input path/to/your/file.parquet --format json --output example.json
3848
```
3949

50+
#### Converting Parquet to JSON Lines
51+
52+
To convert a Parquet file into a JSON Lines, use the `parq convert` command.
53+
54+
```bash
55+
parq convert --input path/to/your/file.parquet --format jsonl --output example.jsonl
56+
```
57+
4058

4159
### Python
4260

default.nix

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ let
66
python3 -m venv env
77
source env/bin/activate
88
pip install -U pip
9-
pip install 'maturin[patchelf]'
9+
pip install 'maturin'
1010
else
11-
source env/bin/activate
11+
source env/bin/activate
1212
fi
1313
'';
1414
in
1515

1616
pkgs.mkShell {
17-
nativeBuildInputs = with pkgs.buildPackages; [ python312 ];
17+
nativeBuildInputs = with pkgs.buildPackages; [ python312 cmake ninja autoconf automake rustc cargo rustfmt libiconv ];
1818
shellHook = createVenv;
1919
}

parq/cli/__main__.py

Lines changed: 60 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
#!/usr/bin/env python3
2+
from csv import DictWriter
3+
from json import dumps
4+
from io import StringIO
5+
from itertools import chain
6+
27
import click
3-
from parq import to_json_str, to_csv_str
8+
from parq import to_iter
49

510

611
@click.group()
@@ -10,7 +15,13 @@ def parq_cli():
1015

1116
@parq_cli.command()
1217
@click.option(
13-
"--input", "-i", "parquet_file_path", help="Input parquet file", type=click.Path(exists=True), required=True
18+
"--input",
19+
"-i",
20+
"parquet_file_paths",
21+
help="Input parquet files",
22+
type=click.Path(exists=True),
23+
required=True,
24+
multiple=True,
1425
)
1526
@click.option(
1627
"--format",
@@ -19,6 +30,7 @@ def parq_cli():
1930
type=click.Choice(
2031
[
2132
"json",
33+
"jsonl",
2234
"csv",
2335
],
2436
case_sensitive=False,
@@ -27,19 +39,55 @@ def parq_cli():
2739
help="Output format",
2840
)
2941
@click.option("--output", "-o", "output_file_path", help="Output file", type=click.Path(), required=False)
30-
def convert(parquet_file_path, output_format, output_file_path):
31-
if output_format == "json":
32-
output_string = to_json_str(str(parquet_file_path))
42+
def convert(parquet_file_paths, output_format, output_file_path):
43+
"""
44+
Convert a list of parquet files to a specified output format.
45+
:param parquet_file_paths: List of parquet file paths
46+
:param output_format: Output format
47+
:param output_file_path: Output file path
48+
:return:
49+
"""
50+
51+
iterchain = chain.from_iterable([to_iter(parquet_file_path) for parquet_file_path in parquet_file_paths])
52+
53+
if output_format == "jsonl":
54+
if output_file_path:
55+
56+
def _iter_jsonl(iterchain_):
57+
for item_ in iterchain_:
58+
yield dumps(item_) + "\n"
59+
60+
with open(output_file_path, "w") as f:
61+
f.writelines(_iter_jsonl(iterchain))
62+
else:
63+
for item in iterchain:
64+
click.echo(dumps(item))
65+
66+
elif output_format == "json":
67+
# FIXME: Optimize this
68+
if output_file_path:
69+
with open(output_file_path, "w") as f:
70+
f.write(dumps(list(iterchain)))
71+
else:
72+
click.echo(dumps(list(iterchain)))
73+
3374
elif output_format == "csv":
34-
output_string = to_csv_str(str(parquet_file_path))
35-
else:
36-
raise ValueError(f"Unsupported output format {output_format}")
75+
# FIXME: Optimize this
76+
buffer = list(iterchain)
77+
if output_file_path:
78+
with open(output_file_path, "w") as f:
79+
writer = DictWriter(f, fieldnames=buffer[0].keys())
80+
writer.writeheader()
81+
writer.writerows(buffer)
82+
else:
83+
f = StringIO()
84+
writer = DictWriter(f, fieldnames=buffer[0].keys())
85+
writer.writeheader()
86+
writer.writerows(buffer)
87+
click.echo(f.getvalue())
3788

38-
if output_file_path:
39-
with open(output_file_path, "w") as f:
40-
f.write(output_string)
4189
else:
42-
click.echo(output_string)
90+
raise ValueError(f"Unsupported output format: {output_format}")
4391

4492

4593
if __name__ == "__main__":

0 commit comments

Comments
 (0)