Skip to content

Commit 5f5dae1

Browse files
authored
feat: convert to csv (#6)
* feat: convert to csv * update docs
1 parent ea33e21 commit 5f5dae1

File tree

6 files changed

+146
-41
lines changed

6 files changed

+146
-41
lines changed

Cargo.lock

Lines changed: 29 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "parquet-py"
3-
version = "0.0.5-alpha"
3+
version = "0.1.0-beta"
44
edition = "2021"
55

66
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -12,3 +12,4 @@ crate-type = ["cdylib"]
1212
parquet = { version = "52.0.0", features = ["json"] }
1313
pyo3 = "0.22.0"
1414
serde_json = "1.0.120"
15+
csv = "1.3.0"

README.md

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@
22

33
# Parquet-Py
44

5-
Parquet-Py is a simple Python API & CLI designed to facilitate the interaction with Parquet files. It allows users to convert Parquet files into JSON strings, lists, or iterators for easy manipulation and access in Python applications.
5+
Parquet-Py is a simple command-line interface & Python API designed to facilitate the interaction with Parquet files. It allows users to convert Parquet files into CSV, JSON, lists, and iterators for easy manipulation and access in Python applications.
66

7-
Using Rust bindings under the hood, Parquet-Py provides a fast and efficient way to work with Parquet files, making it ideal for processing large datasets.
7+
Using Rust bindings under the hood, Parquet-Py provides a fast and efficient way to work with Parquet files, making it ideal for converting or processing large datasets.
88

99
## Features
1010

11-
- **Convert Parquet to JSON String**: Easily convert your Parquet files into a JSON string format for quick inspection or processing.
12-
- **Convert Parquet to Python List**: Transform your Parquet files into Python lists, where each row is represented as a dictionary within the list.
11+
- **Convert Parquet to CSV**: Convert your Parquet files into CSV format for easy viewing and processing in spreadsheet applications.
12+
- **Convert Parquet to JSON**: Easily convert your Parquet files into a JSON string format for quick inspection or processing.
1313
- **Iterable Parquet Rows**: Access Parquet file rows through an iterator, allowing for efficient row-by-row processing without loading the entire file into memory.
14+
- **Convert Parquet to Python List**: Transform your Parquet files into Python lists, where each row is represented as a dictionary within the list.
1415

1516
## Installation
1617

@@ -20,6 +21,14 @@ Using Rust bindings under the hood, Parquet-Py provides a fast and efficient way
2021
## Usage
2122
### Command-Line Interface
2223

24+
#### Converting Parquet to CSV
25+
26+
To convert a Parquet file into a CSV file, use the `parq convert` command.
27+
28+
```bash
29+
parq convert --input path/to/your/file.parquet --format csv --output example.csv
30+
```
31+
2332
#### Converting Parquet to JSON
2433

2534
To convert a Parquet file into a JSON string, use the `parq convert` command.
@@ -28,11 +37,42 @@ To convert a Parquet file into a JSON string, use the `parq convert` command.
2837
parq convert --input path/to/your/file.parquet --format json --output example.json
2938
```
3039

40+
3141
### Python
3242

43+
#### Iterating Over Parquet Rows
44+
45+
To iterate over the rows of a Parquet file, use the `iter_rows` function. This allows for efficient row-by-row processing without loading the entire file into memory.
46+
47+
```python
48+
from parq import to_iter
49+
50+
# Path to your Parquet file
51+
file_path = "path/to/your/file.parquet"
52+
53+
# Iterate over Parquet rows
54+
for row in to_iter(file_path):
55+
print(row)
56+
```
57+
58+
#### Converting Parquet to CSV String
59+
60+
To convert a Parquet file into a CSV string, use the `to_csv_str` function.
61+
62+
```python
63+
from parq import to_csv_str
64+
65+
# Path to your Parquet file
66+
file_path = "path/to/your/file.parquet"
67+
68+
# Convert to CSV string
69+
csv_str = to_csv_str(file_path)
70+
print(csv_str)
71+
```
72+
3373
#### Converting Parquet to JSON String
3474

35-
To convert a Parquet file into a JSON string, use the `to_json_str` function. This is useful for quick inspection or processing of the data.
75+
To convert a Parquet file into a JSON string, use the `to_json_str` function.
3676

3777
```python
3878
from parq import to_json_str
@@ -59,18 +99,3 @@ file_path = "path/to/your/file.parquet"
5999
data_list = to_list(file_path)
60100
print(len(data_list))
61101
```
62-
63-
#### Iterating Over Parquet Rows
64-
65-
To iterate over the rows of a Parquet file, use the `iter_rows` function. This allows for efficient row-by-row processing without loading the entire file into memory.
66-
67-
```python
68-
from parq import to_iter
69-
70-
# Path to your Parquet file
71-
file_path = "path/to/your/file.parquet"
72-
73-
# Iterate over Parquet rows
74-
for row in to_iter(file_path):
75-
print(row)
76-
```

parq/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
from .lib import to_json_str, to_list, to_iter
1+
from .lib import to_json_str, to_csv_str, to_list, to_iter

parq/cli/__main__.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env python3
22
import click
3-
from parq import to_json_str
3+
from parq import to_json_str, to_csv_str
44

55

66
@click.group()
@@ -19,6 +19,7 @@ def parq_cli():
1919
type=click.Choice(
2020
[
2121
"json",
22+
"csv",
2223
],
2324
case_sensitive=False,
2425
),
@@ -28,12 +29,17 @@ def parq_cli():
2829
@click.option("--output", "-o", "output_file_path", help="Output file", type=click.Path(), required=False)
2930
def convert(parquet_file_path, output_format, output_file_path):
3031
if output_format == "json":
31-
json_str = to_json_str(str(parquet_file_path))
32-
if output_file_path:
33-
with open(output_file_path, "w") as f:
34-
f.write(json_str)
35-
else:
36-
click.echo(json_str)
32+
output_string = to_json_str(str(parquet_file_path))
33+
elif output_format == "csv":
34+
output_string = to_csv_str(str(parquet_file_path))
35+
else:
36+
raise ValueError(f"Unsupported output format {output_format}")
37+
38+
if output_file_path:
39+
with open(output_file_path, "w") as f:
40+
f.write(output_string)
41+
else:
42+
click.echo(output_string)
3743

3844

3945
if __name__ == "__main__":

src/lib.rs

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use parquet::file::reader::{FileReader, SerializedFileReader};
22
use parquet::record::reader::RowIter;
3-
use pyo3::exceptions::PyStopIteration;
3+
use pyo3::exceptions::{PyIOError, PyStopIteration, PyValueError};
44
use pyo3::prelude::*;
55
use pyo3::types::{PyBool, PyDict, PyList};
66
use serde_json::Value;
@@ -51,14 +51,46 @@ fn value_to_py_object(py: Python, value: &Value) -> PyResult<PyObject> {
5151
}
5252
}
5353

54-
// convert parquet file to json string
54+
/// to_csv_str(path: str) -> str
55+
/// --
56+
///
57+
/// Read parquet file and convert to csv string.
58+
#[pyfunction]
59+
fn to_csv_str(path: &str) -> PyResult<String> {
60+
let file_path = Path::new(path);
61+
let file = File::open(&file_path).map_err(|e| PyIOError::new_err(e.to_string()))?;
62+
let reader = SerializedFileReader::new(file).map_err(|e| PyValueError::new_err(e.to_string()))?;
63+
let metadata = reader.metadata();
64+
let schema = metadata.file_metadata().schema();
65+
66+
let mut wtr = csv::Writer::from_writer(vec![]);
67+
let fields = schema.get_fields();
68+
let headers: Vec<String> = fields.iter().map(|f| f.name().to_string()).collect();
69+
wtr.write_record(&headers).map_err(|e| PyValueError::new_err(e.to_string()))?;
70+
71+
let row_iter = reader.get_row_iter(None).map_err(|e| PyValueError::new_err(e.to_string()))?;
72+
for row_result in row_iter {
73+
let row = row_result.map_err(|e| PyValueError::new_err(e.to_string()))?;
74+
let csv_record: Vec<String> = row.get_column_iter().map(|(_col_idx, col)| col.to_string()).collect();
75+
wtr.write_record(&csv_record).map_err(|e| PyValueError::new_err(e.to_string()))?;
76+
}
77+
78+
wtr.flush().map_err(|e| PyValueError::new_err(e.to_string()))?;
79+
let csv_data = String::from_utf8(wtr.into_inner().map_err(|e| PyValueError::new_err(e.to_string()))?).map_err(|e| PyValueError::new_err(e.to_string()))?;
80+
Ok(csv_data)
81+
}
82+
83+
/// to_json_str(path: str) -> str
84+
/// --
85+
///
86+
/// Read parquet file and convert to JSON string.
5587
#[pyfunction]
5688
fn to_json_str(path: &str) -> PyResult<String> {
5789
let file_path = Path::new(path);
5890
let file =
59-
File::open(&file_path).map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string()))?;
91+
File::open(&file_path).map_err(|e| PyIOError::new_err(e.to_string()))?;
6092
let reader = SerializedFileReader::new(file)
61-
.map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?;
93+
.map_err(|e| PyValueError::new_err(e.to_string()))?;
6294

6395
// iterate through reader and add to json list string
6496
let mut json_str = "[".to_string();
@@ -74,6 +106,10 @@ fn to_json_str(path: &str) -> PyResult<String> {
74106
return Ok(json_str);
75107
}
76108

109+
/// ParquetRowIterator
110+
/// --
111+
///
112+
/// Iterator over rows in parquet file.
77113
#[pyclass]
78114
struct ParquetRowIterator {
79115
iter: RowIter<'static>,
@@ -85,9 +121,9 @@ impl ParquetRowIterator {
85121
fn new(path: &str) -> PyResult<Self> {
86122
let file_path = Path::new(path);
87123
let file = File::open(&file_path)
88-
.map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string()))?;
124+
.map_err(|e| PyIOError::new_err(e.to_string()))?;
89125
let reader = SerializedFileReader::new(file)
90-
.map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?;
126+
.map_err(|e| PyValueError::new_err(e.to_string()))?;
91127

92128
Ok(Self {
93129
iter: RowIter::from_file_into(Box::new(reader)),
@@ -112,26 +148,34 @@ impl ParquetRowIterator {
112148
}
113149
}
114150

151+
/// to_iter(path: str) -> ParquetRowIterator
152+
/// --
153+
///
154+
/// Return iterator over rows in parquet file.
115155
#[pyfunction]
116156
fn to_iter(path: &str) -> PyResult<ParquetRowIterator> {
117157
let file_path = Path::new(path);
118158
let file =
119-
File::open(&file_path).map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string()))?;
159+
File::open(&file_path).map_err(|e| PyIOError::new_err(e.to_string()))?;
120160
let reader = SerializedFileReader::new(file)
121-
.map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?;
161+
.map_err(|e| PyValueError::new_err(e.to_string()))?;
122162

123163
Ok(ParquetRowIterator {
124164
iter: RowIter::from_file_into(Box::new(reader)),
125165
})
126166
}
127167

168+
/// to_list(path: str) -> List[Dict[str, Any]]
169+
/// --
170+
///
171+
/// Read parquet file and convert to list of dictionaries.
128172
#[pyfunction]
129173
fn to_list(path: &str, py: Python) -> PyResult<PyObject> {
130174
let file_path = Path::new(path);
131175
let file =
132-
File::open(&file_path).map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string()))?;
176+
File::open(&file_path).map_err(|e| PyIOError::new_err(e.to_string()))?;
133177
let reader = SerializedFileReader::new(file)
134-
.map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?;
178+
.map_err(|e| PyValueError::new_err(e.to_string()))?;
135179
let list = PyList::empty_bound(py);
136180
for row in reader.get_row_iter(None).unwrap() {
137181
let row_dict = row.unwrap().to_json_value();
@@ -145,10 +189,11 @@ fn to_list(path: &str, py: Python) -> PyResult<PyObject> {
145189
Ok(list.into())
146190
}
147191

148-
// python module
192+
/// A Parquet file reader and converter, written in Rust.
149193
#[pymodule]
150194
fn lib(m: &Bound<'_, PyModule>) -> PyResult<()> {
151195
m.add_function(wrap_pyfunction!(to_json_str, m)?)?;
196+
m.add_function(wrap_pyfunction!(to_csv_str, m)?)?;
152197
m.add_function(wrap_pyfunction!(to_list, m)?)?;
153198
m.add_function(wrap_pyfunction!(to_iter, m)?)?;
154199
m.add_class::<ParquetRowIterator>()?;

0 commit comments

Comments
 (0)