Skip to content

Commit 4514a34

Browse files
committed
Added tests for parquedit.py
1 parent 9fec1b7 commit 4514a34

File tree

4 files changed

+328
-14
lines changed

4 files changed

+328
-14
lines changed

docs/conf.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
import os
1616
import sys
1717

18-
1918
sys.path.insert(0, os.path.abspath("../src"))
2019

2120
# -- Project information -----------------------------------------------------

noxfile.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
import nox
1212

13-
1413
try:
1514
from nox_poetry import Session
1615
from nox_poetry import session

src/ssb_parquedit/parquedit.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,15 @@ def __init__(
3535
self._conn.sql(f"INSTALL {ext}")
3636
self._conn.sql(f"LOAD {ext}")
3737
# Attach catalog
38-
self._conn.sql(
39-
f"""
38+
self._conn.sql(f"""
4039
ATTACH 'ducklake:postgres:
4140
dbname={db_config["dbname"]}
4241
user={db_config["dbuser"]}
4342
host=localhost
4443
' AS {db_config["catalog_name"]}
4544
(DATA_PATH '{db_config["data_path"]}',
4645
METADATA_SCHEMA {db_config["metadata_schema"]});
47-
"""
48-
)
46+
""")
4947
self._conn.sql(f"USE {db_config['catalog_name']}")
5048

5149
def __enter__(self) -> "ParquEdit":
@@ -136,12 +134,10 @@ def _create_from_parquet(self, table_name: str, parquet_path: str) -> None:
136134
table_name: Name of the table to create.
137135
parquet_path: Path to the Parquet file (supports gs:// URIs).
138136
"""
139-
self._conn.execute(
140-
f"""
137+
self._conn.execute(f"""
141138
CREATE TABLE {table_name} AS
142139
SELECT * FROM read_parquet('{parquet_path}') WHERE 1=2;
143-
"""
144-
)
140+
""")
145141

146142
def _create_from_schema(self, table_name: str, schema: dict[str, Any]) -> None:
147143
"""Create a table from a JSON Schema specification.
@@ -186,12 +182,10 @@ def _fill_from_parquet(self, table_name: str, parquet_path: str) -> None:
186182
table_name: Name of the table to populate.
187183
parquet_path: Path to the Parquet file (supports gs:// URIs).
188184
"""
189-
self._conn.sql(
190-
f"""
185+
self._conn.sql(f"""
191186
INSERT INTO {table_name}
192187
SELECT * FROM read_parquet('{parquet_path}');
193-
"""
194-
)
188+
""")
195189

196190
@staticmethod
197191
def translate(prop: dict[str, Any]) -> str:

tests/test_parquedit.py

Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
import importlib
2+
import sys
3+
from collections.abc import Generator
4+
from typing import Any
5+
from unittest.mock import MagicMock
6+
from unittest.mock import call
7+
8+
import pytest
9+
10+
11+
# ---- Test scaffolding: stub external modules before importing the SUT ----
12+
@pytest.fixture(autouse=True)
13+
def stub_external_modules(
14+
monkeypatch: pytest.MonkeyPatch,
15+
) -> Generator[None, None, None]:
16+
"""Stub external heavy dependencies (duckdb, gcsfs, pandas) so tests run hermetically.
17+
18+
We inject minimal fakes into sys.modules prior to importing ssb_parquedit.parquedit.
19+
"""
20+
21+
# Fake duckdb module with minimal API surface
22+
class FakeDuckDB:
23+
class DuckDBPyConnection: # only for type hints; runtime uses MagicMock
24+
pass
25+
26+
def connect(self) -> MagicMock:
27+
# Return a MagicMock connection to simulate owned connections
28+
conn = MagicMock()
29+
conn.sql = MagicMock()
30+
conn.execute = MagicMock()
31+
conn.register = MagicMock()
32+
conn.register_filesystem = MagicMock()
33+
conn.close = MagicMock()
34+
return conn
35+
36+
# Fake gcsfs module with a GCSFileSystem type
37+
class FakeGCSFS:
38+
class GCSFileSystem:
39+
def __init__(self, *args: object, **kwargs: object) -> None:
40+
self.created = True
41+
42+
# Fake pandas with just a DataFrame type for isinstance checks
43+
class FakePandas:
44+
class DataFrame:
45+
pass
46+
47+
monkeypatch.setitem(sys.modules, "duckdb", FakeDuckDB())
48+
monkeypatch.setitem(sys.modules, "gcsfs", FakeGCSFS())
49+
monkeypatch.setitem(sys.modules, "pandas", FakePandas())
50+
51+
yield
52+
53+
# Cleanup is automatic by pytest monkeypatch fixture
54+
55+
56+
@pytest.fixture
57+
def sut() -> Any:
58+
"""Import and return the ParquEdit class with stubs injected."""
59+
module = importlib.import_module("ssb_parquedit.parquedit")
60+
importlib.reload(module)
61+
return module.ParquEdit
62+
63+
64+
@pytest.fixture
65+
def fake_conn() -> MagicMock:
66+
"""A MagicMock simulating a DuckDB connection."""
67+
conn = MagicMock()
68+
# Provide attributes/methods that ParquEdit expects
69+
# - register_filesystem(fs)
70+
# - sql(str)
71+
# - execute(str)
72+
# - register(name, obj)
73+
# - close()
74+
# Use wraps to capture SQL calls distinctly
75+
conn.sql = MagicMock()
76+
conn.execute = MagicMock()
77+
conn.register = MagicMock()
78+
conn.register_filesystem = MagicMock()
79+
conn.close = MagicMock()
80+
return conn
81+
82+
83+
@pytest.fixture
84+
def db_config() -> dict[str, str]:
85+
return {
86+
"dbname": "testdb",
87+
"dbuser": "testuser",
88+
"catalog_name": "testcat",
89+
"data_path": "gs://bucket/path",
90+
"metadata_schema": "meta_schema",
91+
}
92+
93+
94+
# -------------------- Behavior tests --------------------
95+
96+
97+
def test_init_registers_fs_and_loads_extensions_and_uses_catalog(
98+
sut: Any, fake_conn: MagicMock, db_config: dict[str, str]
99+
) -> None:
100+
sut(db_config=db_config, conn=fake_conn)
101+
102+
# Should not call duckdb.connect when conn is provided
103+
# Assert initial setup interactions
104+
assert fake_conn.register_filesystem.called, "Filesystem should be registered"
105+
106+
# Extensions install/load for ducklake and postgres
107+
expected_ext_calls = [
108+
call("INSTALL ducklake"),
109+
call("LOAD ducklake"),
110+
call("INSTALL postgres"),
111+
call("LOAD postgres"),
112+
]
113+
# Ensure the order is preserved as implemented
114+
fake_conn.sql.assert_has_calls(expected_ext_calls, any_order=False)
115+
116+
# Attach and USE catalog
117+
# We don't reconstruct the exact multi-line string; just ensure calls occurred and last USE is correct
118+
assert any(
119+
"ATTACH 'ducklake:postgres" in args[0]
120+
for (args, _) in fake_conn.sql.call_args_list
121+
), "ATTACH call missing"
122+
assert any(
123+
f"USE {db_config['catalog_name']}" in args[0]
124+
for (args, _) in fake_conn.sql.call_args_list
125+
), "USE catalog call missing"
126+
127+
128+
def test_context_manager_closes_only_if_owns_connection(
129+
sut: Any, db_config: dict[str, str]
130+
) -> None:
131+
# Case 1: Owns connection -> close called on exit (no conn passed => owns)
132+
pe1 = sut(db_config=db_config)
133+
with pe1:
134+
pass
135+
pe1._conn.close.assert_called_once()
136+
137+
# Case 2: Manually close closes if owns
138+
prev_calls = pe1._conn.close.call_count
139+
pe1.close()
140+
assert pe1._conn.close.call_count == prev_calls + 1
141+
142+
143+
@pytest.mark.parametrize(
144+
"name,valid",
145+
[
146+
("valid_name", True),
147+
("_underscore_ok", True),
148+
("1starts_with_digit", False),
149+
("has-dash", False),
150+
("has space", False),
151+
],
152+
)
153+
def test_validate_table_name(sut: Any, name: str, valid: bool) -> None:
154+
if valid:
155+
sut._validate_table_name(name) # should not raise
156+
else:
157+
with pytest.raises(ValueError):
158+
sut._validate_table_name(name)
159+
160+
161+
@pytest.mark.parametrize(
162+
"prop,expected",
163+
[
164+
({"type": "string"}, "VARCHAR"),
165+
({"type": ["null", "string"]}, "VARCHAR"),
166+
({"type": "string", "format": "date"}, "DATE"),
167+
({"type": "string", "format": "date-time"}, "TIMESTAMP"),
168+
({"type": "integer"}, "BIGINT"),
169+
({"type": "number"}, "DOUBLE"),
170+
({"type": "boolean"}, "BOOLEAN"),
171+
({"type": "array", "items": {"type": "integer"}}, "LIST<BIGINT>"),
172+
(
173+
{
174+
"type": "object",
175+
"properties": {"a": {"type": "string"}, "b": {"type": "integer"}},
176+
},
177+
"STRUCT(a VARCHAR, b BIGINT)",
178+
),
179+
({"type": "object"}, "JSON"), # object with no properties -> JSON
180+
({}, "JSON"), # fallback
181+
],
182+
)
183+
def test_translate_jsonschema_property(
184+
sut: Any, prop: dict[str, object], expected: str
185+
) -> None:
186+
assert sut.translate(prop) == expected
187+
188+
189+
def test_jsonschema_to_duckdb_builds_correct_ddl(sut: Any) -> None:
190+
schema = {
191+
"properties": {
192+
"id": {"type": "integer"},
193+
"name": {"type": "string"},
194+
"tags": {"type": "array", "items": {"type": "string"}},
195+
"meta": {
196+
"type": "object",
197+
"properties": {
198+
"active": {"type": "boolean"},
199+
"score": {"type": ["null", "number"]},
200+
},
201+
},
202+
},
203+
"required": ["id", "name"],
204+
}
205+
206+
ddl = sut.jsonschema_to_duckdb(schema, "t")
207+
# Expected columns and constraints
208+
assert "CREATE TABLE t (" in ddl
209+
assert "id BIGINT NOT NULL" in ddl
210+
assert "name VARCHAR NOT NULL" in ddl
211+
assert "tags LIST<VARCHAR>" in ddl
212+
assert "meta STRUCT(active BOOLEAN, score DOUBLE)" in ddl
213+
assert ddl.strip().endswith(");"), "DDL should end with semicolon"
214+
215+
216+
def test_create_table_from_dataframe_routes_and_applies_flags(
217+
sut: Any, fake_conn: MagicMock, db_config: dict[str, str]
218+
) -> None:
219+
# Create a fake DataFrame instance compatible with our Fake pandas
220+
DF = sys.modules["pandas"].DataFrame
221+
source = DF()
222+
223+
pe = sut(db_config=db_config, conn=fake_conn)
224+
225+
# Spy on internal helpers
226+
pe._create_from_dataframe = MagicMock()
227+
pe._add_table_partition = MagicMock()
228+
pe.fill_table = MagicMock()
229+
pe._add_table_description = MagicMock()
230+
231+
pe.create_table(
232+
table_name="t",
233+
source=source,
234+
table_description="desc",
235+
part_columns=["c1", "c2"],
236+
fill=True,
237+
)
238+
239+
pe._create_from_dataframe.assert_called_once_with("t", source)
240+
pe._add_table_partition.assert_called_once_with("t", ["c1", "c2"])
241+
pe.fill_table.assert_called_once_with("t", source)
242+
pe._add_table_description.assert_called_once_with("t", "desc")
243+
244+
245+
def test_create_table_from_parquet_routes_and_applies_flags(
246+
sut: Any, fake_conn: MagicMock, db_config: dict[str, str]
247+
) -> None:
248+
pe = sut(db_config=db_config, conn=fake_conn)
249+
250+
pe._create_from_parquet = MagicMock()
251+
pe._add_table_partition = MagicMock()
252+
pe.fill_table = MagicMock()
253+
pe._add_table_description = MagicMock()
254+
255+
pe.create_table(
256+
table_name="t",
257+
source="gs://bucket/path/file.parquet",
258+
table_description="desc",
259+
part_columns=None, # should treat as [] and not call _add_table_partition
260+
fill=False,
261+
)
262+
263+
pe._create_from_parquet.assert_called_once()
264+
pe._add_table_partition.assert_not_called()
265+
pe.fill_table.assert_not_called()
266+
pe._add_table_description.assert_called_once_with("t", "desc")
267+
268+
269+
def test_fill_table_routes_to_correct_helper(
270+
sut: Any, fake_conn: MagicMock, db_config: dict[str, str]
271+
) -> None:
272+
pe = sut(db_config=db_config, conn=fake_conn)
273+
pe._fill_from_dataframe = MagicMock()
274+
pe._fill_from_parquet = MagicMock()
275+
276+
DF = sys.modules["pandas"].DataFrame
277+
df = DF()
278+
pe.fill_table("t", df)
279+
pe._fill_from_dataframe.assert_called_once_with("t", data=df)
280+
281+
pe.fill_table("t", "gs://bucket/data.parquet")
282+
pe._fill_from_parquet.assert_called_once_with(
283+
"t", parquet_path="gs://bucket/data.parquet"
284+
)
285+
286+
287+
def test_create_from_dataframe_registers_and_creates_empty_table(
288+
sut: Any, fake_conn: MagicMock, db_config: dict[str, str]
289+
) -> None:
290+
pe = sut(db_config=db_config, conn=fake_conn)
291+
292+
DF = sys.modules["pandas"].DataFrame
293+
df = DF()
294+
295+
pe._create_from_dataframe("mytable", df)
296+
297+
fake_conn.register.assert_called_once_with("data", df)
298+
# Ensure the CREATE TABLE ... WHERE 1=2 pattern is used
299+
assert any(
300+
call_args[0].startswith("CREATE TABLE mytable AS SELECT * FROM data WHERE 1=2")
301+
for (call_args, _) in fake_conn.execute.call_args_list
302+
)
303+
304+
305+
def test_add_table_partition_executes_alter_table(
306+
sut: Any, fake_conn: MagicMock, db_config: dict[str, str]
307+
) -> None:
308+
pe = sut(db_config=db_config, conn=fake_conn)
309+
310+
pe._add_table_partition("t", ["a", "b"])
311+
312+
fake_conn.execute.assert_called_with("ALTER TABLE t SET PARTITIONED BY (a,b);")
313+
314+
315+
def test_add_table_description_executes_comment(
316+
sut: Any, fake_conn: MagicMock, db_config: dict[str, str]
317+
) -> None:
318+
pe = sut(db_config=db_config, conn=fake_conn)
319+
320+
pe._add_table_description("t", "some desc")
321+
322+
fake_conn.execute.assert_called_with("COMMENT ON TABLE t IS 'some desc';")

0 commit comments

Comments
 (0)