Skip to content

Commit b5e65df

Browse files
authored
feat: Create Python integration testing (#320)
* Add a workflow step for python integration tests and add a duckdb test * fix some bugs in workflow * Fix a bug in pytest installation * Implement flight test * Replace unittest with pytest and update workflow script * Add two integration tests and add roapi to git workflow * Add ODBC integration test * update path * Add sqliteodbc to pipeline * Split python test pipeline into a separate step * Add sqlite odbc lib * Update step name * Fix bugs in mysql and postgres tests
1 parent cc612fa commit b5e65df

9 files changed

Lines changed: 1247 additions & 1 deletion

File tree

.github/workflows/pr.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,34 @@ jobs:
107107
108108
- name: Run tests
109109
run: make test
110+
111+
python-integration-test:
112+
name: Python Tests
113+
runs-on: ubuntu-latest
114+
115+
steps:
116+
- uses: actions/checkout@v4
117+
118+
- uses: dtolnay/rust-toolchain@stable
119+
120+
- uses: astral-sh/setup-uv@v5
121+
with:
122+
enable-cache: true
123+
124+
- name: Install ODBC, Sqlite and Roapi
125+
run: |
126+
sudo apt-get install -y unixodbc-dev libsqliteodbc
127+
sudo apt-get install -y libsqlite3-dev
128+
cargo install --locked --git https://github.com/roapi/roapi --branch main --bins roapi
129+
130+
- name: Build Python package
131+
run: |
132+
cd python
133+
uv sync --dev --no-install-package datafusion
134+
uv run --no-project maturin develop --uv
135+
136+
137+
- name: Run Python tests
138+
run: |
139+
cd python/python/tests
140+
uv run --no-project pytest -v .

python/pyproject.toml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[build-system]
22
requires = ["maturin>=1.5.1,<1.6.0"]
3-
build-backend = "[maturin]"
3+
build-backend = "maturin"
44

55
[project]
66
name = "datafusion_table_providers"
@@ -57,3 +57,15 @@ max-doc-length = 88
5757
"dev/*" = ["D"]
5858
"benchmarks/*" = ["D", "F"]
5959
"docs/*" = ["D"]
60+
61+
[dependency-groups]
62+
dev = [
63+
"maturin>=1.8.1",
64+
"numpy>1.25.0",
65+
"pytest>=7.4.4",
66+
"pytest-asyncio>=0.23.3",
67+
"ruff>=0.9.1",
68+
"toml>=0.10.2",
69+
"pygithub==2.5.0",
70+
"pyarrow>=19.0.1"
71+
]

python/python/tests/test_duckdb.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import pytest
2+
import os
3+
from datafusion import SessionContext
4+
from datafusion_table_providers import duckdb
5+
6+
class TestDuckDBIntegration:
7+
def setup_method(self):
8+
"""Set up the test environment"""
9+
self.ctx = SessionContext()
10+
self.db_path = os.path.join(os.path.dirname(__file__), "..", "..", "..", "core", "examples", "duckdb_example.db")
11+
self.pool_readonly = duckdb.DuckDBTableFactory(self.db_path, duckdb.AccessMode.ReadOnly)
12+
self.pool_readwrite = duckdb.DuckDBTableFactory(self.db_path)
13+
14+
def test_get_tables(self):
15+
"""Test retrieving tables from the database"""
16+
tables = self.pool_readonly.tables()
17+
assert isinstance(tables, list)
18+
assert len(tables) == 2
19+
assert tables == ["companies", "projects"]
20+
21+
def test_query_companies(self):
22+
"""Test querying companies table with SQL"""
23+
self.ctx.register_table_provider("companies", self.pool_readonly.get_table("companies"))
24+
25+
# Run SQL query to select Microsoft row
26+
df = self.ctx.sql("SELECT name FROM companies WHERE ticker = 'MSFT'")
27+
result = df.collect()
28+
29+
# Verify single row returned with name = Microsoft
30+
assert len(result) == 1
31+
assert str(result[0]['name'][0]) == "Microsoft"
32+
33+
def test_complex_query(self):
34+
"""Test querying companies table with SQL"""
35+
self.ctx.register_table_provider("companies", self.pool_readonly.get_table("companies"))
36+
self.ctx.register_table_provider("projects", self.pool_readonly.get_table("projects"))
37+
38+
# Run SQL query to select Microsoft row
39+
df = self.ctx.sql(
40+
"""SELECT companies.id, companies.name as company_name, projects.name as project_name
41+
FROM companies, projects
42+
WHERE companies.id = projects.id"""
43+
)
44+
result = df.collect()
45+
46+
assert len(result) == 1
47+
assert str(result[0]['company_name'][0]) == "Microsoft"
48+
assert str(result[0]['project_name'][0]) == "DataFusion"
49+
50+
def test_write_fails(self):
51+
"""Test that writing fails on read-only mode"""
52+
table_name = "companies"
53+
self.ctx.register_table_provider(table_name, self.pool_readonly.get_table("companies"))
54+
55+
with pytest.raises(Exception):
56+
tmp = self.ctx.sql("INSERT INTO companies VALUES (3, 'Test Corp', 'TEST')")
57+
tmp.collect() # this will trigger the execution of the query
58+
59+
def test_write_fails_readwrite(self):
60+
"""Test that writing fails because it is not supported"""
61+
# Insertion fails because duckdb does not implement write operations even when
62+
# database is opened in read-write mode.
63+
table_name = "companies"
64+
self.ctx.register_table_provider(table_name, self.pool_readwrite.get_table("companies"))
65+
66+
with pytest.raises(Exception):
67+
tmp = self.ctx.sql("INSERT INTO companies VALUES (3, 'Test Corp', 'TEST')")
68+
tmp.collect()

python/python/tests/test_flight.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from datafusion import SessionContext
2+
from datafusion_table_providers import flight
3+
import pytest
4+
import subprocess
5+
import time
6+
7+
8+
class TestFlightIntegration:
9+
@classmethod
10+
def setup_class(self):
11+
"""Called once before all test methods in the class"""
12+
self.ctx = SessionContext()
13+
self.pool = flight.FlightTableFactory()
14+
self.process = subprocess.Popen(
15+
["roapi", "-t", "taxi=https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet"],
16+
stdout=subprocess.PIPE,
17+
stderr=subprocess.PIPE
18+
)
19+
# 20s timeout is required to ensure the server is running and data is loaded
20+
# The timeout is determined by empirical testing
21+
time.sleep(20)
22+
23+
@classmethod
24+
def teardown_class(self):
25+
"""Called once after all test methods in the class"""
26+
self.process.kill()
27+
28+
def test_query_companies(self):
29+
"""Test querying companies table with SQL"""
30+
print("Running test_query_companies")
31+
table_name = "taxi_flight_table"
32+
self.ctx.register_table_provider(table_name, self.pool.get_table("http://localhost:32010", {
33+
"flight.sql.query": "SELECT * FROM taxi"
34+
}))
35+
df = self.ctx.sql(f"""
36+
SELECT "VendorID", COUNT(*) as count, SUM(passenger_count) as passenger_counts, SUM(total_amount) as total_amounts
37+
FROM {table_name}
38+
GROUP BY "VendorID"
39+
ORDER BY COUNT(*) DESC
40+
""")
41+
result = df.collect()
42+
43+
# Verify the results
44+
vendor_ids = result[0]['VendorID'].tolist()
45+
assert vendor_ids == [2, 1, 6]
46+
47+
counts = result[0]['count'].tolist()
48+
assert counts == [2234632, 729732, 260]
49+
50+
passenger_counts = result[0]['passenger_counts'].tolist()
51+
assert passenger_counts == [2971865, 810883, None]
52+
53+
total_amounts = result[0]['total_amounts'].tolist()
54+
assert total_amounts == pytest.approx([60602721.27, 18841261.98, 12401.03])

python/python/tests/test_mysql.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import subprocess
2+
import time
3+
4+
from datafusion import SessionContext
5+
from datafusion_table_providers import mysql
6+
7+
def run_docker_container():
8+
"""Run the Docker container with the MySQL image"""
9+
result = subprocess.run(
10+
["docker", "run", "--name", "mysql", "-e", "MYSQL_ROOT_PASSWORD=password", "-e", "MYSQL_DATABASE=mysql_db",
11+
"-p", "3306:3306", "-d", "mysql:9.0"],
12+
stdout=subprocess.PIPE,
13+
stderr=subprocess.PIPE
14+
)
15+
if result.returncode != 0:
16+
print(f"Failed to start MySQL container: {result.stderr.decode()}")
17+
18+
def create_table_and_insert_data():
19+
"""Create a table and insert data into MySQL"""
20+
sql_commands = """
21+
CREATE TABLE companies (
22+
id INT PRIMARY KEY,
23+
name VARCHAR(100)
24+
);
25+
26+
INSERT INTO companies (id, name) VALUES (1, 'Acme Corporation');
27+
"""
28+
29+
# Execute the SQL commands inside the Docker container
30+
result = subprocess.run(
31+
["docker", "exec", "-i", "mysql", "mysql", "-uroot", "-ppassword", "mysql_db"],
32+
input=sql_commands.encode(), # Pass SQL commands to stdin
33+
stdout=subprocess.PIPE,
34+
stderr=subprocess.PIPE
35+
)
36+
37+
# Check if the SQL execution was successful
38+
if result.returncode != 0:
39+
print(f"Error executing SQL commands: {result.stderr.decode()}")
40+
else:
41+
print(f"SQL commands executed successfully:\n{result.stdout.decode()}")
42+
43+
def stop_and_remove_container():
44+
"""Stop and remove the MySQL container after use"""
45+
subprocess.run(["docker", "stop", "mysql"])
46+
subprocess.run(["docker", "rm", "mysql"])
47+
print("MySQL container stopped and removed.")
48+
49+
50+
class TestMySQLIntegration:
51+
@classmethod
52+
def setup_class(self):
53+
run_docker_container()
54+
time.sleep(30)
55+
create_table_and_insert_data()
56+
time.sleep(10)
57+
self.ctx = SessionContext()
58+
connection_param = {
59+
"connection_string": "mysql://root:password@localhost:3306/mysql_db",
60+
"sslmode": "disabled"}
61+
self.pool = mysql.MySQLTableFactory(connection_param)
62+
63+
@classmethod
64+
def teardown_class(self):
65+
stop_and_remove_container()
66+
67+
def test_get_tables(self):
68+
"""Test retrieving tables from the database"""
69+
tables = self.pool.tables()
70+
assert isinstance(tables, list)
71+
assert len(tables) == 1
72+
assert tables == ["companies"]
73+
74+
def test_query_companies(self):
75+
"""Test querying companies table with SQL"""
76+
table_name = "companies"
77+
self.ctx.register_table_provider(table_name, self.pool.get_table("companies"))
78+
query = "SELECT * FROM companies"
79+
df = self.ctx.sql(query).collect()
80+
assert df is not None
81+
name_column = df[0]['name']
82+
assert str(name_column[0]) == "Acme Corporation"

python/python/tests/test_odbc.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import pytest
2+
import os
3+
from datafusion import SessionContext
4+
from datafusion_table_providers import odbc
5+
6+
class TestOdbcIntegration:
7+
def setup_method(self):
8+
"""Set up the test environment"""
9+
self.ctx = SessionContext()
10+
connection_param: dict = {'connection_string': 'driver=SQLite3;database=../../../core/examples/sqlite_example.db;'}
11+
self.pool = odbc.ODBCTableFactory(connection_param)
12+
13+
def test_query_companies(self):
14+
"""Test querying companies table with SQL"""
15+
self.ctx.register_table_provider("companies", self.pool.get_table("companies"))
16+
17+
# Run SQL query to select Microsoft row
18+
df = self.ctx.sql("SELECT name FROM companies WHERE ticker = 'MSFT'")
19+
result = df.collect()
20+
21+
# Verify single row returned with name = Microsoft
22+
assert len(result) == 1
23+
assert str(result[0]['name'][0]) == "Microsoft"
24+
25+
def test_complex_query(self):
26+
"""Test querying companies table with SQL"""
27+
self.ctx.register_table_provider("companies", self.pool.get_table("companies"))
28+
self.ctx.register_table_provider("projects", self.pool.get_table("projects"))
29+
30+
# Run SQL query to select Microsoft row
31+
df = self.ctx.sql(
32+
"""SELECT companies.id, companies.name as company_name, projects.name as project_name
33+
FROM companies, projects
34+
WHERE companies.id = projects.id"""
35+
)
36+
result = df.collect()
37+
38+
assert len(result) == 1
39+
assert str(result[0]['company_name'][0]) == "Microsoft"
40+
assert str(result[0]['project_name'][0]) == "DataFusion"
41+
42+
def test_write_fails(self):
43+
"""Test that writing fails because it is not supported"""
44+
table_name = "companies"
45+
self.ctx.register_table_provider(table_name, self.pool.get_table("companies"))
46+
47+
with pytest.raises(Exception):
48+
tmp = self.ctx.sql("INSERT INTO companies VALUES (3, 'Test Corp', 'TEST')")
49+
tmp.collect() # this will trigger the execution of the query

0 commit comments

Comments
 (0)