Skip to content

Commit fef7f75

Browse files
Merge pull request #44 from CSCfi/t-tests-improve-dbutils
Tests: Improvements to database testing utilities
2 parents 2ec3d5c + b189a19 commit fef7f75

File tree

3 files changed

+104
-22
lines changed

3 files changed

+104
-22
lines changed

tests/README.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,18 @@ privilege.
6161
The database user should also have the file privilege to load data
6262
from files.
6363

64-
If the test database cannot be created, tests using the database
65-
(fixture `database`) are skipped.
64+
In addition, you can specify the database collation with a custom
65+
`pytest` command-line option:
66+
67+
- `--db-collate=`_COLLATE_: Use _COLLATE_ as the Korp MySQL test
68+
database collation. If not specified, use the collation of the Korp
69+
MySQL database, or if the Korp MySQL database (as specified in the
70+
Korp configuration) cannot be accessed, the default collation for
71+
the database character set specified in the Korp configuration
72+
variable `DBCHARSET`.
73+
74+
If the test database cannot be created, a warning is issued and tests
75+
using the database (fixture `database`) are skipped.
6676

6777

6878
### Test coverage

tests/conftest.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
"""
77

88

9+
import warnings
10+
911
import pytest
1012

1113
from pathlib import Path
@@ -88,6 +90,7 @@ def database(_database):
8890
msg = "Unable to create Korp database: Error " + error["message"]
8991
if error["sql"] is not None:
9092
msg += " when executing SQL statement: " + error["sql"]
93+
warnings.warn(f"Skipping tests using Korp database: {msg}")
9194
pytest.skip(msg)
9295
yield _database
9396

@@ -99,7 +102,8 @@ def database_tables(database):
99102
The returned function takes as its arguments a list of corpora
100103
(corpus ids) or a single corpus id (string) whose data to import,
101104
and the type of table data to import (if omitted, import all
102-
types).
105+
types). The function drops possibly existing tables, so all the
106+
tables for a test should be imported with a single call.
103107
"""
104108

105109
def _database_tables(corpora, tabletype=None):

tests/dbutils.py

Lines changed: 87 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,6 @@
1616
"""
1717

1818

19-
# TODO:
20-
# - Specify default collation for db
21-
# - Remove corpus data before importing to a multi-corpus table
22-
23-
2419
import csv
2520
import re
2621

@@ -63,6 +58,11 @@ class KorpDatabase:
6358
"create-user": "Use user {} to create the Korp MySQL test database",
6459
"create-password": (
6560
"Use password {} to create the Korp MySQL test database"),
61+
"collate": (
62+
"Use {} as the Korp MySQL test database collation."
63+
" If not specified, use the collation of the Korp MySQL database,"
64+
" or if that cannot be accessed, the default collation for the"
65+
" Korp MySQL database character set."),
6666
}
6767
# The custom pytest command-line options
6868
_pytest_db_options = {}
@@ -91,6 +91,8 @@ def __init__(self, datadir):
9191
self._tableinfo = []
9292
# Filename patterns by table type
9393
self._tabletype_patts = defaultdict(list)
94+
# Table info by table type
95+
self._tabletype_info = defaultdict(list)
9496
# Initialize self._tableinfo, self._tabletype_patts
9597
self._read_tableinfo()
9698
# If True, use an existing table in the database, so do not
@@ -221,6 +223,39 @@ def create(self):
221223
_make_db_name, user is taken from _db_options and host from
222224
_conn_params.
223225
"""
226+
227+
def get_collation(korp_conf):
228+
"""Get the collation for the Korp test database.
229+
230+
If the custom Pytest command-line option --db-collate has
231+
been specified, return its value.
232+
Otherwise, if the Korp MySQL database can be accessed
233+
(with the parameters defined in the Korp configuration
234+
korp_conf), return its collation.
235+
Otherwise, return the empty string to use the default
236+
collation for the database character set.
237+
"""
238+
collate = self._db_options["collate"] or ""
239+
if not collate:
240+
try:
241+
# Try to access the Korp database
242+
with MySQLdb.Connect(
243+
host=korp_conf["DBHOST"],
244+
port=korp_conf["DBPORT"],
245+
user=korp_conf["DBUSER"],
246+
password=korp_conf["DBPASSWORD"],
247+
database=korp_conf["DBNAME"],
248+
use_unicode=True
249+
) as conn:
250+
cursor = conn.cursor()
251+
# Alternative ways to find out db collation:
252+
# https://stackoverflow.com/a/76490467
253+
cursor.execute("SELECT @@collation_database;")
254+
collate = cursor.fetchone()[0]
255+
except MySQLdb.Error as exc:
256+
pass
257+
return collate
258+
224259
if self.dbname is not None:
225260
# If a database has already been created, do not create
226261
# another
@@ -231,13 +266,17 @@ def create(self):
231266
with self._connect() as conn:
232267
cursor = conn.cursor()
233268
dbname = self._make_db_name(cursor)
234-
charset = korp_conf['DBCHARSET']
235-
user = self._db_options['user']
236-
host = self._conn_params['host']
237-
for sql in [
238-
f"CREATE DATABASE {dbname} CHARACTER SET {charset};",
239-
f"GRANT ALL ON {dbname}.* TO '{user}'@'{host}';",
240-
]:
269+
charset = korp_conf["DBCHARSET"]
270+
collate = get_collation(korp_conf)
271+
if collate:
272+
collate = f" COLLATE {collate}"
273+
user = self._db_options["user"]
274+
host = self._conn_params["host"]
275+
sqls = [
276+
f"CREATE DATABASE {dbname} CHARACTER SET {charset}{collate};",
277+
f"GRANT ALL ON {dbname}.* TO '{user}'@'{host}';",
278+
]
279+
for sql in sqls:
241280
self.execute(sql, cursor)
242281
except MySQLdb.Error as exc:
243282
self.create_error = {
@@ -302,7 +341,7 @@ def compile_filenames(filenames):
302341
303342
If a filename does not end in ".tsv", add the suffix. If a
304343
filename does not begin with ".*/", add the prefix.
305-
Replace corpus name placeholder "{corpus} with
344+
Replace corpus name placeholder "{corpus}" with
306345
"(?P<corpus>[a-zA-Z0-9_-]+?)".
307346
"""
308347
filenames_re = []
@@ -369,28 +408,36 @@ def expand_vars(tableinfo_items):
369408
if not filename.startswith(".*/"):
370409
filename = ".*/" + filename
371410
self._tabletype_patts[info["tabletype"]].append(filename)
411+
self._tabletype_info[info["tabletype"]].append(info)
372412
self._tableinfo = tableinfo
373413

374414
def import_tables(self, corpora, tabletypes=None):
375415
"""Import database tables of tabletypes (or all) for corpora.
376416
377417
Import database tables in TSV or SQL files matching patterns
378418
in self._tabletype_patts for the tabletypes and corpora.
419+
Possibly existing tables are first dropped to avoid
420+
interference between tests.
421+
379422
corpora and tabletypes may be single strings or lists of
380423
strings. If tabletypes is None (default), import all types of
381424
tables for corpora.
382425
"""
383-
files = self._find_table_files(corpora, tabletypes)
384-
self.import_table_files(files)
385-
386-
def _find_table_files(self, corpora, tabletypes=None):
387-
"""Return a list of table data file names for corpora and tabletypes."""
388426
if tabletypes is None:
389427
tabletypes = self._tabletype_patts.keys()
390428
elif isinstance(tabletypes, str):
391429
tabletypes = [tabletypes]
392430
if isinstance(corpora, str):
393431
corpora = [corpora]
432+
files = self._find_table_files(corpora, tabletypes)
433+
# It would probably be more efficient to delete existing data
434+
# than to drop and re-create tables, but the latter is simpler
435+
# to implement
436+
self.drop_tables(corpora, tabletypes)
437+
self.import_table_files(files)
438+
439+
def _find_table_files(self, corpora, tabletypes):
440+
"""Return a list of table data file names for corpora and tabletypes."""
394441
files = []
395442
for ext in ["sql", "tsv"]:
396443
for filename in self._datadir.rglob(f"*.{ext}"):
@@ -403,8 +450,29 @@ def _find_table_files(self, corpora, tabletypes=None):
403450
files.append(filename)
404451
return files
405452

453+
def drop_tables(self, corpora, tabletypes):
454+
"""Drop possibly existing tables for tabletypes and corpora.
455+
456+
corpora and tabletypes are iterables of strings.
457+
"""
458+
tables = []
459+
for tabletype in tabletypes:
460+
for info in self._tabletype_info[tabletype]:
461+
if "{" in info["tablename"]:
462+
# Table name contains corpus id
463+
tables.extend(self._make_tablename(info, corpus)
464+
for corpus in corpora)
465+
else:
466+
tables.append(info["tablename"])
467+
tablenames = ", ".join(f"`{table}`" for table in tables)
468+
self.execute(f"DROP TABLE IF EXISTS {tablenames};")
469+
406470
def import_table_files(self, tablefile_globs):
407-
"""Import table data from files matched by tablefile_globs."""
471+
"""Import table data from files matched by tablefile_globs.
472+
473+
Note that unlike import_tables, import_table_files does *not*
474+
first drop possibly existing tables.
475+
"""
408476

409477
def find_files(tablefile_glob):
410478
"""Find files in tablefile_globs or use directly if absolute.

0 commit comments

Comments
 (0)