1616"""
1717
1818
19- # TODO:
20- # - Specify default collation for db
21- # - Remove corpus data before importing to a multi-corpus table
22-
23-
2419import csv
2520import re
2621
@@ -63,6 +58,11 @@ class KorpDatabase:
6358 "create-user" : "Use user {} to create the Korp MySQL test database" ,
6459 "create-password" : (
6560 "Use password {} to create the Korp MySQL test database" ),
61+ "collate" : (
62+ "Use {} as the Korp MySQL test database collation."
63+ " If not specified, use the collation of the Korp MySQL database,"
64+ " or if that cannot be accessed, the default collation for the"
65+ " Korp MySQL database character set." ),
6666 }
6767 # The custom pytest command-line options
6868 _pytest_db_options = {}
@@ -91,6 +91,8 @@ def __init__(self, datadir):
9191 self ._tableinfo = []
9292 # Filename patterns by table type
9393 self ._tabletype_patts = defaultdict (list )
94+ # Table info by table type
95+ self ._tabletype_info = defaultdict (list )
9496 # Initialize self._tableinfo, self._tabletype_patts
9597 self ._read_tableinfo ()
9698 # If True, use an existing table in the database, so do not
@@ -221,6 +223,39 @@ def create(self):
221223 _make_db_name, user is taken from _db_options and host from
222224 _conn_params.
223225 """
226+
227+ def get_collation (korp_conf ):
228+ """Get the collation for the Korp test database.
229+
230+ If the custom Pytest command-line option --db-collate has
231+ been specified, return its value.
232+ Otherwise, if the Korp MySQL database can be accessed
233+ (with the parameters defined in the Korp configuration
234+ korp_conf), return its collation.
235+ Otherwise, return the empty string to use the default
236+ collation for the database character set.
237+ """
238+ collate = self ._db_options ["collate" ] or ""
239+ if not collate :
240+ try :
241+ # Try to access the Korp database
242+ with MySQLdb .Connect (
243+ host = korp_conf ["DBHOST" ],
244+ port = korp_conf ["DBPORT" ],
245+ user = korp_conf ["DBUSER" ],
246+ password = korp_conf ["DBPASSWORD" ],
247+ database = korp_conf ["DBNAME" ],
248+ use_unicode = True
249+ ) as conn :
250+ cursor = conn .cursor ()
251+ # Alternative ways to find out db collation:
252+ # https://stackoverflow.com/a/76490467
253+ cursor .execute ("SELECT @@collation_database;" )
254+ collate = cursor .fetchone ()[0 ]
255+ except MySQLdb .Error as exc :
256+ pass
257+ return collate
258+
224259 if self .dbname is not None :
225260 # If a database has already been created, do not create
226261 # another
@@ -231,13 +266,17 @@ def create(self):
231266 with self ._connect () as conn :
232267 cursor = conn .cursor ()
233268 dbname = self ._make_db_name (cursor )
234- charset = korp_conf ['DBCHARSET' ]
235- user = self ._db_options ['user' ]
236- host = self ._conn_params ['host' ]
237- for sql in [
238- f"CREATE DATABASE { dbname } CHARACTER SET { charset } ;" ,
239- f"GRANT ALL ON { dbname } .* TO '{ user } '@'{ host } ';" ,
240- ]:
269+ charset = korp_conf ["DBCHARSET" ]
270+ collate = get_collation (korp_conf )
271+ if collate :
272+ collate = f" COLLATE { collate } "
273+ user = self ._db_options ["user" ]
274+ host = self ._conn_params ["host" ]
275+ sqls = [
276+ f"CREATE DATABASE { dbname } CHARACTER SET { charset } { collate } ;" ,
277+ f"GRANT ALL ON { dbname } .* TO '{ user } '@'{ host } ';" ,
278+ ]
279+ for sql in sqls :
241280 self .execute (sql , cursor )
242281 except MySQLdb .Error as exc :
243282 self .create_error = {
@@ -302,7 +341,7 @@ def compile_filenames(filenames):
302341
303342 If a filename does not end in ".tsv", add the suffix. If a
304343 filename does not begin with ".*/", add the prefix.
305- Replace corpus name placeholder "{corpus}” with
344+ Replace corpus name placeholder "{corpus}" with
306345 "(?P<corpus>[a-zA-Z0-9_-]+?)".
307346 """
308347 filenames_re = []
@@ -369,28 +408,36 @@ def expand_vars(tableinfo_items):
369408 if not filename .startswith (".*/" ):
370409 filename = ".*/" + filename
371410 self ._tabletype_patts [info ["tabletype" ]].append (filename )
411+ self ._tabletype_info [info ["tabletype" ]].append (info )
372412 self ._tableinfo = tableinfo
373413
374414 def import_tables (self , corpora , tabletypes = None ):
375415 """Import database tables of tabletypes (or all) for corpora.
376416
377417 Import database tables in TSV or SQL files matching patterns
378418 in self._tabletype_patts for the tabletypes and corpora.
419+ Possibly existing tables are first dropped to avoid
420+ interference between tests.
421+
379422 corpora and tabletypes may be single strings or lists of
380423 strings. If tabletypes is None (default), import all types of
381424 tables for corpora.
382425 """
383- files = self ._find_table_files (corpora , tabletypes )
384- self .import_table_files (files )
385-
386- def _find_table_files (self , corpora , tabletypes = None ):
387- """Return a list of table data file names for corpora and tabletypes."""
388426 if tabletypes is None :
389427 tabletypes = self ._tabletype_patts .keys ()
390428 elif isinstance (tabletypes , str ):
391429 tabletypes = [tabletypes ]
392430 if isinstance (corpora , str ):
393431 corpora = [corpora ]
432+ files = self ._find_table_files (corpora , tabletypes )
433+ # It would probably be more efficient to delete existing data
434+ # than to drop and re-create tables, but the latter is simpler
435+ # to implement
436+ self .drop_tables (corpora , tabletypes )
437+ self .import_table_files (files )
438+
439+ def _find_table_files (self , corpora , tabletypes ):
440+ """Return a list of table data file names for corpora and tabletypes."""
394441 files = []
395442 for ext in ["sql" , "tsv" ]:
396443 for filename in self ._datadir .rglob (f"*.{ ext } " ):
@@ -403,8 +450,29 @@ def _find_table_files(self, corpora, tabletypes=None):
403450 files .append (filename )
404451 return files
405452
453+ def drop_tables (self , corpora , tabletypes ):
454+ """Drop possibly existing tables for tabletypes and corpora.
455+
456+ corpora and tabletypes are iterables of strings.
457+ """
458+ tables = []
459+ for tabletype in tabletypes :
460+ for info in self ._tabletype_info [tabletype ]:
461+ if "{" in info ["tablename" ]:
462+ # Table name contains corpus id
463+ tables .extend (self ._make_tablename (info , corpus )
464+ for corpus in corpora )
465+ else :
466+ tables .append (info ["tablename" ])
467+ tablenames = ", " .join (f"`{ table } `" for table in tables )
468+ self .execute (f"DROP TABLE IF EXISTS { tablenames } ;" )
469+
406470 def import_table_files (self , tablefile_globs ):
407- """Import table data from files matched by tablefile_globs."""
471+ """Import table data from files matched by tablefile_globs.
472+
473+ Note that unlike import_tables, import_table_files does *not*
474+ first drop possibly existing tables.
475+ """
408476
409477 def find_files (tablefile_glob ):
410478 """Find files in tablefile_globs or use directly if absolute.
0 commit comments