3434DEFAULT_STEMMER = "none" # Exact word matches
3535DUCKDB_DEFAULT_INDEX_FILENAME = "index.duckdb"
3636DUCKDB_DEFAULT_PARTIAL_INDEX_FILENAME = "partial-index.duckdb"
37+
38+ ATTACH_DATABASE = "ATTACH '{database}' as db; USE db;"
39+ ATTACH_READ_ONLY_DATABASE = "ATTACH '{database}' as db (READ_ONLY); USE db;"
40+ LOAD_FTS_COMMAND = "INSTALL 'fts'; LOAD 'fts';"
41+ DISABLE_EXTERNAL_ACCESS_COMMAND = "SET enable_external_access=false;"
42+ LOCK_CONFIG_COMMAND = "SET lock_configuration=true;"
43+ SET_EXTENSIONS_DIRECTORY_COMMAND = "SET extension_directory='{directory}';"
44+
3745CREATE_INDEX_COMMAND = (
3846 f"PRAGMA create_fts_index('data', '{ ROW_IDX_COLUMN } ', {{columns}}, stemmer='{{stemmer}}', overwrite=1);"
3947)
@@ -216,16 +224,7 @@ def _sql(con: duckdb.DuckDBPyConnection, query: str) -> duckdb.DuckDBPyRelation:
216224 return out
217225
218226 with tempfile .TemporaryDirectory (suffix = ".duckdb" ) as tmp_dir :
219- with duckdb .connect (":memory:" ) as con :
220- # configure duckdb extensions
221- if extensions_directory is not None :
222- con .execute (SET_EXTENSIONS_DIRECTORY_COMMAND .format (directory = extensions_directory ))
223- con .execute (INSTALL_AND_LOAD_EXTENSION_COMMAND )
224-
225- # init
226- _sql (con , "ATTACH '%database%' as db;" )
227- _sql (con , "USE db;" )
228-
227+ with duckdb_connect (database = database , extensions_directory = extensions_directory ) as con :
229228 # check input_table and get number of rows
230229 _count = _sql (con , "SELECT count(*) FROM %input_table%;" ).fetchone ()
231230 if _count and isinstance (_count [0 ], int ):
@@ -255,7 +254,8 @@ def _sql(con: duckdb.DuckDBPyConnection, query: str) -> duckdb.DuckDBPyRelation:
255254 )
256255
257256 # create fields table
258- field_values = ", " .join (f"({ i } , '{ field } ')" for i , field in enumerate (columns ))
257+
258+ field_values = ", " .join (f"({ i } , { varchar_sql (field )} )" for i , field in enumerate (columns ))
259259 _sql (
260260 con ,
261261 """
@@ -278,17 +278,8 @@ def _sql(con: duckdb.DuckDBPyConnection, query: str) -> duckdb.DuckDBPyRelation:
278278 batch_size = 1 + count // num_jobs
279279 commands = [
280280 (
281- (
282- SET_EXTENSIONS_DIRECTORY_COMMAND .format (directory = extensions_directory )
283- if extensions_directory is not None
284- else ""
285- )
286- + INSTALL_AND_LOAD_EXTENSION_COMMAND
287- + (
288- "ATTACH IF NOT EXISTS '%database%' as db (READ_ONLY);" # nosec - tmp_dir, batch_size, rank and i are safe
289- "USE db;"
290- f"ATTACH '{ tmp_dir } /tmp_{ rank } _{ i } .duckdb' as tmp_{ rank } _{ i } ;"
291- f"""
281+ f"ATTACH '{ tmp_dir } /tmp_{ rank } _{ i } .duckdb' as tmp_{ rank } _{ i } ;" # nosec - tmp_dir, batch_size, rank and i are safe
282+ f"""
292283 CREATE TABLE tmp_{ rank } _{ i } .tokenized AS (
293284 SELECT unnest(%fts_schema%.tokenize(fts_ii."{ column } ")) AS w,
294285 { rank * batch_size } + row_number() OVER () - 1 AS docid,
@@ -299,14 +290,13 @@ def _sql(con: duckdb.DuckDBPyConnection, query: str) -> duckdb.DuckDBPyRelation:
299290 );
300291 CHECKPOINT;
301292 """
302- )
303293 )
304294 for rank in range (num_jobs )
305295 for i , column in enumerate (columns )
306296 ]
307297
308298 def _parallel_sql (command : str ) -> None :
309- with duckdb . connect ( ":memory:" ) as rank_con :
299+ with duckdb_connect ( database = database , extensions_directory = extensions_directory ) as rank_con :
310300 _sql (rank_con , command )
311301
312302 thread_map (_parallel_sql , commands , desc = "Tokenize" )
@@ -324,16 +314,9 @@ def _parallel_sql(command: str) -> None:
324314 # """)
325315 # union_fields_query = " UNION ALL ".join(f"SELECT * FROM tmp.tokenized_{i}" for i in range(len(columns)))
326316
327- with duckdb .connect (":memory:" ) as con :
328- # configure duckdb extensions
329- if extensions_directory is not None :
330- con .execute (SET_EXTENSIONS_DIRECTORY_COMMAND .format (directory = extensions_directory ))
331- con .execute (INSTALL_AND_LOAD_EXTENSION_COMMAND )
332-
317+ with duckdb_connect (database = database , extensions_directory = extensions_directory ) as con :
333318 # init
334319 _sql (con , f"ATTACH '{ tmp_dir } /tmp.duckdb' as tmp;" ) # nosec - tmp_dir is safe
335- _sql (con , "ATTACH '%database%' as db;" )
336- _sql (con , "USE db;" )
337320 _sql (
338321 con ,
339322 ";" .join (
@@ -526,3 +509,32 @@ def _parallel_sql(command: str) -> None:
526509 """ ,
527510 )
528511 _sql (con , "CHECKPOINT;" )
512+
513+
514+ def varchar_sql (value : str ) -> str :
515+ """escape the value and return the varchar `'{value}'`"""
516+ return "'" + value .replace ("'" , "''" ) + "'"
517+
518+
519+ def key_sql (value : str ) -> str :
520+ """escape the value and return the key `"{value}"`"""
521+ return '"' + value .replace ('"' , '""' ) + '"'
522+
523+
524+ def duckdb_connect (
525+ index_file_location : Optional [str ] = None ,
526+ database : Optional [str ] = None ,
527+ extensions_directory : Optional [str ] = None ,
528+ read_only : bool = False ,
529+ ** kwargs : Any ,
530+ ) -> duckdb .DuckDBPyConnection :
531+ """In-memory session with the current database attached with read-only and fts extension"""
532+ con = duckdb .connect (":memory:" if index_file_location is None else index_file_location , ** kwargs )
533+ if database is not None :
534+ con .execute ((ATTACH_READ_ONLY_DATABASE if read_only else ATTACH_DATABASE ).format (database = database ))
535+ if extensions_directory is not None :
536+ con .execute (SET_EXTENSIONS_DIRECTORY_COMMAND .format (directory = extensions_directory ))
537+ con .sql (LOAD_FTS_COMMAND )
538+ con .sql (DISABLE_EXTERNAL_ACCESS_COMMAND )
539+ con .sql (LOCK_CONFIG_COMMAND )
540+ return con
0 commit comments