@@ -161,7 +161,6 @@ def __rows_as_void(rows):
161161 void_dtype = np .dtype ((np .void , arr_contiguous .dtype .itemsize * arr_contiguous .shape [1 ]))
162162 return arr_contiguous .view (void_dtype ).ravel ()
163163
164-
165164 corpus_philo_ids_void = __rows_as_void (corpus_philo_ids )
166165 philo_ids_void = __rows_as_void (philo_ids )
167166 matching_indices_void = np .isin (philo_ids_void , corpus_philo_ids_void )
@@ -228,15 +227,10 @@ def search_word(db_path, hitlist_filename, corpus_file=None):
228227def search_phrase (db_path , hitlist_filename , corpus_file = None ):
229228 """Phrase searches where words need to be in a specific order"""
230229 word_groups = get_word_groups (f"{ hitlist_filename } .terms" )
231- object_level = None
232- corpus_philo_ids = None
233- if corpus_file is not None :
234- corpus_philo_ids , object_level = get_corpus_philo_ids (corpus_file )
235230 common_object_ids = get_cooccurrence_groups (
236- db_path , word_groups , corpus_philo_ids = corpus_philo_ids , object_level = object_level , cooc_order = True
231+ db_path , word_groups , corpus_file = corpus_file , cooc_order = True
237232 )
238233 mapping_order = next (common_object_ids )
239-
240234 with open (hitlist_filename , "wb" ) as output_file :
241235 for philo_id_groups in common_object_ids :
242236 for group_combination in product (* philo_id_groups ):
@@ -256,12 +250,8 @@ def search_phrase(db_path, hitlist_filename, corpus_file=None):
256250def search_within_word_span (db_path , hitlist_filename , n , cooc_order , exact_distance , corpus_file = None ):
257251 """Search for co-occurrences of multiple words within n words of each other in the database."""
258252 word_groups = get_word_groups (f"{ hitlist_filename } .terms" )
259- object_level = None
260- corpus_philo_ids = None
261- if corpus_file is not None :
262- corpus_philo_ids , object_level = get_corpus_philo_ids (corpus_file )
263253 common_object_ids = get_cooccurrence_groups (
264- db_path , word_groups , corpus_philo_ids = corpus_philo_ids , object_level = object_level , cooc_order = cooc_order
254+ db_path , word_groups , corpus_file = corpus_file , cooc_order = cooc_order
265255 )
266256
267257 if cooc_order is True :
@@ -302,16 +292,11 @@ def search_within_word_span(db_path, hitlist_filename, n, cooc_order, exact_dist
302292def search_within_text_object (db_path , hitlist_filename , level , cooc_order , corpus_file = None ):
303293 """Search for co-occurrences of multiple words in the same sentence in the database."""
304294 word_groups = get_word_groups (f"{ hitlist_filename } .terms" )
305- object_level = None
306- corpus_philo_ids = None
307- if corpus_file is not None :
308- corpus_philo_ids , object_level = get_corpus_philo_ids (corpus_file )
309295 common_object_ids = get_cooccurrence_groups (
310296 db_path ,
311297 word_groups ,
312298 level = level ,
313- corpus_philo_ids = corpus_philo_ids ,
314- object_level = object_level ,
299+ corpus_file = corpus_file ,
315300 cooc_order = cooc_order ,
316301 )
317302
@@ -358,7 +343,7 @@ def get_word_groups(terms_file):
358343
359344
360345def get_cooccurrence_groups (
361- db_path , word_groups , level = "sent" , corpus_philo_ids = None , object_level = None , cooc_order = False
346+ db_path , word_groups , level = "sent" , corpus_file = None , cooc_order = False
362347):
363348 cooc_slice = 6
364349 if level == "para" :
@@ -396,8 +381,8 @@ def one_word_generator(word):
396381 else :
397382 group_generators .append (merge_word_group (txn , words , chunk_size = 36 * 1000 ))
398383
399- if corpus_philo_ids is not None :
400- first_group_data = filter_philo_ids (first_group_data , corpus_philo_ids , object_level )
384+ if corpus_file is not None :
385+ first_group_data = filter_philo_ids (corpus_file , first_group_data )
401386
402387 group_data = [None for _ in range (len (word_groups ) - 1 )] # Start with None for each group
403388 break_out = False
0 commit comments