2323logger = logging .getLogger (__name__ )
2424
2525
26- def get_embedder (embedding_model_name : str ):
27- """Define embedder to convert text into vectors."""
28- model_kwargs = {"device" : DEVICE }
29- embedder = HuggingFaceEmbeddings (
26+ def get_embedder (embedding_model_name : str ) -> HuggingFaceEmbeddings :
27+ """Initialize an embedder to convert text into vectors."""
28+ return HuggingFaceEmbeddings (
3029 model_name = embedding_model_name ,
31- model_kwargs = model_kwargs ,
30+ model_kwargs = { "device" : DEVICE } ,
3231 show_progress = True ,
3332 )
3433
35- return embedder
36-
3734
3835def ingest (
3936 meta_lookup : dict [pathlib .Path , dict ],
@@ -44,7 +41,6 @@ def ingest(
4441 embedding_model_name : str = "sentence-transformers/all-MiniLM-L6-v2" ,
4542 mode : str = "overwrite" ,
4643 collection_metadata : dict = {},
47- logs_folder_id : str = None ,
4844):
4945 """Load documents into a vectorstore."""
5046 # Get documents
@@ -59,7 +55,9 @@ def ingest(
5955 file_name = source .stem
6056 document .metadata ["_source" ] = document .metadata ["source" ]
6157 document .metadata ["source" ] = file_name
62- chunks = split_document (document , extension , chunk_size = chunk_size , chunk_overlap = chunk_overlap )
58+ chunks = split_document (
59+ document , extension , chunk_size = chunk_size , chunk_overlap = chunk_overlap
60+ )
6361 # Attach metadata to each chunk
6462 for chunk in chunks :
6563 path_metadata = meta_lookup .get (source , {})
@@ -101,11 +99,15 @@ def ingest(
10199 logger .info (f"Collection { collection_name } created" )
102100
103101 # Load the documents
104- logger .info (f"Loading { len (all_documents )} embeddings to { PGVECTOR_HOST } - { PGVECTOR_DATABASE_NAME } - { collection_name } " )
102+ logger .info (
103+ f"Loading { len (all_documents )} embeddings to { PGVECTOR_HOST } - { PGVECTOR_DATABASE_NAME } - { collection_name } "
104+ )
105105 db .add_documents (documents = all_documents )
106106 logger .info (f"Successfully loaded { len (all_documents )} embeddings" )
107107
108- directory_source_url_chunks = [list (origin_url ) + [chunks ] for origin_url , chunks in origin_urls .items ()]
108+ directory_source_url_chunks = [
109+ list (origin_url ) + [chunks ] for origin_url , chunks in origin_urls .items ()
110+ ]
109111 df = pd .DataFrame (directory_source_url_chunks , columns = ["origin" , "url" , "chunks" ])
110112 filename = f"{ PGVECTOR_HOST } - { collection_name } - { datetime .now ()} .csv"
111113 outpath = DIRECTORY_PATH / "logs" / filename
0 commit comments