Skip to content

Commit 1341795

Browse files
committed
Use selenium for testing
1 parent 15b3e77 commit 1341795

File tree

8 files changed

+345
-209
lines changed

8 files changed

+345
-209
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,6 @@ __pycache__/
77
.env.*
88
.idea/
99
.vscode/
10-
*.db
10+
*.db
11+
.mypy_cache/
12+
.cache/

config.yaml

Lines changed: 0 additions & 19 deletions
This file was deleted.

config/config.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
version: v1
2+
ingest_threads: 8
3+
collections:
4+
- name: "Source Collection"
5+
id: "source_collection"
6+
mode: "overwrite"
7+
chunk_size: 500
8+
chunk_overlap: 250
9+
embedding_model: "all-MiniLM-L6-v2"
10+
metadata:
11+
key: "value"
12+
sources:
13+
- type: "source"
14+
url_fragment: "/departments"
15+
recursive: true
16+
attachments: true
17+
metadata:
18+
key: "value"

scripts/constants.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,20 @@
77
load_dotenv()
88

99
# PATHS
10-
DIRECTORY_PATH = pathlib.Path(os.path.dirname(__file__)).parent
10+
DIRECTORY_PATH = pathlib.Path.cwd()
1111
KNOWLEDGE_REPOSITORY_PATH = DIRECTORY_PATH / "knowledge"
1212
SOURCE_RESPOSITORY_PATH = KNOWLEDGE_REPOSITORY_PATH / "source"
1313

1414
# INGEST
15-
DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
15+
DEVICE = (
16+
"cuda"
17+
if torch.cuda.is_available()
18+
else ("mps" if torch.backends.mps.is_available() else "cpu")
19+
)
1620

1721
# PGVECTOR
18-
PGVECTOR_USER = os.environ.get("PGVECTOR_USER", None)
19-
PGVECTOR_PASS = os.environ.get("PGVECTOR_PASS", None)
20-
PGVECTOR_DATABASE_NAME = os.environ.get("PGVECTOR_DATABASE_NAME", None)
22+
PGVECTOR_USER = os.environ.get("PGVECTOR_USER")
23+
PGVECTOR_PASS = os.environ.get("PGVECTOR_PASS")
24+
PGVECTOR_DATABASE_NAME = os.environ.get("PGVECTOR_DATABASE_NAME")
2125
PGVECTOR_HOST = os.environ.get("PGVECTOR_URI", "localhost")
2226
PGVECTOR_PORT = int(os.environ.get("PGVECTOR_PORT", 5432))

scripts/ingest_data.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,14 @@
2323
logger = logging.getLogger(__name__)
2424

2525

26-
def get_embedder(embedding_model_name: str):
27-
"""Define embedder to convert text into vectors."""
28-
model_kwargs = {"device": DEVICE}
29-
embedder = HuggingFaceEmbeddings(
26+
def get_embedder(embedding_model_name: str) -> HuggingFaceEmbeddings:
27+
"""Initialize an embedder to convert text into vectors."""
28+
return HuggingFaceEmbeddings(
3029
model_name=embedding_model_name,
31-
model_kwargs=model_kwargs,
30+
model_kwargs={"device": DEVICE},
3231
show_progress=True,
3332
)
3433

35-
return embedder
36-
3734

3835
def ingest(
3936
meta_lookup: dict[pathlib.Path, dict],
@@ -44,7 +41,6 @@ def ingest(
4441
embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
4542
mode: str = "overwrite",
4643
collection_metadata: dict = {},
47-
logs_folder_id: str = None,
4844
):
4945
"""Load documents into a vectorstore."""
5046
# Get documents
@@ -59,7 +55,9 @@ def ingest(
5955
file_name = source.stem
6056
document.metadata["_source"] = document.metadata["source"]
6157
document.metadata["source"] = file_name
62-
chunks = split_document(document, extension, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
58+
chunks = split_document(
59+
document, extension, chunk_size=chunk_size, chunk_overlap=chunk_overlap
60+
)
6361
# Attach metadata to each chunk
6462
for chunk in chunks:
6563
path_metadata = meta_lookup.get(source, {})
@@ -101,11 +99,15 @@ def ingest(
10199
logger.info(f"Collection {collection_name} created")
102100

103101
# Load the documents
104-
logger.info(f"Loading {len(all_documents)} embeddings to {PGVECTOR_HOST} - {PGVECTOR_DATABASE_NAME} - {collection_name}")
102+
logger.info(
103+
f"Loading {len(all_documents)} embeddings to {PGVECTOR_HOST} - {PGVECTOR_DATABASE_NAME} - {collection_name}"
104+
)
105105
db.add_documents(documents=all_documents)
106106
logger.info(f"Successfully loaded {len(all_documents)} embeddings")
107107

108-
directory_source_url_chunks = [list(origin_url) + [chunks] for origin_url, chunks in origin_urls.items()]
108+
directory_source_url_chunks = [
109+
list(origin_url) + [chunks] for origin_url, chunks in origin_urls.items()
110+
]
109111
df = pd.DataFrame(directory_source_url_chunks, columns=["origin", "url", "chunks"])
110112
filename = f"{PGVECTOR_HOST} - {collection_name} - {datetime.now()}.csv"
111113
outpath = DIRECTORY_PATH / "logs" / filename

0 commit comments

Comments
 (0)