diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index e63b6c2..e82d87a 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,19 +1,19 @@
-{
- "name": "My Container",
- "build": {
- "context": "..",
- "dockerfile": "../Dockerfile"
- },
- "runArgs": [
- "--gpus", "all"
- ],
- "forwardPorts": [8888],
- "customizations": {
- "vscode": {
- "extensions": [
- "ms-python.python",
- "ms-toolsai.jupyter"
- ]
- }
- }
+{
+ "name": "My Container",
+ "build": {
+ "context": "..",
+ "dockerfile": "../Dockerfile"
+ },
+ "runArgs": [
+ "--gpus", "all"
+ ],
+ "forwardPorts": [8888],
+ "customizations": {
+ "vscode": {
+ "extensions": [
+ "ms-python.python",
+ "ms-toolsai.jupyter"
+ ]
+ }
+ }
}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 960e9b5..233b286 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,27 +1,49 @@
-# notebooks and generated files
-test_import.ipynb
-
-# byte-compiled
-__pycache__/
-
-# environments
-.venv
-venv/
-
-# unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-.pylintrc
-
-# vscode
+# notebooks and generated files
+test_import.ipynb
+
+# byte-compiled
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# environments
+.venv
+venv/
+
+# unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.pylintrc
+
+# vscode
.vscode/*.*
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 3236538..063487e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,44 +1,44 @@
-FROM nvidia/cuda:12.2.2-devel-ubuntu22.04
-
-# Set working directory
-WORKDIR /workspace
-
-# Set non-interactive installation mode and configure timezone
-ENV DEBIAN_FRONTEND=noninteractive
-ENV TZ=UTC
-
-# Environment variables for Llama library
-ENV LLAMA_CUBLAS=1
-ENV CMAKE_ARGS=-DLLAMA_CUBLAS=on
-ENV FORCE_CMAKE=1
-
-# Install Python, build tools, compilers, and git
-RUN apt-get update && apt-get install -y \
- python3-pip \
- python3-dev \
- build-essential \
- cmake \
- libblas-dev \
- liblapack-dev \
- gfortran \
- git \
- && rm -rf /var/lib/apt/lists/*
-
-# Update pip and install wheel
-RUN python3 -m pip install --upgrade pip wheel
-
-# Install Requirements
-COPY requirements.txt .
-RUN python3 -m pip install -r requirements.txt
-
-# Special installation for llama-cpp-python with GPU support
-RUN pip install llama-cpp-python==0.2.55 --no-cache-dir --force-reinstall --verbose
-
-# Force specific numpy version
-RUN python3 -m pip install numpy==1.26.2 --no-cache-dir --force-reinstall
-
-# Expose Jupyter port
-EXPOSE 8888
-
-# Start Jupyter Notebook
+FROM nvidia/cuda:12.2.2-devel-ubuntu22.04
+
+# Set working directory
+WORKDIR /workspace
+
+# Set non-interactive installation mode and configure timezone
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=UTC
+
+# Environment variables for Llama library
+ENV LLAMA_CUBLAS=1
+ENV CMAKE_ARGS=-DLLAMA_CUBLAS=on
+ENV FORCE_CMAKE=1
+
+# Install Python, build tools, compilers, and git
+RUN apt-get update && apt-get install -y \
+ python3-pip \
+ python3-dev \
+ build-essential \
+ cmake \
+ libblas-dev \
+ liblapack-dev \
+ gfortran \
+ git \
+ && rm -rf /var/lib/apt/lists/*
+
+# Update pip and install wheel
+RUN python3 -m pip install --upgrade pip wheel
+
+# Install Requirements
+COPY requirements.txt .
+RUN python3 -m pip install -r requirements.txt
+
+# Special installation for llama-cpp-python with GPU support
+RUN pip install llama-cpp-python==0.2.55 --no-cache-dir --force-reinstall --verbose
+
+# Force specific numpy version
+RUN python3 -m pip install numpy==1.26.2 --no-cache-dir --force-reinstall
+
+# Expose Jupyter port
+EXPOSE 8888
+
+# Start Jupyter Notebook
CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--no-browser", "--allow-root"]
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 5f0ec6a..abc6d4c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,21 @@
-MIT License
-
-Copyright (c) 2024 hc-sc-ocdo-bdpd
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+MIT License
+
+Copyright (c) 2024 hc-sc-ocdo-bdpd
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index e8135bf..9400812 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,223 @@
-# faiss-search
-Tool designed to create the embedding and FAISS search pipeline
+# FAISS Retrieval Pipeline
+
+
+
+## Overview
+
+This library contains tools designed encapsulate the FAISS-embedding pipeline, giving users the ability to chunk and embed a dataset of text documents and perform queries to retrieve the most relevant documents.
+
+
+
+## Installation and Dependencies
+
+The library can be installed using
+
+```
+pip install git+https://github.com/hc-sc-ocdo-bdpd/faiss-search.git
+```
+
+Note that this project is downstream from `file-processing-tools` and uses it as a dependency.
+It also requires `llama-cpp` as an optional dependency to load embedding models from a `.gguf` file which. This must be installed using docker.
+
+
+
+# Getting Started
+
+The faiss search library currently offers two imports: `SearchDirectory` and `DatasetVariability` that encapsulate the retrieval pipeline and generate statistics about the dataset, respectively. There is also extended functionality available for working with FAISS indexes.
+
+
+
+## Search Directory
+
+The `SearchDirectory` class contains the end-to-end functionality that can create a searchable database from a directory that can retrieve documents with similar semantic meanings to user inputted queries.
+
+
+
+### File Structure
+
+The functions contained within `SearchDirectory` are designed to manipulate existing data and store it in formats that lend themselves better search features. All of the files created will be stored in the folder specified when the `SearchDirectory` object is created. For example, if the following code is run it will create a `SearchDirectory` object that saves any files that it creates to `path/to/folder`.
+
+```python
+from faiss_search import SearchDirectory
+search = SearchDirectory("path/to/folder")
+```
+
+If there are already files contained in `path/to/folder`, such as a `.faiss` file, it will be able to read and load these files so that previous steps do not need to be recomputed.
+
+Once all steps are completed, the following files will be contained in the specified folder:
+
+| File | Generated by | Purpose |
+| ---- | ------------ | ------- |
+| `report.csv` | `report_from_directory()` | Contains text info and metadata from files in a given directory |
+| `data_chunked.csv` | `chunk_text()` | Contains the chunked text data and corresponding file paths |
+| `setup_data.json` | `chunk_text()` and `load_embedding_model()` | Contains the number of chunks and the name of the embedding model being used as well as if it's a `.gguf` model |
+| `embedding_batches/` | `embed_text()` | Contains embedding batches in the form of `.npy` files |
+| `embeddings.npy` | `embed_text()` | Contains the complete embeddings of the chunks data |
+| `index.faiss` | `create_flat_index()`, `create_ivf_flat_index()`, and `create_hnsw_index()` | Contains the embeddings in the form of a searchable FAISS index |
+
+
+
+### Generating Report
+Creates - `report.csv`
+
+If working with a directory of files, the `report_from_directory()` function generates a `report.csv` file that contains the text and metadata of any text-based files in that directory.
+
+```python
+from faiss_search import SearchDirectory
+search = SearchDirectory("path/to/folder")
+search.report_from_directory("text/documents/directory/path")
+```
+
+
+
+### Chunking Text
+Parameters - `input_file_path`, `document_path_column`, `document_text_column`, `chunk_size`, `chunk_overlap`
+
+Creates - `data_chunked.csv`, `setup_data.json`
+
+Many embedding models and LLMs have a limited context window. This means any large text files need to be broken down into chunks before being passed into these models. The `chunk_text()` method is used for this purpose.
+
+It takes a `.csv` file containing a text field and a file path field as input. The `document_path_column` and `document_text_column` parameters are used to specify the column names in the `.csv`.
+
+```python
+from faiss_search import SearchDirectory
+search = SearchDirectory("path/to/folder")
+search.chunk_text("path/to/csv/file.csv",
+ document_path_column="file path",
+ document_text_column="content")
+```
+
+Alternatively, if a `report.csv` file was already generated and is contained in the folder then no `.csv` file needs to be specified and the function will use the `report.csv` file as the input.
+
+```python
+from faiss_search import SearchDirectory
+search = SearchDirectory("path/to/folder")
+search.report_from_directory("text/documents/directory/path")
+search.chunk_text()
+```
+
+Both of these approaches will produce a chunked CSV file.
+
+
+
+### Loading Embedding Model
+Parameters - `model_name`, `gguf`
+
+Creates - `setup_data.json`
+
+The `load_embedding_model()` function is used to specify and load the embedding model that will be used in the text embedding and search steps. This function supports models that are hosted in the `sentence_transformers` library when `gguf` is set to `False` and will load a specified `.gguf` file when set to `True`.
+
+```python
+search.load_embedding_model("paraphrase-MiniLM-L3-v2")
+```
+
+
+
+### Text Embedding
+Parameters - `row_start`, `row_end`, `batch_size`
+
+Creates - `embedding_batches/`, `embeddings.npy`
+
+The `embed_text()` function takes in the `data_chunked.csv` file and outputs an embedding file called `embeddings.npy`. Because the embeddings can be a time intensive computation, the embeddings are saved in batches of a specified size to the `embedding_batches/` folder in order to save progress. Once all of the chunk embeddings are saved in the `embedding_batches` folder the embeddings are combined and saved to `embeddings.npy`.
+
+```python
+from faiss_search import SearchDirectory
+search = SearchDirectory("path/to/folder")
+search.report_from_directory("text/documents/directory/path")
+search.chunk_text()
+search.load_embedding_model("paraphrase-MiniLM-L3-v2")
+search.chunk_text(batch_size=100)
+```
+
+
+
+### FAISS Index Creation
+Parameters - `embeddings`
+
+Creates - `index.faiss`
+
+There are a few different functions to create FAISS indexes adopted from the `file_processing.faiss_index` library with the same functionality. The only difference is that the FAISS index is automatically saved to the folder. If the embeddings is not specified, it will check if `embeddings.npy` is contained in the folder and it will use that file.
+
+```python
+from faiss_search import SearchDirectory
+search = SearchDirectory("path/to/folder")
+search.report_from_directory("text/documents/directory/path")
+search.chunk_text()
+search.load_embedding_model("paraphrase-MiniLM-L3-v2")
+search.chunk_text(batch_size=100)
+search.create_ivf_flat_index(nlist=16)
+```
+
+
+
+### Search Function
+Parameters - `query`, `k`, `args`
+
+The `search()` function takes in a query and returns the `k` closest matching chunks and corresponding file paths. The function also can take in arguments that can specify the FAISS index search hyperparameters.
+
+```python
+from faiss_search import SearchDirectory
+search = SearchDirectory("path/to/folder")
+search.report_from_directory("text/documents/directory/path")
+search.chunk_text()
+search.load_embedding_model("paraphrase-MiniLM-L3-v2")
+search.chunk_text(batch_size=100)
+search.create_ivf_flat_index(nlist=16)
+search.search("What is the meaning of life, the universe, and everything?", k=3, nprobe=2)
+```
+
+
+
+## Dataset Variability
+
+The Dataset Variability import is designed to compute statistics about how similar the contents of a dataset are to one another based on the embeddings. It takes in a numpy array of embeddings as an input along with an option to normalize the embeddings and has two functions that can be called to compute the similarity.
+
+```python
+from faiss_search import DatasetVariability
+embeddings = np.load("path/to/embeddings.npy")
+variability = DatasetVariability(embeddings, normalize=True)
+print(f"mean cosine similarity: {variability.cosine_similarity_avg()}")
+print(f"mean L2 distance: {variability.variance()}")
+```
+
+
+
+## FAISS Indexes
+
+The FAISS index functionality is utilized by `SearchDirectory` but can also be called on its own if working directly with these indexes.
+
+The `faiss_index` import offers a collection of functions that make it easy to interface with FAISS indexes. A `faiss_index` object can be created by either:
+
+* calling one of the create index methods such as `faiss_index.create_flat_index(embeddings)` or `faiss_index.create_ivf_index(embeddings, nlist=16)`.
+* loading an index from a `.faiss` file using `faiss_index.load_index("path/to/file.faiss")`.
+
+Once an index is created it can be queried. This involves providing a query vector as an input and the index will return the nearest `k` vectors contained in the index (as found by that algorithm). Consider the example below to view the functionality:
+
+```python
+import faiss_search
+index = faiss_search.faiss_index.load_index("path/to/file.faiss")
+nearest_three_vectors = index.query(query_vector, k=3)
+```
+
+For large numbers of documents, creating the index can take a while so it is often a good idea to save the file to be loaded in for future use. This can be done by specifying the file path when creating the index or by calling `save()`.
+
+```python
+# save the index when creating it
+index = faiss_search.faiss_index.create_flat_index(embeddings, "path/to/save.faiss")
+# save the index afetr creating it
+index = faiss_search.faiss_index.create_flat_index(embeddings)
+index.save("path/to/save.faiss")
+```
+
+The ability to create indexes is limited to a select number of common indexes. More complex indexes can still be loaded and queried as with the other indexes but does not come with the ability to adjust hyperparameters during the query.
+
+
+
+# Limitations and Future Work
+
+* Currently the embedding models can only be loaded from `.gguf` files if used in a dockerized container. This limits the use of this functionality in a packaged environment.
+* Continuous integration for unit testing is to be added in the development pipeline.
+* This library currently relies on `file-processing-tools` as an upstream requirement. Changes to that library could impact features in `SearchDirectory`.
+* A generalized `load_embedding_model()` function should be created that allows users to pass in custom functions to perform embeddings to extend the functionality beyond just `sentence-transformers` and `.gguf` files. This would make using API embeddings possible from this library.
+* Potentially include a text cleaning process before the chunking step to remove noisy text data.
+* Create a GitHub pages site for the documentation
diff --git a/dist/faiss_search-0.0.0-py3-none-any.whl b/dist/faiss_search-0.0.0-py3-none-any.whl
new file mode 100644
index 0000000..aa4acf0
Binary files /dev/null and b/dist/faiss_search-0.0.0-py3-none-any.whl differ
diff --git a/dist/faiss_search-0.0.0.tar.gz b/dist/faiss_search-0.0.0.tar.gz
new file mode 100644
index 0000000..73e3e1a
Binary files /dev/null and b/dist/faiss_search-0.0.0.tar.gz differ
diff --git a/faiss_search/__init__.py b/faiss_search/__init__.py
index bedff82..e6e5255 100644
--- a/faiss_search/__init__.py
+++ b/faiss_search/__init__.py
@@ -1,3 +1,4 @@
-from .search_directory import SearchDirectory
-
-__all__ = ['SearchDirectory']
\ No newline at end of file
+from .search_directory import SearchDirectory
+from .dataset_variability import DatasetVariability
+
+__all__ = ['SearchDirectory', 'DatasetVariability']
\ No newline at end of file
diff --git a/faiss_search/dataset_variability.py b/faiss_search/dataset_variability.py
new file mode 100644
index 0000000..57b22dc
--- /dev/null
+++ b/faiss_search/dataset_variability.py
@@ -0,0 +1,42 @@
+import numpy as np
+
+class DatasetVariability:
+ def __init__(self, embedding: np.ndarray, normalize: bool = True) -> None:
+ """
+ Initializes the DatasetVariability class with the provided embedding.
+
+ :param embedding: A 2D numpy array where each row is a vector representing an embedding.
+ :param normalize: A boolean indicating whether to normalize the embeddings. Defaults to True.
+ """
+ if normalize:
+ self.embedding = self.normalize(embedding)
+ else:
+ self.embedding = embedding
+
+ def normalize(self, embedding: np.ndarray) -> None:
+ """
+ Normalizes the embeddings to have unit norm along the rows.
+
+ :param embedding: A 2D numpy array where each row is a vector representing an embedding.
+
+ :return: A 2D numpy array with normalized embeddings.
+ """
+ return embedding / np.linalg.norm(embedding, axis=1, keepdims=True)
+
+ def variance(self) -> float:
+ """
+ Computes the sum of the variance of the embeddings across features.
+ Also equivalent to the average squared L2 distance between embeddings.
+
+ :return: A float representing the sum of variances across features.
+ """
+ return np.sum(np.var(self.embedding, axis=0))
+
+ def cosine_similarity_avg(self) -> float:
+ """
+ Calculates the average cosine similarity between embeddings.
+
+ :return: A float representing the average cosine similarity of the embeddings.
+ """
+ mean_vec = np.mean(self.embedding, axis=0)
+ return np.mean(np.dot(self.embedding, mean_vec))
diff --git a/faiss_search/faiss_index/HNSW_index.py b/faiss_search/faiss_index/HNSW_index.py
index f76c1df..97e5c47 100644
--- a/faiss_search/faiss_index/HNSW_index.py
+++ b/faiss_search/faiss_index/HNSW_index.py
@@ -1,35 +1,35 @@
-import faiss
-import numpy as np
-from faiss_search.faiss_index.faiss_strategy import FAISSStrategy
-
-
-class HNSWIndex(FAISSStrategy):
- def _create_index(self, embeddings: np.ndarray, M: int,
- efConstruction: int, metric: int):
- if M is None:
- M = 64
- if efConstruction is None:
- efConstruction = 64
- if not isinstance(M, int):
- raise TypeError("M must be an int type")
- if not isinstance(efConstruction, int):
- raise TypeError("efConstruction must be an int type")
- if M < 1:
- raise ValueError(
- "M cannot be less than 1")
- if efConstruction < 1:
- raise ValueError(
- "efConstruction cannot be less than 1")
- dimension = embeddings.shape[1]
- index = faiss.IndexHNSWFlat(dimension, M, metric)
- index.hnsw.efConstruction = efConstruction
- index.add(embeddings)
- return index
-
- def query(self, xq: np.ndarray, k: int = 1, efSearch: int = None):
- if efSearch is not None:
- if efSearch < 1:
- raise ValueError(
- "efSearch cannot be less than 1")
- self.index.hnsw.efSearch = efSearch
+import faiss
+import numpy as np
+from faiss_search.faiss_index.faiss_strategy import FAISSStrategy
+
+
+class HNSWIndex(FAISSStrategy):
+ def _create_index(self, embeddings: np.ndarray, M: int,
+ efConstruction: int, metric: int):
+ if M is None:
+ M = 64
+ if efConstruction is None:
+ efConstruction = 64
+ if not isinstance(M, int):
+ raise TypeError("M must be an int type")
+ if not isinstance(efConstruction, int):
+ raise TypeError("efConstruction must be an int type")
+ if M < 1:
+ raise ValueError(
+ "M cannot be less than 1")
+ if efConstruction < 1:
+ raise ValueError(
+ "efConstruction cannot be less than 1")
+ dimension = embeddings.shape[1]
+ index = faiss.IndexHNSWFlat(dimension, M, metric)
+ index.hnsw.efConstruction = efConstruction
+ index.add(embeddings)
+ return index
+
+ def query(self, xq: np.ndarray, k: int = 1, efSearch: int = None):
+ if efSearch is not None:
+ if efSearch < 1:
+ raise ValueError(
+ "efSearch cannot be less than 1")
+ self.index.hnsw.efSearch = efSearch
return super().query(xq, k)
\ No newline at end of file
diff --git a/faiss_search/faiss_index/IVF_flat_index.py b/faiss_search/faiss_index/IVF_flat_index.py
index 998cce1..88c06ef 100644
--- a/faiss_search/faiss_index/IVF_flat_index.py
+++ b/faiss_search/faiss_index/IVF_flat_index.py
@@ -1,30 +1,30 @@
-import faiss
-import numpy as np
-from faiss_search.faiss_index.faiss_strategy import FAISSStrategy
-
-
-class IVFFlatIndex(FAISSStrategy):
- def _create_index(self, embeddings: np.ndarray, nlist: int, metric: int):
- dimension = embeddings.shape[1]
- if nlist is None:
- nlist = max(1, int(np.sqrt(embeddings.shape[0] / 2)))
- if not isinstance(nlist, int):
- raise TypeError("nlist must be an int type")
- if nlist < 1:
- raise ValueError("nlist cannot be less than 1")
- if nlist > embeddings.shape[0]:
- raise ValueError(
- f"nlist value of {nlist} is larger than the number of documents in the index")
- quantizer = faiss.IndexFlat(dimension, metric)
- index = faiss.IndexIVFFlat(quantizer, dimension, nlist, metric)
- index.train(embeddings)
- index.add(embeddings)
- return index
-
- def query(self, xq: np.ndarray, k: int = 1, nprobe: int = None):
- if nprobe is not None:
- if nprobe not in range(1, self.index.nlist + 1):
- raise ValueError(
- f"nprobe must be between 1 and {self.index.nlist}")
- self.index.nprobe = nprobe
+import faiss
+import numpy as np
+from faiss_search.faiss_index.faiss_strategy import FAISSStrategy
+
+
+class IVFFlatIndex(FAISSStrategy):
+ def _create_index(self, embeddings: np.ndarray, nlist: int, metric: int):
+ dimension = embeddings.shape[1]
+ if nlist is None:
+ nlist = max(1, int(np.sqrt(embeddings.shape[0] / 2)))
+ if not isinstance(nlist, int):
+ raise TypeError("nlist must be an int type")
+ if nlist < 1:
+ raise ValueError("nlist cannot be less than 1")
+ if nlist > embeddings.shape[0]:
+ raise ValueError(
+ f"nlist value of {nlist} is larger than the number of documents in the index")
+ quantizer = faiss.IndexFlat(dimension, metric)
+ index = faiss.IndexIVFFlat(quantizer, dimension, nlist, metric)
+ index.train(embeddings)
+ index.add(embeddings)
+ return index
+
+ def query(self, xq: np.ndarray, k: int = 1, nprobe: int = None):
+ if nprobe is not None:
+ if nprobe not in range(1, self.index.nlist + 1):
+ raise ValueError(
+ f"nprobe must be between 1 and {self.index.nlist}")
+ self.index.nprobe = nprobe
return super().query(xq, k)
\ No newline at end of file
diff --git a/faiss_search/faiss_index/__pycache__/__init__.cpython-310.pyc b/faiss_search/faiss_index/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 24f5d0f..0000000
Binary files a/faiss_search/faiss_index/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/faiss_search/faiss_index/__pycache__/faiss_strategy.cpython-310.pyc b/faiss_search/faiss_index/__pycache__/faiss_strategy.cpython-310.pyc
deleted file mode 100644
index 67e296d..0000000
Binary files a/faiss_search/faiss_index/__pycache__/faiss_strategy.cpython-310.pyc and /dev/null differ
diff --git a/faiss_search/faiss_index/__pycache__/flat_index.cpython-310.pyc b/faiss_search/faiss_index/__pycache__/flat_index.cpython-310.pyc
deleted file mode 100644
index 576f362..0000000
Binary files a/faiss_search/faiss_index/__pycache__/flat_index.cpython-310.pyc and /dev/null differ
diff --git a/faiss_search/faiss_index/__pycache__/index_creator.cpython-310.pyc b/faiss_search/faiss_index/__pycache__/index_creator.cpython-310.pyc
deleted file mode 100644
index 49664a7..0000000
Binary files a/faiss_search/faiss_index/__pycache__/index_creator.cpython-310.pyc and /dev/null differ
diff --git a/faiss_search/faiss_index/faiss_strategy.py b/faiss_search/faiss_index/faiss_strategy.py
index d9805db..db8e9dd 100644
--- a/faiss_search/faiss_index/faiss_strategy.py
+++ b/faiss_search/faiss_index/faiss_strategy.py
@@ -1,33 +1,33 @@
-import faiss
-import numpy as np
-from abc import ABC, abstractmethod
-
-
-class FAISSStrategy(ABC):
- METRICS = {
- "L2": 1,
- "IP": 0,
- }
-
- def __init__(self, *args, metric: str=None, index=None):
- if index is not None:
- self.index = index
- else:
- try:
- metric_id = self.METRICS[metric]
- except KeyError:
- metric_id = 1
- self.index = self._create_index(*args, metric_id)
-
- def save_index(self, output_path: str):
- faiss.write_index(self.index, output_path)
-
- @abstractmethod
- def _create_index(self):
- pass
-
- @abstractmethod
- def query(self, xq: np.ndarray, k: int):
- if k < 1:
- raise ValueError("k cannot be less than 1")
+import faiss
+import numpy as np
+from abc import ABC, abstractmethod
+
+
+class FAISSStrategy(ABC):
+ METRICS = {
+ "L2": 1,
+ "IP": 0,
+ }
+
+ def __init__(self, *args, metric: str=None, index=None):
+ if index is not None:
+ self.index = index
+ else:
+ try:
+ metric_id = self.METRICS[metric]
+ except KeyError:
+ metric_id = 1
+ self.index = self._create_index(*args, metric_id)
+
+ def save_index(self, output_path: str):
+ faiss.write_index(self.index, output_path)
+
+ @abstractmethod
+ def _create_index(self):
+ pass
+
+ @abstractmethod
+ def query(self, xq: np.ndarray, k: int):
+ if k < 1:
+ raise ValueError("k cannot be less than 1")
return self.index.search(xq, k)
\ No newline at end of file
diff --git a/faiss_search/faiss_index/flat_index.py b/faiss_search/faiss_index/flat_index.py
index 94d6a1d..5cb96cb 100644
--- a/faiss_search/faiss_index/flat_index.py
+++ b/faiss_search/faiss_index/flat_index.py
@@ -1,14 +1,14 @@
-import faiss
-import numpy as np
-from faiss_search.faiss_index.faiss_strategy import FAISSStrategy
-
-
-class FlatIndex(FAISSStrategy):
- def _create_index(self, embeddings: np.ndarray, metric: int):
- dimension = embeddings.shape[1]
- index = faiss.IndexFlat(dimension, metric)
- index.add(embeddings)
- return index
-
- def query(self, xq: np.ndarray, k: int = 1):
+import faiss
+import numpy as np
+from faiss_search.faiss_index.faiss_strategy import FAISSStrategy
+
+
+class FlatIndex(FAISSStrategy):
+ def _create_index(self, embeddings: np.ndarray, metric: int):
+ dimension = embeddings.shape[1]
+ index = faiss.IndexFlat(dimension, metric)
+ index.add(embeddings)
+ return index
+
+ def query(self, xq: np.ndarray, k: int = 1):
return super().query(xq, k)
\ No newline at end of file
diff --git a/faiss_search/faiss_index/general_index.py b/faiss_search/faiss_index/general_index.py
index 618f678..be04bb9 100644
--- a/faiss_search/faiss_index/general_index.py
+++ b/faiss_search/faiss_index/general_index.py
@@ -1,10 +1,10 @@
-import numpy as np
-from faiss_search.faiss_index.faiss_strategy import FAISSStrategy
-
-
-class GeneralIndex(FAISSStrategy):
- def _create_index(self):
- raise NotImplementedError()
-
- def query(self, xq: np.ndarray, k: int = 1):
+import numpy as np
+from faiss_search.faiss_index.faiss_strategy import FAISSStrategy
+
+
+class GeneralIndex(FAISSStrategy):
+ def _create_index(self):
+ raise NotImplementedError()
+
+ def query(self, xq: np.ndarray, k: int = 1):
return super().query()
\ No newline at end of file
diff --git a/faiss_search/faiss_index/index_creator.py b/faiss_search/faiss_index/index_creator.py
index fac2771..e56bc49 100644
--- a/faiss_search/faiss_index/index_creator.py
+++ b/faiss_search/faiss_index/index_creator.py
@@ -1,89 +1,89 @@
-import faiss
-import numpy as np
-from faiss_search.faiss_index import flat_index
-from faiss_search.faiss_index import IVF_flat_index
-from faiss_search.faiss_index import HNSW_index
-from faiss_search.faiss_index import general_index
-
-
-def load_index(file_path: str):
- """
- Load a FAISS index from a specified file path.
-
- Args:
- file_path (str): The path to the file containing the saved FAISS index.
-
- Returns:
- An instance of the corresponding index class.
- """
- INDEXES = {
- faiss.IndexFlat: flat_index.FlatIndex,
- faiss.IndexIVFFlat: IVF_flat_index.IVFFlatIndex,
- faiss.IndexHNSWFlat: HNSW_index.HNSWIndex
- }
- index = faiss.read_index(file_path)
- if type(index) in INDEXES:
- index_class = INDEXES.get(type(index))
- else:
- index_class = general_index.GeneralIndex
- return index_class(index=index)
-
-
-def create_flat_index(embeddings: np.ndarray, file_path: str = None, metric: str = "L2"):
- """
- Create a Flat Index using the provided embeddings.
-
- Args:
- embeddings (np.ndarray): An array of embeddings to index.
- file_path (str, optional): The path to save the index. If None, the index is not saved.
- metric (str, optional): The metric used to compute distances. L2 and IP currently supported. Default is L2.
-
- Returns:
- FlatIndex: An instance of the FlatIndex containing the embeddings.
- """
- index = flat_index.FlatIndex(embeddings, metric=metric)
- if file_path is not None:
- index.save_index(file_path)
- return index
-
-
-def create_IVF_flat_index(embeddings: np.ndarray, nlist: int = None,
- file_path: str = None, metric: str = "L2"):
- """
- Create an IVF Flat Index using the provided embeddings and parameters.
-
- Args:
- embeddings (np.ndarray): An array of embeddings to index.
- nlist (int, optional): Number of clusters for the IVF index. If None, a default will be used.
- file_path (str, optional): The path to save the index. If None, the index is not saved.
- metric (str, optional): The metric used to compute distances. L2 and IP currently supported. Default is L2.
-
- Returns:
- IVFFlatIndex: An instance of the IVFFlatIndex containing the embeddings.
- """
- index = IVF_flat_index.IVFFlatIndex(embeddings, nlist, metric=metric)
- if file_path is not None:
- index.save_index(file_path)
- return index
-
-
-def create_HNSW_index(embeddings: np.ndarray, M: int = 64,
- efConstruction: int = 64, file_path: str = None,
- metric: str = "L2"):
- """
- Create an HNSW Index using the provided embeddings and parameters.
-
- Args:
- embeddings (np.ndarray): An array of embeddings to index.
- M (int, optional): Number of bi-directional links created for each new element. Default is 64.
- efConstruction (int, optional): Size of the dynamic list for the nearest neighbors during construction. Default is 64.
- file_path (str, optional): The path to save the index. If None, the index is not saved.
- metric (str, optional): The metric used to compute distances. L2 and IP currently supported. Default is L2.
-
- Returns:
- HNSWIndex: An instance of the HNSWIndex containing the embeddings.
- """
- index = HNSW_index.HNSWIndex(embeddings, M, efConstruction, metric=metric)
- if file_path is not None:
- index.save_index(file_path)
+import faiss
+import numpy as np
+from faiss_search.faiss_index import flat_index
+from faiss_search.faiss_index import IVF_flat_index
+from faiss_search.faiss_index import HNSW_index
+from faiss_search.faiss_index import general_index
+
+
+def load_index(file_path: str):
+ """
+ Load a FAISS index from a specified file path.
+
+ Args:
+ file_path (str): The path to the file containing the saved FAISS index.
+
+ Returns:
+ An instance of the corresponding index class.
+ """
+ INDEXES = {
+ faiss.IndexFlat: flat_index.FlatIndex,
+ faiss.IndexIVFFlat: IVF_flat_index.IVFFlatIndex,
+ faiss.IndexHNSWFlat: HNSW_index.HNSWIndex
+ }
+ index = faiss.read_index(file_path)
+ if type(index) in INDEXES:
+ index_class = INDEXES.get(type(index))
+ else:
+ index_class = general_index.GeneralIndex
+ return index_class(index=index)
+
+
+def create_flat_index(embeddings: np.ndarray, file_path: str = None, metric: str = "L2"):
+ """
+ Create a Flat Index using the provided embeddings.
+
+ Args:
+ embeddings (np.ndarray): An array of embeddings to index.
+ file_path (str, optional): The path to save the index. If None, the index is not saved.
+ metric (str, optional): The metric used to compute distances. L2 and IP currently supported. Default is L2.
+
+ Returns:
+ FlatIndex: An instance of the FlatIndex containing the embeddings.
+ """
+ index = flat_index.FlatIndex(embeddings, metric=metric)
+ if file_path is not None:
+ index.save_index(file_path)
+ return index
+
+
+def create_IVF_flat_index(embeddings: np.ndarray, nlist: int = None,
+ file_path: str = None, metric: str = "L2"):
+ """
+ Create an IVF Flat Index using the provided embeddings and parameters.
+
+ Args:
+ embeddings (np.ndarray): An array of embeddings to index.
+ nlist (int, optional): Number of clusters for the IVF index. If None, a default will be used.
+ file_path (str, optional): The path to save the index. If None, the index is not saved.
+ metric (str, optional): The metric used to compute distances. L2 and IP currently supported. Default is L2.
+
+ Returns:
+ IVFFlatIndex: An instance of the IVFFlatIndex containing the embeddings.
+ """
+ index = IVF_flat_index.IVFFlatIndex(embeddings, nlist, metric=metric)
+ if file_path is not None:
+ index.save_index(file_path)
+ return index
+
+
+def create_HNSW_index(embeddings: np.ndarray, M: int = 64,
+ efConstruction: int = 64, file_path: str = None,
+ metric: str = "L2"):
+ """
+ Create an HNSW Index using the provided embeddings and parameters.
+
+ Args:
+ embeddings (np.ndarray): An array of embeddings to index.
+ M (int, optional): Number of bi-directional links created for each new element. Default is 64.
+ efConstruction (int, optional): Size of the dynamic list for the nearest neighbors during construction. Default is 64.
+ file_path (str, optional): The path to save the index. If None, the index is not saved.
+ metric (str, optional): The metric used to compute distances. L2 and IP currently supported. Default is L2.
+
+ Returns:
+ HNSWIndex: An instance of the HNSWIndex containing the embeddings.
+ """
+ index = HNSW_index.HNSWIndex(embeddings, M, efConstruction, metric=metric)
+ if file_path is not None:
+ index.save_index(file_path)
return index
\ No newline at end of file
diff --git a/faiss_search/search_directory.py b/faiss_search/search_directory.py
index 5cf0ad8..808d194 100644
--- a/faiss_search/search_directory.py
+++ b/faiss_search/search_directory.py
@@ -1,301 +1,389 @@
-import os
-import re
-import json
-import numpy as np
-import pandas as pd
-from typing import List
-from tqdm import tqdm
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from file_processing import Directory
-from .tools.errors import FileTypeError
-from .tools.errors import EncodingModelError
-from sentence_transformers import SentenceTransformer
-
-class SearchDirectory:
- def __init__(self, folder_path: str) -> None:
- """
- Initializes the SearchDirectory object with paths to data and model files.
-
- :param folder_path: Path to the folder containing data and setup files.
- """
- self.folder_path = folder_path
- # get chunking file path
- if os.path.exists(os.path.join(self.folder_path, "data_chunked.csv")):
- self.chunks_path = os.path.join(self.folder_path, "data_chunked.csv")
- else:
- self.chunks_path = None
- # get json data
- if os.path.exists(os.path.join(self.folder_path, "setup_data.json")):
- with open(os.path.join(self.folder_path, "setup_data.json"), 'r') as f:
- setup_data = json.load(f)
- self.encoding_name = setup_data['encoding_model']
- self.n_chunks = setup_data['number_of_chunks']
- else:
- self.n_chunks = None
- self.encoding_name = None
- # load the encoding model
- if self.encoding_name is not None:
- self.load_embedding_model(self.encoding_name)
- else:
- self.encoder = None
-
- def _get_text_chunks(self, text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
- """
- Splits the input text into smaller chunks with specified size and overlap.
-
- :param text: The text to be split into chunks.
- :param chunk_size: Number of characters in each chunk.
- :param chunk_overlap: Number of overlapping characters between chunks.
-
- :return: A list of text chunks.
- """
- chunks = []
- splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
- for chunk in splitter.split_text(text):
- chunks.append(chunk)
- return chunks
-
- def _save_to_json(self) -> None:
- """
- Saves the encoding model name and number of chunks to a JSON file in the folder.
- """
- setup_data = {
- 'encoding_model': self.encoding_name,
- 'number_of_chunks': self.n_chunks
- }
- with open(os.path.join(self.folder_path, "setup_data.json"), 'w') as f:
- json.dump(setup_data, f, indent=4)
-
- def _embed_string(self, text: str) -> np.ndarray:
- """
- Converts a text string into a vector representation using the encoding model.
-
- :param text: The text to be embedded.
-
- :return: The embedding vector of the input text.
- """
- embedding = self.encoder.encode(text)
- return embedding
-
- def _combine_embeddings(self) -> None:
- """
- Combines individual embedding files into a single numpy array and saves it as 'embeddings.npy'.
- """
- batch_path = os.path.join(self.folder_path, "embedding_batches")
- pattern = r"\((\d+)-(\d+)\)"
-
- file_ranges = []
-
- for filename in os.listdir(batch_path):
- match = re.search(pattern, filename)
- file_ranges.append((filename, int(match.group(1)), int(match.group(2))))
-
- file_ranges.sort(key=lambda x: x[1])
-
- if file_ranges[0][1] == 0:
- start = 0
- for filename, batch_start, batch_end in file_ranges:
- emb = np.load(os.path.join(batch_path, filename))
- if batch_start == 0:
- emb_full = emb
- else:
- if start >= batch_start:
- emb_full = np.vstack((emb_full, emb[start - batch_start:]))
- start = batch_end
- if emb_full.shape[0] == self.n_chunks:
- np.save(os.path.join(self.folder_path, "embeddings.npy"), emb_full)
- print("Embeddings combined and saved to embeddings.npy")
- else:
- print("Embeddings not yet combined. The remainder of the embeddings left must be completed before they can be combined.")
-
- def _check_for_embeddings(self, embeddings: np.ndarray) -> np.ndarray:
- """
- Checks for the provided embeddings or loads them from a file if not provided.
-
- :param embeddings: Pre-existing embeddings to check.
-
- :return: The embeddings, either provided or loaded from file.
-
- :raises FileNotFoundError: If embeddings are not found and no file exists.
- """
- if embeddings is None:
- if os.path.exists(os.path.join(self.folder_path, "embeddings.npy")):
- embeddings = np.load(os.path.join(self.folder_path, "embeddings.npy"))
- else:
- raise FileNotFoundError("No embeddings found.")
- return embeddings
-
- def report_from_directory(self, directory_path: str) -> None:
- """
- Generates a report from the specified directory and saves it as 'report.csv'.
-
- :param directory_path: Path to the directory to generate the report from.
- """
- directory = Directory(directory_path)
- directory.generate_report(
- report_file = os.path.join(self.folder_path,"report.csv"),
- split_metadata=True,
- include_text=True,
- )
-
- def chunk_text(self,
- input_file_path: str = None,
- document_path_column: str = "File Path",
- document_text_column: str = "Text",
- chunk_size: int = 1024,
- chunk_overlap: int = 10) -> None:
- """
- Chunks the text data from a CSV file into smaller pieces and saves the result to 'data_chunked.csv'.
-
- :param input_file_path: Path to the CSV file containing text to chunk. If None, uses 'report.csv' in the folder.
- :param document_path_column: Column name for file paths in the CSV.
- :param document_text_column: Column name for text content in the CSV.
- :param chunk_size: Number of characters in each chunk.
- :param chunk_overlap: Number of overlapping characters between chunks.
-
- :raises FileNotFoundError: If no input file is specified and no report exists.
- :raises FileTypeError: If the input file is not a CSV.
- :raises KeyError: If specified columns are not found in the CSV.
- """
-
- # check if there is a report
- if input_file_path is None:
- if os.path.exists(os.path.join(self.folder_path, "report.csv")):
- input_file_path = os.path.join(self.folder_path, "report.csv")
- else:
- raise FileNotFoundError("No input file specified and no report provided. \
- Please provide a file path to a .csv or run 'report_from_directory'.")
-
- # load into a dataframe
- if input_file_path.lower().endswith('.csv'):
- df = pd.read_csv(input_file_path)
- else:
- raise FileTypeError(f"File path {input_file_path} is not a .csv file.")
-
- # check if the column names are valid
- if document_path_column not in df.columns:
- raise KeyError(f"'{document_path_column}' is not a column in {input_file_path}.")
- elif document_text_column not in df.columns:
- raise KeyError(f"'{document_text_column}' is not a column in {input_file_path}.")
-
- # Initialize an empty list to collect all rows
- all_new_rows = []
-
- # Get the total number of rows
- total_rows = len(df)
- print(f"Total rows (excluding header): {total_rows}")
-
- # Process each row with tqdm to show progress
- for index, row in tqdm(df.iterrows(), total=total_rows, desc="Processing rows"):
- file_path = row[document_path_column]
- content = row[document_text_column]
-
- # Get chunks for the current content
- chunks = self._get_text_chunks(content, chunk_size, chunk_overlap)
-
- # Create new rows for each chunk
- for chunk_text in chunks:
- new_row = {
- 'file_path': file_path,
- 'content': chunk_text
- }
- all_new_rows.append(new_row)
-
- # Create a new DataFrame from the collected new rows
- chunked_df = pd.DataFrame(all_new_rows)
-
- # Save the new DataFrame to a new CSV file
- chunked_df.to_csv(os.path.join(self.folder_path, 'data_chunked.csv'), index=False)
- self.chunks_path = os.path.join(self.folder_path, 'data_chunked.csv')
- self.n_chunks = len(chunked_df)
- self._save_to_json()
-
- print("Chunking complete and saved to 'data_chunked.csv'.")
-
- def load_embedding_model(self, model_name: str = "paraphrase-MiniLM-L3-v2") -> None:
- """
- Loads the specified embedding model and saves the model name to JSON.
-
- :param model_name: Name of the embedding model to load.
- """
- self.encoding_name = model_name
- self.encoder = SentenceTransformer(model_name)
- self._save_to_json()
-
- def embed_text(self, row_start: int = 0, row_end: int = None, batch_size: int = 1000) -> None:
- """
- Embeds text chunks from the 'data_chunked.csv' file into vectors and saves them in batches.
- If all batches are complete then it combines the batches and saves the embeddings to 'embeddings.npy'.
-
- :param row_start: Starting index of rows to process.
- :param row_end: Ending index of rows to process. If None, processes till the end.
- :param batch_size: Number of rows to process in each batch.
- """
- if self.chunks_path is None:
- raise FileNotFoundError(f"Error: data_chunked.csv not located in {self.folder_path}")
- if self.encoder is None:
- raise EncodingModelError("Error: no encoding model found. Run 'load_embedding_model' first.")
- else:
- os.makedirs(os.path.join(self.folder_path, "embedding_batches"), exist_ok=True)
- chunked_df = pd.read_csv(self.chunks_path)
- n_chunks = len(chunked_df)
-
- if (row_end is None) or (row_end > n_chunks):
- row_end = n_chunks
-
- # handle index error values and negative indexes
- if row_start < -n_chunks - 1:
- raise IndexError(f"Row start {row_start} is out of bounds for {n_chunks} chunks.")
- elif row_start < 0:
- row_start = n_chunks + row_start + 1
- if row_end < -n_chunks -1:
- raise IndexError(f"Row end {row_end} is out of bounds for {n_chunks} chunks.")
- elif row_end < 0:
- row_end = n_chunks + row_end +1
- if row_start >= n_chunks:
- raise IndexError(f"Start index of {row_start} is out of bounds for {n_chunks} chunks")
- if row_end <= row_start:
- raise ValueError(f"Row end ({row_end}) cannot be less than the row start ({row_start}).")
-
- batch_path = os.path.join(self.folder_path, "embedding_batches")
- pattern = r"\((\d+)-(\d+)\)"
-
- contained_ranges = []
-
- for filename in os.listdir(batch_path):
- match = re.search(pattern, filename)
- batch_start = int(match.group(1))
- batch_end = int(match.group(2))
- if (batch_start < row_end) and (batch_end > row_start):
- if batch_start < row_start:
- batch_start = row_start
- if batch_end > row_end:
- batch_end = row_end
- contained_ranges.append((batch_start, batch_end))
-
- contained_ranges.sort(key=lambda x: x[1])
-
- segments = []
- for batch_start, batch_end in contained_ranges:
- if row_start < row_end:
- if (batch_start > row_start):
- segments.append((row_start, batch_start))
- row_start = batch_end
- if row_start < row_end:
- segments.append((row_start, row_end))
-
- for start, end in segments:
- current_row = start
-
- while current_row < end:
- df = chunked_df[current_row:min(end, current_row + batch_size)]
-
- tqdm.pandas()
- embeddings = np.array(df['content'].progress_apply(self._embed_string).to_list())
-
- # Save the new DataFrame to a new CSV file
- np.save(os.path.join(self.folder_path, f"embedding_batches/embeddings ({current_row}-{min(end, current_row + batch_size)}).npy"), embeddings)
- print(f"Embedding batch complete and saved to embeddings ({current_row}-{min(end, current_row + batch_size)}).npy').")
- current_row += batch_size
-
- self._combine_embeddings()
+import os
+import re
+import json
+import numpy as np
+import pandas as pd
+from typing import List
+from tqdm import tqdm
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from file_processing import Directory
+from .tools.errors import FileTypeError
+from .tools.errors import EncodingModelError
+from faiss_search import faiss_index
+from sentence_transformers import SentenceTransformer
+
+# import docker specific requirement
+try:
+ import llama_cpp
+ llama_cpp_available = True
+except ImportError:
+ llama_cpp_available = False
+
+class SearchDirectory:
+ def __init__(self, folder_path: str) -> None:
+ """
+ Initializes the SearchDirectory object with paths to data and model files.
+
+ :param folder_path: Path to the folder containing data and setup files.
+ """
+ self.folder_path = folder_path
+ # get chunking file path
+ if os.path.exists(os.path.join(self.folder_path, "data_chunked.csv")):
+ self.chunks_path = os.path.join(self.folder_path, "data_chunked.csv")
+ else:
+ self.chunks_path = None
+ # get json data
+ if os.path.exists(os.path.join(self.folder_path, "setup_data.json")):
+ with open(os.path.join(self.folder_path, "setup_data.json"), 'r') as f:
+ setup_data = json.load(f)
+ self.encoding_name = setup_data['encoding_model']
+ self.is_gguf = setup_data['is_gguf']
+ self.n_chunks = setup_data['number_of_chunks']
+ else:
+ self.n_chunks = None
+ self.encoding_name = None
+ self.is_gguf = None
+ # get the faiss index
+ if os.path.exists(os.path.join(self.folder_path, "index.faiss")):
+ self.index = faiss_index.load_index(os.path.join(self.folder_path, "index.faiss"))
+ else:
+ self.index = None
+ # load the encoding model
+ if self.encoding_name is not None:
+ self.load_embedding_model(self.encoding_name)
+ else:
+ self.encoder = None
+
+ def _get_text_chunks(self, text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
+ """
+ Splits the input text into smaller chunks with specified size and overlap.
+
+ :param text: The text to be split into chunks.
+ :param chunk_size: Number of characters in each chunk.
+ :param chunk_overlap: Number of overlapping characters between chunks.
+
+ :return: A list of text chunks.
+ """
+ chunks = []
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+ for chunk in splitter.split_text(text):
+ chunks.append(chunk)
+ return chunks
+
+ def _save_to_json(self) -> None:
+ """
+ Saves the encoding model name and number of chunks to a JSON file in the folder.
+ """
+ setup_data = {
+ 'encoding_model': self.encoding_name,
+ 'is_gguf': self.is_gguf,
+ 'number_of_chunks': self.n_chunks
+ }
+ with open(os.path.join(self.folder_path, "setup_data.json"), 'w') as f:
+ json.dump(setup_data, f, indent=4)
+
+ def _embed_string(self, text: str) -> np.ndarray:
+ """
+ Converts a text string into a vector representation using the encoding model.
+
+ :param text: The text to be embedded.
+
+ :return: The embedding vector of the input text.
+ """
+ if self.is_gguf:
+ embedding = self.encoder.create_embedding(text)['data'][0]['embedding']
+ else:
+ embedding = self.encoder.encode(text)
+ return embedding
+
+ def _combine_embeddings(self) -> None:
+ """
+ Combines individual embedding files into a single numpy array and saves it as 'embeddings.npy'.
+ """
+ batch_path = os.path.join(self.folder_path, "embedding_batches")
+ pattern = r"\((\d+)-(\d+)\)"
+
+ file_ranges = []
+
+ for filename in os.listdir(batch_path):
+ match = re.search(pattern, filename)
+ file_ranges.append((filename, int(match.group(1)), int(match.group(2))))
+
+ file_ranges.sort(key=lambda x: x[1])
+
+ if file_ranges[0][1] == 0:
+ start = 0
+ for filename, batch_start, batch_end in file_ranges:
+ emb = np.load(os.path.join(batch_path, filename))
+ if batch_start == 0:
+ emb_full = emb
+ else:
+ if start >= batch_start:
+ emb_full = np.vstack((emb_full, emb[start - batch_start:]))
+ start = batch_end
+ if emb_full.shape[0] == self.n_chunks:
+ np.save(os.path.join(self.folder_path, "embeddings.npy"), emb_full)
+ print("Embeddings combined and saved to embeddings.npy")
+ else:
+ print("Embeddings not yet combined. The remainder of the embeddings left must be completed before they can be combined.")
+
+ def _check_for_embeddings(self, embeddings: np.ndarray) -> np.ndarray:
+ """
+ Checks for the provided embeddings or loads them from a file if not provided.
+
+ :param embeddings: Pre-existing embeddings to check.
+
+ :return: The embeddings, either provided or loaded from file.
+
+ :raises FileNotFoundError: If embeddings are not found and no file exists.
+ """
+ if embeddings is None:
+ if os.path.exists(os.path.join(self.folder_path, "embeddings.npy")):
+ embeddings = np.load(os.path.join(self.folder_path, "embeddings.npy"))
+ else:
+ raise FileNotFoundError("No embeddings found.")
+ return embeddings
+
+ def report_from_directory(self, directory_path: str) -> None:
+ """
+ Generates a report from the specified directory and saves it as 'report.csv'.
+
+ :param directory_path: Path to the directory to generate the report from.
+ """
+ directory = Directory(directory_path)
+ directory.generate_report(
+ report_file = os.path.join(self.folder_path,"report.csv"),
+ split_metadata=True,
+ include_text=True,
+ )
+
+ def chunk_text(self,
+ input_file_path: str = None,
+ document_path_column: str = "File Path",
+ document_text_column: str = "Text",
+ chunk_size: int = 1024,
+ chunk_overlap: int = 10) -> None:
+ """
+ Chunks the text data from a CSV file into smaller pieces and saves the result to 'data_chunked.csv'.
+
+ :param input_file_path: Path to the CSV file containing text to chunk. If None, uses 'report.csv' in the folder.
+ :param document_path_column: Column name for file paths in the CSV.
+ :param document_text_column: Column name for text content in the CSV.
+ :param chunk_size: Number of characters in each chunk.
+ :param chunk_overlap: Number of overlapping characters between chunks.
+
+ :raises FileNotFoundError: If no input file is specified and no report exists.
+ :raises FileTypeError: If the input file is not a CSV.
+ :raises KeyError: If specified columns are not found in the CSV.
+ """
+
+ # check if there is a report
+ if input_file_path is None:
+ if os.path.exists(os.path.join(self.folder_path, "report.csv")):
+ input_file_path = os.path.join(self.folder_path, "report.csv")
+ else:
+ raise FileNotFoundError("No input file specified and no report provided. \
+ Please provide a file path to a .csv or run 'report_from_directory'.")
+
+ # load into a dataframe
+ if input_file_path.lower().endswith('.csv'):
+ df = pd.read_csv(input_file_path)
+ else:
+ raise FileTypeError(f"File path {input_file_path} is not a .csv file.")
+
+ # check if the column names are valid
+ if document_path_column not in df.columns:
+ raise KeyError(f"'{document_path_column}' is not a column in {input_file_path}.")
+ elif document_text_column not in df.columns:
+ raise KeyError(f"'{document_text_column}' is not a column in {input_file_path}.")
+
+ # Initialize an empty list to collect all rows
+ all_new_rows = []
+
+ # Get the total number of rows
+ total_rows = len(df)
+ print(f"Total rows (excluding header): {total_rows}")
+
+ # Process each row with tqdm to show progress
+ for _, row in tqdm(df.iterrows(), total=total_rows, desc="Processing rows"):
+ file_path = row[document_path_column]
+ content = row[document_text_column]
+
+ # Get chunks for the current content
+ chunks = self._get_text_chunks(content, chunk_size, chunk_overlap)
+
+ # Create new rows for each chunk
+ for chunk_text in chunks:
+ new_row = {
+ 'file_path': file_path,
+ 'content': chunk_text
+ }
+ all_new_rows.append(new_row)
+
+ # Create a new DataFrame from the collected new rows
+ chunked_df = pd.DataFrame(all_new_rows)
+
+ # Save the new DataFrame to a new CSV file
+ chunked_df.to_csv(os.path.join(self.folder_path, 'data_chunked.csv'), index=False)
+ self.chunks_path = os.path.join(self.folder_path, 'data_chunked.csv')
+ self.n_chunks = len(chunked_df)
+ self._save_to_json()
+
+ print("Chunking complete and saved to 'data_chunked.csv'.")
+
+ def load_embedding_model(self, model_name: str = "paraphrase-MiniLM-L3-v2", gguf: bool = False) -> None:
+ """
+ Loads the specified embedding model and saves the model name to JSON.
+
+ :param model_name: Name of the embedding model to load.
+ If gguf is set to true the model name will instead be the path to the gguf file.
+ :param gguf: True if a gguf model is being used; false if a sentance-transformer model is being used.
+ """
+ self.encoding_name = model_name
+ if gguf:
+ if llama_cpp_available:
+ self.is_gguf = True
+ self.encoder = llama_cpp.Llama(model_path=model_name,
+ embedding=True,
+ verbose=False)
+ else:
+ print("Cannot load .gguf file. llama-cpp is not available.")
+ else:
+ self.is_gguf = False
+ self.encoder = SentenceTransformer(model_name)
+ self._save_to_json()
+
+ def embed_text(self, row_start: int = 0, row_end: int = None, batch_size: int = 1000) -> None:
+ """
+ Embeds text chunks from the 'data_chunked.csv' file into vectors and saves them in batches.
+ If all batches are complete then it combines the batches and saves the embeddings to 'embeddings.npy'.
+
+ :param row_start: Starting index of rows to process.
+ :param row_end: Ending index of rows to process. If None, processes till the end.
+ :param batch_size: Number of rows to process in each batch.
+ """
+ if self.chunks_path is None:
+ raise FileNotFoundError(f"Error: data_chunked.csv not located in {self.folder_path}")
+ if self.encoder is None:
+ raise EncodingModelError("Error: no encoding model found. Run 'load_embedding_model' first.")
+ else:
+ os.makedirs(os.path.join(self.folder_path, "embedding_batches"), exist_ok=True)
+ chunked_df = pd.read_csv(self.chunks_path)
+ n_chunks = len(chunked_df)
+
+ if (row_end is None) or (row_end > n_chunks):
+ row_end = n_chunks
+
+ # handle index error values and negative indexes
+ if row_start < -n_chunks - 1:
+ raise IndexError(f"Row start {row_start} is out of bounds for {n_chunks} chunks.")
+ elif row_start < 0:
+ row_start = n_chunks + row_start + 1
+ if row_end < -n_chunks -1:
+ raise IndexError(f"Row end {row_end} is out of bounds for {n_chunks} chunks.")
+ elif row_end < 0:
+ row_end = n_chunks + row_end +1
+ if row_start >= n_chunks:
+ raise IndexError(f"Start index of {row_start} is out of bounds for {n_chunks} chunks")
+ if row_end <= row_start:
+ raise ValueError(f"Row end ({row_end}) cannot be less than the row start ({row_start}).")
+
+ batch_path = os.path.join(self.folder_path, "embedding_batches")
+ pattern = r"\((\d+)-(\d+)\)"
+
+ contained_ranges = []
+
+ for filename in os.listdir(batch_path):
+ match = re.search(pattern, filename)
+ batch_start = int(match.group(1))
+ batch_end = int(match.group(2))
+ if (batch_start < row_end) and (batch_end > row_start):
+ if batch_start < row_start:
+ batch_start = row_start
+ if batch_end > row_end:
+ batch_end = row_end
+ contained_ranges.append((batch_start, batch_end))
+
+ contained_ranges.sort(key=lambda x: x[1])
+
+ segments = []
+ for batch_start, batch_end in contained_ranges:
+ if row_start < row_end:
+ if (batch_start > row_start):
+ segments.append((row_start, batch_start))
+ row_start = batch_end
+ if row_start < row_end:
+ segments.append((row_start, row_end))
+
+ for start, end in segments:
+ current_row = start
+
+ while current_row < end:
+ df = chunked_df[current_row:min(end, current_row + batch_size)]
+
+ tqdm.pandas()
+ embeddings = np.array(df['content'].progress_apply(self._embed_string).to_list())
+
+ # Save the new DataFrame to a new CSV file
+ np.save(os.path.join(self.folder_path, f"embedding_batches/embeddings ({current_row}-{min(end, current_row + batch_size)}).npy"), embeddings)
+ print(f"Embedding batch complete and saved to embeddings ({current_row}-{min(end, current_row + batch_size)}).npy').")
+ current_row += batch_size
+
+ self._combine_embeddings()
+
+ def create_flat_index(self, embeddings: np.ndarray = None) -> None:
+ """
+ Creates a FAISS flat index from the provided embeddings and saves it to a file.
+
+ :param embeddings: The embeddings to use for creating the index. If None, the method will load embeddings from file.
+ """
+ embeddings = self._check_for_embeddings(embeddings)
+ self.index = faiss_index.create_flat_index(embeddings, file_path=os.path.join(self.folder_path, "index.faiss"))
+
+ def create_ivf_flat_index(self, embeddings: np.ndarray = None, nlist: int = None) -> None:
+ """
+ Creates a FAISS IVF flat index from the provided embeddings and saves it to a file.
+
+ :param embeddings: The embeddings to use for creating the index. If None, the method will load embeddings from file.
+ :param nlist: Number of partitions (clusters) in the IVF index.
+ """
+ embeddings = self._check_for_embeddings(embeddings)
+ self.index = faiss_index.create_IVF_flat_index(embeddings, nlist=nlist, file_path=os.path.join(self.folder_path, "index.faiss"))
+
+ def create_hnsw_index(self,
+ embeddings: np.ndarray = None,
+ M: int = 64,
+ efConstruction: int = 64) -> None:
+ """
+ Creates a FAISS HNSW index from the provided embeddings and saves it to a file.
+
+ :param embeddings: The embeddings to use for creating the index. If None, the method will load embeddings from file.
+ :param M: The number of neighbors to use in the HNSW graph.
+ :param efConstruction: The size of the dynamic list used during the construction of the HNSW graph.
+ """
+ embeddings = self._check_for_embeddings(embeddings)
+ self.index = faiss_index.create_HNSW_index(embeddings, M=M, efConstruction=efConstruction, file_path=os.path.join(self.folder_path, "index.faiss"))
+
+ def search(self, query: str, k: int = 1, *args):
+ """
+ Searches the FAISS index for the most similar chunks to the provided query based on the embeddings.
+
+ :param query: The query string to search for.
+ :param k: The number of nearest neighbors to retrieve.
+ :param args: Additional arguments passed to the FAISS query method.
+
+ :return: A DataFrame containing the most similar chunks based on the query.
+
+ :raises FileNotFoundError: If 'data_chunked.csv' or FAISS index is not found.
+ :raises EncodingModelError: If no encoding model is loaded.
+ """
+ if self.chunks_path is None:
+ raise FileNotFoundError(f"Error: data_chunked.csv not located in {self.folder_path}")
+ if self.index is None:
+ raise FileNotFoundError(f"Error: no FAISS index found in {self.folder_path}")
+ if self.encoder is None:
+ raise EncodingModelError("Error: no encoding model found. Run 'load_embedding_model' first.")
+ xq = np.expand_dims(self._embed_string(query), axis=0)
+ df = pd.read_csv(os.path.join(self.folder_path, 'data_chunked.csv'))
+ _, indexes = self.index.query(xq, k, *args)
+ return df.iloc[indexes[0]]
diff --git a/faiss_search/tools/errors.py b/faiss_search/tools/errors.py
index 0b0050a..473ca05 100644
--- a/faiss_search/tools/errors.py
+++ b/faiss_search/tools/errors.py
@@ -1,7 +1,7 @@
-from file_processing.tools.errors import FileProcessorError
-
-class FileTypeError(FileProcessorError):
- """Raised when the provided file is of the incorrect file type."""
-
-class EncodingModelError(FileProcessorError):
- """Raised when there is no encoding model specified."""
+from file_processing.tools.errors import FileProcessorError
+
+class FileTypeError(FileProcessorError):
+ """Raised when the provided file is of the incorrect file type."""
+
+class EncodingModelError(FileProcessorError):
+ """Raised when there is no encoding model specified."""
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..2453f58
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,14 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel>=0.40.0", "build>=1.0.3"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "faiss_search"
+description = "A semantic embedding and FAISS search pipeline for document retrieval"
+readme = "README.md"
+requires-python = ">=3.10"
+classifiers = ["Programming Language :: Python :: 3"]
+dynamic = ["version", "dependencies"]
+
+[tool.setuptools.dynamic]
+dependencies = { file = ["requirements.txt"] }
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index fe36caa..b3e15b4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
-faiss-cpu==1.7.4
-ipykernel==6.28.0
-langchain
-numpy==1.26.2
-pandas>=2.0.3
-pytest==7.4.0
-tqdm==4.66.1
-sentence-transformers==2.2.2
-git+https://github.com/hc-sc-ocdo-bdpd/file-processing-tools.git
\ No newline at end of file
+faiss-cpu==1.7.4
+ipykernel==6.28.0
+langchain
+numpy==1.26.2
+pandas>=2.0.3
+pytest==7.4.0
+tqdm==4.66.1
+sentence-transformers==2.2.2
+file-processing @ git+https://github.com/hc-sc-ocdo-bdpd/file-processing-tools.git
\ No newline at end of file
diff --git a/search_directory_demo.ipynb b/search_directory_demo.ipynb
new file mode 100644
index 0000000..054d1e8
--- /dev/null
+++ b/search_directory_demo.ipynb
@@ -0,0 +1,401 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Search Directory Tutorial\n",
+ "\n",
+ "This notebook demonstrates how to use the `SearchDirectory` class to create embeddings and query a set of documents using a FAISS index.\n",
+ "\n",
+ "The process can be broken down into the following steps:\n",
+ "1. Get text information from documents in a directory\n",
+ "2. Chunk the text data\n",
+ "3. Load an embedding model\n",
+ "4. Embed the chunked text\n",
+ "5. Create a FAISS index\n",
+ "6. Use a query to search the FAISS index"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\Users\\TYWILSON\\OneDrive - HC-SC PHAC-ASPC\\Documents\\GitHub\\faiss-search\\.venv\\lib\\site-packages\\pypdf\\_crypt_providers\\_cryptography.py:32: CryptographyDeprecationWarning: ARC4 has been moved to cryptography.hazmat.decrepit.ciphers.algorithms.ARC4 and will be removed from this module in 48.0.0.\n",
+ " from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import shutil\n",
+ "from faiss_search import SearchDirectory"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1. Get text information from documents in a directory\n",
+ "\n",
+ "This step can be skipped if you already have a CSV containing the document names/file paths and the text data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# specify a path to save the chunking, embedding, and faiss index to\n",
+ "os.makedirs('sample_search_docs', exist_ok=True)\n",
+ "search_dir_path = \"sample_search_docs\"\n",
+ "\n",
+ "# create a SearchDirectory object\n",
+ "search = SearchDirectory(search_dir_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Processing files: 20 files completed [00:00, 50.49 files completed/s]\n",
+ "Processing batches: 1 batches completed [00:00, 2.51 batches completed/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# specify the path with the files to extract text from\n",
+ "resource_path = \"tests/resources/sample_text_files\"\n",
+ "\n",
+ "# generate a CSV report that contains text information\n",
+ "search.report_from_directory(resource_path)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Chunk the text data from the report\n",
+ "\n",
+ "You can either pass no arguments and it will use the `report.csv` generated in the previous step or you can specify the file path of another CSV file containing text data along with the column names of the file path and text content."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total rows (excluding header): 20\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Processing rows: 100%|██████████| 20/20 [00:00<00:00, 9787.20it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Chunking complete and saved to 'data_chunked.csv'.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# generate chunks from the report\n",
+ "search.chunk_text()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# generate chunks from a CSV file\n",
+ "search.chunk_text(\"tests/resources/search_directory_test_files/report_modified.csv\",\n",
+ " \"path\",\n",
+ " \"content\")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3. Specify the embedding model to use"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\Users\\TYWILSON\\OneDrive - HC-SC PHAC-ASPC\\Documents\\GitHub\\faiss-search\\.venv\\lib\\site-packages\\transformers\\tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "search.load_embedding_model(\"paraphrase-MiniLM-L3-v2\")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4. Perform embeddings on the chunked text data\n",
+ "\n",
+ "By default, this will split the task into batches that are saved to store progress during long computations. This is demonstrated by specifying the `batch_size` to be 20 chunks. The embeddings can also be broken down further by specifying the start and end chunks. The function is also designed to not recompute any chunks that have already been saved.\n",
+ "\n",
+ "Once all the chunks are computed and saved then they are combined and saved to `embeddings.npy`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 20/20 [00:04<00:00, 4.62it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Embedding batch complete and saved to embeddings (0-20).npy').\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 20/20 [00:00<00:00, 38.44it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Embedding batch complete and saved to embeddings (20-40).npy').\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 20/20 [00:00<00:00, 40.23it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Embedding batch complete and saved to embeddings (40-60).npy').\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 16/16 [00:00<00:00, 38.02it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Embedding batch complete and saved to embeddings (60-76).npy').\n",
+ "Embeddings combined and saved to embeddings.npy\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "search.embed_text(row_start=0, row_end=-1, batch_size=20)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 5. Create the FAISS index\n",
+ "\n",
+ "Multiple different types of FAISS indexes can be created with different hyperparameters. The functionality of using and creating FAISS indexes is demonstrated in more depth in `faiss_demo.ipynb`. This class uses the same methods as that demo but will always save the FAISS index after creating them."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "search.create_flat_index()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 6. Query the FAISS index and find the most similar documents\n",
+ "\n",
+ "Specify a query and the number of similar chunks to return (as well as any hyperparameters depending on the FAISS index used) and this will return a data frame with the most similar chunks (accoring to the embedding and FAISS models used)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " file_path | \n",
+ " content | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 10 | \n",
+ " tests\\resources\\sample_text_files\\climate_chan... | \n",
+ " The earth's climate is naturally variable on a... | \n",
+ "
\n",
+ " \n",
+ " | 43 | \n",
+ " tests\\resources\\sample_text_files\\history_of_c... | \n",
+ " The Huron-Wendat of the Great Lakes Region, li... | \n",
+ "
\n",
+ " \n",
+ " | 56 | \n",
+ " tests\\resources\\sample_text_files\\origin_of_na... | \n",
+ " Origin of the name \"Canada\"\\nToday, it seems i... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " file_path \\\n",
+ "10 tests\\resources\\sample_text_files\\climate_chan... \n",
+ "43 tests\\resources\\sample_text_files\\history_of_c... \n",
+ "56 tests\\resources\\sample_text_files\\origin_of_na... \n",
+ "\n",
+ " content \n",
+ "10 The earth's climate is naturally variable on a... \n",
+ "43 The Huron-Wendat of the Great Lakes Region, li... \n",
+ "56 Origin of the name \"Canada\"\\nToday, it seems i... "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "query = \"What is the meaning of life, the universe, and everything?\"\n",
+ "search.search(query, k=3)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Clean up created files"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "shutil.rmtree(\"sample_search_docs\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.4"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/resources/sample_embedding_files/paraphrasemini-space-embeddings.npy b/tests/resources/sample_embedding_files/paraphrasemini-space-embeddings.npy
new file mode 100644
index 0000000..6fcaa65
Binary files /dev/null and b/tests/resources/sample_embedding_files/paraphrasemini-space-embeddings.npy differ
diff --git a/tests/resources/sample_text_files/CPP_disability_benefits.txt b/tests/resources/sample_text_files/CPP_disability_benefits.txt
index 45b18c9..951603f 100644
--- a/tests/resources/sample_text_files/CPP_disability_benefits.txt
+++ b/tests/resources/sample_text_files/CPP_disability_benefits.txt
@@ -1,111 +1,111 @@
-Canada Pension Plan disability benefits
-
-Overview
-The Canada Pension Plan (CPP) disability benefit is a monthly payment you can get if you:
-
-are under 65
-have made enough contributions into the CPP
-have a mental or physical disability that regularly stops you from doing any type of substantially gainful work
-have a disability that is long-term and of indefinite duration, or is likely to result in death
-Find out if you might be eligible.
-
-Two types of CPP disability benefits
-Benefit name Age Must be
-CPP disability benefit Under 65 Not receiving the CPP retirement pension
-CPP post-retirement disability benefit From 60 to 65 Already receiving the CPP retirement pension for more than 15 months or become disabled after starting to receive the retirement pension
-When you turn 65 your CPP disability benefit is automatically changed to a CPP retirement pension.
-
-Children's benefit
-If you are getting a disability benefit, your dependent children may be able to get a monthly payment.
-
-Benefit name Age Must be
-CPP Children's benefit Under 18 or between the ages of 18 and 25 and in full-time attendance at a recognized school or university The child of a person receiving a disability benefit
-Find out more about the CPP Children's benefit.
-
-When you should apply
-You have to apply for this benefit. You should apply for this benefit as soon as possible.
-
-When payments start
-Your decision letter will give you the date and amount of your first payment.
-
-How long for a decision
-It can take up to 4 months for a decision to be made. The date your application form is received may affect the date your benefit begins. Do not wait for your completed medical form before sending your completed application form.
-
-However, a decision on your application can only be made when you have sent both of the following:
-
-a complete application form (including questionnaire and signed consent form)
-a signed medical report
-For eligible disabilities, we aim to make a decision on your eligibility within 120 calendar days (4 months).
-
-If we confirm your condition is grave, we aim to process your application within 30 calendar days.
-
-If we confirm you have a terminal illness, we aim to process your application within 5 business days.
-
-Definitions
-Substantially gainful work
-Substantially gainful work is a job that pays wages equal to or greater than the maximum annual amount a person could receive as a disability pension. In 2023, this amount is $18,508.36 (before tax).
-
-Once you have earned $6,600 (before tax) in 2023, you must contact Service Canada. Your disability benefits may be impacted by your gross (before tax) earnings:
-
-if you earn below $6,600 (before tax), this alone should not affect your disability benefits
-if you earn between $6,600 and $18,508.36 (before tax), this may show that you are regularly capable of working and it may affect your disability benefits
-if you earn $18,508.36 (before tax) or more, this demonstrates you are regularly capable of working and you will likely no longer qualify for disability benefits
-You need to advise Service Canada when you reach any of these amounts and you should call when you start working. This does not necessarily mean that your benefits will stop. When reviewing your file, we will consider the factors listed above (hours, regularity, etc.). By contacting us, we can provide you with information about our work-related supports and services.
-
-Terminal illness
-A terminal medical condition is a disease state that cannot be cured or adequately treated and is reasonably expected to result in death within 6 months.
-
-Grave condition
-A grave condition is a rapidly progressive medical condition. A list of grave conditions was developed based on extensive research by ESDC. These conditions have a high probability of meeting the CPP disability eligibility criteria.
-
-Acute Lymphoblastic Leukemia
-Acute Myeloid Leukemia
-Adrenocortical Cancer
-Alzheimer's Disease: (Early onset, less than age 60)
-Amyloidosis
-Amyotrophic Lateral Sclerosis (ALS)
-Anal Cancer
-Appendiceal Cancer
-Bladder Cancer (Metastatic, Stage IV)
-Brain Cancer
-Breast Cancer (Metastatic/recurrent)
-Cervical Carcinoma
-Chronic Kidney Disease (CKD)
-Chronic Liver Disease
-Colorectal Cancer
-Endometrial Cancer
-Esophagus Cancer
-Follicular Lymphoma
-Frontotemporal Dementia (FTD)
-Gallbladder Cancer
-Huntington Disease
-Idiopathic Pulmonary Fibrosis (IPF)
-Kidney Cancer
-Liver Cancer
-Lung Cancer
-Malignant Melanoma
-Malignant Tumours of Small Intestine
-Multiple Myeloma
-Muscular Dystrophy (Adult onset)
-Ovarian Cancer
-Pancreatic Cancer
-Parkinson's Disease
-Post-inflammatory Pulmonary Fibrosis
-Primary Cerebellar Degeneration
-Progressive Polyneuropathy
-Quadriplegia and Quadriparesis
-Schizophrenia
-Stomach Cancer
-Thymus Cancer
-Uterine Sarcoma
-Vascular Dementia
-Where you can get help
-If you have general questions about the disability benefits or specific questions about your application, contact us .
-
-If your medical condition is short-term or temporary
-If your medical condition is expected to be short-term or temporary, you will not be eligible for CPP disability benefits. You may be eligible for other benefits such as Employment Insurance sickness.
-
-Other disability resources
-Canada Pension Plan disability benefit toolkit
-Other resources for people with disabilities
+Canada Pension Plan disability benefits
+
+Overview
+The Canada Pension Plan (CPP) disability benefit is a monthly payment you can get if you:
+
+are under 65
+have made enough contributions into the CPP
+have a mental or physical disability that regularly stops you from doing any type of substantially gainful work
+have a disability that is long-term and of indefinite duration, or is likely to result in death
+Find out if you might be eligible.
+
+Two types of CPP disability benefits
+Benefit name Age Must be
+CPP disability benefit Under 65 Not receiving the CPP retirement pension
+CPP post-retirement disability benefit From 60 to 65 Already receiving the CPP retirement pension for more than 15 months or become disabled after starting to receive the retirement pension
+When you turn 65 your CPP disability benefit is automatically changed to a CPP retirement pension.
+
+Children's benefit
+If you are getting a disability benefit, your dependent children may be able to get a monthly payment.
+
+Benefit name Age Must be
+CPP Children's benefit Under 18 or between the ages of 18 and 25 and in full-time attendance at a recognized school or university The child of a person receiving a disability benefit
+Find out more about the CPP Children's benefit.
+
+When you should apply
+You have to apply for this benefit. You should apply for this benefit as soon as possible.
+
+When payments start
+Your decision letter will give you the date and amount of your first payment.
+
+How long for a decision
+It can take up to 4 months for a decision to be made. The date your application form is received may affect the date your benefit begins. Do not wait for your completed medical form before sending your completed application form.
+
+However, a decision on your application can only be made when you have sent both of the following:
+
+a complete application form (including questionnaire and signed consent form)
+a signed medical report
+For eligible disabilities, we aim to make a decision on your eligibility within 120 calendar days (4 months).
+
+If we confirm your condition is grave, we aim to process your application within 30 calendar days.
+
+If we confirm you have a terminal illness, we aim to process your application within 5 business days.
+
+Definitions
+Substantially gainful work
+Substantially gainful work is a job that pays wages equal to or greater than the maximum annual amount a person could receive as a disability pension. In 2023, this amount is $18,508.36 (before tax).
+
+Once you have earned $6,600 (before tax) in 2023, you must contact Service Canada. Your disability benefits may be impacted by your gross (before tax) earnings:
+
+if you earn below $6,600 (before tax), this alone should not affect your disability benefits
+if you earn between $6,600 and $18,508.36 (before tax), this may show that you are regularly capable of working and it may affect your disability benefits
+if you earn $18,508.36 (before tax) or more, this demonstrates you are regularly capable of working and you will likely no longer qualify for disability benefits
+You need to advise Service Canada when you reach any of these amounts and you should call when you start working. This does not necessarily mean that your benefits will stop. When reviewing your file, we will consider the factors listed above (hours, regularity, etc.). By contacting us, we can provide you with information about our work-related supports and services.
+
+Terminal illness
+A terminal medical condition is a disease state that cannot be cured or adequately treated and is reasonably expected to result in death within 6 months.
+
+Grave condition
+A grave condition is a rapidly progressive medical condition. A list of grave conditions was developed based on extensive research by ESDC. These conditions have a high probability of meeting the CPP disability eligibility criteria.
+
+Acute Lymphoblastic Leukemia
+Acute Myeloid Leukemia
+Adrenocortical Cancer
+Alzheimer's Disease: (Early onset, less than age 60)
+Amyloidosis
+Amyotrophic Lateral Sclerosis (ALS)
+Anal Cancer
+Appendiceal Cancer
+Bladder Cancer (Metastatic, Stage IV)
+Brain Cancer
+Breast Cancer (Metastatic/recurrent)
+Cervical Carcinoma
+Chronic Kidney Disease (CKD)
+Chronic Liver Disease
+Colorectal Cancer
+Endometrial Cancer
+Esophagus Cancer
+Follicular Lymphoma
+Frontotemporal Dementia (FTD)
+Gallbladder Cancer
+Huntington Disease
+Idiopathic Pulmonary Fibrosis (IPF)
+Kidney Cancer
+Liver Cancer
+Lung Cancer
+Malignant Melanoma
+Malignant Tumours of Small Intestine
+Multiple Myeloma
+Muscular Dystrophy (Adult onset)
+Ovarian Cancer
+Pancreatic Cancer
+Parkinson's Disease
+Post-inflammatory Pulmonary Fibrosis
+Primary Cerebellar Degeneration
+Progressive Polyneuropathy
+Quadriplegia and Quadriparesis
+Schizophrenia
+Stomach Cancer
+Thymus Cancer
+Uterine Sarcoma
+Vascular Dementia
+Where you can get help
+If you have general questions about the disability benefits or specific questions about your application, contact us .
+
+If your medical condition is short-term or temporary
+If your medical condition is expected to be short-term or temporary, you will not be eligible for CPP disability benefits. You may be eligible for other benefits such as Employment Insurance sickness.
+
+Other disability resources
+Canada Pension Plan disability benefit toolkit
+Other resources for people with disabilities
One-time payment to persons with disabilities Status: Closed
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/CPP_retirement_pension.txt b/tests/resources/sample_text_files/CPP_retirement_pension.txt
index c94a6bb..73ab066 100644
--- a/tests/resources/sample_text_files/CPP_retirement_pension.txt
+++ b/tests/resources/sample_text_files/CPP_retirement_pension.txt
@@ -1,87 +1,87 @@
-CPP Retirement pension
-
-Overview
-The Canada Pension Plan (CPP) retirement pension is a monthly, taxable benefit that replaces part of your income when you retire. If you qualify, you’ll receive the CPP retirement pension for the rest of your life. To qualify you must:
-
-be at least 60 years old
-have made at least one valid contribution to the CPP
-Valid contributions can be either from work you did in Canada, or as the result of receiving credits from a former spouse or former common-law partner at the end of the relationship.
-
-You must apply
-CPP payments are not automatic. You must apply. You should apply in advance of when you want your pension to start.
-
-Our goal is to pay your CPP retirement pension in the month of the start date you choose.
-
-Pension amount
-The amount you receive each month is based on your average earnings throughout your working life, your contributions to the CPP, and the age you decide to start your CPP retirement pension. Your contributions to the CPP are based on your earnings.
-
-The standard age to start the pension is 65. However, you can start receiving it as early as age 60 or as late as age 70.
-
-If you start receiving your pension earlier, the monthly amount you’ll receive will be smaller. If you decide to start later, you’ll receive a larger monthly amount. There’s no benefit to wait after age 70 to start receiving the pension. The maximum monthly amount you can receive is reached when you turn 70.
-
-There are different factors that can affect how much you'll receive, such as time taken off from work to care for young children. Find out more about how much you could receive. You can also work while receiving a CPP retirement pension.
-
-How long will it take to process your application
-We begin to process your application once we receive your completed application form. You should receive a notice of our decision by mail within 120 days.
-
-It could take longer to process your application if Service Canada does not have a complete application.
-
-You may also qualify for other CPP benefits
-In addition to the CPP retirement pension, you may also quality for other CPP benefits listed below. Like the CPP retirement pension, you will need to apply for these benefits (except for the Post-retirement benefit if you already receive the CPP retirement pension).
-
-Post-retirement benefit
-Disability pension
-Post-retirement disability benefit
-Survivor's pension
-Children's benefit
-Death benefit
-CPP enhancement
-As of 2019, the Canada Pension Plan (CPP) is gradually being enhanced. This means that today’s workers, the seniors of tomorrow, will have higher benefits and greater financial stability through a small increase in the amount they contribute to the CPP.
-
-The CPP enhancement only affects those who work and contribute to the CPP in 2019 or after.
-
-The enhancement adds 2 additional components to the CPP. These components are not a separate benefit, but a ‘top-up’ to the base CPP.
-
-The CPP now consists of:
-
-the base (or original CPP)
-the first additional component, which was phased in between 2019 and 2023, and
-the second additional component, which will be phased in between 2024 and 2025
-The CPP enhancement will increase the amount working Canadians receive in the:
-
-CPP retirement pension
-Post-retirement benefit
-Disability pension
-Survivor’s pension
-It will not affect eligibility for CPP benefits. Learn more about the CPP enhancement.
-
-Frequently asked questions
-What is the CPP?
-A monthly, taxable benefit that replaces part of your income when you retire. If you qualify, you’ll receive the CPP retirement pension for the rest of your life.
-
-How much is CPP at 60?
-At age 60, your CPP amount depends on your contributions and your average annual earnings.
-
-How much do you get from CPP?
-Your CPP amount depends on the age you started your pension, your contributions and your average annual earnings.
-
-How many years do you need to work to get CPP?
-Everyone is entitled to CPP regardless of how many years you have worked. How much you receive depends on your earnings as well as your contributions.
-
-Who is eligible for the Canada Pension Plan?
-To qualify for the CPP, you must be at least 60 years old and have made valid contributions.
-
-How do I apply for my Canada Pension?
-If you qualify for CPP, you can apply online. You can also mail or drop-off a completed form to a Service Canada office.
-
-Should I take my Canada pension at 60 or 65?
-Deciding when to start collecting CPP should be based on your finances, health, life expectancy and taxes. The main reason to delay CPP is that you will receive a larger benefit.
-
-When should I apply for CPP benefits?
-The standard age to start CPP is 65. You can start receiving as early as age 60 or as late as age 70. You should apply in advance of when you want your pension to start.
-
-How long does it take to receive CPP after applying?
-It takes approximately 7 to 28 days for online applications, 120 days for applications delivered by mail or in-person to a Service Canada Centre.
-
-Contact us
+CPP Retirement pension
+
+Overview
+The Canada Pension Plan (CPP) retirement pension is a monthly, taxable benefit that replaces part of your income when you retire. If you qualify, you’ll receive the CPP retirement pension for the rest of your life. To qualify you must:
+
+be at least 60 years old
+have made at least one valid contribution to the CPP
+Valid contributions can be either from work you did in Canada, or as the result of receiving credits from a former spouse or former common-law partner at the end of the relationship.
+
+You must apply
+CPP payments are not automatic. You must apply. You should apply in advance of when you want your pension to start.
+
+Our goal is to pay your CPP retirement pension in the month of the start date you choose.
+
+Pension amount
+The amount you receive each month is based on your average earnings throughout your working life, your contributions to the CPP, and the age you decide to start your CPP retirement pension. Your contributions to the CPP are based on your earnings.
+
+The standard age to start the pension is 65. However, you can start receiving it as early as age 60 or as late as age 70.
+
+If you start receiving your pension earlier, the monthly amount you’ll receive will be smaller. If you decide to start later, you’ll receive a larger monthly amount. There’s no benefit to wait after age 70 to start receiving the pension. The maximum monthly amount you can receive is reached when you turn 70.
+
+There are different factors that can affect how much you'll receive, such as time taken off from work to care for young children. Find out more about how much you could receive. You can also work while receiving a CPP retirement pension.
+
+How long will it take to process your application
+We begin to process your application once we receive your completed application form. You should receive a notice of our decision by mail within 120 days.
+
+It could take longer to process your application if Service Canada does not have a complete application.
+
+You may also qualify for other CPP benefits
+In addition to the CPP retirement pension, you may also quality for other CPP benefits listed below. Like the CPP retirement pension, you will need to apply for these benefits (except for the Post-retirement benefit if you already receive the CPP retirement pension).
+
+Post-retirement benefit
+Disability pension
+Post-retirement disability benefit
+Survivor's pension
+Children's benefit
+Death benefit
+CPP enhancement
+As of 2019, the Canada Pension Plan (CPP) is gradually being enhanced. This means that today’s workers, the seniors of tomorrow, will have higher benefits and greater financial stability through a small increase in the amount they contribute to the CPP.
+
+The CPP enhancement only affects those who work and contribute to the CPP in 2019 or after.
+
+The enhancement adds 2 additional components to the CPP. These components are not a separate benefit, but a ‘top-up’ to the base CPP.
+
+The CPP now consists of:
+
+the base (or original CPP)
+the first additional component, which was phased in between 2019 and 2023, and
+the second additional component, which will be phased in between 2024 and 2025
+The CPP enhancement will increase the amount working Canadians receive in the:
+
+CPP retirement pension
+Post-retirement benefit
+Disability pension
+Survivor’s pension
+It will not affect eligibility for CPP benefits. Learn more about the CPP enhancement.
+
+Frequently asked questions
+What is the CPP?
+A monthly, taxable benefit that replaces part of your income when you retire. If you qualify, you’ll receive the CPP retirement pension for the rest of your life.
+
+How much is CPP at 60?
+At age 60, your CPP amount depends on your contributions and your average annual earnings.
+
+How much do you get from CPP?
+Your CPP amount depends on the age you started your pension, your contributions and your average annual earnings.
+
+How many years do you need to work to get CPP?
+Everyone is entitled to CPP regardless of how many years you have worked. How much you receive depends on your earnings as well as your contributions.
+
+Who is eligible for the Canada Pension Plan?
+To qualify for the CPP, you must be at least 60 years old and have made valid contributions.
+
+How do I apply for my Canada Pension?
+If you qualify for CPP, you can apply online. You can also mail or drop-off a completed form to a Service Canada office.
+
+Should I take my Canada pension at 60 or 65?
+Deciding when to start collecting CPP should be based on your finances, health, life expectancy and taxes. The main reason to delay CPP is that you will receive a larger benefit.
+
+When should I apply for CPP benefits?
+The standard age to start CPP is 65. You can start receiving as early as age 60 or as late as age 70. You should apply in advance of when you want your pension to start.
+
+How long does it take to receive CPP after applying?
+It takes approximately 7 to 28 days for online applications, 120 days for applications delivered by mail or in-person to a Service Canada Centre.
+
+Contact us
For more information, contact the Canada Pension Plan program.
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/EI_regular_benefits.txt b/tests/resources/sample_text_files/EI_regular_benefits.txt
index ed446f9..6b118b9 100644
--- a/tests/resources/sample_text_files/EI_regular_benefits.txt
+++ b/tests/resources/sample_text_files/EI_regular_benefits.txt
@@ -1,65 +1,65 @@
-EI regular benefits
-
-How much you could receive
-From: Employment and Social Development Canada
-
-On this page
-You could get up to 55% of your earnings
-You can get benefits for up to a maximum of 45 weeks
-How we calculate your weekly benefit amount
-If your net family income is $25,921 or less
-Taxes are deducted from EI payments
-You could get up to 55% of your earnings
-We can’t tell you exactly how much you’ll receive before we process your application. For most people, the basic rate for calculating Employment Insurance (EI) benefits is 55% of their average insurable weekly earnings, up to a maximum amount. As of January 1, 2023, the maximum yearly insurable earnings amount is $61,500. This means that you can receive a maximum amount of $650 per week.
-
-Insurable earnings include most of the different types of compensation from employment, such as wages, tips, bonuses and commissions. The Canada Revenue Agency determines what types of earnings are insurable.
-
-You can get benefits for up to a maximum of 45 weeks
-You can receive EI from 14 weeks up to a maximum of 45 weeks, depending on the unemployment rate in your region at the time of filing your claim and the amount of insurable hours you've accumulated in the last 52 weeks or since your last claim, whichever is shorter.
-
-Seasonal workers
-Number of weeks of EI regular benefits payable by regional rate of unemployment
-The number of weeks for which you may receive benefits doesn’t change if you move to another region after your benefit period begins.
-
-How we calculate your weekly benefit amount
-The amount of weekly benefits is calculated as follows:
-
-we calculate your total insurable earnings for the required number of best weeks (the weeks that you earned the most money, including insurable tips and commissions) based on the information you provide and/or your record(s) of employment
-we determine the divisor (number of best weeks) that corresponds to your regional rate of unemployment
-we divide your total insurable earnings for your best weeks by your required number of best weeks
-we then multiply the result by 55% to obtain the amount of your weekly benefits
-In regions of Canada with the highest rates of unemployment, we’ll calculate using the best 14 weeks. In regions of Canada with the lowest rates of unemployment, we’ll use the best 22 weeks. In other regions, the number of weeks used to calculate benefits will be somewhere between 14 and 22, depending on the unemployment rate in those regions.
-
-Calculation
-Total insurable earnings for the required number of best weeks
-divided byRequired number of best weeks
-times by55 %
-equalsWeekly EI benefit (maximum $650)
-Required number of best weeks by regional rate of unemployment
-Regional rate of unemployment Required weeks
-6% or less 22
-6.1% to 7% 21
-7.1% to 8% 20
-8.1% to 9% 19
-9.1% to 10% 18
-10.1% to 11% 17
-11.1% to 12% 16
-12.1% to 13% 15
-13.1% or more 14
-To find out the rate of unemployment in your region, visit EI Program Characteristics.
-
-Once the weekly benefit rate is established, it will remain unchanged over the life of your claim.
-
-If your net family income is $25,921 or less
-If your net family income doesn’t exceed $25,921 per year, you have children and you or your spouse receives the Canada Child Benefit, you’re considered a member of a low-income family. Therefore, you may be eligible to receive the EI family supplement.
-
-The family supplement rate is based on:
-
-your net family income up to a maximum of $25,921 per year
-the number of children in the family and their ages
-The family supplement may increase your benefit rate up to 80% of your average insurable earnings. If you and your spouse claim EI benefits at the same time, only 1 of you can receive the family supplement. It is generally better for the spouse with the lower benefit rate to receive the supplement.
-
-As your income level rises, the Family Supplement gradually decreases, so that when the maximum income of $25,921 is reached no supplement is payable.
-
-Taxes are deducted from EI payments
+EI regular benefits
+
+How much you could receive
+From: Employment and Social Development Canada
+
+On this page
+You could get up to 55% of your earnings
+You can get benefits for up to a maximum of 45 weeks
+How we calculate your weekly benefit amount
+If your net family income is $25,921 or less
+Taxes are deducted from EI payments
+You could get up to 55% of your earnings
+We can’t tell you exactly how much you’ll receive before we process your application. For most people, the basic rate for calculating Employment Insurance (EI) benefits is 55% of their average insurable weekly earnings, up to a maximum amount. As of January 1, 2023, the maximum yearly insurable earnings amount is $61,500. This means that you can receive a maximum amount of $650 per week.
+
+Insurable earnings include most of the different types of compensation from employment, such as wages, tips, bonuses and commissions. The Canada Revenue Agency determines what types of earnings are insurable.
+
+You can get benefits for up to a maximum of 45 weeks
+You can receive EI from 14 weeks up to a maximum of 45 weeks, depending on the unemployment rate in your region at the time of filing your claim and the amount of insurable hours you've accumulated in the last 52 weeks or since your last claim, whichever is shorter.
+
+Seasonal workers
+Number of weeks of EI regular benefits payable by regional rate of unemployment
+The number of weeks for which you may receive benefits doesn’t change if you move to another region after your benefit period begins.
+
+How we calculate your weekly benefit amount
+The amount of weekly benefits is calculated as follows:
+
+we calculate your total insurable earnings for the required number of best weeks (the weeks that you earned the most money, including insurable tips and commissions) based on the information you provide and/or your record(s) of employment
+we determine the divisor (number of best weeks) that corresponds to your regional rate of unemployment
+we divide your total insurable earnings for your best weeks by your required number of best weeks
+we then multiply the result by 55% to obtain the amount of your weekly benefits
+In regions of Canada with the highest rates of unemployment, we’ll calculate using the best 14 weeks. In regions of Canada with the lowest rates of unemployment, we’ll use the best 22 weeks. In other regions, the number of weeks used to calculate benefits will be somewhere between 14 and 22, depending on the unemployment rate in those regions.
+
+Calculation
+Total insurable earnings for the required number of best weeks
+divided byRequired number of best weeks
+times by55 %
+equalsWeekly EI benefit (maximum $650)
+Required number of best weeks by regional rate of unemployment
+Regional rate of unemployment Required weeks
+6% or less 22
+6.1% to 7% 21
+7.1% to 8% 20
+8.1% to 9% 19
+9.1% to 10% 18
+10.1% to 11% 17
+11.1% to 12% 16
+12.1% to 13% 15
+13.1% or more 14
+To find out the rate of unemployment in your region, visit EI Program Characteristics.
+
+Once the weekly benefit rate is established, it will remain unchanged over the life of your claim.
+
+If your net family income is $25,921 or less
+If your net family income doesn’t exceed $25,921 per year, you have children and you or your spouse receives the Canada Child Benefit, you’re considered a member of a low-income family. Therefore, you may be eligible to receive the EI family supplement.
+
+The family supplement rate is based on:
+
+your net family income up to a maximum of $25,921 per year
+the number of children in the family and their ages
+The family supplement may increase your benefit rate up to 80% of your average insurable earnings. If you and your spouse claim EI benefits at the same time, only 1 of you can receive the family supplement. It is generally better for the spouse with the lower benefit rate to receive the supplement.
+
+As your income level rises, the Family Supplement gradually decreases, so that when the maximum income of $25,921 is reached no supplement is payable.
+
+Taxes are deducted from EI payments
EI benefits are taxable, no matter what type of benefits you receive. Federal and provincial or territorial taxes, where applicable, will be deducted from your payment.
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/aviation_safety.txt b/tests/resources/sample_text_files/aviation_safety.txt
index a0113d5..51749c9 100644
--- a/tests/resources/sample_text_files/aviation_safety.txt
+++ b/tests/resources/sample_text_files/aviation_safety.txt
@@ -1,81 +1,81 @@
-Aviation safety in Canada
-From: Transport Canada
-
-
-
-
-Canada’s skies are busy. Millions of flights and more than 150 million passengers travel our 15 million square kilometres of airspace every year.
-
-Transport Canada’s Aviation Safety Program
-We work with industry and government partners to help keep planes, people and airports safe and secure. We develop and implement policies, regulations and administer the Canadian Aviation Regulations (CARs).
-
-We set the standards for aircraft designed and used in Canada to ensure all meet certification requirements and operational airworthiness. We also oversee the licensing and training of pilots, crew members and aircraft maintenance engineers.
-
-The work we do is not only applicable to commercial air carriers and those who work in the industry but also for other aircraft such as drones, hot air balloons, gliders and helicopters. Given there are over 34, 000 privately licensed pilots, recreational aviation is a big part of Canadian aviation.
-
-In addition to our regulatory approach for improving safety, we also rely on promotional and educational tools to promote adherence to safety regulations and best practices within the aviation industry and to build awareness of safety hazards and risks.
-
-As aviation is a global industry, connecting people, goods, and communities around the world, it is vital that our work extends outside of Canada so that people can have confidence in the safety of the aviation system around the world. Canada is working with international partners and key stakeholders to enhance the level of safety and security for commercial airlines travelling in or near conflict zones.
-
-Our work behind the scenes
-
-Airport
-We make and enforce regulations to:
-
-keep take-off and landing zones free of obstacle
-require runways, taxiways and airport lighting to be built to appropriate standards
-Drones
-We regulate the use of unmanned vehicles.
-
-Wildlife
-We help minimize the risks of wildlife hits.
-
-Hangar
-We:
-
-make sure Approved Maintenance Organizations meet our regulations for performing maintenance on registered aircraft
-issue Aircraft Maintenance Engineer licenses
-set aircraft manufacturing and maintenance requirements
-Airspace
-We regulate 15 million square kilometres of airspace.
-
-Control tower
-We make sure that air traffic control activities meet our safety regulations.
-
-Air carriers
-We verify that the aviation industry complies with the regulations through oversight activities.
-
-Flight deck
-We:
-
-set training standards
-certify and monitor training schools and their programs
-administer yearly examinations
-set medical standards and procedures
-Pilot
-We:
-
-review pilots’ physical and mental fitness
-set flight and duty time requirements
-Aircraft certification
-We:
-
-establish and regulate standards for Canadian aeronautical products
-guide the industry through the certification process
-ensure aircraft designs meet internationally accepted design standards
-support the industry to take corrective action
-Cabin safety
-We regulate training standards for emergency procedures, evacuation and training.
-
-We set and enforce safety rules and procedures from pre-flight briefings to the use of seatbelts and personal electronic devices.
-
-We monitor cabin safety practices and review and approve safety features cards and passenger briefings.
-
-Inspections
-We:
-
-train and delegate inspectors who promote safety and conduct over 10,000 oversight and certifications activities per month to verify that operators are following the rules
-conduct inspections of all registered and certified aerodromes, airports and heliports
-monitor industry for compliance with aviation safety rules and standards, and the effectiveness of a company’s Safety Management System
-when necessary, our inspectors issue fines and suspensions
-
+Aviation safety in Canada
+From: Transport Canada
+
+
+
+
+Canada’s skies are busy. Millions of flights and more than 150 million passengers travel our 15 million square kilometres of airspace every year.
+
+Transport Canada’s Aviation Safety Program
+We work with industry and government partners to help keep planes, people and airports safe and secure. We develop and implement policies, regulations and administer the Canadian Aviation Regulations (CARs).
+
+We set the standards for aircraft designed and used in Canada to ensure all meet certification requirements and operational airworthiness. We also oversee the licensing and training of pilots, crew members and aircraft maintenance engineers.
+
+The work we do is not only applicable to commercial air carriers and those who work in the industry but also for other aircraft such as drones, hot air balloons, gliders and helicopters. Given there are over 34, 000 privately licensed pilots, recreational aviation is a big part of Canadian aviation.
+
+In addition to our regulatory approach for improving safety, we also rely on promotional and educational tools to promote adherence to safety regulations and best practices within the aviation industry and to build awareness of safety hazards and risks.
+
+As aviation is a global industry, connecting people, goods, and communities around the world, it is vital that our work extends outside of Canada so that people can have confidence in the safety of the aviation system around the world. Canada is working with international partners and key stakeholders to enhance the level of safety and security for commercial airlines travelling in or near conflict zones.
+
+Our work behind the scenes
+
+Airport
+We make and enforce regulations to:
+
+keep take-off and landing zones free of obstacle
+require runways, taxiways and airport lighting to be built to appropriate standards
+Drones
+We regulate the use of unmanned vehicles.
+
+Wildlife
+We help minimize the risks of wildlife hits.
+
+Hangar
+We:
+
+make sure Approved Maintenance Organizations meet our regulations for performing maintenance on registered aircraft
+issue Aircraft Maintenance Engineer licenses
+set aircraft manufacturing and maintenance requirements
+Airspace
+We regulate 15 million square kilometres of airspace.
+
+Control tower
+We make sure that air traffic control activities meet our safety regulations.
+
+Air carriers
+We verify that the aviation industry complies with the regulations through oversight activities.
+
+Flight deck
+We:
+
+set training standards
+certify and monitor training schools and their programs
+administer yearly examinations
+set medical standards and procedures
+Pilot
+We:
+
+review pilots’ physical and mental fitness
+set flight and duty time requirements
+Aircraft certification
+We:
+
+establish and regulate standards for Canadian aeronautical products
+guide the industry through the certification process
+ensure aircraft designs meet internationally accepted design standards
+support the industry to take corrective action
+Cabin safety
+We regulate training standards for emergency procedures, evacuation and training.
+
+We set and enforce safety rules and procedures from pre-flight briefings to the use of seatbelts and personal electronic devices.
+
+We monitor cabin safety practices and review and approve safety features cards and passenger briefings.
+
+Inspections
+We:
+
+train and delegate inspectors who promote safety and conduct over 10,000 oversight and certifications activities per month to verify that operators are following the rules
+conduct inspections of all registered and certified aerodromes, airports and heliports
+monitor industry for compliance with aviation safety rules and standards, and the effectiveness of a company’s Safety Management System
+when necessary, our inspectors issue fines and suspensions
+
diff --git a/tests/resources/sample_text_files/canadian_constitution.txt b/tests/resources/sample_text_files/canadian_constitution.txt
index 900e855..e681a54 100644
--- a/tests/resources/sample_text_files/canadian_constitution.txt
+++ b/tests/resources/sample_text_files/canadian_constitution.txt
@@ -1,49 +1,49 @@
-The Canadian Constitution
-A constitution provides the fundamental rules and principles that govern a country. It creates many of the institutions and branches of government, and defines their powers.
-
-Did you know?
-Canada was created by an act of the Parliament of the United Kingdom called the British North America Act, 1867 (now known as the Constitution Act, 1867) uniting the British colonies of the United Province of Canada, Nova Scotia, and New Brunswick.
-The Constitution of Canada includes the Constitution Act, 1867, and the Constitution Act, 1982. It is the supreme law of Canada. It reaffirms Canada's dual legal system and also includes Aboriginal rights and treaty rights.
-
-What does our Constitution say?
-The Constitution sets out the basic principles of democratic government in Canada when it defines the powers of the three branches of government:
-
-the executive
-the legislative
-the judiciary
-Did you know?
-The Constitution was "patriated" from the United Kingdom in 1982.
-
-When Canada was created, it was a self-governing British colony. The British North America Act, 1867, codified many constitutional rules for Canada, but major changes to the Constitution could only be made by the United Kingdom Parliament. In 1982, the Charter was enacted as part of Canada's Constitution along with a set of procedures allowing the Constitution to be amended in Canada.
-
-The Queen has the executive power in Canada, but in our democratic society the Queen's powers are exercised by constitutional convention on the advice of Ministers who enjoy the confidence of the House of Commons. Together, the Prime Minister and other Ministers form the cabinet, which is responsible to Parliament for government business. Ministers are also responsible for government departments, such as the Department of Finance and the Department of Justice. When we say "the government," we are usually referring to the executive branch.
-
-Parliament is the legislative branch of the federal government. Parliament consists of the Queen (who is usually represented by the Governor General), the Senate and the House of Commons. Bills are debated and passed by the Senate and the House of Commons. The Governor General must also give royal assent to a bill in order for it to become a law. By constitutional convention, royal assent is always given to bills passed by the Senate and the House of Commons.
-
-The Department of Justice
-The Minister of Justice is responsible for the Department of Justice, which provides legal services such as drafting laws and providing legal advice to the government and its departments. The department also develops criminal law and public law, as well as policies and programs for victims, families, children and youth criminal justice. The Minister of Justice is also the Attorney General or chief law officer of Canada
-Our Constitution also includes provisions relating to the judicial branch of government, composed of judges. The judiciary must interpret and apply the law and the Constitution, and give impartial judgments in all cases, whether they involve public law, such as a criminal case, or private law, such as a dispute over a contract.
-
-The Constitution only provides for federally appointed judges. Provincial judges are appointed under provincial laws.
-
-What is a federal system?
-The Parliament of Canada and the provincial and territorial legislatures both have the authority or jurisdiction to make laws. Parliament can make laws for all of Canada, but only about matters the Constitution assigns to it. A provincial or territorial legislature can only make laws about matters within the province's borders.
-
-Did you know?
-The Constitution Act, 1867 authorized Parliament to establish a general court of appeal for Canada, as well as any additional courts to better administer the laws of Canada. It was under this authority that the Federal Courts, the Tax Court, and the Supreme Court of Canada were established.
-The federal Parliament deals mainly with issues that concern Canada as a whole: trade between provinces, national defence, criminal law, money, patents, and the postal service. It is also responsible for the three territories: Yukon, the Northwest Territories, and Nunavut. Federal law allows territories to elect councils with powers like those of the provincial legislatures.
-
-The provinces have the authority to make laws about education, property, civil rights, the administration of justice, hospitals, municipalities, and other local or private matters within the provinces.
-
-Other federal systems
-Australia and the United States also have federal systems where jurisdiction is divided between the federal government and the various states. In contrast, the United Kingdom has a unitary system where there is only one level of government.
-There are also local or municipal governments. They are created under provincial laws and can make bylaws that regulate a variety of local matters: zoning, smoking, pesticide use, parking, business regulations, and construction permits.
-
-Aboriginal peoples in Canada have different types of government. For example, First Nations can have a range of governmental powers over reserve lands under the federal Indian Act. Other Aboriginal governments, such as self-governments, exercise these powers as a result of agreements they have negotiated with the federal and provincial or territorial governments.
-
-It was only with the Canadian Charter of Rights and Freedoms that human rights in Canada were protected in the written Constitution.
-The Constitution Act includes protection for the rights of the Aboriginal peoples (Indian, Inuit, and Métis) of Canada. Section 35 of the Constitution Act recognizes and affirms Aboriginal rights, which are rights related to the historical occupancy and use of the land by Aboriginal peoples. This is to help Aboriginal peoples preserve their customs and traditions for future generations, as continuing cultural practices. Section 35 also recognizes and affirms treaty rights, which are specifically set out in agreements between the Crown and particular groups of Aboriginal people.
-
-Bijuralism
-Canada is a bijural country – that means it has both common and civil law systems. Matters of private law in Quebec are governed by the civil law, while the common law applies in the other provinces. Federal bills and regulations must respect both types of systems, and the legal concepts within these laws must be expressed in both English and French.
-
+The Canadian Constitution
+A constitution provides the fundamental rules and principles that govern a country. It creates many of the institutions and branches of government, and defines their powers.
+
+Did you know?
+Canada was created by an act of the Parliament of the United Kingdom called the British North America Act, 1867 (now known as the Constitution Act, 1867) uniting the British colonies of the United Province of Canada, Nova Scotia, and New Brunswick.
+The Constitution of Canada includes the Constitution Act, 1867, and the Constitution Act, 1982. It is the supreme law of Canada. It reaffirms Canada's dual legal system and also includes Aboriginal rights and treaty rights.
+
+What does our Constitution say?
+The Constitution sets out the basic principles of democratic government in Canada when it defines the powers of the three branches of government:
+
+the executive
+the legislative
+the judiciary
+Did you know?
+The Constitution was "patriated" from the United Kingdom in 1982.
+
+When Canada was created, it was a self-governing British colony. The British North America Act, 1867, codified many constitutional rules for Canada, but major changes to the Constitution could only be made by the United Kingdom Parliament. In 1982, the Charter was enacted as part of Canada's Constitution along with a set of procedures allowing the Constitution to be amended in Canada.
+
+The Queen has the executive power in Canada, but in our democratic society the Queen's powers are exercised by constitutional convention on the advice of Ministers who enjoy the confidence of the House of Commons. Together, the Prime Minister and other Ministers form the cabinet, which is responsible to Parliament for government business. Ministers are also responsible for government departments, such as the Department of Finance and the Department of Justice. When we say "the government," we are usually referring to the executive branch.
+
+Parliament is the legislative branch of the federal government. Parliament consists of the Queen (who is usually represented by the Governor General), the Senate and the House of Commons. Bills are debated and passed by the Senate and the House of Commons. The Governor General must also give royal assent to a bill in order for it to become a law. By constitutional convention, royal assent is always given to bills passed by the Senate and the House of Commons.
+
+The Department of Justice
+The Minister of Justice is responsible for the Department of Justice, which provides legal services such as drafting laws and providing legal advice to the government and its departments. The department also develops criminal law and public law, as well as policies and programs for victims, families, children and youth criminal justice. The Minister of Justice is also the Attorney General or chief law officer of Canada
+Our Constitution also includes provisions relating to the judicial branch of government, composed of judges. The judiciary must interpret and apply the law and the Constitution, and give impartial judgments in all cases, whether they involve public law, such as a criminal case, or private law, such as a dispute over a contract.
+
+The Constitution only provides for federally appointed judges. Provincial judges are appointed under provincial laws.
+
+What is a federal system?
+The Parliament of Canada and the provincial and territorial legislatures both have the authority or jurisdiction to make laws. Parliament can make laws for all of Canada, but only about matters the Constitution assigns to it. A provincial or territorial legislature can only make laws about matters within the province's borders.
+
+Did you know?
+The Constitution Act, 1867 authorized Parliament to establish a general court of appeal for Canada, as well as any additional courts to better administer the laws of Canada. It was under this authority that the Federal Courts, the Tax Court, and the Supreme Court of Canada were established.
+The federal Parliament deals mainly with issues that concern Canada as a whole: trade between provinces, national defence, criminal law, money, patents, and the postal service. It is also responsible for the three territories: Yukon, the Northwest Territories, and Nunavut. Federal law allows territories to elect councils with powers like those of the provincial legislatures.
+
+The provinces have the authority to make laws about education, property, civil rights, the administration of justice, hospitals, municipalities, and other local or private matters within the provinces.
+
+Other federal systems
+Australia and the United States also have federal systems where jurisdiction is divided between the federal government and the various states. In contrast, the United Kingdom has a unitary system where there is only one level of government.
+There are also local or municipal governments. They are created under provincial laws and can make bylaws that regulate a variety of local matters: zoning, smoking, pesticide use, parking, business regulations, and construction permits.
+
+Aboriginal peoples in Canada have different types of government. For example, First Nations can have a range of governmental powers over reserve lands under the federal Indian Act. Other Aboriginal governments, such as self-governments, exercise these powers as a result of agreements they have negotiated with the federal and provincial or territorial governments.
+
+It was only with the Canadian Charter of Rights and Freedoms that human rights in Canada were protected in the written Constitution.
+The Constitution Act includes protection for the rights of the Aboriginal peoples (Indian, Inuit, and Métis) of Canada. Section 35 of the Constitution Act recognizes and affirms Aboriginal rights, which are rights related to the historical occupancy and use of the land by Aboriginal peoples. This is to help Aboriginal peoples preserve their customs and traditions for future generations, as continuing cultural practices. Section 35 also recognizes and affirms treaty rights, which are specifically set out in agreements between the Crown and particular groups of Aboriginal people.
+
+Bijuralism
+Canada is a bijural country – that means it has both common and civil law systems. Matters of private law in Quebec are governed by the civil law, while the common law applies in the other provinces. Federal bills and regulations must respect both types of systems, and the legal concepts within these laws must be expressed in both English and French.
+
diff --git a/tests/resources/sample_text_files/climate_change_causes.txt b/tests/resources/sample_text_files/climate_change_causes.txt
index d6f7b75..36f7b32 100644
--- a/tests/resources/sample_text_files/climate_change_causes.txt
+++ b/tests/resources/sample_text_files/climate_change_causes.txt
@@ -1,45 +1,45 @@
-Causes of climate change
-What is the most important cause of climate change?
-Human activity is the main cause of climate change. People burn fossil fuels and convert land from forests to agriculture. Since the beginning of the Industrial Revolution, people have burned more and more fossil fuels and changed vast areas of land from forests to farmland.
-
-Burning fossil fuels produces carbon dioxide, a greenhouse gas. It is called a greenhouse gas because it produces a “greenhouse effect”. The greenhouse effect makes the earth warmer, just as a greenhouse is warmer than its surroundings.
-
-Carbon dioxide is the main cause of human-induced climate change.
-
-It stays in the atmosphere for a very long time. Other greenhouse gases, such as nitrous oxide, stay in the atmosphere for a long time. Other substances only produce short-term effects.
-
-Not all substances produce warming. Some, like certain aerosols, can produce cooling.
-
-What are climate forcers?
-Carbon dioxide and other substances are referred to as climate forcers because they force or push the climate towards being warmer or cooler. They do this by affecting the flow of energy coming into and leaving the earth’s climate system.
-
-Small changes in the sun’s energy that reaches the earth can cause some climate change. But since the Industrial Revolution, adding greenhouse gases has been over 50 times more powerful than changes in the Sun's radiance. The additional greenhouse gases in earth’s atmosphere have had a strong warming effect on earth’s climate.
-
-Future emissions of greenhouse gases, particularly carbon dioxide, will determine how much more climate warming occurs.
-
-What can be done about climate change?
-Carbon dioxide is the main cause of human-induced global warming and associated climate change. It is a very long-lived gas, which means carbon dioxide builds up in the atmosphere with ongoing human emissions and remains in the atmosphere for centuries. Global warming can only be stopped by reducing global emissions of carbon dioxide from human fossil fuel combustion and industrial processes to zero, but even with zero emissions, the global temperature will remain essentially constant at its new warmer level. Emissions of other substances that warm the climate must also be substantially reduced. This indicates how difficult the challenge is.
-
-What is climate change?
-Climate change is a long-term shift in weather conditions identified by changes in temperature, precipitation, winds, and other indicators. Climate change can involve both changes in average conditions and changes in variability, including, for example, extreme events.
-
-The earth's climate is naturally variable on all time scales. However, its long-term state and average temperature are regulated by the balance between incoming and outgoing energy, which determines the Earth's energy balance. Any factor that causes a sustained change to the amount of incoming energy or the amount of outgoing energy can lead to climate change. Different factors operate on different time scales, and not all of those factors that have been responsible for changes in earth's climate in the distant past are relevant to contemporary climate change. Factors that cause climate change can be divided into two categories - those related to natural processes and those related to human activity. In addition to natural causes of climate change, changes internal to the climate system, such as variations.
-
-In ocean currents or atmospheric circulation, can also influence the climate for short periods of time. This natural internal climate variability is superimposed on the long-term forced climate change.
-
-Does climate change have natural causes?
-The Earth's climate can be affected by natural factors that are external to the climate system, such as changes in volcanic activity, solar output, and the Earth's orbit around the Sun. Of these, the two factors relevant on timescales of contemporary climate change are changes in volcanic activity and changes in solar radiation. In terms of the Earth's energy balance, these factors primarily influence the amount of incoming energy. Volcanic eruptions are episodic and have relatively short-term effects on climate. Changes in solar irradiance have contributed to climate trends over the past century but since the Industrial Revolution, the effect of additions of greenhouse gases to the atmosphere has been over 50 times that of changes in the Sun's output.
-
-Human causes
-Climate change can also be caused by human activities, such as the burning of fossil fuels and the conversion of land for forestry and agriculture. Since the beginning of the Industrial Revolution, these human influences on the climate system have increased substantially. In addition to other environmental impacts, these activities change the land surface and emit various substances to the atmosphere. These in turn can influence both the amount of incoming energy and the amount of outgoing energy and can have both warming and cooling effects on the climate. The dominant product of fossil fuel combustion is carbon dioxide, a greenhouse gas. The overall effect of human activities since the Industrial Revolution has been a warming effect, driven primarily by emissions of carbon dioxide and enhanced by emissions of other greenhouse gases.
-
-The build-up of greenhouse gases in the atmosphere has led to an enhancement of the natural greenhouse effect. It is this human-induced enhancement of the greenhouse effect that is of concern because ongoing emissions of greenhouse gases have the potential to warm the planet to levels that have never been experienced in the history of human civilization. Such climate change could have far-reaching and/or unpredictable environmental, social, and economic consequences.
-
-Short-lived and long-lived climate forcers
-Carbon dioxide is the main cause of human-induced climate change. It has been emitted in vast quantities from the burning of fossil fuels and it is a very long-lived gas, which means it continues to affect the climate system during its long residence time in the atmosphere. However, fossil fuel combustion, industrial processes, agriculture, and forestry-related activities emit other substances that also act as climate forcers. Some, such as nitrous oxide, are long-lived greenhouse gases like carbon dioxide, and so contribute to long-term climate change. Other substances have shorter atmospheric lifetimes because they are removed fairly quickly from the atmosphere. Therefore, their effect on the climate system is similarly short-lived. Together, these short-lived climate forcers are responsible for a significant amount of current climate forcing from anthropogenic substances. Some short-lived climate forcers have a climate warming effect (‘positive climate forcers') while others have a cooling effect (‘negative climate forcers').
-
-If atmospheric levels of short-lived climate forcers are continually replenished by ongoing emissions, these continue to exert a climate forcing. However, reducing emissions will quite quickly lead to reduced atmospheric levels of such substances. A number of short-lived climate forcers have climate warming effects and together are the most important contributors to the human enhancement of the greenhouse effect after carbon dioxide. This includes methane and tropospheric ozone – both greenhouse gases – and black carbon, a small solid particle formed from the incomplete combustion of carbon-based fuels (coal, oil and wood for example).
-
-Other short-lived climate forcers have climate cooling effects, most notably sulphate aerosols. Fossil fuel combustion emits sulphur dioxide into the atmosphere (in addition to carbon dioxide) which then combines with water vapour to form tiny droplets (aerosols) which reflect sunlight. Sulphate aerosols remain in the atmosphere for only a few days (washing out in what is referred to as acid rain), and so do not have the same long-term effect as greenhouse gases. The cooling from sulphate aerosols in the atmosphere has, however, offset some of the warming from other substances. That is, the warming we have experienced to date would have been even larger had it not been for elevated levels of sulphate aerosols in the atmosphere.
-
+Causes of climate change
+What is the most important cause of climate change?
+Human activity is the main cause of climate change. People burn fossil fuels and convert land from forests to agriculture. Since the beginning of the Industrial Revolution, people have burned more and more fossil fuels and changed vast areas of land from forests to farmland.
+
+Burning fossil fuels produces carbon dioxide, a greenhouse gas. It is called a greenhouse gas because it produces a “greenhouse effect”. The greenhouse effect makes the earth warmer, just as a greenhouse is warmer than its surroundings.
+
+Carbon dioxide is the main cause of human-induced climate change.
+
+It stays in the atmosphere for a very long time. Other greenhouse gases, such as nitrous oxide, stay in the atmosphere for a long time. Other substances only produce short-term effects.
+
+Not all substances produce warming. Some, like certain aerosols, can produce cooling.
+
+What are climate forcers?
+Carbon dioxide and other substances are referred to as climate forcers because they force or push the climate towards being warmer or cooler. They do this by affecting the flow of energy coming into and leaving the earth’s climate system.
+
+Small changes in the sun’s energy that reaches the earth can cause some climate change. But since the Industrial Revolution, adding greenhouse gases has been over 50 times more powerful than changes in the Sun's radiance. The additional greenhouse gases in earth’s atmosphere have had a strong warming effect on earth’s climate.
+
+Future emissions of greenhouse gases, particularly carbon dioxide, will determine how much more climate warming occurs.
+
+What can be done about climate change?
+Carbon dioxide is the main cause of human-induced global warming and associated climate change. It is a very long-lived gas, which means carbon dioxide builds up in the atmosphere with ongoing human emissions and remains in the atmosphere for centuries. Global warming can only be stopped by reducing global emissions of carbon dioxide from human fossil fuel combustion and industrial processes to zero, but even with zero emissions, the global temperature will remain essentially constant at its new warmer level. Emissions of other substances that warm the climate must also be substantially reduced. This indicates how difficult the challenge is.
+
+What is climate change?
+Climate change is a long-term shift in weather conditions identified by changes in temperature, precipitation, winds, and other indicators. Climate change can involve both changes in average conditions and changes in variability, including, for example, extreme events.
+
+The earth's climate is naturally variable on all time scales. However, its long-term state and average temperature are regulated by the balance between incoming and outgoing energy, which determines the Earth's energy balance. Any factor that causes a sustained change to the amount of incoming energy or the amount of outgoing energy can lead to climate change. Different factors operate on different time scales, and not all of those factors that have been responsible for changes in earth's climate in the distant past are relevant to contemporary climate change. Factors that cause climate change can be divided into two categories - those related to natural processes and those related to human activity. In addition to natural causes of climate change, changes internal to the climate system, such as variations.
+
+In ocean currents or atmospheric circulation, can also influence the climate for short periods of time. This natural internal climate variability is superimposed on the long-term forced climate change.
+
+Does climate change have natural causes?
+The Earth's climate can be affected by natural factors that are external to the climate system, such as changes in volcanic activity, solar output, and the Earth's orbit around the Sun. Of these, the two factors relevant on timescales of contemporary climate change are changes in volcanic activity and changes in solar radiation. In terms of the Earth's energy balance, these factors primarily influence the amount of incoming energy. Volcanic eruptions are episodic and have relatively short-term effects on climate. Changes in solar irradiance have contributed to climate trends over the past century but since the Industrial Revolution, the effect of additions of greenhouse gases to the atmosphere has been over 50 times that of changes in the Sun's output.
+
+Human causes
+Climate change can also be caused by human activities, such as the burning of fossil fuels and the conversion of land for forestry and agriculture. Since the beginning of the Industrial Revolution, these human influences on the climate system have increased substantially. In addition to other environmental impacts, these activities change the land surface and emit various substances to the atmosphere. These in turn can influence both the amount of incoming energy and the amount of outgoing energy and can have both warming and cooling effects on the climate. The dominant product of fossil fuel combustion is carbon dioxide, a greenhouse gas. The overall effect of human activities since the Industrial Revolution has been a warming effect, driven primarily by emissions of carbon dioxide and enhanced by emissions of other greenhouse gases.
+
+The build-up of greenhouse gases in the atmosphere has led to an enhancement of the natural greenhouse effect. It is this human-induced enhancement of the greenhouse effect that is of concern because ongoing emissions of greenhouse gases have the potential to warm the planet to levels that have never been experienced in the history of human civilization. Such climate change could have far-reaching and/or unpredictable environmental, social, and economic consequences.
+
+Short-lived and long-lived climate forcers
+Carbon dioxide is the main cause of human-induced climate change. It has been emitted in vast quantities from the burning of fossil fuels and it is a very long-lived gas, which means it continues to affect the climate system during its long residence time in the atmosphere. However, fossil fuel combustion, industrial processes, agriculture, and forestry-related activities emit other substances that also act as climate forcers. Some, such as nitrous oxide, are long-lived greenhouse gases like carbon dioxide, and so contribute to long-term climate change. Other substances have shorter atmospheric lifetimes because they are removed fairly quickly from the atmosphere. Therefore, their effect on the climate system is similarly short-lived. Together, these short-lived climate forcers are responsible for a significant amount of current climate forcing from anthropogenic substances. Some short-lived climate forcers have a climate warming effect (‘positive climate forcers') while others have a cooling effect (‘negative climate forcers').
+
+If atmospheric levels of short-lived climate forcers are continually replenished by ongoing emissions, these continue to exert a climate forcing. However, reducing emissions will quite quickly lead to reduced atmospheric levels of such substances. A number of short-lived climate forcers have climate warming effects and together are the most important contributors to the human enhancement of the greenhouse effect after carbon dioxide. This includes methane and tropospheric ozone – both greenhouse gases – and black carbon, a small solid particle formed from the incomplete combustion of carbon-based fuels (coal, oil and wood for example).
+
+Other short-lived climate forcers have climate cooling effects, most notably sulphate aerosols. Fossil fuel combustion emits sulphur dioxide into the atmosphere (in addition to carbon dioxide) which then combines with water vapour to form tiny droplets (aerosols) which reflect sunlight. Sulphate aerosols remain in the atmosphere for only a few days (washing out in what is referred to as acid rain), and so do not have the same long-term effect as greenhouse gases. The cooling from sulphate aerosols in the atmosphere has, however, offset some of the warming from other substances. That is, the warming we have experienced to date would have been even larger had it not been for elevated levels of sulphate aerosols in the atmosphere.
+
Learn more about the Earth's climate system.
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/coronavirus_symptoms.txt b/tests/resources/sample_text_files/coronavirus_symptoms.txt
index 10a2ce0..e8b92ea 100644
--- a/tests/resources/sample_text_files/coronavirus_symptoms.txt
+++ b/tests/resources/sample_text_files/coronavirus_symptoms.txt
@@ -1,127 +1,127 @@
-COVID-19: Symptoms, treatment, what to do if you feel sick
-Current situation
-Symptoms and treatment
-Prevention and risks
-Canada's response
-Guidance documents
-Join the effort to limit the spread of COVID-19
-
-On this page
-COVID-19 symptoms
-If you have severe symptoms
-What to do if you’re sick or were exposed
-Caring for others
-Treating COVID-19
-Long-term symptoms
-COVID-19 symptoms
-Symptoms of COVID-19 can vary:
-
-from person to person
-in different age groups
-depending on the COVID-19 variant
-Some of the more commonly reported symptoms include:
-
-sore throat
-runny nose
-sneezing
-new or worsening cough
-shortness of breath or difficulty breathing
-temperature equal to or more than 38°C
-feeling feverish
-chills
-fatigue or weakness
-muscle or body aches
-new loss of smell or taste
-headache
-abdominal pain, diarrhea and vomiting
-feeling very unwell
-If you don’t feel well or if you have any symptoms, even if mild, assume you may have COVID-19. Immediately isolate at home and away from others. Check with your local public health authority for more advice, including where and how to get tested if recommended.
-
-You may be infected but not have symptoms. However, you can still spread the virus to others. You may:
-
-develop symptoms later (be pre-symptomatic)
-never develop symptoms (be asymptomatic)
-If you’ve been in contact with someone who has COVID-19, contact your local public health authority for advice on what to do next.
-
-Learn more about:
-
-Testing for COVID-19: When to get tested and testing results
-COVID-19: Contact your local public health authority
-Start of symptoms
-You may start experiencing symptoms anywhere from 1 to 14 days after exposure. Typically, symptoms appear between 3 and 7 days after exposure.
-
-Vaccination prevents severe illness
-Vaccination is one of the most effective ways to protect our families, communities and ourselves against COVID-19. Evidence indicates that the vaccines used in Canada are very effective at preventing severe illness, hospitalization and death from COVID-19.
-
-However, vaccines are not 100% effective and you may still become infected with or without symptoms.
-
-Learn more about:
-
-Vaccines for COVID-19: How to get vaccinated
-Public health measures
-When layered together, public health measures are effective in reducing the spread of COVID-19, including variants of concern.
-
-Regardless of your vaccination status, you should continue to:
-
-follow the advice of your local public health authority
-layer multiple individual public health measures to protect yourself and others
-Learn more about:
-
-COVID-19: Provincial and territorial resources
-COVID-19: Individual public health measures
-If you have severe symptoms
-Call 911 or your local emergency number if you develop severe symptoms, such as:
-
-trouble breathing or severe shortness of breath
-persistent pressure or pain in the chest
-new onset of confusion
-difficulty waking up or staying awake
-pale, grey or blue-coloured skin, lips or nail beds
-Follow instructions for safe transport if taking an ambulance or a private vehicle to a hospital or clinic.
-
-What to do if you’re sick or were exposed
-It’s important that you continue to follow the recommendations and restrictions of your local public health authority on quarantine or isolation if you:
-
-may have COVID-19 (for example, you feel sick or have been exposed)
-have tested positive for COVID-19
-If you have to quarantine or isolate, follow appropriate precautions to reduce the risk of illness spreading within your home. If you don’t have somewhere safe to isolate, contact your local public health authority for available options.
-
-Adults and children with mild COVID-19 symptoms can stay at home while recovering. You don’t need to go to the hospital if symptoms are mild.
-
-Check with your local public health authority about quarantine or isolation periods, and reporting.
-
-Choose your local public health authority:
-Alberta
-Learn more about:
-
-Testing for COVID-19: When to get tested and testing results
-Safe Voluntary Isolation Sites Program
-Caring for others
-You may be caring for someone at home who has or may have COVID-19. If so, you should follow the appropriate precautions to reduce the risk of illness spreading within your home.
-
-Adults and children with mild COVID-19 symptoms can stay at home while recovering. You don’t need to go to the hospital if symptoms are mild.
-
-Learn more about:
-
-COVID-19: What to do if you or someone in your home is sick
-Treating COVID-19
-If you’re concerned about your symptoms, consult your health care provider. They may recommend steps or medications you can take to relieve some of your symptoms, like fever and cough.
-
-Follow the advice of your health care provider, who may prescribe treatments.
-
-Learn more about:
-
-COVID-19 treatments
-Long-term symptoms
-Some people who become infected with COVID-19 may experience long-term symptoms, even after they recover from their initial infection. This is sometimes called post COVID-19 condition or long COVID. It has also been called post-acute COVID-19 syndrome (PACS) or long haul COVID.
-
-Studies are underway to further understand what causes post COVID-19 condition and how to diagnose and treat it.
-
-If you think you have this condition, talk to your health care provider about how to manage your symptoms.
-
-Learn more about:
-
-Post COVID-19 condition (long COVID)
-Related links
-Digital factsheets, printable posters and shareable videos on COVID-19 (multilingual products available)
+COVID-19: Symptoms, treatment, what to do if you feel sick
+Current situation
+Symptoms and treatment
+Prevention and risks
+Canada's response
+Guidance documents
+Join the effort to limit the spread of COVID-19
+
+On this page
+COVID-19 symptoms
+If you have severe symptoms
+What to do if you’re sick or were exposed
+Caring for others
+Treating COVID-19
+Long-term symptoms
+COVID-19 symptoms
+Symptoms of COVID-19 can vary:
+
+from person to person
+in different age groups
+depending on the COVID-19 variant
+Some of the more commonly reported symptoms include:
+
+sore throat
+runny nose
+sneezing
+new or worsening cough
+shortness of breath or difficulty breathing
+temperature equal to or more than 38°C
+feeling feverish
+chills
+fatigue or weakness
+muscle or body aches
+new loss of smell or taste
+headache
+abdominal pain, diarrhea and vomiting
+feeling very unwell
+If you don’t feel well or if you have any symptoms, even if mild, assume you may have COVID-19. Immediately isolate at home and away from others. Check with your local public health authority for more advice, including where and how to get tested if recommended.
+
+You may be infected but not have symptoms. However, you can still spread the virus to others. You may:
+
+develop symptoms later (be pre-symptomatic)
+never develop symptoms (be asymptomatic)
+If you’ve been in contact with someone who has COVID-19, contact your local public health authority for advice on what to do next.
+
+Learn more about:
+
+Testing for COVID-19: When to get tested and testing results
+COVID-19: Contact your local public health authority
+Start of symptoms
+You may start experiencing symptoms anywhere from 1 to 14 days after exposure. Typically, symptoms appear between 3 and 7 days after exposure.
+
+Vaccination prevents severe illness
+Vaccination is one of the most effective ways to protect our families, communities and ourselves against COVID-19. Evidence indicates that the vaccines used in Canada are very effective at preventing severe illness, hospitalization and death from COVID-19.
+
+However, vaccines are not 100% effective and you may still become infected with or without symptoms.
+
+Learn more about:
+
+Vaccines for COVID-19: How to get vaccinated
+Public health measures
+When layered together, public health measures are effective in reducing the spread of COVID-19, including variants of concern.
+
+Regardless of your vaccination status, you should continue to:
+
+follow the advice of your local public health authority
+layer multiple individual public health measures to protect yourself and others
+Learn more about:
+
+COVID-19: Provincial and territorial resources
+COVID-19: Individual public health measures
+If you have severe symptoms
+Call 911 or your local emergency number if you develop severe symptoms, such as:
+
+trouble breathing or severe shortness of breath
+persistent pressure or pain in the chest
+new onset of confusion
+difficulty waking up or staying awake
+pale, grey or blue-coloured skin, lips or nail beds
+Follow instructions for safe transport if taking an ambulance or a private vehicle to a hospital or clinic.
+
+What to do if you’re sick or were exposed
+It’s important that you continue to follow the recommendations and restrictions of your local public health authority on quarantine or isolation if you:
+
+may have COVID-19 (for example, you feel sick or have been exposed)
+have tested positive for COVID-19
+If you have to quarantine or isolate, follow appropriate precautions to reduce the risk of illness spreading within your home. If you don’t have somewhere safe to isolate, contact your local public health authority for available options.
+
+Adults and children with mild COVID-19 symptoms can stay at home while recovering. You don’t need to go to the hospital if symptoms are mild.
+
+Check with your local public health authority about quarantine or isolation periods, and reporting.
+
+Choose your local public health authority:
+Alberta
+Learn more about:
+
+Testing for COVID-19: When to get tested and testing results
+Safe Voluntary Isolation Sites Program
+Caring for others
+You may be caring for someone at home who has or may have COVID-19. If so, you should follow the appropriate precautions to reduce the risk of illness spreading within your home.
+
+Adults and children with mild COVID-19 symptoms can stay at home while recovering. You don’t need to go to the hospital if symptoms are mild.
+
+Learn more about:
+
+COVID-19: What to do if you or someone in your home is sick
+Treating COVID-19
+If you’re concerned about your symptoms, consult your health care provider. They may recommend steps or medications you can take to relieve some of your symptoms, like fever and cough.
+
+Follow the advice of your health care provider, who may prescribe treatments.
+
+Learn more about:
+
+COVID-19 treatments
+Long-term symptoms
+Some people who become infected with COVID-19 may experience long-term symptoms, even after they recover from their initial infection. This is sometimes called post COVID-19 condition or long COVID. It has also been called post-acute COVID-19 syndrome (PACS) or long haul COVID.
+
+Studies are underway to further understand what causes post COVID-19 condition and how to diagnose and treat it.
+
+If you think you have this condition, talk to your health care provider about how to manage your symptoms.
+
+Learn more about:
+
+Post COVID-19 condition (long COVID)
+Related links
+Digital factsheets, printable posters and shareable videos on COVID-19 (multilingual products available)
COVID-19: Social media and promotional resources for Health Canada and Public Health Agency of Canada
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/documents_for_express_entry.txt b/tests/resources/sample_text_files/documents_for_express_entry.txt
index 9e6a950..535772d 100644
--- a/tests/resources/sample_text_files/documents_for_express_entry.txt
+++ b/tests/resources/sample_text_files/documents_for_express_entry.txt
@@ -1,41 +1,41 @@
-Documents for Express Entry
-You need certain documents at different stages of the Express Entry application process.
-
-Documents for your profile
-You don’t need to upload documents to submit a profile, but you may need information from some or all of these documents:
-
-a passport or travel document
-Venezuelan passport holders: You may need to complete extra steps when you create a profile or fill out your application.
-language test results
-proof of Canadian education or an educational credential assessment report for immigration purposes if
-you’re applying through the Federal Skilled Workers Program, or
-you want to get points for the education you got outside Canada
-provincial nomination (if you have one)
-written job offer from an employer in Canada (if you have one)
-proof of work experience
-certificate of qualification in a trade occupation issued by a Canadian province or territory (if you have one)
-proof of funds
-Documents for your application for permanent residence
-As of October 1, 2023, you no longer have to get an upfront medical exam before you submit your application for permanent residence through Express Entry. Find out when to get your medical exams and what documents you need to submit.
-
-If we invite you to apply, you’ll need to upload copies of the documents you used for your profile (see list above). Most applicants will also need to upload the following:
-
-police certificates
-proof of funds
-birth certificate, if you’re declaring dependent children
-Use of a Representative form (PDF, 137 KB), if you’ve hired a representative
-common-law union form (PDF, 2.22 MB), if you’ve declared your marital status as “common-law”
-marriage certificate, if you’ve declared your marital status as “married”
-divorce certificate and legal separation agreement, if you’ve declared your marital status as “divorced”
-death certificate, if you’ve declared your marital status as “widowed”
-adoption certificate, when a dependent child is listed as “adopted”
-Other documents
-You only need to submit these if we included them in your personalized document checklist. Your application may not be refused or rejected if you don’t submit these documents. They’re only used to check if you meet the requirements of the program you’re applying for.
-
-proof of relationship to a relative in Canada
-digital photos to confirm your identity
-other name to confirm aliases
-authority to release personal information to a designated individual form (PDF, 1.74 MB)
-any other documents that you feel are relevant to your application
-Next step
+Documents for Express Entry
+You need certain documents at different stages of the Express Entry application process.
+
+Documents for your profile
+You don’t need to upload documents to submit a profile, but you may need information from some or all of these documents:
+
+a passport or travel document
+Venezuelan passport holders: You may need to complete extra steps when you create a profile or fill out your application.
+language test results
+proof of Canadian education or an educational credential assessment report for immigration purposes if
+you’re applying through the Federal Skilled Workers Program, or
+you want to get points for the education you got outside Canada
+provincial nomination (if you have one)
+written job offer from an employer in Canada (if you have one)
+proof of work experience
+certificate of qualification in a trade occupation issued by a Canadian province or territory (if you have one)
+proof of funds
+Documents for your application for permanent residence
+As of October 1, 2023, you no longer have to get an upfront medical exam before you submit your application for permanent residence through Express Entry. Find out when to get your medical exams and what documents you need to submit.
+
+If we invite you to apply, you’ll need to upload copies of the documents you used for your profile (see list above). Most applicants will also need to upload the following:
+
+police certificates
+proof of funds
+birth certificate, if you’re declaring dependent children
+Use of a Representative form (PDF, 137 KB), if you’ve hired a representative
+common-law union form (PDF, 2.22 MB), if you’ve declared your marital status as “common-law”
+marriage certificate, if you’ve declared your marital status as “married”
+divorce certificate and legal separation agreement, if you’ve declared your marital status as “divorced”
+death certificate, if you’ve declared your marital status as “widowed”
+adoption certificate, when a dependent child is listed as “adopted”
+Other documents
+You only need to submit these if we included them in your personalized document checklist. Your application may not be refused or rejected if you don’t submit these documents. They’re only used to check if you meet the requirements of the program you’re applying for.
+
+proof of relationship to a relative in Canada
+digital photos to confirm your identity
+other name to confirm aliases
+authority to release personal information to a designated individual form (PDF, 1.74 MB)
+any other documents that you feel are relevant to your application
+Next step
Submit a profile
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/express_entry.txt b/tests/resources/sample_text_files/express_entry.txt
index 6e9f280..4c763c1 100644
--- a/tests/resources/sample_text_files/express_entry.txt
+++ b/tests/resources/sample_text_files/express_entry.txt
@@ -1,55 +1,55 @@
-How Express Entry works
-Express Entry is an online system that we use to manage immigration applications from skilled workers.
-
-There are 3 immigration programs managed through Express Entry:
-
-Canadian Experience Class
-for skilled workers who have Canadian work experience
-work experience must have been gained in the 3 years before you apply
-Learn more
-
-Federal Skilled Worker Program
-for skilled workers with foreign work experience
-must meet criteria for education and other factors
-Learn more
-
-Federal Skilled Trades Program
-for skilled workers who are qualified in a skilled trade
-must have a valid job offer or a certificate of qualification
-Learn more
-
-If you’re eligible for one of the above programs, you can also apply through Express Entry for the Provincial Nominee Program. If you’re nominated, you’ll get extra points so you can be invited to apply quickly.
-
-Step 1: Find out if you’re eligible
-There are 2 ways to find out if you’re eligible for a program that is part of Express Entry:
-
-answer a few questions to see if you meet the minimum requirements
-read the detailed requirements for each program
-Step 2: Check your score
-If you’re eligible for one or more of the Express Entry programs and submit your profile, you’ll be ranked in the Express Entry pool using the Comprehensive Ranking System (CRS).
-
-The CRS is a points-based system we use to
-
-assess and score your profile and
-rank it in the Express Entry pool
-To be invited, you need to have a score above the minimum points score for your round of invitations.
-
-Use the CRS tool to estimate your score.
-Step 3: Get your documents ready
-If you decide to fill out a profile, you’ll need documents, such as language test results, to show you’re eligible for Express Entry.
-
-Some documents take a long time to get, so you should get them ready early.
-
-Get the documents you need
-Step 4: Fill out your profile
-Your Express Entry profile is where you give us information about yourself.
-
-If you’re eligible, we’ll accept you into our pool of candidates and give you a CRS score.
-
-Submit your profile
-Step 5: Get an invitation and apply
-We send invitations to apply to the candidates with the highest scores in the pool. If we invite you to apply, you’ll have 60 days to submit your application.
-
-We’ll process most complete applications that have all supporting documents in 6 months or less.
-
+How Express Entry works
+Express Entry is an online system that we use to manage immigration applications from skilled workers.
+
+There are 3 immigration programs managed through Express Entry:
+
+Canadian Experience Class
+for skilled workers who have Canadian work experience
+work experience must have been gained in the 3 years before you apply
+Learn more
+
+Federal Skilled Worker Program
+for skilled workers with foreign work experience
+must meet criteria for education and other factors
+Learn more
+
+Federal Skilled Trades Program
+for skilled workers who are qualified in a skilled trade
+must have a valid job offer or a certificate of qualification
+Learn more
+
+If you’re eligible for one of the above programs, you can also apply through Express Entry for the Provincial Nominee Program. If you’re nominated, you’ll get extra points so you can be invited to apply quickly.
+
+Step 1: Find out if you’re eligible
+There are 2 ways to find out if you’re eligible for a program that is part of Express Entry:
+
+answer a few questions to see if you meet the minimum requirements
+read the detailed requirements for each program
+Step 2: Check your score
+If you’re eligible for one or more of the Express Entry programs and submit your profile, you’ll be ranked in the Express Entry pool using the Comprehensive Ranking System (CRS).
+
+The CRS is a points-based system we use to
+
+assess and score your profile and
+rank it in the Express Entry pool
+To be invited, you need to have a score above the minimum points score for your round of invitations.
+
+Use the CRS tool to estimate your score.
+Step 3: Get your documents ready
+If you decide to fill out a profile, you’ll need documents, such as language test results, to show you’re eligible for Express Entry.
+
+Some documents take a long time to get, so you should get them ready early.
+
+Get the documents you need
+Step 4: Fill out your profile
+Your Express Entry profile is where you give us information about yourself.
+
+If you’re eligible, we’ll accept you into our pool of candidates and give you a CRS score.
+
+Submit your profile
+Step 5: Get an invitation and apply
+We send invitations to apply to the candidates with the highest scores in the pool. If we invite you to apply, you’ll have 60 days to submit your application.
+
+We’ll process most complete applications that have all supporting documents in 6 months or less.
+
Apply through Express Entry
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/funding_culture_history_sport.txt b/tests/resources/sample_text_files/funding_culture_history_sport.txt
index 9363390..5751efb 100644
--- a/tests/resources/sample_text_files/funding_culture_history_sport.txt
+++ b/tests/resources/sample_text_files/funding_culture_history_sport.txt
@@ -1,48 +1,48 @@
-Funding - Culture, history and sport
-COVID-19: Support for the culture, heritage and sport sectors
-
-Filter items
-Showing 1 to 40 of 40 entriesShow
-100
- entries
-Program name Program description
-Athlete Assistance Program The Athlete Assistance Program supports high-performance Canadian athletes who are preparing for and participating in international sport.
-Building Communities Through Arts and Heritage Building Communities Through Arts and Heritage supports community celebrations, such as festivals, events and projects.
-Canada Arts Presentation Fund The Canada Arts Presentation Fund provides financial assistance to organizations that professionally present arts festivals or performing arts series (arts presenters) and organizations that offer support to arts presenters.
-Canada Arts Training Fund The Canada Arts Training Fund supports the training of artists with high potential through organizations that offer high-calibre training.
-Canada Book Fund The Canada Book Fund promotes a strong book industry that publishes and markets Canadian-authored books.
-Canada Cultural Investment Fund The Canada Cultural Investment Fund encourages private sector investment, partnership and sound business practices to help arts and heritage organizations be better rooted and recognized in their communities.
-Canada Cultural Spaces Fund The Canada Cultural Spaces Fund supports the improvement of physical conditions for artistic creativity and innovation.
-Canada History Fund The Canada History Fund supports the development of learning materials and activities that contribute to increasing Canadians' knowledge about Canada.
-Canada Media Fund The Canada Media Fund encourages the creation of popular, innovative Canadian content and software applications.
-Canada Music Fund The Canada Music Fund supports a wide range of musicians and entrepreneurs who create, produce and market original and diverse Canadian music.
-Canada Periodical Fund The Canada Periodical Fund provides financial assistance to Canadian print magazines, non-daily newspapers and digital periodicals.
-Canada Travelling Exhibitions Indemnification Program The Canada Travelling Exhibitions Indemnification Program assists eligible Canadian art galleries, museums, archives and libraries by assuming financial responsibility for loss or damage to objects or appurtenances in eligible travelling exhibitions.
-Canadian Conservation Institute internship programs The Canadian Conservation Institute offers paid post-graduate internships and curriculum internships that provide learning opportunities for the conservation community in Canada and abroad.
-Canadian Film or Video Production Tax Credit The Canadian Film or Video Production Tax Credit supports Canadian programming and the development of an active domestic production sector.
-Celebrate Canada Celebrate Canada provides funding for activities organized on National Indigenous Peoples Day, Saint-Jean-Baptiste Day, Canadian Multiculturalism Day and Canada Day.
-Commemorate Canada Commemorate Canada provides financial support to initiatives that commemorate Canada's significant people, places, achievements and life events.
-Court Challenges Program The Court Challenges Program provides financial support to Canadians to bring cases of national significance related to constitutional and quasi-constitutional official language rights and human rights before the courts.
-Creative Export Canada The Creative Export Canada program provides funding contributions to Canadian organizations that wish to carry out an export project.
-Destination Clic — French Enrichment Bursary Program Destination Clic is a three-week summer program for francophone students in Grades 8 and 9 who attend a French-as-a-first-language school and live outside of Quebec.
-Digital Citizen Research Program The Digital Citizen Contribution Program supports research on countering online disinformation as well as other online harms and threats to Canada’s democracy and social cohesion.
-Documentary Heritage Community Program The Documentary Heritage Community Program ensures that Canada's continuing memory is documented and accessible to current and future generations by adopting a more collaborative approach with local documentary heritage communities.
-Economic Development Initiative Learn about the funding available for activities related to business and economic development that encourage growth in Northern Ontario's Francophone communities.
-Exchanges Canada The Exchanges Canada program helps youth to enhance their knowledge and understanding of Canada, while connecting with other young Canadians.
-Explore — Second Language Bursary Program Explore is a five-week, intensive language immersion program.
-Film or Video Production Services Tax Credit The Film or Video Production Services Tax Credit promotes Canada as a location of choice for film and video productions employing Canadians.
-Funding for Commemorating the National Day for Truth and Reconciliation This program provides funding for initiatives that commemorate or increase awareness of the National Day for Truth and Reconciliation (observed on September 30) and of the history and legacy of residential schools.
-Indigenous Languages and Cultures Program The Indigenous Languages and Cultures Program promotes Indigenous languages, strengthens Indigenous cultural identity and increases Indigenous participation in Canadian society.
-Joint Initiative for Digital Citizen Research The Joint Initiative for Digital Citizen Research supports the goals of the Government of Canada's approach to protecting democracy and the Digital Citizen Initiative.
-Listen, Hear Our Voices initiative The Listen, Hear Our Voices initiative can fund Indigenous organizations to help digitize and preserve existing culture and language recordings for future generations.
-Local Journalism Initiative The Local Journalism Initiative supports the creation of original civic journalism that covers the diverse needs of underserved communities across Canada.
-Movable Cultural Property Grants Movable Cultural Property Grants help designated organizations acquire cultural property of outstanding significance and national importance to Canada.
-Multiculturalism and Anti-Racism Program The Multiculturalism and Anti-Racism Program supports the mandate of the Department of Canadian Heritage by building on Canada's strength as a diverse and inclusive society.
-Museums Assistance Program The Museums Assistance Program supports heritage institutions and workers in the preservation and presentation of heritage collections.
-National Acadian Day Funds are available to promote National Acadian Day.
-Odyssey — Language-Assistant Program Odyssey – Language-Assistant Program is a bilingual, paid work experience that provides opportunities for post-secondary students to travel to another province.
-Official Languages Support Programs Official Languages Support Programs promote French and English languages in Canadian society and enhance the vitality of English- and French-speaking communities in minority situations.
-Sport Canada Hosting Program The Sport Canada Hosting Program assists sport organizations to host the Canada Games and international sports events in Canada.
-Sport Support Program The Sport Support Program supports the development of Canadian athletes and coaches.
-Young Canada Works Young Canada Works offers a variety of summer job and internship programs to job seekers and employers.
+Funding - Culture, history and sport
+COVID-19: Support for the culture, heritage and sport sectors
+
+Filter items
+Showing 1 to 40 of 40 entriesShow
+100
+ entries
+Program name Program description
+Athlete Assistance Program The Athlete Assistance Program supports high-performance Canadian athletes who are preparing for and participating in international sport.
+Building Communities Through Arts and Heritage Building Communities Through Arts and Heritage supports community celebrations, such as festivals, events and projects.
+Canada Arts Presentation Fund The Canada Arts Presentation Fund provides financial assistance to organizations that professionally present arts festivals or performing arts series (arts presenters) and organizations that offer support to arts presenters.
+Canada Arts Training Fund The Canada Arts Training Fund supports the training of artists with high potential through organizations that offer high-calibre training.
+Canada Book Fund The Canada Book Fund promotes a strong book industry that publishes and markets Canadian-authored books.
+Canada Cultural Investment Fund The Canada Cultural Investment Fund encourages private sector investment, partnership and sound business practices to help arts and heritage organizations be better rooted and recognized in their communities.
+Canada Cultural Spaces Fund The Canada Cultural Spaces Fund supports the improvement of physical conditions for artistic creativity and innovation.
+Canada History Fund The Canada History Fund supports the development of learning materials and activities that contribute to increasing Canadians' knowledge about Canada.
+Canada Media Fund The Canada Media Fund encourages the creation of popular, innovative Canadian content and software applications.
+Canada Music Fund The Canada Music Fund supports a wide range of musicians and entrepreneurs who create, produce and market original and diverse Canadian music.
+Canada Periodical Fund The Canada Periodical Fund provides financial assistance to Canadian print magazines, non-daily newspapers and digital periodicals.
+Canada Travelling Exhibitions Indemnification Program The Canada Travelling Exhibitions Indemnification Program assists eligible Canadian art galleries, museums, archives and libraries by assuming financial responsibility for loss or damage to objects or appurtenances in eligible travelling exhibitions.
+Canadian Conservation Institute internship programs The Canadian Conservation Institute offers paid post-graduate internships and curriculum internships that provide learning opportunities for the conservation community in Canada and abroad.
+Canadian Film or Video Production Tax Credit The Canadian Film or Video Production Tax Credit supports Canadian programming and the development of an active domestic production sector.
+Celebrate Canada Celebrate Canada provides funding for activities organized on National Indigenous Peoples Day, Saint-Jean-Baptiste Day, Canadian Multiculturalism Day and Canada Day.
+Commemorate Canada Commemorate Canada provides financial support to initiatives that commemorate Canada's significant people, places, achievements and life events.
+Court Challenges Program The Court Challenges Program provides financial support to Canadians to bring cases of national significance related to constitutional and quasi-constitutional official language rights and human rights before the courts.
+Creative Export Canada The Creative Export Canada program provides funding contributions to Canadian organizations that wish to carry out an export project.
+Destination Clic — French Enrichment Bursary Program Destination Clic is a three-week summer program for francophone students in Grades 8 and 9 who attend a French-as-a-first-language school and live outside of Quebec.
+Digital Citizen Research Program The Digital Citizen Contribution Program supports research on countering online disinformation as well as other online harms and threats to Canada’s democracy and social cohesion.
+Documentary Heritage Community Program The Documentary Heritage Community Program ensures that Canada's continuing memory is documented and accessible to current and future generations by adopting a more collaborative approach with local documentary heritage communities.
+Economic Development Initiative Learn about the funding available for activities related to business and economic development that encourage growth in Northern Ontario's Francophone communities.
+Exchanges Canada The Exchanges Canada program helps youth to enhance their knowledge and understanding of Canada, while connecting with other young Canadians.
+Explore — Second Language Bursary Program Explore is a five-week, intensive language immersion program.
+Film or Video Production Services Tax Credit The Film or Video Production Services Tax Credit promotes Canada as a location of choice for film and video productions employing Canadians.
+Funding for Commemorating the National Day for Truth and Reconciliation This program provides funding for initiatives that commemorate or increase awareness of the National Day for Truth and Reconciliation (observed on September 30) and of the history and legacy of residential schools.
+Indigenous Languages and Cultures Program The Indigenous Languages and Cultures Program promotes Indigenous languages, strengthens Indigenous cultural identity and increases Indigenous participation in Canadian society.
+Joint Initiative for Digital Citizen Research The Joint Initiative for Digital Citizen Research supports the goals of the Government of Canada's approach to protecting democracy and the Digital Citizen Initiative.
+Listen, Hear Our Voices initiative The Listen, Hear Our Voices initiative can fund Indigenous organizations to help digitize and preserve existing culture and language recordings for future generations.
+Local Journalism Initiative The Local Journalism Initiative supports the creation of original civic journalism that covers the diverse needs of underserved communities across Canada.
+Movable Cultural Property Grants Movable Cultural Property Grants help designated organizations acquire cultural property of outstanding significance and national importance to Canada.
+Multiculturalism and Anti-Racism Program The Multiculturalism and Anti-Racism Program supports the mandate of the Department of Canadian Heritage by building on Canada's strength as a diverse and inclusive society.
+Museums Assistance Program The Museums Assistance Program supports heritage institutions and workers in the preservation and presentation of heritage collections.
+National Acadian Day Funds are available to promote National Acadian Day.
+Odyssey — Language-Assistant Program Odyssey – Language-Assistant Program is a bilingual, paid work experience that provides opportunities for post-secondary students to travel to another province.
+Official Languages Support Programs Official Languages Support Programs promote French and English languages in Canadian society and enhance the vitality of English- and French-speaking communities in minority situations.
+Sport Canada Hosting Program The Sport Canada Hosting Program assists sport organizations to host the Canada Games and international sports events in Canada.
+Sport Support Program The Sport Support Program supports the development of Canadian athletes and coaches.
+Young Canada Works Young Canada Works offers a variety of summer job and internship programs to job seekers and employers.
Youth Take Charge Program The Youth Take Charge Program supports youth-led projects that exemplify the ability to strengthen youth attachment to Canada.
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/healthcare_system.txt b/tests/resources/sample_text_files/healthcare_system.txt
index eca09a7..8b70ea8 100644
--- a/tests/resources/sample_text_files/healthcare_system.txt
+++ b/tests/resources/sample_text_files/healthcare_system.txt
@@ -1,120 +1,120 @@
-Canada's health care system
-Learn about Canada's health care system, including Medicare, funding, accessing health care services and delivery.
-
-On this page
-About Medicare
-Federal funding for health care
-Accessing health care services
-About Medicare
-Medicare is a term that refers to Canada's publicly funded health care system. Instead of having a single national plan, we have 13 provincial and territorial health care insurance plans. Under this system, all Canadian residents have reasonable access to medically necessary hospital and physician services without paying out-of-pocket.
-
-Roles and responsibilities for health care services are shared between provincial and territorial governments and the federal government.
-
-The provincial and territorial governments are responsible for the management, organization and delivery of health care services for their residents.
-
-The federal government is responsible for:
-
-setting and administering national standards for the health care system through the Canada Health Act
-providing funding support for provincial and territorial health care services
-supporting the delivery for health care services to specific groups
-providing other health-related functions
-Canada Health Act
-Provincial and territorial health care insurance plans must meet the standards described in the Canada Health Act. This is necessary to get their full payment under the Canada Health Transfer.
-
-These standards include:
-
-public administration
-comprehensiveness
-universality
-portability
-accessibility
-Public administration
-The provincial and territorial plans must be administered and operated on a non profit basis by a public authority.
-
-Comprehensiveness
-The provincial and territorial plans must insure all medically necessary services provided by:
-
-hospitals
-physicians
-dentists, when the service must be performed in a hospital
-Medically necessary services are not defined in the Canada Health Act. The provincial and territorial health care insurance plans consult with their respective physician colleges or groups. Together, they decide which services are medically necessary for health care insurance purposes.
-
-If a service is considered medically necessary, the full cost must be covered by the public health care insurance plan.
-
-Universality
-The provincial and territorial plans must cover all residents.
-
-Portability
-The provincial and territorial plans must cover all residents when they travel within Canada. Limited coverage is also required for travel outside the country.
-
-When a resident moves to another province, they can continue to use their original health care insurance card for 3 months. This gives them enough time to register for the new plan and receive their new health insurance card.
-
-Accessibility
-The provincial and territorial plans must provide all residents reasonable access to medically necessary services. Access must be based on medical need and not the ability to pay.
-
-Federal funding for health care
-The federal government provides health care funding to the provinces and territories through the Canada Health Transfer.
-
-Provinces and territories receive additional federal funding support through other fiscal transfers.
-
-Delivering health care services to specific groups
-We provide certain direct health care services to some population groups, including:
-
-First Nations people living on reserves
-Inuit
-serving members of the Canadian Forces
-eligible veterans
-inmates in federal penitentiaries
-some groups of refugee claimants
-Other federal health-related functions
-We are responsible for the regulation of products, such as:
-
-food
-consumer products
-pharmaceuticals
-cosmetics
-chemicals
-pesticides
-medical devices
-radiation-emitting devices like cellphones
-The federal government also supports:
-
-health research
-health promotion and protection
-disease monitoring and prevention
-The government also provides tax support for health-related costs:
-
-tax credits for:
-disability
-medical expenses
-caregivers and disabled dependents
-tax rebates to public institutions for health services
-deductions for private health insurance premiums for the self-employed
-Accessing health care services
-Canadians most often turn to primary health care services as their first point of contact with the health care system.
-
-In general, primary health care:
-
-delivers first-contact health care services
-coordinates patients' health care services to support:
-continuity of care, which means receiving high quality care from diagnosis to recovery
-ease of movement across the health care system when more specialized services are needed from specialists or in hospitals
-The provinces and territories also provide supplemental coverage to certain groups of people, such as:
-
-seniors
-children
-social assistance recipients
-This helps pay for health care services that are not generally covered under the publicly funded health care system. These services include:
-
-vision care
-dental care
-prescription drugs
-ambulance services
-independent living (home care)
-Those who do not qualify for supplementary benefits under government plans pay for these services through:
-
-out-of-pocket payments
-private health insurance plans
-For more information
-Canada Health Act
+Canada's health care system
+Learn about Canada's health care system, including Medicare, funding, accessing health care services and delivery.
+
+On this page
+About Medicare
+Federal funding for health care
+Accessing health care services
+About Medicare
+Medicare is a term that refers to Canada's publicly funded health care system. Instead of having a single national plan, we have 13 provincial and territorial health care insurance plans. Under this system, all Canadian residents have reasonable access to medically necessary hospital and physician services without paying out-of-pocket.
+
+Roles and responsibilities for health care services are shared between provincial and territorial governments and the federal government.
+
+The provincial and territorial governments are responsible for the management, organization and delivery of health care services for their residents.
+
+The federal government is responsible for:
+
+setting and administering national standards for the health care system through the Canada Health Act
+providing funding support for provincial and territorial health care services
+supporting the delivery for health care services to specific groups
+providing other health-related functions
+Canada Health Act
+Provincial and territorial health care insurance plans must meet the standards described in the Canada Health Act. This is necessary to get their full payment under the Canada Health Transfer.
+
+These standards include:
+
+public administration
+comprehensiveness
+universality
+portability
+accessibility
+Public administration
+The provincial and territorial plans must be administered and operated on a non profit basis by a public authority.
+
+Comprehensiveness
+The provincial and territorial plans must insure all medically necessary services provided by:
+
+hospitals
+physicians
+dentists, when the service must be performed in a hospital
+Medically necessary services are not defined in the Canada Health Act. The provincial and territorial health care insurance plans consult with their respective physician colleges or groups. Together, they decide which services are medically necessary for health care insurance purposes.
+
+If a service is considered medically necessary, the full cost must be covered by the public health care insurance plan.
+
+Universality
+The provincial and territorial plans must cover all residents.
+
+Portability
+The provincial and territorial plans must cover all residents when they travel within Canada. Limited coverage is also required for travel outside the country.
+
+When a resident moves to another province, they can continue to use their original health care insurance card for 3 months. This gives them enough time to register for the new plan and receive their new health insurance card.
+
+Accessibility
+The provincial and territorial plans must provide all residents reasonable access to medically necessary services. Access must be based on medical need and not the ability to pay.
+
+Federal funding for health care
+The federal government provides health care funding to the provinces and territories through the Canada Health Transfer.
+
+Provinces and territories receive additional federal funding support through other fiscal transfers.
+
+Delivering health care services to specific groups
+We provide certain direct health care services to some population groups, including:
+
+First Nations people living on reserves
+Inuit
+serving members of the Canadian Forces
+eligible veterans
+inmates in federal penitentiaries
+some groups of refugee claimants
+Other federal health-related functions
+We are responsible for the regulation of products, such as:
+
+food
+consumer products
+pharmaceuticals
+cosmetics
+chemicals
+pesticides
+medical devices
+radiation-emitting devices like cellphones
+The federal government also supports:
+
+health research
+health promotion and protection
+disease monitoring and prevention
+The government also provides tax support for health-related costs:
+
+tax credits for:
+disability
+medical expenses
+caregivers and disabled dependents
+tax rebates to public institutions for health services
+deductions for private health insurance premiums for the self-employed
+Accessing health care services
+Canadians most often turn to primary health care services as their first point of contact with the health care system.
+
+In general, primary health care:
+
+delivers first-contact health care services
+coordinates patients' health care services to support:
+continuity of care, which means receiving high quality care from diagnosis to recovery
+ease of movement across the health care system when more specialized services are needed from specialists or in hospitals
+The provinces and territories also provide supplemental coverage to certain groups of people, such as:
+
+seniors
+children
+social assistance recipients
+This helps pay for health care services that are not generally covered under the publicly funded health care system. These services include:
+
+vision care
+dental care
+prescription drugs
+ambulance services
+independent living (home care)
+Those who do not qualify for supplementary benefits under government plans pay for these services through:
+
+out-of-pocket payments
+private health insurance plans
+For more information
+Canada Health Act
Working together to improve health care in Canada
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/history_of_canada.txt b/tests/resources/sample_text_files/history_of_canada.txt
index f5b1458..faab38c 100644
--- a/tests/resources/sample_text_files/history_of_canada.txt
+++ b/tests/resources/sample_text_files/history_of_canada.txt
@@ -1,42 +1,42 @@
-History of Canada
-Canadian history does not begin with the arrival of European explorers over 500 years ago; people have been living in the country that we now call Canada for thousands of years.
-
-Canada’s original inhabitants
-The new Dominion of Canada
-The Northwest Territories and Manitoba
-British Columbia, Prince Edward Island and Yukon
-Saskatchewan and Alberta
-Newfoundland and Nunavut
-Canada’s original inhabitants
-Centuries before Europeans began to settle in North America, explorers who came here found thriving First Nations and Inuit societies with their own beliefs, way of life and rich history.
-
-When the first European explorers came to Canada they found all regions occupied by native peoples they called “Indians,” thinking they had reached the East Indies. The native people lived off the land, some by hunting and gathering, others by raising crops.
-
-The Huron-Wendat of the Great Lakes Region, like the Iroquois, were farmers and hunters. The Cree and Dene of the Northwest were hunter-gatherers. The Sioux were nomadic, following the bison (buffalo) herd. The Inuit lived off Arctic wildlife. West Coast natives preserved fish by drying and smoking. Warfare was common among Aboriginal groups as they competed for land, resources and prestige.
-
-The arrival of European traders, missionaries, soldiers and colonists changed the native way of life forever. Large numbers of Aboriginals died of European diseases to which they lacked immunity. However, Aboriginals and Europeans formed strong economic, religious and military bonds in the first 200 years of coexistence which laid the foundations of Canada.
-
-The new Dominion of Canada
-Today, Canada is made up of 10 provinces and three territories.
-
-However, when the British North America Act, 1867, (now the Constitution Acts, 1867 to 1982) created the new Dominion of Canada, there were only four provinces – Ontario, Quebec, Nova Scotia and New Brunswick.
-
-The Northwest Territories and Manitoba
-The year 1870 – three years after Confederation – brought multiple historic changes to land ownership, including:
-
-Canada’s purchase of Rupert’s Land from the Hudson’s Bay Company, which had been granted a charter to the area by the British government exactly two centuries earlier. Rupert's Land spanned all land drained by rivers flowing into Hudson Bay – roughly 40 per cent of present-day Canada. The selling price was 300,000 pounds sterling.
-Britain’s transfer of the North-Western Territory to Canada. Previously, the Hudson’s Bay Company had an exclusive licence to trade in this area, which stretched west to the colony of British Columbia and north to the Arctic Circle. When it was discovered in the mid-1800s that the Prairies had enormous farming potential, the British government refused to renew the company’s licence. With the Hudson's Bay Company out of the area, Britain was free to turn it over to Canada.
-The combination of Rupert’s Land and the North-Western Territory to form the Northwest Territories, followed by the creation of the Province of Manitoba from a small part of this area.
-British Columbia, Prince Edward Island and Yukon
-Subsequent years brought more changes to Canada’s territorial boundaries:
-
-In 1871, British Columbia joined the union with the promise of a railway to link it to the rest of the country.
-In 1873, Prince Edward Island, which had previously declined an offer to join Confederation, became Canada's seventh province.
-Yukon, which had been a district of the Northwest Territories since 1895, became a separate territory in 1898.
-Saskatchewan and Alberta
-Meanwhile, Canada was opening up its west, just as its neighbour to the south had done before. Migrants from eastern Canada and immigrants from Europe and the United States began to fill the Prairies, which were still part of the Northwest Territories. Then, in 1905, the provinces of Saskatchewan and Alberta were created, completing the map of Western Canada.
-
-Newfoundland and Nunavut
-After great debate and two referenda, the people of Newfoundland voted to join Confederation in 1949, creating Canada’s tenth province.
-
+History of Canada
+Canadian history does not begin with the arrival of European explorers over 500 years ago; people have been living in the country that we now call Canada for thousands of years.
+
+Canada’s original inhabitants
+The new Dominion of Canada
+The Northwest Territories and Manitoba
+British Columbia, Prince Edward Island and Yukon
+Saskatchewan and Alberta
+Newfoundland and Nunavut
+Canada’s original inhabitants
+Centuries before Europeans began to settle in North America, explorers who came here found thriving First Nations and Inuit societies with their own beliefs, way of life and rich history.
+
+When the first European explorers came to Canada they found all regions occupied by native peoples they called “Indians,” thinking they had reached the East Indies. The native people lived off the land, some by hunting and gathering, others by raising crops.
+
+The Huron-Wendat of the Great Lakes Region, like the Iroquois, were farmers and hunters. The Cree and Dene of the Northwest were hunter-gatherers. The Sioux were nomadic, following the bison (buffalo) herd. The Inuit lived off Arctic wildlife. West Coast natives preserved fish by drying and smoking. Warfare was common among Aboriginal groups as they competed for land, resources and prestige.
+
+The arrival of European traders, missionaries, soldiers and colonists changed the native way of life forever. Large numbers of Aboriginals died of European diseases to which they lacked immunity. However, Aboriginals and Europeans formed strong economic, religious and military bonds in the first 200 years of coexistence which laid the foundations of Canada.
+
+The new Dominion of Canada
+Today, Canada is made up of 10 provinces and three territories.
+
+However, when the British North America Act, 1867, (now the Constitution Acts, 1867 to 1982) created the new Dominion of Canada, there were only four provinces – Ontario, Quebec, Nova Scotia and New Brunswick.
+
+The Northwest Territories and Manitoba
+The year 1870 – three years after Confederation – brought multiple historic changes to land ownership, including:
+
+Canada’s purchase of Rupert’s Land from the Hudson’s Bay Company, which had been granted a charter to the area by the British government exactly two centuries earlier. Rupert's Land spanned all land drained by rivers flowing into Hudson Bay – roughly 40 per cent of present-day Canada. The selling price was 300,000 pounds sterling.
+Britain’s transfer of the North-Western Territory to Canada. Previously, the Hudson’s Bay Company had an exclusive licence to trade in this area, which stretched west to the colony of British Columbia and north to the Arctic Circle. When it was discovered in the mid-1800s that the Prairies had enormous farming potential, the British government refused to renew the company’s licence. With the Hudson's Bay Company out of the area, Britain was free to turn it over to Canada.
+The combination of Rupert’s Land and the North-Western Territory to form the Northwest Territories, followed by the creation of the Province of Manitoba from a small part of this area.
+British Columbia, Prince Edward Island and Yukon
+Subsequent years brought more changes to Canada’s territorial boundaries:
+
+In 1871, British Columbia joined the union with the promise of a railway to link it to the rest of the country.
+In 1873, Prince Edward Island, which had previously declined an offer to join Confederation, became Canada's seventh province.
+Yukon, which had been a district of the Northwest Territories since 1895, became a separate territory in 1898.
+Saskatchewan and Alberta
+Meanwhile, Canada was opening up its west, just as its neighbour to the south had done before. Migrants from eastern Canada and immigrants from Europe and the United States began to fill the Prairies, which were still part of the Northwest Territories. Then, in 1905, the provinces of Saskatchewan and Alberta were created, completing the map of Western Canada.
+
+Newfoundland and Nunavut
+After great debate and two referenda, the people of Newfoundland voted to join Confederation in 1949, creating Canada’s tenth province.
+
On April 1, 1999, Nunavut was created from the eastern part of the Northwest Territories, covering 1.9 million square kilometres of Canada’s Eastern Arctic.
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/how_courts_are_organized.txt b/tests/resources/sample_text_files/how_courts_are_organized.txt
index fd808ee..579dec7 100644
--- a/tests/resources/sample_text_files/how_courts_are_organized.txt
+++ b/tests/resources/sample_text_files/how_courts_are_organized.txt
@@ -1,65 +1,65 @@
-How the Courts are Organized
-Previous Page Table of Contents Next Page
-Each type of court has its own jurisdiction, which means that it has the authority to decide specific types of cases. Canada has four levels of court.
-
-Provincial and territorial (lower) courts: These courts handle most cases that come into the system. They are established by provincial and territorial governments.
-Provincial and territorial superior courts: These are courts of plenary, or complete, jurisdiction established under section 96 of the Constitution Act, 1867. They deal with more serious crimes and also hear appeals from provincial and territorial courts. The Federal Court is on the same level, but is responsible for deciding civil matters assigned to it by statute, such as immigration and patents.
-Provincial and territorial courts of appeal and the Federal Court of Appeal.
-The Supreme Court of Canada, which is the final court of appeal for Canada.
-Outline of Canada's Court System
-
-Outline of Canada's Court System
-
-Outline of Canada’s Court System – Text version
-This chart provides an overview of the hierarchy of Canada’s Court System. It is arranged as follows:
-
-Courts
-
-Supreme Court of Canada
-Court Martial Appeal Court
-Military Courts
-Provincial/Territorial Courts of Appeals
-Provincial/Territorial Superior Courts
-Provincial/Territorial Courts
-Federal Court of Appeal
-Federal Court
-Tax Court of Canada
-Administrative Boards and Tribunals
-
-Provincial/Territorial Administrative Tribunals
-Federal Administrative Tribunals
-Provincial/territorial courts
-Each province and territory has a provincial/territorial court and hears cases involving either federal or provincial/territorial laws.
-
-In Nunavut, the Nunavut Court of Justice, which is Canada’s only single-level trial court, combines the power of the superior trial court and the territorial court so that the same judge can hear all cases that arise in the territory.
-
-Provincial/territorial courts deal with:
-
-most criminal offences, except the most serious ones;
-family law matters (e.g., child support, child protection, adoption, but not divorce);
-young persons from 12 to 17 years old in conflict with the law;
-traffic and bylaw violations;
-provincial/territorial regulatory offences;
-claims involving money, up to a certain amount (set by the province or territory in question);
-small claims (civil cases that resolve private disputes involving limited sums of money); and
-all preliminary inquiries (hearings to determine whether there is enough evidence to justify a full trial in serious criminal cases).
-Some courts at this level are dedicated to particular types of offences or groups of offenders. One example is the Drug Treatment Court. The objective of these courts is to address the needs of non-violent offenders who are charged with criminal offences that were motivated by their addiction. Those who qualify are offered judicial supervision and treatment for their addiction, with the help of community support services. Some provinces and territories have also established Domestic Violence Courts, with the objective of improving justice system responses to domestic violence, providing better support to victims and survivors, and holding offenders accountable.
-
-Youth courts handle cases for young people 12 to 17 years old who are charged with an offence under federal youth justice laws. Youth courts provide protections appropriate to the age of the accused, including protecting his or her privacy. Any court at either the provincial/territorial or superior court level can be designated a youth court.
-
-Provincial/territorial superior courts
-Each province and territory has superior courts, which are courts of “inherent jurisdiction.” This means that they can hear cases in any area except when a statute or rule limits that authority. The superior courts try the most serious criminal and civil cases. These include divorce cases and cases that involve large amounts of money (the minimum is set by the province or territory in question). The jurisdiction of superior courts originally came from the first courts in England, whose authority over government actions was based on Magna Carta. Proceedings in superior courts are thus a continuation of a court process that dates right back to the beginnings of the common law system.
-
-The superior courts also act as a court of first appeal for the provincial and territorial courts that the provinces and territories maintain. Although the provinces and territories administer superior courts, the federal government appoints and pays the judges.
-
-Although there are permanent court houses and judicial centres in all of Canada’s provinces and territories, Canada’s population is scattered widely across huge expanses of land, and it may be difficult for individuals to travel to a court house to have their matter heard. In response, courts often travel "on circuit" to small or isolated areas.
-
-For example, in Nunavut, most of the communities are small and isolated from Iqaluit, the capital, so the court travels to them. The circuit court includes a judge, a clerk, a court reporter, a prosecutor, and at least one defence attorney. Interpreters are hired in the communities when possible, or travel with the circuit court when necessary. The court holds regular sessions in Iqaluit and flies to about 85 percent of all 25 communities in Nunavut, as often as every six weeks or as seldom as every two years, depending on how often it’s needed.
-
-Family courts
-In most provinces and territories, the superior court has special divisions, such as the family division. Some superior courts have established specialized family courts to deal with specific family law matters, including divorce and property claims.
-
-Several provinces (Manitoba, New Brunswick, Newfoundland and Labrador, Nova Scotia, Ontario, Prince Edward Island and Saskatchewan) use unified family courts. This allows a single court to deal with all aspects of family law, using specialized superior court judges and services. These courts encourage constructive, non-adversarial techniques to resolve issues, and provide access to support services through community organizations. These services typically include such programs as parent-education sessions, mediation, and counselling.
-
-Provincial/territorial courts of appeal
+How the Courts are Organized
+Previous Page Table of Contents Next Page
+Each type of court has its own jurisdiction, which means that it has the authority to decide specific types of cases. Canada has four levels of court.
+
+Provincial and territorial (lower) courts: These courts handle most cases that come into the system. They are established by provincial and territorial governments.
+Provincial and territorial superior courts: These are courts of plenary, or complete, jurisdiction established under section 96 of the Constitution Act, 1867. They deal with more serious crimes and also hear appeals from provincial and territorial courts. The Federal Court is on the same level, but is responsible for deciding civil matters assigned to it by statute, such as immigration and patents.
+Provincial and territorial courts of appeal and the Federal Court of Appeal.
+The Supreme Court of Canada, which is the final court of appeal for Canada.
+Outline of Canada's Court System
+
+Outline of Canada's Court System
+
+Outline of Canada’s Court System – Text version
+This chart provides an overview of the hierarchy of Canada’s Court System. It is arranged as follows:
+
+Courts
+
+Supreme Court of Canada
+Court Martial Appeal Court
+Military Courts
+Provincial/Territorial Courts of Appeals
+Provincial/Territorial Superior Courts
+Provincial/Territorial Courts
+Federal Court of Appeal
+Federal Court
+Tax Court of Canada
+Administrative Boards and Tribunals
+
+Provincial/Territorial Administrative Tribunals
+Federal Administrative Tribunals
+Provincial/territorial courts
+Each province and territory has a provincial/territorial court and hears cases involving either federal or provincial/territorial laws.
+
+In Nunavut, the Nunavut Court of Justice, which is Canada’s only single-level trial court, combines the power of the superior trial court and the territorial court so that the same judge can hear all cases that arise in the territory.
+
+Provincial/territorial courts deal with:
+
+most criminal offences, except the most serious ones;
+family law matters (e.g., child support, child protection, adoption, but not divorce);
+young persons from 12 to 17 years old in conflict with the law;
+traffic and bylaw violations;
+provincial/territorial regulatory offences;
+claims involving money, up to a certain amount (set by the province or territory in question);
+small claims (civil cases that resolve private disputes involving limited sums of money); and
+all preliminary inquiries (hearings to determine whether there is enough evidence to justify a full trial in serious criminal cases).
+Some courts at this level are dedicated to particular types of offences or groups of offenders. One example is the Drug Treatment Court. The objective of these courts is to address the needs of non-violent offenders who are charged with criminal offences that were motivated by their addiction. Those who qualify are offered judicial supervision and treatment for their addiction, with the help of community support services. Some provinces and territories have also established Domestic Violence Courts, with the objective of improving justice system responses to domestic violence, providing better support to victims and survivors, and holding offenders accountable.
+
+Youth courts handle cases for young people 12 to 17 years old who are charged with an offence under federal youth justice laws. Youth courts provide protections appropriate to the age of the accused, including protecting his or her privacy. Any court at either the provincial/territorial or superior court level can be designated a youth court.
+
+Provincial/territorial superior courts
+Each province and territory has superior courts, which are courts of “inherent jurisdiction.” This means that they can hear cases in any area except when a statute or rule limits that authority. The superior courts try the most serious criminal and civil cases. These include divorce cases and cases that involve large amounts of money (the minimum is set by the province or territory in question). The jurisdiction of superior courts originally came from the first courts in England, whose authority over government actions was based on Magna Carta. Proceedings in superior courts are thus a continuation of a court process that dates right back to the beginnings of the common law system.
+
+The superior courts also act as a court of first appeal for the provincial and territorial courts that the provinces and territories maintain. Although the provinces and territories administer superior courts, the federal government appoints and pays the judges.
+
+Although there are permanent court houses and judicial centres in all of Canada’s provinces and territories, Canada’s population is scattered widely across huge expanses of land, and it may be difficult for individuals to travel to a court house to have their matter heard. In response, courts often travel "on circuit" to small or isolated areas.
+
+For example, in Nunavut, most of the communities are small and isolated from Iqaluit, the capital, so the court travels to them. The circuit court includes a judge, a clerk, a court reporter, a prosecutor, and at least one defence attorney. Interpreters are hired in the communities when possible, or travel with the circuit court when necessary. The court holds regular sessions in Iqaluit and flies to about 85 percent of all 25 communities in Nunavut, as often as every six weeks or as seldom as every two years, depending on how often it’s needed.
+
+Family courts
+In most provinces and territories, the superior court has special divisions, such as the family division. Some superior courts have established specialized family courts to deal with specific family law matters, including divorce and property claims.
+
+Several provinces (Manitoba, New Brunswick, Newfoundland and Labrador, Nova Scotia, Ontario, Prince Edward Island and Saskatchewan) use unified family courts. This allows a single court to deal with all aspects of family law, using specialized superior court judges and services. These courts encourage constructive, non-adversarial techniques to resolve issues, and provide access to support services through community organizations. These services typically include such programs as parent-education sessions, mediation, and counselling.
+
+Provincial/territorial courts of appeal
Each province and territory also has a court of appeal. These courts hear appeals from the decisions of the superior courts and the provincial/territorial courts. These can include commercial disputes, property disputes, negligence claims, family disputes, bankruptcies, and corporate reorganizations. Appeals are usually heard by a panel of three judges. The courts of appeal also hear constitutional questions that may be raised in appeals involving individuals, governments, or governmental agencies.
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/national_security_act.txt b/tests/resources/sample_text_files/national_security_act.txt
index e135860..9421e7a 100644
--- a/tests/resources/sample_text_files/national_security_act.txt
+++ b/tests/resources/sample_text_files/national_security_act.txt
@@ -1,16 +1,16 @@
-Our Security, Our Rights
-
-On June 21, 2019, an Act respecting national security matters (the National Security Act, 2017) received royal assent.
-
-This legislation modernizes and enhances Canada’s security and intelligence laws by providing our agencies with the clear constitutional and legal framework they need to do their work effectively while safeguarding Canadians’ rights and freedoms.
-
-The National Security Act, 2017 accomplishes three important objectives:
-
-Enhancing accountability and transparency
-The results of the public consultation on national security demonstrated a demand from the public for increased accountability and greater transparency on national security matters. The National Security Act, 2017 addresses this demand in a number of ways as outlined in this section.
-
-Fulfilling commitments to address former C-51
-Canadians were clear in the consultation they expect their rights and freedoms to be protected at the same time as their security. The measures outlined in this section demonstrate how the National Security Act, 2017 addresses the problematic elements of former Bill C-51.
-
-Strengthening security and protecting rights
+Our Security, Our Rights
+
+On June 21, 2019, an Act respecting national security matters (the National Security Act, 2017) received royal assent.
+
+This legislation modernizes and enhances Canada’s security and intelligence laws by providing our agencies with the clear constitutional and legal framework they need to do their work effectively while safeguarding Canadians’ rights and freedoms.
+
+The National Security Act, 2017 accomplishes three important objectives:
+
+Enhancing accountability and transparency
+The results of the public consultation on national security demonstrated a demand from the public for increased accountability and greater transparency on national security matters. The National Security Act, 2017 addresses this demand in a number of ways as outlined in this section.
+
+Fulfilling commitments to address former C-51
+Canadians were clear in the consultation they expect their rights and freedoms to be protected at the same time as their security. The measures outlined in this section demonstrate how the National Security Act, 2017 addresses the problematic elements of former Bill C-51.
+
+Strengthening security and protecting rights
As the threat environment has evolved, so too must the legislative landscape and the political environment in which it operates. The measures outlined in this section demonstrate how the National Security Act, 2017 strengthens Canada's ability to address new threats and safeguard Canadian's rights and freedoms.
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/net_zero_emissions_by_2050.txt b/tests/resources/sample_text_files/net_zero_emissions_by_2050.txt
index 17a0c2e..4c830d0 100644
--- a/tests/resources/sample_text_files/net_zero_emissions_by_2050.txt
+++ b/tests/resources/sample_text_files/net_zero_emissions_by_2050.txt
@@ -1,33 +1,33 @@
-Net-zero emissions by 2050
-
-The transition to a cleaner, prosperous economy needs to be both an immediate priority and a sustained effort over the years and decades ahead. Canada must keep innovating to meet this long-term goal, strengthening and building on existing measures that fight climate change and transform the economy.
-
-To avert the worst impacts of climate change, the Government of Canada is committed to achieving net-zero emissions by 2050.
-
-This goal will require support and engagement from all parts of society, including provinces and territories, cities, Indigenous Peoples, youth, and businesses.
-
-What is Net-Zero?
-Achieving net-zero emissions means our economy either emits no greenhouse gas emissions or offsets its emissions, for example, through actions such as tree planting or employing technologies that can capture carbon before it is released into the air. This is essential to keeping the world safe and livable for our kids and grandkids.
-
-Canada has joined over 120 countries in committing to be net-zero emissions by 2050, including all other G7 nations (United Kingdom, United States, Germany, Italy, France, and Japan), A number of provinces and cities have already made net-zero-by-2050 commitments, including Guelph, Vancouver, Hamilton, Toronto, Halifax, Newfoundland and Labrador, and most recently Quebec. Prince Edward Island has also pledged to reach net-zero greenhouse gas emissions by 2040. Nova Scotia and British Columbia have put into place, or plan to put into place, provincial net-zero-by-2050 legislation.
-
-Canada’s plan to reach Net-Zero
-The Canadian Net-Zero Emissions Accountability Act, which became law on June 29, 2021, enshrines in legislation Canada’s commitment to achieve net-zero emissions by 2050. The Act ensures transparency and accountability as the government works to deliver on its targets. The Act requires public participation and independent advice to guide the Government of Canada’s efforts.
-
-2030 Emissions Reduction Plan: Clean Air, Strong Economy
-Building on the actions in Canada’s strengthened climate plan (2020), and the Pan-Canadian Framework (2016), the 2030 Emissions Reduction Plan (2022) provides a roadmap to how Canada will meet its enhanced Paris Agreement target to reduce emissions by 40-45% from 2005 levels by 2030.
-
-The Government of Canada published the country’s 2030 Emissions Reduction Plan in March 2022. The plan reflects input from provinces, territories, Indigenous Peoples, the Net-Zero Advisory Body, and interested Canadians on what is needed to reach Canada’s more ambitious climate target of 40-45% emissions reductions by 2030.
-
-Net-Zero Advisory Body
-The Net-Zero Advisory Body (NZAB) was launched in February 2021. With up to 15 members that bring together relevant experience and knowledge, the Advisory Body provides independent advice to the Minister of Environment and Climate Change that supports achieving Canada’s net-zero target.
-
-In November 2021, the Minister of Environment and Climate Change and the Minister of Natural Resources asked the Net-Zero Advisory Body to provide advice on guiding principles to inform the development of quantitative five-year targets for caps on emissions from the oil and gas sector. This advice will support the achievement of the Government’s commitment to capping and cutting emissions from the sector at the pace and scale needed to get to net zero by 2050.
-
-Net-Zero Accelerator Fund
-As part of Canada’s plan, the Government of Canada has launched the $8 billion Net-Zero Accelerator Fund to help large emitters reduce their emissions.
-
-For example, Algoma Steel Inc. is receiving up to $420 million from the Fund to retrofit its operations and phase out coal-fired steelmaking processes at its facility in Sault Ste. Marie, Ontario. This will create 500 jobs and reduce emissions by 3 million tonnes per year by 2030.
-
-The Net-Zero Challenge
+Net-zero emissions by 2050
+
+The transition to a cleaner, prosperous economy needs to be both an immediate priority and a sustained effort over the years and decades ahead. Canada must keep innovating to meet this long-term goal, strengthening and building on existing measures that fight climate change and transform the economy.
+
+To avert the worst impacts of climate change, the Government of Canada is committed to achieving net-zero emissions by 2050.
+
+This goal will require support and engagement from all parts of society, including provinces and territories, cities, Indigenous Peoples, youth, and businesses.
+
+What is Net-Zero?
+Achieving net-zero emissions means our economy either emits no greenhouse gas emissions or offsets its emissions, for example, through actions such as tree planting or employing technologies that can capture carbon before it is released into the air. This is essential to keeping the world safe and livable for our kids and grandkids.
+
+Canada has joined over 120 countries in committing to be net-zero emissions by 2050, including all other G7 nations (United Kingdom, United States, Germany, Italy, France, and Japan), A number of provinces and cities have already made net-zero-by-2050 commitments, including Guelph, Vancouver, Hamilton, Toronto, Halifax, Newfoundland and Labrador, and most recently Quebec. Prince Edward Island has also pledged to reach net-zero greenhouse gas emissions by 2040. Nova Scotia and British Columbia have put into place, or plan to put into place, provincial net-zero-by-2050 legislation.
+
+Canada’s plan to reach Net-Zero
+The Canadian Net-Zero Emissions Accountability Act, which became law on June 29, 2021, enshrines in legislation Canada’s commitment to achieve net-zero emissions by 2050. The Act ensures transparency and accountability as the government works to deliver on its targets. The Act requires public participation and independent advice to guide the Government of Canada’s efforts.
+
+2030 Emissions Reduction Plan: Clean Air, Strong Economy
+Building on the actions in Canada’s strengthened climate plan (2020), and the Pan-Canadian Framework (2016), the 2030 Emissions Reduction Plan (2022) provides a roadmap to how Canada will meet its enhanced Paris Agreement target to reduce emissions by 40-45% from 2005 levels by 2030.
+
+The Government of Canada published the country’s 2030 Emissions Reduction Plan in March 2022. The plan reflects input from provinces, territories, Indigenous Peoples, the Net-Zero Advisory Body, and interested Canadians on what is needed to reach Canada’s more ambitious climate target of 40-45% emissions reductions by 2030.
+
+Net-Zero Advisory Body
+The Net-Zero Advisory Body (NZAB) was launched in February 2021. With up to 15 members that bring together relevant experience and knowledge, the Advisory Body provides independent advice to the Minister of Environment and Climate Change that supports achieving Canada’s net-zero target.
+
+In November 2021, the Minister of Environment and Climate Change and the Minister of Natural Resources asked the Net-Zero Advisory Body to provide advice on guiding principles to inform the development of quantitative five-year targets for caps on emissions from the oil and gas sector. This advice will support the achievement of the Government’s commitment to capping and cutting emissions from the sector at the pace and scale needed to get to net zero by 2050.
+
+Net-Zero Accelerator Fund
+As part of Canada’s plan, the Government of Canada has launched the $8 billion Net-Zero Accelerator Fund to help large emitters reduce their emissions.
+
+For example, Algoma Steel Inc. is receiving up to $420 million from the Fund to retrofit its operations and phase out coal-fired steelmaking processes at its facility in Sault Ste. Marie, Ontario. This will create 500 jobs and reduce emissions by 3 million tonnes per year by 2030.
+
+The Net-Zero Challenge
Companies operating in Canada have an essential role to play in helping the country achieve net-zero emissions. The Net-Zero Challenge encourages businesses to develop and implement credible and effective plans to transition their facilities and operations to net-zero emissions by 2050. Any Canadian company or business can join the Net-Zero Challenge.
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/origin_of_name_canada.txt b/tests/resources/sample_text_files/origin_of_name_canada.txt
index 6032af2..8038e1b 100644
--- a/tests/resources/sample_text_files/origin_of_name_canada.txt
+++ b/tests/resources/sample_text_files/origin_of_name_canada.txt
@@ -1,23 +1,23 @@
-Origin of the name "Canada"
-Today, it seems impossible to imagine Canada by any other name. However, there were a number of other interesting suggestions and events leading up to the formal christening of our nation in 1867.
-
-Aboriginal roots
-The naming of a nation
-Aboriginal roots
-The name “Canada” likely comes from the Huron-Iroquois word “kanata,” meaning “village” or “settlement.” In 1535, two Aboriginal youths told French explorer Jacques Cartier about the route to kanata; they were actually referring to the village of Stadacona, the site of the present-day City of Québec. For lack of another name, Cartier used the word “Canada” to describe not only the village, but the entire area controlled by its chief, Donnacona.
-
-The name was soon applied to a much larger area; maps in 1547 designated everything north of the St. Lawrence River as Canada. Cartier also called the St. Lawrence River the “rivière du Canada,” a name used until the early 1600s. By 1616, although the entire region was known as New France, the area along the great river of Canada and the Gulf of St. Lawrence was still called Canada.
-
-Soon explorers and fur traders opened up territory to the west and to the south, and the area known as Canada grew. In the early 1700s, the name referred to all French lands in what is now the American Midwest and as far south as present-day Louisiana.
-
-The first use of Canada as an official name came in 1791, when the Province of Quebec was divided into the colonies of Upper Canada and Lower Canada. In 1841, the two colonies were united under one name, the Province of Canada.
-
-Two Aboriginal youths telling Jacques Cartier about the route to the village of Stadacona, site of the present day City of Québec.
-The naming of a nation
-Leading up to the proposed confederation, a number of names were suggested for the northern half of the continent of North America, including: Albertsland, Albionora, Borealia, Britannia, Cabotia, Colonia, EfisgaFootnote1, Hochelaga, Norland, Superior, Transatlantia, TuponiaFootnote2, and Victorialand.
-
-The debate was placed in perspective by Thomas D’Arcy McGee, who declared on February 9, 1865:
-
-“I read in one newspaper not less than a dozen attempts to derive a new name. One individual chooses Tuponia and another Hochelaga as a suitable name for the new nationality. Now I ask any honourable member of this House how he would feel if he woke up some fine morning and found himself instead of a Canadian, a Tuponian or a Hochelagander.”
-
+Origin of the name "Canada"
+Today, it seems impossible to imagine Canada by any other name. However, there were a number of other interesting suggestions and events leading up to the formal christening of our nation in 1867.
+
+Aboriginal roots
+The naming of a nation
+Aboriginal roots
+The name “Canada” likely comes from the Huron-Iroquois word “kanata,” meaning “village” or “settlement.” In 1535, two Aboriginal youths told French explorer Jacques Cartier about the route to kanata; they were actually referring to the village of Stadacona, the site of the present-day City of Québec. For lack of another name, Cartier used the word “Canada” to describe not only the village, but the entire area controlled by its chief, Donnacona.
+
+The name was soon applied to a much larger area; maps in 1547 designated everything north of the St. Lawrence River as Canada. Cartier also called the St. Lawrence River the “rivière du Canada,” a name used until the early 1600s. By 1616, although the entire region was known as New France, the area along the great river of Canada and the Gulf of St. Lawrence was still called Canada.
+
+Soon explorers and fur traders opened up territory to the west and to the south, and the area known as Canada grew. In the early 1700s, the name referred to all French lands in what is now the American Midwest and as far south as present-day Louisiana.
+
+The first use of Canada as an official name came in 1791, when the Province of Quebec was divided into the colonies of Upper Canada and Lower Canada. In 1841, the two colonies were united under one name, the Province of Canada.
+
+Two Aboriginal youths telling Jacques Cartier about the route to the village of Stadacona, site of the present day City of Québec.
+The naming of a nation
+Leading up to the proposed confederation, a number of names were suggested for the northern half of the continent of North America, including: Albertsland, Albionora, Borealia, Britannia, Cabotia, Colonia, EfisgaFootnote1, Hochelaga, Norland, Superior, Transatlantia, TuponiaFootnote2, and Victorialand.
+
+The debate was placed in perspective by Thomas D’Arcy McGee, who declared on February 9, 1865:
+
+“I read in one newspaper not less than a dozen attempts to derive a new name. One individual chooses Tuponia and another Hochelaga as a suitable name for the new nationality. Now I ask any honourable member of this House how he would feel if he woke up some fine morning and found himself instead of a Canadian, a Tuponian or a Hochelagander.”
+
Fortunately for posterity, McGee’s wit and reasoning – along with common sense – prevailed, and on July 1, 1867, the provinces of Canada, Nova Scotia, and New Brunswick became “one Dominion under the name of Canada.”
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/personal_income_tax.txt b/tests/resources/sample_text_files/personal_income_tax.txt
index 61951db..36b8741 100644
--- a/tests/resources/sample_text_files/personal_income_tax.txt
+++ b/tests/resources/sample_text_files/personal_income_tax.txt
@@ -1,66 +1,66 @@
-Personal income tax
-
-Get ready to do your taxes
-
-Chat with Charlie
-Open chat
-Find out what's new for the 2022 tax season and your filing and payment due dates. Begin by gathering your documents to report income and claim deductions, and choose how you want to file and send your completed tax return to the CRA.
-
-Understand your rights as a taxpayer and who should file a tax return.
-
-Steps to get ready for 2022 taxes
-Find out what's new
-What's new for 2022 taxes
-Changes to benefits, credits, and expenses for individuals and families, and updates to the 2022 income tax package.
-
-How COVID-19 benefits impact your taxes
-Reporting COVID-19 benefit amounts on your tax return, tax slip issues, and payment options.
-
-Be aware of key dates for 2022 taxes
-Filing and payment due dates for taxes, instalment payments, and any amounts you may owe
-
-Filing dates for 2022 taxes
-February 20, 2023: Earliest day to file your taxes online
-April 30, 2023 (May 1, 2023 since April 30 is a Sunday): Deadline to file your taxes
-June 15, 2023: Deadline to file your taxes if you or your spouse or common-law partner are self-employed
-Payment date for 2022 taxes
-April 30, 2023 (May 1, 2023 since April 30 is a Sunday): Deadline to pay your taxes
-Know what to report and claim
-Report your income
-What to report as income, including COVID-19 benefits, self-employment income, foreign investment income, income from assets, or other earnings.
-
-Claim deductions, credits, and expenses
-Find and claim tax deductions and credits, and get information on tax instalments and expenses.
-
-If you are a small business owner or self-employed
-Get free tax help to better understand your tax obligations by booking a virtual visit with a Liaison Officer
-
-Get your tax slips
-Understand your tax slips such as the T4 and T4A, when you will receive them, and how to get copies
-
-Keep your CRA information up-to-date
-Update your CRA information
-Change your address, marital status, name, or other personal information with the CRA. Register for direct deposit and keep it up-to-date to get your payments and refunds faster.
-
-Ways to do your taxes
-You can file your taxes online or by paper, or find other options such as having someone else complete them for you. Options include:
-
-NETFILE-certified tax software (electronic filing)
-Through a tax preparer using EFILE-certified tax software (electronic filing)
-Community volunteer tax clinic
-Paper tax return
-File my Return - Automated phone line (by invitation only)
-Send us your completed tax return
-File your tax return online or mail your completed tax return to your tax centre.
-
-Make a payment or wait for your refund
-Have a balance owing
-Pay the taxes you owe, arrange to pay over time, or see options if you cannot pay.
-
-Getting a refund
-You will get a refund if you paid more taxes than needed
-
-After you file your tax return
-Get your notice of assessment, find out the status of your refund, or make a change to your tax return
-
+Personal income tax
+
+Get ready to do your taxes
+
+Chat with Charlie
+Open chat
+Find out what's new for the 2022 tax season and your filing and payment due dates. Begin by gathering your documents to report income and claim deductions, and choose how you want to file and send your completed tax return to the CRA.
+
+Understand your rights as a taxpayer and who should file a tax return.
+
+Steps to get ready for 2022 taxes
+Find out what's new
+What's new for 2022 taxes
+Changes to benefits, credits, and expenses for individuals and families, and updates to the 2022 income tax package.
+
+How COVID-19 benefits impact your taxes
+Reporting COVID-19 benefit amounts on your tax return, tax slip issues, and payment options.
+
+Be aware of key dates for 2022 taxes
+Filing and payment due dates for taxes, instalment payments, and any amounts you may owe
+
+Filing dates for 2022 taxes
+February 20, 2023: Earliest day to file your taxes online
+April 30, 2023 (May 1, 2023 since April 30 is a Sunday): Deadline to file your taxes
+June 15, 2023: Deadline to file your taxes if you or your spouse or common-law partner are self-employed
+Payment date for 2022 taxes
+April 30, 2023 (May 1, 2023 since April 30 is a Sunday): Deadline to pay your taxes
+Know what to report and claim
+Report your income
+What to report as income, including COVID-19 benefits, self-employment income, foreign investment income, income from assets, or other earnings.
+
+Claim deductions, credits, and expenses
+Find and claim tax deductions and credits, and get information on tax instalments and expenses.
+
+If you are a small business owner or self-employed
+Get free tax help to better understand your tax obligations by booking a virtual visit with a Liaison Officer
+
+Get your tax slips
+Understand your tax slips such as the T4 and T4A, when you will receive them, and how to get copies
+
+Keep your CRA information up-to-date
+Update your CRA information
+Change your address, marital status, name, or other personal information with the CRA. Register for direct deposit and keep it up-to-date to get your payments and refunds faster.
+
+Ways to do your taxes
+You can file your taxes online or by paper, or find other options such as having someone else complete them for you. Options include:
+
+NETFILE-certified tax software (electronic filing)
+Through a tax preparer using EFILE-certified tax software (electronic filing)
+Community volunteer tax clinic
+Paper tax return
+File my Return - Automated phone line (by invitation only)
+Send us your completed tax return
+File your tax return online or mail your completed tax return to your tax centre.
+
+Make a payment or wait for your refund
+Have a balance owing
+Pay the taxes you owe, arrange to pay over time, or see options if you cannot pay.
+
+Getting a refund
+You will get a refund if you paid more taxes than needed
+
+After you file your tax return
+Get your notice of assessment, find out the status of your refund, or make a change to your tax return
+
If you need more information, you can get answers to frequently asked questions about filing your taxes.
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/start_a_business.txt b/tests/resources/sample_text_files/start_a_business.txt
index 322b1f8..3fda281 100644
--- a/tests/resources/sample_text_files/start_a_business.txt
+++ b/tests/resources/sample_text_files/start_a_business.txt
@@ -1,209 +1,209 @@
-Starting a business
-Table of contents
-Before you start: The plan
-
-What is a business plan?
-Securing financing
-Choosing a business structure
-Choosing a business name
-Choosing a location
-Getting started: The essentials
-
-How to register your business name
-Incorporating your business
-Regulations, licences and permits
-Business number registration
-Taxation
-Hiring employees
-Other resources
-
-Associations
-Business organizations
-Before you start: The plan
-What is a business plan?
-A business plan is a written document that describes your business objectives and strategies, your financial forecasts and the market you are targeting. It will help you set realistic and timely goals, secure external funding, measure your success, clarify operational requirements and establish reasonable financial forecasts. Preparing your plan will help you focus on how to operate your new business and give it the best chance for success.
-
-Securing financial assistance to start your new business is directly related to the strength of your business plan. To be considered for funding from financial institutions or investors, you must demonstrate that you understand every aspect of your business, and its ability to generate profit.
-
-A business plan is more than just a document that you present to lenders and investors; it also helps you plan for the growth and progress of your business. Proper planning can help your business succeed.
-
-Read online:
-Business Plan Guide
-
-Securing financing
-Grants, contributions, subsidies and loan guarantees are available from various government sources. Use Government of Canada’s online search tool to look for programs and services that may apply to your business.
-
-Search online:
-Business Benefits Finder
-
-Choosing a business structure
-When starting your business, choose the business structure that best suits your needs. The three most common business structures are:
-
-Sole proprietorship
-General partnership
-Incorporation
-To learn more about different forms of business organization, read the following:
-Business structures: Which one is right for you?
-
-Choosing a business name
-Before registering your business, you should decide what you want your business name to be. The right name can be an effective advertising tool that can help your customers understand what your business does and which market you are targeting.
-
-Some points to consider when naming your business:
-
-Short names are easier to remember
-Descriptive names can help people understand what your company sells
-Professional names can fit the image you want to project
-Unique names ensure that the name is not already in use
-Your business name is an important part of your business identity. Choose a name that will fit your needs and suit your business image.
-
-Read online:
-Choosing a name…
-
-Choosing a location
-For most businesses, choosing an appropriate location is critical, and the address is often needed for registrations, licences and permits. Your ideal location will depend on your business needs, zoning restrictions and where your customers and competitors are. Taxes, noise and the local business environment are also important factors to consider when reviewing your options.
-
-If you are considering setting up your business in your home, make sure you know what regulations and restrictions will apply to your home-based business before you start.
-
-Read online:
-Choosing and setting up a location
-
-Getting started: The essentials
-How to register your business name
-Business name registration applies to entrepreneurs who want to register a sole proprietorship, a partnership or an operating name (trade name) for a corporation. The name of a new business must be registered if it is different than the business owner’s legal name. For information on how to set up a corporation, see the Incorporating your business section below.
-
-You can complete an optional name search and register your business in the following ways:
-
-Through ServiceOntario's website
-By mailing an application to the address indicated on the form
-The cost to register a business is $60. Your registration is valid for five years, at which time it must be renewed.
-
-Use online:
-Register your business online
-
-Incorporating your business
-A corporation is a legal entity that separates the business from its owner/operator. You can choose to incorporate federally or provincially. Each option comes with its own advantages and disadvantages.
-
-Provincial incorporation
-Incorporating your business provincially allows you to do business under a corporate name in Ontario. Corporate name protection applies in Ontario, and you can open offices/stores within the province.
-
-Contact ServiceOntario:
-1-800-361-3223
-Ontario business incorporation
-
-Federal incorporation
-If you incorporate your business federally, you can open locations within Ontario and/or in other provinces and territories across Canada. If you open offices/stores in different provinces, you will be required to register your business in those locations. Federal incorporation also provides corporate name protection across the country.
-
-Contact Corporations Canada:
-1-866-333-5556
-Steps to incorporating
-
-Professional corporations
-If you are a regulated professional (e.g. healthcare professionals, social workers, accountants) you may be able to provincially incorporate your practice as a professional corporation.
-
-Some key features of professional corporations are:
-
-Limited liability protection
-Access to external investment funding
-Advantages of corporate tax rules
-Corporate status
-Regulated professions can contact the relevant regulatory body and the Ontario Ministry of Public and Business Service Delivery:
-1-800-361-3223
-Professional corporations
-
-Personal Real Estate Corporations (PREC)
-If your are a broker or salesperson registered in Ontario to deal in real estate you may wish to explore the option offered by a Personal Real Estate Corporation (PREC) (not a professional corporation).
-
-Contact the Real Estate Council of Ontario (RECO):
-1-800-245-6910
-Personal Real Estate Corporations (PREC) and Advertising Terms
-
-Regulations, licences and permits
-Your business may need licences and permits from the federal, provincial and municipal levels of government.
-
-In addition to the information you will find in this guide, you can use BizPaL to find licences and regulations that may affect your business.
-
-Contact us:
-Permits and licences search
-
-Legal questions
-You can contact Pro Bono Ontario’s free legal advice hotline to enquire about getting help with your everyday civil legal needs (no family law, immigration or criminal law). The service is generally aimed at those who cannot afford a lawyer.
-
-Note that service is not guaranteed and you will be asked questions as part of the qualifying process, such as the amount of personal income earned by your household, your name, postal code and age range.
-
-Contact Pro Bono Ontario’s Free Legal Advice Hotline:
-1-855-255-7256
-
-Read online:
-Pro Bono Ontario - Free Legal Advice Hotline
-
-
-
-You can also contact the Law Society of Ontario's Law Society Referral Service if you have legal questions of a business nature. The service may be able to assist you in finding a lawyer or paralegal, based on your needs.
-
-Use online:
-Law Society Referral Service
-
-Business number registration
-Your Business Number is your single account number for dealing with the federal government regarding taxes, payroll, import/export and other activities. If you plan to hire employees, or if you will be importing and/or exporting products or services, you will need to get a business number.
-
-If you sell goods and services in Ontario, you may need a business number to charge and remit the Harmonized Sales Tax (HST). Speak with the Canada Revenue Agency (CRA) for more information.
-
-Contact CRA:
-1-800-959-5525
-Canada Revenue Agency – Business
-Business number (BN) registration
-
-Taxation
-Depending on your location and the type of products or services being offered, federal, provincial and/or municipal business taxes may also apply.
-
-Read online:
-Taxation guide
-E-business and selling to customers outside of Ontario
-
-Hiring employees
-It is important that you know your obligations and opportunities when it comes to hiring employees, and familiarize yourself with current labour market conditions.
-
-Some of the things you will want to consider when hiring staff are:
-
-Recruitment practices
-Payroll
-Tax returns
-Employment standards
-Read online:
-Employment regulations: Hiring
-
-Other resources
-Business organizations
-Small Business Enterprise Centres
-Visit a Small Business Enterprise Centre to speak with knowledgeable general business consultants, attend seminars and access business publications.
-
-Search online:
-Small Business Enterprise Centres
-
-Community Futures Ontario
-Access information and financing for businesses in Northern Ontario and rural areas of Southern and Eastern Ontario.
-
-Contact a CFDC:
-1-866-668-2332
-Community Futures Ontario
-
-Ministry of Economic Development, Job Creation and Trade
-Find information about Ontario's small business community, and connect to the people and resources you need to improve competitiveness and profitability.
-
-Contact the Ministry of Economic Development, Job Creation and Trade:
-1-800-268-7095
-Ministry of Economic Development, Job Creation and Trade
-
-Business Development Bank of Canada (BDC)
-Access a wide range of business counselling, training and financing programs, including workshops, seminars and business management courses. Program costs will vary.
-
-Contact BDC:
-1-877-232-2269
-Business Development Bank of Canada
-
-The Canadian Trade Commissioner Service (TCS)
-Learn how global value chains can improve competitiveness, profitability and long-term sustainability for your business.
-
-Contact TCS:
-1-888-306-9991
+Starting a business
+Table of contents
+Before you start: The plan
+
+What is a business plan?
+Securing financing
+Choosing a business structure
+Choosing a business name
+Choosing a location
+Getting started: The essentials
+
+How to register your business name
+Incorporating your business
+Regulations, licences and permits
+Business number registration
+Taxation
+Hiring employees
+Other resources
+
+Associations
+Business organizations
+Before you start: The plan
+What is a business plan?
+A business plan is a written document that describes your business objectives and strategies, your financial forecasts and the market you are targeting. It will help you set realistic and timely goals, secure external funding, measure your success, clarify operational requirements and establish reasonable financial forecasts. Preparing your plan will help you focus on how to operate your new business and give it the best chance for success.
+
+Securing financial assistance to start your new business is directly related to the strength of your business plan. To be considered for funding from financial institutions or investors, you must demonstrate that you understand every aspect of your business, and its ability to generate profit.
+
+A business plan is more than just a document that you present to lenders and investors; it also helps you plan for the growth and progress of your business. Proper planning can help your business succeed.
+
+Read online:
+Business Plan Guide
+
+Securing financing
+Grants, contributions, subsidies and loan guarantees are available from various government sources. Use Government of Canada’s online search tool to look for programs and services that may apply to your business.
+
+Search online:
+Business Benefits Finder
+
+Choosing a business structure
+When starting your business, choose the business structure that best suits your needs. The three most common business structures are:
+
+Sole proprietorship
+General partnership
+Incorporation
+To learn more about different forms of business organization, read the following:
+Business structures: Which one is right for you?
+
+Choosing a business name
+Before registering your business, you should decide what you want your business name to be. The right name can be an effective advertising tool that can help your customers understand what your business does and which market you are targeting.
+
+Some points to consider when naming your business:
+
+Short names are easier to remember
+Descriptive names can help people understand what your company sells
+Professional names can fit the image you want to project
+Unique names ensure that the name is not already in use
+Your business name is an important part of your business identity. Choose a name that will fit your needs and suit your business image.
+
+Read online:
+Choosing a name…
+
+Choosing a location
+For most businesses, choosing an appropriate location is critical, and the address is often needed for registrations, licences and permits. Your ideal location will depend on your business needs, zoning restrictions and where your customers and competitors are. Taxes, noise and the local business environment are also important factors to consider when reviewing your options.
+
+If you are considering setting up your business in your home, make sure you know what regulations and restrictions will apply to your home-based business before you start.
+
+Read online:
+Choosing and setting up a location
+
+Getting started: The essentials
+How to register your business name
+Business name registration applies to entrepreneurs who want to register a sole proprietorship, a partnership or an operating name (trade name) for a corporation. The name of a new business must be registered if it is different than the business owner’s legal name. For information on how to set up a corporation, see the Incorporating your business section below.
+
+You can complete an optional name search and register your business in the following ways:
+
+Through ServiceOntario's website
+By mailing an application to the address indicated on the form
+The cost to register a business is $60. Your registration is valid for five years, at which time it must be renewed.
+
+Use online:
+Register your business online
+
+Incorporating your business
+A corporation is a legal entity that separates the business from its owner/operator. You can choose to incorporate federally or provincially. Each option comes with its own advantages and disadvantages.
+
+Provincial incorporation
+Incorporating your business provincially allows you to do business under a corporate name in Ontario. Corporate name protection applies in Ontario, and you can open offices/stores within the province.
+
+Contact ServiceOntario:
+1-800-361-3223
+Ontario business incorporation
+
+Federal incorporation
+If you incorporate your business federally, you can open locations within Ontario and/or in other provinces and territories across Canada. If you open offices/stores in different provinces, you will be required to register your business in those locations. Federal incorporation also provides corporate name protection across the country.
+
+Contact Corporations Canada:
+1-866-333-5556
+Steps to incorporating
+
+Professional corporations
+If you are a regulated professional (e.g. healthcare professionals, social workers, accountants) you may be able to provincially incorporate your practice as a professional corporation.
+
+Some key features of professional corporations are:
+
+Limited liability protection
+Access to external investment funding
+Advantages of corporate tax rules
+Corporate status
+Regulated professions can contact the relevant regulatory body and the Ontario Ministry of Public and Business Service Delivery:
+1-800-361-3223
+Professional corporations
+
+Personal Real Estate Corporations (PREC)
+If your are a broker or salesperson registered in Ontario to deal in real estate you may wish to explore the option offered by a Personal Real Estate Corporation (PREC) (not a professional corporation).
+
+Contact the Real Estate Council of Ontario (RECO):
+1-800-245-6910
+Personal Real Estate Corporations (PREC) and Advertising Terms
+
+Regulations, licences and permits
+Your business may need licences and permits from the federal, provincial and municipal levels of government.
+
+In addition to the information you will find in this guide, you can use BizPaL to find licences and regulations that may affect your business.
+
+Contact us:
+Permits and licences search
+
+Legal questions
+You can contact Pro Bono Ontario’s free legal advice hotline to enquire about getting help with your everyday civil legal needs (no family law, immigration or criminal law). The service is generally aimed at those who cannot afford a lawyer.
+
+Note that service is not guaranteed and you will be asked questions as part of the qualifying process, such as the amount of personal income earned by your household, your name, postal code and age range.
+
+Contact Pro Bono Ontario’s Free Legal Advice Hotline:
+1-855-255-7256
+
+Read online:
+Pro Bono Ontario - Free Legal Advice Hotline
+
+
+
+You can also contact the Law Society of Ontario's Law Society Referral Service if you have legal questions of a business nature. The service may be able to assist you in finding a lawyer or paralegal, based on your needs.
+
+Use online:
+Law Society Referral Service
+
+Business number registration
+Your Business Number is your single account number for dealing with the federal government regarding taxes, payroll, import/export and other activities. If you plan to hire employees, or if you will be importing and/or exporting products or services, you will need to get a business number.
+
+If you sell goods and services in Ontario, you may need a business number to charge and remit the Harmonized Sales Tax (HST). Speak with the Canada Revenue Agency (CRA) for more information.
+
+Contact CRA:
+1-800-959-5525
+Canada Revenue Agency – Business
+Business number (BN) registration
+
+Taxation
+Depending on your location and the type of products or services being offered, federal, provincial and/or municipal business taxes may also apply.
+
+Read online:
+Taxation guide
+E-business and selling to customers outside of Ontario
+
+Hiring employees
+It is important that you know your obligations and opportunities when it comes to hiring employees, and familiarize yourself with current labour market conditions.
+
+Some of the things you will want to consider when hiring staff are:
+
+Recruitment practices
+Payroll
+Tax returns
+Employment standards
+Read online:
+Employment regulations: Hiring
+
+Other resources
+Business organizations
+Small Business Enterprise Centres
+Visit a Small Business Enterprise Centre to speak with knowledgeable general business consultants, attend seminars and access business publications.
+
+Search online:
+Small Business Enterprise Centres
+
+Community Futures Ontario
+Access information and financing for businesses in Northern Ontario and rural areas of Southern and Eastern Ontario.
+
+Contact a CFDC:
+1-866-668-2332
+Community Futures Ontario
+
+Ministry of Economic Development, Job Creation and Trade
+Find information about Ontario's small business community, and connect to the people and resources you need to improve competitiveness and profitability.
+
+Contact the Ministry of Economic Development, Job Creation and Trade:
+1-800-268-7095
+Ministry of Economic Development, Job Creation and Trade
+
+Business Development Bank of Canada (BDC)
+Access a wide range of business counselling, training and financing programs, including workshops, seminars and business management courses. Program costs will vary.
+
+Contact BDC:
+1-877-232-2269
+Business Development Bank of Canada
+
+The Canadian Trade Commissioner Service (TCS)
+Learn how global value chains can improve competitiveness, profitability and long-term sustainability for your business.
+
+Contact TCS:
+1-888-306-9991
Canadian Trade Commissioner Service
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/travel_advisories.txt b/tests/resources/sample_text_files/travel_advisories.txt
index 29af55c..33538ca 100644
--- a/tests/resources/sample_text_files/travel_advisories.txt
+++ b/tests/resources/sample_text_files/travel_advisories.txt
@@ -1,488 +1,488 @@
-Travel advice and advisories by destination
-COVID-19: travel health notice for all travellers
-
-The Government of Canada’s official source of travel information and advice, the Travel Advice and Advisories help you to make informed decisions and travel safely while you are outside Canada. Check the page for your destination often, because safety and security conditions may change. See Travel Advice and Advisories – FAQ for more information.
-
-Where are you going?
-Where do you want to go?
-Select a destination
-Legend
-Take normal security precautions
-Take normal security precautions
-
-Exercise a high degree of caution
-Exercise a high degree of caution
-
-Avoid non-essential travel
-Avoid non-essential travel
-
-Avoid all travel
-Avoid all travel
-
-For more details about the risk levels.
-
-Filter items
-Destination Risk level Last updated
- New Caledonia
-Take normal security precautions 2023-12-21 09:39:15
- Iran
-Avoid all travel 2023-12-20 18:51:41
- Comoros
-Exercise a high degree of caution 2023-12-20 16:59:15
- Chad
-Avoid non-essential travel (with regional advisories) 2023-12-20 16:54:42
- Central African Republic
-Avoid all travel 2023-12-20 16:41:10
- Latvia
-Take normal security precautions 2023-12-20 16:24:07
- Cabo Verde
-Exercise a high degree of caution 2023-12-20 15:50:46
- Cameroon
-Exercise a high degree of caution (with regional advisories) 2023-12-20 15:28:24
- Mayotte
-Exercise a high degree of caution 2023-12-20 15:27:12
- Réunion
-Take normal security precautions 2023-12-20 15:21:41
- Mali
-Avoid all travel 2023-12-20 15:17:10
- Niger
-Avoid all travel 2023-12-20 15:12:22
- Burundi
-Avoid non-essential travel (with regional advisories) 2023-12-20 15:11:14
- Zimbabwe
-Exercise a high degree of caution 2023-12-20 15:05:01
- Zambia
-Take normal security precautions (with regional advisories) 2023-12-20 14:55:21
- United States
-Take normal security precautions 2023-12-20 13:54:25
- South Africa
-Exercise a high degree of caution 2023-12-20 13:44:26
- Uganda
-Exercise a high degree of caution (with regional advisories) 2023-12-20 13:27:08
- Peru
-Exercise a high degree of caution (with regional advisories) 2023-12-20 13:21:52
- Benin
-Exercise a high degree of caution (with regional advisories) 2023-12-20 13:21:44
- Togo
-Exercise a high degree of caution (with regional advisories) 2023-12-20 13:19:25
- Tanzania
-Exercise a high degree of caution (with regional advisories) 2023-12-20 13:11:36
- Somalia
-Avoid all travel 2023-12-20 12:41:36
- Seychelles
-Take normal security precautions 2023-12-20 12:38:06
- Chile
-Exercise a high degree of caution 2023-12-20 12:31:06
- Senegal
-Exercise a high degree of caution 2023-12-20 12:15:05
- Sao Tome and Principe
-Take normal security precautions 2023-12-20 12:07:29
- Ethiopia
-Avoid non-essential travel (with regional advisories) 2023-12-20 10:49:17
- Kenya
-Exercise a high degree of caution (with regional advisories) 2023-12-20 10:49:17
- Malawi
-Exercise a high degree of caution 2023-12-20 10:49:17
- Afghanistan
-Avoid all travel 2023-12-20 09:36:50
- Albania
-Take normal security precautions 2023-12-20 09:36:50
- Algeria
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- American Samoa
-Take normal security precautions 2023-12-20 09:36:50
- Andorra
-Take normal security precautions 2023-12-20 09:36:50
- Angola
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Anguilla
-Take normal security precautions 2023-12-20 09:36:50
- Antarctica
-Exercise a high degree of caution 2023-12-20 09:36:50
- Antigua and Barbuda
-Take normal security precautions 2023-12-20 09:36:50
- Argentina
-Take normal security precautions 2023-12-20 09:36:50
- Armenia
-Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
- Aruba
-Take normal security precautions 2023-12-20 09:36:50
- Australia
-Take normal security precautions 2023-12-20 09:36:50
- Austria
-Take normal security precautions 2023-12-20 09:36:50
- Azerbaijan
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Azores
-Take normal security precautions 2023-12-20 09:36:50
- Bahamas
-Exercise a high degree of caution 2023-12-20 09:36:50
- Bahrain
-Exercise a high degree of caution 2023-12-20 09:36:50
- Bangladesh
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Barbados
-Take normal security precautions 2023-12-20 09:36:50
- Belarus
-Avoid all travel 2023-12-20 09:36:50
- Belgium
-Exercise a high degree of caution 2023-12-20 09:36:50
- Belize
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Bermuda
-Take normal security precautions 2023-12-20 09:36:50
- Bhutan
-Take normal security precautions 2023-12-20 09:36:50
- Bolivia
-Exercise a high degree of caution 2023-12-20 09:36:50
- Bonaire
-Take normal security precautions 2023-12-20 09:36:50
- Bosnia and Herzegovina
-Exercise a high degree of caution 2023-12-20 09:36:50
- Botswana
-Take normal security precautions 2023-12-20 09:36:50
- Brazil
-Exercise a high degree of caution 2023-12-20 09:36:50
- British Virgin Islands
-Take normal security precautions 2023-12-20 09:36:50
- Brunei
-Take normal security precautions 2023-12-20 09:36:50
- Bulgaria
-Take normal security precautions 2023-12-20 09:36:50
- Burkina Faso
-Avoid all travel (with regional advisories) 2023-12-20 09:36:50
- Cambodia
-Exercise a high degree of caution 2023-12-20 09:36:50
- Canary Islands
-Take normal security precautions 2023-12-20 09:36:50
- Cayman Islands
-Take normal security precautions 2023-12-20 09:36:50
- China
-Exercise a high degree of caution 2023-12-20 09:36:50
- Colombia
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Cook Islands
-Take normal security precautions 2023-12-20 09:36:50
- Costa Rica
-Exercise a high degree of caution 2023-12-20 09:36:50
- Côte d'Ivoire (Ivory Coast)
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Croatia
-Take normal security precautions 2023-12-20 09:36:50
- Cuba
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Curaçao
-Take normal security precautions 2023-12-20 09:36:50
- Cyprus
-Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
- Czechia
-Take normal security precautions 2023-12-20 09:36:50
- Democratic Republic of Congo (Kinshasa)
-Avoid non-essential travel (with regional advisories) 2023-12-20 09:36:50
- Denmark
-Exercise a high degree of caution 2023-12-20 09:36:50
- Djibouti
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Dominica
-Take normal security precautions 2023-12-20 09:36:50
- Dominican Republic
-Exercise a high degree of caution 2023-12-20 09:36:50
- Ecuador
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Egypt
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- El Salvador
-Exercise a high degree of caution 2023-12-20 09:36:50
- Equatorial Guinea
-Exercise a high degree of caution 2023-12-20 09:36:50
- Eritrea
-Avoid non-essential travel (with regional advisories) 2023-12-20 09:36:50
- Estonia
-Take normal security precautions 2023-12-20 09:36:50
- Eswatini
-Exercise a high degree of caution 2023-12-20 09:36:50
- Falkland Islands
-Take normal security precautions 2023-12-20 09:36:50
- Fiji
-Take normal security precautions 2023-12-20 09:36:50
- Finland
-Take normal security precautions 2023-12-20 09:36:50
- France
-Exercise a high degree of caution 2023-12-20 09:36:50
- French Guiana
-Take normal security precautions 2023-12-20 09:36:50
- French Polynesia
-Take normal security precautions 2023-12-20 09:36:50
- Gabon
-Exercise a high degree of caution 2023-12-20 09:36:50
- Gambia, The
-Exercise a high degree of caution 2023-12-20 09:36:50
- Georgia
-Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
- Germany
-Exercise a high degree of caution 2023-12-20 09:36:50
- Ghana
-Exercise a high degree of caution 2023-12-20 09:36:50
- Gibraltar
-Take normal security precautions 2023-12-20 09:36:50
- Greece
-Take normal security precautions 2023-12-20 09:36:50
- Greenland
-Take normal security precautions 2023-12-20 09:36:50
- Grenada
-Take normal security precautions 2023-12-20 09:36:50
- Guadeloupe
-Take normal security precautions 2023-12-20 09:36:50
- Guam
-Take normal security precautions 2023-12-20 09:36:50
- Guatemala
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Guinea
-Exercise a high degree of caution 2023-12-20 09:36:50
- Guinea-Bissau
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Guyana
-Exercise a high degree of caution 2023-12-20 09:36:50
- Haiti
-Avoid all travel 2023-12-20 09:36:50
- Honduras
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Hong Kong
-Exercise a high degree of caution 2023-12-20 09:36:50
- Hungary
-Take normal security precautions 2023-12-20 09:36:50
- Iceland
-Take normal security precautions 2023-12-20 09:36:50
- India
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Indonesia
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Iraq
-Avoid all travel 2023-12-20 09:36:50
- Ireland
-Take normal security precautions 2023-12-20 09:36:50
- Israel, the West Bank and the Gaza Strip
-Avoid non-essential travel (with regional advisories) 2023-12-20 09:36:50
- Italy
-Take normal security precautions 2023-12-20 09:36:50
- Jamaica
-Exercise a high degree of caution 2023-12-20 09:36:50
- Japan
-Take normal security precautions 2023-12-20 09:36:50
- Jordan
-Exercise a high degree of caution 2023-12-20 09:36:50
- Kazakhstan
-Exercise a high degree of caution 2023-12-20 09:36:50
- Kiribati
-Take normal security precautions 2023-12-20 09:36:50
- Kosovo
-Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
- Kuwait
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Kyrgyzstan
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Laos
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Lebanon
-Avoid all travel 2023-12-20 09:36:50
- Lesotho
-Exercise a high degree of caution 2023-12-20 09:36:50
- Liberia
-Exercise a high degree of caution 2023-12-20 09:36:50
- Libya
-Avoid all travel 2023-12-20 09:36:50
- Liechtenstein
-Take normal security precautions 2023-12-20 09:36:50
- Lithuania
-Take normal security precautions 2023-12-20 09:36:50
- Luxembourg
-Take normal security precautions 2023-12-20 09:36:50
- Macao
-Exercise a high degree of caution 2023-12-20 09:36:50
- Madagascar
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Malaysia
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Maldives
-Exercise a high degree of caution 2023-12-20 09:36:50
- Malta
-Take normal security precautions 2023-12-20 09:36:50
- Marshall Islands
-Take normal security precautions 2023-12-20 09:36:50
- Martinique
-Take normal security precautions 2023-12-20 09:36:50
- Mauritania
-Avoid non-essential travel (with regional advisories) 2023-12-20 09:36:50
- Mauritius
-Take normal security precautions 2023-12-20 09:36:50
- Mexico
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Micronesia (FSM)
-Take normal security precautions 2023-12-20 09:36:50
- Moldova
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Monaco
-Take normal security precautions 2023-12-20 09:36:50
- Mongolia
-Take normal security precautions 2023-12-20 09:36:50
- Montenegro
-Take normal security precautions 2023-12-20 09:36:50
- Montserrat
-Take normal security precautions 2023-12-20 09:36:50
- Morocco
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Mozambique
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Myanmar
-Avoid all travel 2023-12-20 09:36:50
- Namibia
-Exercise a high degree of caution 2023-12-20 09:36:50
- Nauru
-Take normal security precautions 2023-12-20 09:36:50
- Nepal
-Exercise a high degree of caution 2023-12-20 09:36:50
- Netherlands
-Exercise a high degree of caution 2023-12-20 09:36:50
- New Zealand
-Take normal security precautions 2023-12-20 09:36:50
- Nicaragua
-Exercise a high degree of caution 2023-12-20 09:36:50
- Nigeria
-Avoid non-essential travel (with regional advisories) 2023-12-20 09:36:50
- Niue
-Take normal security precautions 2023-12-20 09:36:50
- North Korea
-Avoid all travel 2023-12-20 09:36:50
- North Macedonia
-Take normal security precautions 2023-12-20 09:36:50
- Northern Marianas
-Take normal security precautions 2023-12-20 09:36:50
- Norway
-Take normal security precautions 2023-12-20 09:36:50
- Oman
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Pakistan
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Palau
-Take normal security precautions 2023-12-20 09:36:50
- Panama
-Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
- Papua New Guinea
-Avoid non-essential travel 2023-12-20 09:36:50
- Paraguay
-Exercise a high degree of caution 2023-12-20 09:36:50
- Philippines
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Poland
-Take normal security precautions 2023-12-20 09:36:50
- Portugal
-Take normal security precautions 2023-12-20 09:36:50
- Puerto Rico
-Take normal security precautions 2023-12-20 09:36:50
- Qatar
-Take normal security precautions 2023-12-20 09:36:50
- Republic of Congo (Brazzaville)
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Romania
-Take normal security precautions 2023-12-20 09:36:50
- Russia
-Avoid all travel 2023-12-20 09:36:50
- Rwanda
-Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
- Saint Kitts and Nevis
-Take normal security precautions 2023-12-20 09:36:50
- Saint Lucia
-Take normal security precautions 2023-12-20 09:36:50
- Saint Martin
-Take normal security precautions 2023-12-20 09:36:50
- Saint Vincent & the Grenadines
-Take normal security precautions 2023-12-20 09:36:50
- Saint-Barthélemy
-Take normal security precautions 2023-12-20 09:36:50
- Saint-Pierre-et-Miquelon
-Take normal security precautions 2023-12-20 09:36:50
- Samoa
-Take normal security precautions 2023-12-20 09:36:50
- San Marino
-Take normal security precautions 2023-12-20 09:36:50
- Saudi Arabia
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Serbia
-Take normal security precautions 2023-12-20 09:36:50
- Sierra Leone
-Exercise a high degree of caution 2023-12-20 09:36:50
- Singapore
-Take normal security precautions 2023-12-20 09:36:50
- Sint Maarten
-Take normal security precautions 2023-12-20 09:36:50
- Slovakia
-Take normal security precautions 2023-12-20 09:36:50
- Slovenia
-Take normal security precautions 2023-12-20 09:36:50
- Solomon Islands
-Take normal security precautions 2023-12-20 09:36:50
- South Korea
-Take normal security precautions 2023-12-20 09:36:50
- South Sudan
-Avoid all travel 2023-12-20 09:36:50
- Spain
-Exercise a high degree of caution 2023-12-20 09:36:50
- Sri Lanka
-Exercise a high degree of caution 2023-12-20 09:36:50
- Sudan
-Avoid all travel 2023-12-20 09:36:50
- Suriname
-Take normal security precautions 2023-12-20 09:36:50
- Sweden
-Exercise a high degree of caution 2023-12-20 09:36:50
- Switzerland
-Take normal security precautions 2023-12-20 09:36:50
- Syria
-Avoid all travel 2023-12-20 09:36:50
- Taiwan
-Take normal security precautions 2023-12-20 09:36:50
- Tajikistan
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Thailand
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Timor-Leste (East Timor)
-Exercise a high degree of caution 2023-12-20 09:36:50
- Tokelau
-Take normal security precautions 2023-12-20 09:36:50
- Tonga
-Take normal security precautions 2023-12-20 09:36:50
- Trinidad and Tobago
-Exercise a high degree of caution 2023-12-20 09:36:50
- Tunisia
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Türkiye
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Turkmenistan
-Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
- Turks and Caicos Islands
-Take normal security precautions 2023-12-20 09:36:50
- Tuvalu
-Take normal security precautions 2023-12-20 09:36:50
- Ukraine
-Avoid all travel 2023-12-20 09:36:50
- United Arab Emirates
-Exercise a high degree of caution 2023-12-20 09:36:50
- United Kingdom
-Exercise a high degree of caution 2023-12-20 09:36:50
- Uruguay
-Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
- Uzbekistan
-Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
- Vanuatu
-Take normal security precautions 2023-12-20 09:36:50
- Venezuela
-Avoid all travel 2023-12-20 09:36:50
- Vietnam
-Exercise a high degree of caution 2023-12-20 09:36:50
- Virgin Islands (U.S.)
-Take normal security precautions 2023-12-20 09:36:50
- Yemen
-Avoid all travel 2023-12-20 09:36:50
-Destination Risk level Last updated
-Travel advice from other countries
+Travel advice and advisories by destination
+COVID-19: travel health notice for all travellers
+
+The Government of Canada’s official source of travel information and advice, the Travel Advice and Advisories help you to make informed decisions and travel safely while you are outside Canada. Check the page for your destination often, because safety and security conditions may change. See Travel Advice and Advisories – FAQ for more information.
+
+Where are you going?
+Where do you want to go?
+Select a destination
+Legend
+Take normal security precautions
+Take normal security precautions
+
+Exercise a high degree of caution
+Exercise a high degree of caution
+
+Avoid non-essential travel
+Avoid non-essential travel
+
+Avoid all travel
+Avoid all travel
+
+For more details about the risk levels.
+
+Filter items
+Destination Risk level Last updated
+ New Caledonia
+Take normal security precautions 2023-12-21 09:39:15
+ Iran
+Avoid all travel 2023-12-20 18:51:41
+ Comoros
+Exercise a high degree of caution 2023-12-20 16:59:15
+ Chad
+Avoid non-essential travel (with regional advisories) 2023-12-20 16:54:42
+ Central African Republic
+Avoid all travel 2023-12-20 16:41:10
+ Latvia
+Take normal security precautions 2023-12-20 16:24:07
+ Cabo Verde
+Exercise a high degree of caution 2023-12-20 15:50:46
+ Cameroon
+Exercise a high degree of caution (with regional advisories) 2023-12-20 15:28:24
+ Mayotte
+Exercise a high degree of caution 2023-12-20 15:27:12
+ Réunion
+Take normal security precautions 2023-12-20 15:21:41
+ Mali
+Avoid all travel 2023-12-20 15:17:10
+ Niger
+Avoid all travel 2023-12-20 15:12:22
+ Burundi
+Avoid non-essential travel (with regional advisories) 2023-12-20 15:11:14
+ Zimbabwe
+Exercise a high degree of caution 2023-12-20 15:05:01
+ Zambia
+Take normal security precautions (with regional advisories) 2023-12-20 14:55:21
+ United States
+Take normal security precautions 2023-12-20 13:54:25
+ South Africa
+Exercise a high degree of caution 2023-12-20 13:44:26
+ Uganda
+Exercise a high degree of caution (with regional advisories) 2023-12-20 13:27:08
+ Peru
+Exercise a high degree of caution (with regional advisories) 2023-12-20 13:21:52
+ Benin
+Exercise a high degree of caution (with regional advisories) 2023-12-20 13:21:44
+ Togo
+Exercise a high degree of caution (with regional advisories) 2023-12-20 13:19:25
+ Tanzania
+Exercise a high degree of caution (with regional advisories) 2023-12-20 13:11:36
+ Somalia
+Avoid all travel 2023-12-20 12:41:36
+ Seychelles
+Take normal security precautions 2023-12-20 12:38:06
+ Chile
+Exercise a high degree of caution 2023-12-20 12:31:06
+ Senegal
+Exercise a high degree of caution 2023-12-20 12:15:05
+ Sao Tome and Principe
+Take normal security precautions 2023-12-20 12:07:29
+ Ethiopia
+Avoid non-essential travel (with regional advisories) 2023-12-20 10:49:17
+ Kenya
+Exercise a high degree of caution (with regional advisories) 2023-12-20 10:49:17
+ Malawi
+Exercise a high degree of caution 2023-12-20 10:49:17
+ Afghanistan
+Avoid all travel 2023-12-20 09:36:50
+ Albania
+Take normal security precautions 2023-12-20 09:36:50
+ Algeria
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ American Samoa
+Take normal security precautions 2023-12-20 09:36:50
+ Andorra
+Take normal security precautions 2023-12-20 09:36:50
+ Angola
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Anguilla
+Take normal security precautions 2023-12-20 09:36:50
+ Antarctica
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Antigua and Barbuda
+Take normal security precautions 2023-12-20 09:36:50
+ Argentina
+Take normal security precautions 2023-12-20 09:36:50
+ Armenia
+Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
+ Aruba
+Take normal security precautions 2023-12-20 09:36:50
+ Australia
+Take normal security precautions 2023-12-20 09:36:50
+ Austria
+Take normal security precautions 2023-12-20 09:36:50
+ Azerbaijan
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Azores
+Take normal security precautions 2023-12-20 09:36:50
+ Bahamas
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Bahrain
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Bangladesh
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Barbados
+Take normal security precautions 2023-12-20 09:36:50
+ Belarus
+Avoid all travel 2023-12-20 09:36:50
+ Belgium
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Belize
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Bermuda
+Take normal security precautions 2023-12-20 09:36:50
+ Bhutan
+Take normal security precautions 2023-12-20 09:36:50
+ Bolivia
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Bonaire
+Take normal security precautions 2023-12-20 09:36:50
+ Bosnia and Herzegovina
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Botswana
+Take normal security precautions 2023-12-20 09:36:50
+ Brazil
+Exercise a high degree of caution 2023-12-20 09:36:50
+ British Virgin Islands
+Take normal security precautions 2023-12-20 09:36:50
+ Brunei
+Take normal security precautions 2023-12-20 09:36:50
+ Bulgaria
+Take normal security precautions 2023-12-20 09:36:50
+ Burkina Faso
+Avoid all travel (with regional advisories) 2023-12-20 09:36:50
+ Cambodia
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Canary Islands
+Take normal security precautions 2023-12-20 09:36:50
+ Cayman Islands
+Take normal security precautions 2023-12-20 09:36:50
+ China
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Colombia
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Cook Islands
+Take normal security precautions 2023-12-20 09:36:50
+ Costa Rica
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Côte d'Ivoire (Ivory Coast)
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Croatia
+Take normal security precautions 2023-12-20 09:36:50
+ Cuba
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Curaçao
+Take normal security precautions 2023-12-20 09:36:50
+ Cyprus
+Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
+ Czechia
+Take normal security precautions 2023-12-20 09:36:50
+ Democratic Republic of Congo (Kinshasa)
+Avoid non-essential travel (with regional advisories) 2023-12-20 09:36:50
+ Denmark
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Djibouti
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Dominica
+Take normal security precautions 2023-12-20 09:36:50
+ Dominican Republic
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Ecuador
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Egypt
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ El Salvador
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Equatorial Guinea
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Eritrea
+Avoid non-essential travel (with regional advisories) 2023-12-20 09:36:50
+ Estonia
+Take normal security precautions 2023-12-20 09:36:50
+ Eswatini
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Falkland Islands
+Take normal security precautions 2023-12-20 09:36:50
+ Fiji
+Take normal security precautions 2023-12-20 09:36:50
+ Finland
+Take normal security precautions 2023-12-20 09:36:50
+ France
+Exercise a high degree of caution 2023-12-20 09:36:50
+ French Guiana
+Take normal security precautions 2023-12-20 09:36:50
+ French Polynesia
+Take normal security precautions 2023-12-20 09:36:50
+ Gabon
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Gambia, The
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Georgia
+Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
+ Germany
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Ghana
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Gibraltar
+Take normal security precautions 2023-12-20 09:36:50
+ Greece
+Take normal security precautions 2023-12-20 09:36:50
+ Greenland
+Take normal security precautions 2023-12-20 09:36:50
+ Grenada
+Take normal security precautions 2023-12-20 09:36:50
+ Guadeloupe
+Take normal security precautions 2023-12-20 09:36:50
+ Guam
+Take normal security precautions 2023-12-20 09:36:50
+ Guatemala
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Guinea
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Guinea-Bissau
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Guyana
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Haiti
+Avoid all travel 2023-12-20 09:36:50
+ Honduras
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Hong Kong
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Hungary
+Take normal security precautions 2023-12-20 09:36:50
+ Iceland
+Take normal security precautions 2023-12-20 09:36:50
+ India
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Indonesia
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Iraq
+Avoid all travel 2023-12-20 09:36:50
+ Ireland
+Take normal security precautions 2023-12-20 09:36:50
+ Israel, the West Bank and the Gaza Strip
+Avoid non-essential travel (with regional advisories) 2023-12-20 09:36:50
+ Italy
+Take normal security precautions 2023-12-20 09:36:50
+ Jamaica
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Japan
+Take normal security precautions 2023-12-20 09:36:50
+ Jordan
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Kazakhstan
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Kiribati
+Take normal security precautions 2023-12-20 09:36:50
+ Kosovo
+Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
+ Kuwait
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Kyrgyzstan
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Laos
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Lebanon
+Avoid all travel 2023-12-20 09:36:50
+ Lesotho
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Liberia
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Libya
+Avoid all travel 2023-12-20 09:36:50
+ Liechtenstein
+Take normal security precautions 2023-12-20 09:36:50
+ Lithuania
+Take normal security precautions 2023-12-20 09:36:50
+ Luxembourg
+Take normal security precautions 2023-12-20 09:36:50
+ Macao
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Madagascar
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Malaysia
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Maldives
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Malta
+Take normal security precautions 2023-12-20 09:36:50
+ Marshall Islands
+Take normal security precautions 2023-12-20 09:36:50
+ Martinique
+Take normal security precautions 2023-12-20 09:36:50
+ Mauritania
+Avoid non-essential travel (with regional advisories) 2023-12-20 09:36:50
+ Mauritius
+Take normal security precautions 2023-12-20 09:36:50
+ Mexico
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Micronesia (FSM)
+Take normal security precautions 2023-12-20 09:36:50
+ Moldova
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Monaco
+Take normal security precautions 2023-12-20 09:36:50
+ Mongolia
+Take normal security precautions 2023-12-20 09:36:50
+ Montenegro
+Take normal security precautions 2023-12-20 09:36:50
+ Montserrat
+Take normal security precautions 2023-12-20 09:36:50
+ Morocco
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Mozambique
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Myanmar
+Avoid all travel 2023-12-20 09:36:50
+ Namibia
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Nauru
+Take normal security precautions 2023-12-20 09:36:50
+ Nepal
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Netherlands
+Exercise a high degree of caution 2023-12-20 09:36:50
+ New Zealand
+Take normal security precautions 2023-12-20 09:36:50
+ Nicaragua
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Nigeria
+Avoid non-essential travel (with regional advisories) 2023-12-20 09:36:50
+ Niue
+Take normal security precautions 2023-12-20 09:36:50
+ North Korea
+Avoid all travel 2023-12-20 09:36:50
+ North Macedonia
+Take normal security precautions 2023-12-20 09:36:50
+ Northern Marianas
+Take normal security precautions 2023-12-20 09:36:50
+ Norway
+Take normal security precautions 2023-12-20 09:36:50
+ Oman
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Pakistan
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Palau
+Take normal security precautions 2023-12-20 09:36:50
+ Panama
+Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
+ Papua New Guinea
+Avoid non-essential travel 2023-12-20 09:36:50
+ Paraguay
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Philippines
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Poland
+Take normal security precautions 2023-12-20 09:36:50
+ Portugal
+Take normal security precautions 2023-12-20 09:36:50
+ Puerto Rico
+Take normal security precautions 2023-12-20 09:36:50
+ Qatar
+Take normal security precautions 2023-12-20 09:36:50
+ Republic of Congo (Brazzaville)
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Romania
+Take normal security precautions 2023-12-20 09:36:50
+ Russia
+Avoid all travel 2023-12-20 09:36:50
+ Rwanda
+Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
+ Saint Kitts and Nevis
+Take normal security precautions 2023-12-20 09:36:50
+ Saint Lucia
+Take normal security precautions 2023-12-20 09:36:50
+ Saint Martin
+Take normal security precautions 2023-12-20 09:36:50
+ Saint Vincent & the Grenadines
+Take normal security precautions 2023-12-20 09:36:50
+ Saint-Barthélemy
+Take normal security precautions 2023-12-20 09:36:50
+ Saint-Pierre-et-Miquelon
+Take normal security precautions 2023-12-20 09:36:50
+ Samoa
+Take normal security precautions 2023-12-20 09:36:50
+ San Marino
+Take normal security precautions 2023-12-20 09:36:50
+ Saudi Arabia
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Serbia
+Take normal security precautions 2023-12-20 09:36:50
+ Sierra Leone
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Singapore
+Take normal security precautions 2023-12-20 09:36:50
+ Sint Maarten
+Take normal security precautions 2023-12-20 09:36:50
+ Slovakia
+Take normal security precautions 2023-12-20 09:36:50
+ Slovenia
+Take normal security precautions 2023-12-20 09:36:50
+ Solomon Islands
+Take normal security precautions 2023-12-20 09:36:50
+ South Korea
+Take normal security precautions 2023-12-20 09:36:50
+ South Sudan
+Avoid all travel 2023-12-20 09:36:50
+ Spain
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Sri Lanka
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Sudan
+Avoid all travel 2023-12-20 09:36:50
+ Suriname
+Take normal security precautions 2023-12-20 09:36:50
+ Sweden
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Switzerland
+Take normal security precautions 2023-12-20 09:36:50
+ Syria
+Avoid all travel 2023-12-20 09:36:50
+ Taiwan
+Take normal security precautions 2023-12-20 09:36:50
+ Tajikistan
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Thailand
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Timor-Leste (East Timor)
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Tokelau
+Take normal security precautions 2023-12-20 09:36:50
+ Tonga
+Take normal security precautions 2023-12-20 09:36:50
+ Trinidad and Tobago
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Tunisia
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Türkiye
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Turkmenistan
+Exercise a high degree of caution (with regional advisories) 2023-12-20 09:36:50
+ Turks and Caicos Islands
+Take normal security precautions 2023-12-20 09:36:50
+ Tuvalu
+Take normal security precautions 2023-12-20 09:36:50
+ Ukraine
+Avoid all travel 2023-12-20 09:36:50
+ United Arab Emirates
+Exercise a high degree of caution 2023-12-20 09:36:50
+ United Kingdom
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Uruguay
+Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
+ Uzbekistan
+Take normal security precautions (with regional advisories) 2023-12-20 09:36:50
+ Vanuatu
+Take normal security precautions 2023-12-20 09:36:50
+ Venezuela
+Avoid all travel 2023-12-20 09:36:50
+ Vietnam
+Exercise a high degree of caution 2023-12-20 09:36:50
+ Virgin Islands (U.S.)
+Take normal security precautions 2023-12-20 09:36:50
+ Yemen
+Avoid all travel 2023-12-20 09:36:50
+Destination Risk level Last updated
+Travel advice from other countries
Travel advice is also provided by the governments of Australia, New Zealand, the United Kingdom and the United States.
\ No newline at end of file
diff --git a/tests/resources/sample_text_files/visitors_to_canada.txt b/tests/resources/sample_text_files/visitors_to_canada.txt
index 8f82898..5979a9a 100644
--- a/tests/resources/sample_text_files/visitors_to_canada.txt
+++ b/tests/resources/sample_text_files/visitors_to_canada.txt
@@ -1,182 +1,182 @@
-Travellers
-Visitors to Canada
-Have proper identification
-You must carry proper identification for yourself and any children travelling with you to help confirm your legal right or authorization to enter Canada when you arrive.
-
-All visitors arriving from or transiting through the United States should visit the U.S. Customs and Border Protection website for information concerning the requirements to enter, transit through, or return to the United States.
-
-Identification requirements for U. S. citizens and permanent residents
-If you are a U.S. citizen or permanent resident, you must carry proof of citizenship such as a passport, birth certificate, a certificate of citizenship or naturalization, a U.S. Permanent Resident Card, or a Certificate of Indian Status along with photo identification. If you are a U.S. permanent resident, ensure you carry proof of your status such as a U.S. Permanent Resident Card.
-
-For members of a Trusted Traveller program
-U.S. citizens
-U.S. citizens who are members of the NEXUS program can use their membership card as proof of identification and citizenship when entering Canada by land, air or water. This applies when you are using either conventional or NEXUS-only lanes. U.S. citizens who are members of FAST may use their membership card when entering Canada by land or water only. When travelling by air, FAST cards will only be accepted as proof of identification when you are travelling to Canada from the U.S.
-
-U.S. permanent residents
-NEXUS and FAST members who are permanent residents of the U.S. must still travel with a passport and proof of permanent residence. You may be asked to present these documents to the Border Services Officer (BSO) when you arrive at the border.
-
-No matter your mode of travel, we recommend you carry a valid passport for all travel abroad, including visits to Canada from the United States. A passport may be required by your airline or other transportation authority, since it is the only universally-accepted identification document.
-
-Identification requirements for international visitors
-All international travellers must carry acceptable identification and a valid visa (if necessary) when entering Canada. A passport is recommended because it is the only reliable and universally-accepted travel and identification document for the purpose of international travel.
-
-Electronic Travel Authorization
-Visa-exempt foreign nationals need an Electronic Travel Authorization (eTA) to fly to or transit through Canada. Exceptions include U.S. citizens, U.S. Lawful Permanent Residents (USLPR) and travellers with a valid Canadian visa. Canadian citizens, including dual citizens, and Canadian permanent residents cannot apply for an eTA.
-
-Foreign nationals from select visa-required countries may also travel to or through Canada by air using an eTA, if eligible.
-
-Be prepared: Apply for an eTA before you book your flight to Canada. Most applicants get approved within minutes. However, some applications can take several days to process so don’t wait until the last minute. Get help if you have questions before, during or after you apply.
-
-Fake websites
-Travellers who apply for an eTA are advised to be cautious in all dealings with companies that claim to offer help in getting an eTA. These companies are not operating on behalf of the Government of Canada. Many have established websites that charge a fee to provide information and submit eTA applications.
-
-This Government of Canada website is the official place to apply for an eTA.
-
-Travelling with minors
-BSOs watch for missing persons, and may ask detailed questions about any minors travelling with you.
-
-Visit the Children and travel page for more information about travelling abroad with minors.
-
-What you can bring with you
-As a visitor, you can bring certain goods into Canada for your own use as personal baggage. Personal baggage includes clothing, camping and sports equipment, cameras and personal computers. This also includes your mode of transportation, including vehicles, private boats and aircraft.
-
-You must declare all goods when you arrive at the first CBSA port of entry. Our BSOs check goods you are bringing in or taking out of Canada to verify what you have declared. If you declare goods when you arrive and take them back with you when you leave, you will not have to pay any duty or taxes. These goods cannot be:
-
-used by a resident of Canada;
-used on behalf of a business based in Canada;
-given as a gift to a Canadian resident; or
-disposed of or left behind in Canada.
-The BSO may ask you to leave a security deposit for your goods. Your deposit will be refunded when you leave Canada with the goods. If this happens, you will be issued a Temporary Admission Permit. We will keep a copy and give you one for your records. When you leave Canada, bring your goods and your copy of the Temporary Admission Permit, to the BSO. You will get a receipt and your security deposit will be refunded by mail.
-
-Making your Declaration
-When arriving in Canada you must, by Canadian law, report to a BSO, answer all questions truthfully, and accurately report your goods. This means you must also report any food, plant and animal products in your possession.
-
-Have all required identification and travel documents in hand. Be ready to make a full and accurate declaration, including the amount of goods in Canadian dollars you are bringing with you. This will help us get you on your way as quickly as possible.
-
-Arriving by air or by land: If you are arriving by air or by land, follow the signs to the first checkpoint. A BSO will check your identification and other travel documents and you will answer their questions.
-
-Arriving by private boat: If you are arriving by private boat, go directly to a designated marine telephone reporting site and call the Telephone Reporting Centre (TRC) at 1-888-226-7277 to get CBSA clearance. For more information, visit the Private boaters page.
-
-CBSA Declaration Card
-The CBSA Declaration Card tells us what we need to know about you, your travels and what you are bringing into the country. CBSA Declaration Cards are given to passengers arriving by air, and are also used at some locations for travellers arriving by train, boat and bus. Bring a pen in your carry-on baggage to complete the card before you arrive.
-
-Instructions on how to complete the card are attached to the form. You can list up to four people living at the same residence on one card. If there are more than four people living at your address use one additional card for each additional group of four or fewer people. Once the cards are complete you can detach and discard the instructions. Do not fold the card.
-
-Be sure to keep the card handy along with your identification and other travel documents. You will be asked to show this card to our BSOs several times.
-
-If you have any questions about the card or Canadian regulations, ask the BSO when you arrive.
-
-Referrals for secondary services and inspections
-At any point during your interactions with our BSOs at a port of entry, you may be referred to our secondary services and inspections area.
-
-We understand that travellers may feel anxious when crossing the border. Referrals to secondary inspection are a normal part of the cross-border travel process that any visitor to Canada may experience.
-
-Why you may be referred to secondary inspection
-You may be referred to secondary inspection for a variety of reasons, for example:
-
-carrying out a random inspection;
-verifying your declaration or documentation;
-asking you more in-depth questions about yourself or inspecting your goods;
-determining your admissibility to Canada or the admissibility of the goods in your possession;
-having you pay duty and taxes;
-completing or processing paperwork to support your entry or the entry of your goods to Canada.
-All travellers are protected by the Canadian Charter of Rights and Freedoms. Referrals are not made on any discriminatory basis, such as race, nationality, religion, age or gender.
-
-What to expect from secondary inspections
-If you are referred for Secondary Services or Inspection, an officer may:
-
-ask you to provide detailed information about your plans while visiting Canada, or the time you spent abroad;
-make further enquiries, check records, or conduct research to verify your declaration;
-confirm the guardianship of children travelling with you;
-process the payment of duty and taxes;
-inspect your luggage, purse or wallet, electronics (including laptops and cell phones), your vehicle and any additional goods you are transporting;
-examine visually your pet or any animals travelling with you;
-ask you to produce evidence of the money you have available to fund your visit to Canada;
-request that you produce receipts to account for expenses you incurred or purchases made abroad; or
-count your cash or travellers cheques, in your presence.
-While most travellers we inspect comply with Canadian laws and regulations, we do encounter individuals who are intent on breaking the law and who attempt to avoid detection. That is why the officer may not always answer specific questions about a Secondary Inspection.
-
-Travelling with alcohol and tobacco
-Alcoholic beverages
-You are allowed to bring into Canada only one of the following amounts of alcohol and alcoholic beverages free of duty and taxes:
-
-Alcoholic beverages are products that exceed 0.5% alcohol by volume. Certain alcoholic and wine products that do not exceed 0.5% by volume are not considered alcoholic beverages.
-
-Product Metric Imperial Estimates
-Wine Up to1.5 litres of wine Up to 53 fluid ounces Two 750 ml bottles of wine
-Alcoholic beverages Up to 1.14 litres Up to 40 fluid ounces One large standard bottle of liquor
-Beer or ale Up to 8.5 litres Up to 287 fluid ounces Approximately 24 cans or bottles (355 ml each) of beer or ale.
-You must meet the minimum age of the province or territory where you enter Canada. Minimum ages are established by provincial or territorial authorities: 18 years for Alberta, Manitoba and Quebec and 19 years for the remaining provinces and territories.
-
-The CBSA classifies "cooler" products according to the alcoholic beverage they contain. For example, beer coolers are considered to be beer and wine coolers are considered to be wine.
-
-The quantities of alcoholic beverages you can import must be within the limit set by provincial and territorial liquor control authorities that apply where you will enter Canada. If the amount of alcohol you want to import exceeds your personal exemption, you will be required to pay the duty and taxes as well as any provincial or territorial levies that apply. Contact the appropriate provincial or territorial liquor control authority for more information before you return to Canada.
-
-You must be of legal age in the province of importation. While you are allowed to import more alcoholic beverages than the amounts listed above, you will be responsible for paying duty and taxes on the additional alcoholic beverages you are bringing into Canada.
-
-For more information on bringing alcoholic beverages to Canada, consult the Alcohol and tobacco limits page.
-
-Tobacco products
-As a visitor or a temporary resident, you may bring into Canada, free of duty and taxes, all of the following amounts of tobacco products, as long as these items are in your possession when you arrive in Canada:
-
-You can speed up your clearance by having your tobacco products available for inspection when you arrive.
-
-Whether they are stamped or unstamped, if you bring in tobacco products that exceed your personal exemption, you will be required to pay the regular duty and taxes as well as any provincial or territorial levies that apply on the excess amount.
-
-Note: You must be 18 years of age to bring tobacco products into Canada under your personal exemption.
-
-Stamped Tobacco Products – Personal exemption amounts
-If you wish to import cigarettes, manufactured tobacco and tobacco sticks duty free as part of your personal exemption, the packages must be stamped "duty paid Canada droit acquitté". You will find tobacco products sold at duty-free stores marked this way.
-
-Product Amount
-Cigarettes 200 cigarettes
-Cigars 50 cigars
-Tobacco 200 grams (7 ounces) of manufactured tobacco
-Tobacco sticks 200 tobacco sticks
-Unstamped Tobacco Products – Special duties rate
-A special duty rate applies to cigarettes, manufactured tobacco and tobacco sticks that are not stamped "duty paid Canada droit acquitté".
-
-For example, if you claim a carton of 200 cigarettes as part of your personal exemption and it is not stamped "duty paid Canada droit acquitté", you will be assessed at a special duty rate.
-
-Unstamped Tobacco Products – Import limits
-In addition to your personal exemption amounts, there are limits on the quantity of tobacco products that may be imported if it is not packaged and not stamped "duty paid Canada droit acquitté". The limit is currently five units of tobacco products. One unit of tobacco products consists of one of the following:
-
-Product Amount
-Cigarettes 200 cigarettes
-Cigars 50 cigars
-Tobacco 200 grams (7 ounces) of manufactured tobacco
-Tobacco sticks 200 tobacco sticks
-For short visits, these quantities may be limited to amounts that are appropriate in respect of the nature, purpose, and duration of the visit.
-
-For more information on bringing alcoholic beverages to Canada, consult the Alcohol and tobacco limits page.
-
-Restricted/prohibited goods
-Certain goods are restricted or prohibited in Canada. To avoid the possibility of penalties, including seizure or prosecution, make sure you have the information you need before attempting to bring items into Canada.
-
-The following are some examples of restricted or prohibited goods:
-
-Firearms and weapons: You must declare all weapons and firearms at the CBSA port of entry when you enter Canada.
-Food, plants, animals and related products: All food, plants, animals, and related products must be declared. Food can carry disease, such as E. coli. Plants and plant products can carry invasive alien species, such as the Asian Long-Horned Beetle. Animals and animal products can carry diseases, such as avian influenza and foot-and-mouth disease.
-Explosives, fireworks and ammunition: You must have written authorization and permits to bring explosives, fireworks and certain types of ammunition into Canada.
-Vehicles: Vehicles include any kind of pleasure vehicles such as passenger cars, pickup trucks, snowmobiles and motor homes, as long as you use them for non-commercial purposes. There are many requirements that apply to importing a vehicle.
-Consumer products: Certain consumer products that could pose a danger to the public (e.g., baby walkers, jequirity beans that are often found in art or bead work) are not allowed to be brought into Canada. Canadian residents should be aware of consumer products that have safety requirements in Canada. Many of these safety requirements are stricter than requirements of other countries.
-For more information consult the Restricted and Prohibited Goods page.
-
-Travelling with CAN$10,000 or more
-If you have currency or monetary instruments equal to or greater than CAN$10,000 (or the equivalent in a foreign currency) in your possession when arriving in or departing from Canada, you must report to the CBSA. Monetary instruments include items such as stocks, bonds, bank drafts, cheques, and travellers' cheques.
-
-This regulation applies to currency and monetary instruments you have on your person, in your baggage and/or in your vehicle.
-
-When you arrive in Canada with CAN$10,000 or more in your possession, you must report it on the CBSA Declaration Card (if one was provided to you), or in the verbal declaration made to a BSO.
-
-When you leave Canada by air with CAN$10,000 or more in your possession, you must report to the CBSA office within the airport, before clearing security or, if leaving by land or boat, report your intent to export to the CBSA at one of our offices.
-
-For more information, including instructions on how to report your intent to import or export currency in person, by mail, or by courier, you can consult Travelling with CAN$10,000 or more.
-
-Travelling with gifts
-If you are travelling with gifts, do not wrap them before crossing the border. If a gift is wrapped, a BSO may need to un-wrap the gift to examine the goods you are bringing into Canada.
-
-Can I enter Canada?
-Resources for visitors
-Come to Canada to settle, study or work
+Travellers
+Visitors to Canada
+Have proper identification
+You must carry proper identification for yourself and any children travelling with you to help confirm your legal right or authorization to enter Canada when you arrive.
+
+All visitors arriving from or transiting through the United States should visit the U.S. Customs and Border Protection website for information concerning the requirements to enter, transit through, or return to the United States.
+
+Identification requirements for U. S. citizens and permanent residents
+If you are a U.S. citizen or permanent resident, you must carry proof of citizenship such as a passport, birth certificate, a certificate of citizenship or naturalization, a U.S. Permanent Resident Card, or a Certificate of Indian Status along with photo identification. If you are a U.S. permanent resident, ensure you carry proof of your status such as a U.S. Permanent Resident Card.
+
+For members of a Trusted Traveller program
+U.S. citizens
+U.S. citizens who are members of the NEXUS program can use their membership card as proof of identification and citizenship when entering Canada by land, air or water. This applies when you are using either conventional or NEXUS-only lanes. U.S. citizens who are members of FAST may use their membership card when entering Canada by land or water only. When travelling by air, FAST cards will only be accepted as proof of identification when you are travelling to Canada from the U.S.
+
+U.S. permanent residents
+NEXUS and FAST members who are permanent residents of the U.S. must still travel with a passport and proof of permanent residence. You may be asked to present these documents to the Border Services Officer (BSO) when you arrive at the border.
+
+No matter your mode of travel, we recommend you carry a valid passport for all travel abroad, including visits to Canada from the United States. A passport may be required by your airline or other transportation authority, since it is the only universally-accepted identification document.
+
+Identification requirements for international visitors
+All international travellers must carry acceptable identification and a valid visa (if necessary) when entering Canada. A passport is recommended because it is the only reliable and universally-accepted travel and identification document for the purpose of international travel.
+
+Electronic Travel Authorization
+Visa-exempt foreign nationals need an Electronic Travel Authorization (eTA) to fly to or transit through Canada. Exceptions include U.S. citizens, U.S. Lawful Permanent Residents (USLPR) and travellers with a valid Canadian visa. Canadian citizens, including dual citizens, and Canadian permanent residents cannot apply for an eTA.
+
+Foreign nationals from select visa-required countries may also travel to or through Canada by air using an eTA, if eligible.
+
+Be prepared: Apply for an eTA before you book your flight to Canada. Most applicants get approved within minutes. However, some applications can take several days to process so don’t wait until the last minute. Get help if you have questions before, during or after you apply.
+
+Fake websites
+Travellers who apply for an eTA are advised to be cautious in all dealings with companies that claim to offer help in getting an eTA. These companies are not operating on behalf of the Government of Canada. Many have established websites that charge a fee to provide information and submit eTA applications.
+
+This Government of Canada website is the official place to apply for an eTA.
+
+Travelling with minors
+BSOs watch for missing persons, and may ask detailed questions about any minors travelling with you.
+
+Visit the Children and travel page for more information about travelling abroad with minors.
+
+What you can bring with you
+As a visitor, you can bring certain goods into Canada for your own use as personal baggage. Personal baggage includes clothing, camping and sports equipment, cameras and personal computers. This also includes your mode of transportation, including vehicles, private boats and aircraft.
+
+You must declare all goods when you arrive at the first CBSA port of entry. Our BSOs check goods you are bringing in or taking out of Canada to verify what you have declared. If you declare goods when you arrive and take them back with you when you leave, you will not have to pay any duty or taxes. These goods cannot be:
+
+used by a resident of Canada;
+used on behalf of a business based in Canada;
+given as a gift to a Canadian resident; or
+disposed of or left behind in Canada.
+The BSO may ask you to leave a security deposit for your goods. Your deposit will be refunded when you leave Canada with the goods. If this happens, you will be issued a Temporary Admission Permit. We will keep a copy and give you one for your records. When you leave Canada, bring your goods and your copy of the Temporary Admission Permit, to the BSO. You will get a receipt and your security deposit will be refunded by mail.
+
+Making your Declaration
+When arriving in Canada you must, by Canadian law, report to a BSO, answer all questions truthfully, and accurately report your goods. This means you must also report any food, plant and animal products in your possession.
+
+Have all required identification and travel documents in hand. Be ready to make a full and accurate declaration, including the amount of goods in Canadian dollars you are bringing with you. This will help us get you on your way as quickly as possible.
+
+Arriving by air or by land: If you are arriving by air or by land, follow the signs to the first checkpoint. A BSO will check your identification and other travel documents and you will answer their questions.
+
+Arriving by private boat: If you are arriving by private boat, go directly to a designated marine telephone reporting site and call the Telephone Reporting Centre (TRC) at 1-888-226-7277 to get CBSA clearance. For more information, visit the Private boaters page.
+
+CBSA Declaration Card
+The CBSA Declaration Card tells us what we need to know about you, your travels and what you are bringing into the country. CBSA Declaration Cards are given to passengers arriving by air, and are also used at some locations for travellers arriving by train, boat and bus. Bring a pen in your carry-on baggage to complete the card before you arrive.
+
+Instructions on how to complete the card are attached to the form. You can list up to four people living at the same residence on one card. If there are more than four people living at your address use one additional card for each additional group of four or fewer people. Once the cards are complete you can detach and discard the instructions. Do not fold the card.
+
+Be sure to keep the card handy along with your identification and other travel documents. You will be asked to show this card to our BSOs several times.
+
+If you have any questions about the card or Canadian regulations, ask the BSO when you arrive.
+
+Referrals for secondary services and inspections
+At any point during your interactions with our BSOs at a port of entry, you may be referred to our secondary services and inspections area.
+
+We understand that travellers may feel anxious when crossing the border. Referrals to secondary inspection are a normal part of the cross-border travel process that any visitor to Canada may experience.
+
+Why you may be referred to secondary inspection
+You may be referred to secondary inspection for a variety of reasons, for example:
+
+carrying out a random inspection;
+verifying your declaration or documentation;
+asking you more in-depth questions about yourself or inspecting your goods;
+determining your admissibility to Canada or the admissibility of the goods in your possession;
+having you pay duty and taxes;
+completing or processing paperwork to support your entry or the entry of your goods to Canada.
+All travellers are protected by the Canadian Charter of Rights and Freedoms. Referrals are not made on any discriminatory basis, such as race, nationality, religion, age or gender.
+
+What to expect from secondary inspections
+If you are referred for Secondary Services or Inspection, an officer may:
+
+ask you to provide detailed information about your plans while visiting Canada, or the time you spent abroad;
+make further enquiries, check records, or conduct research to verify your declaration;
+confirm the guardianship of children travelling with you;
+process the payment of duty and taxes;
+inspect your luggage, purse or wallet, electronics (including laptops and cell phones), your vehicle and any additional goods you are transporting;
+examine visually your pet or any animals travelling with you;
+ask you to produce evidence of the money you have available to fund your visit to Canada;
+request that you produce receipts to account for expenses you incurred or purchases made abroad; or
+count your cash or travellers cheques, in your presence.
+While most travellers we inspect comply with Canadian laws and regulations, we do encounter individuals who are intent on breaking the law and who attempt to avoid detection. That is why the officer may not always answer specific questions about a Secondary Inspection.
+
+Travelling with alcohol and tobacco
+Alcoholic beverages
+You are allowed to bring into Canada only one of the following amounts of alcohol and alcoholic beverages free of duty and taxes:
+
+Alcoholic beverages are products that exceed 0.5% alcohol by volume. Certain alcoholic and wine products that do not exceed 0.5% by volume are not considered alcoholic beverages.
+
+Product Metric Imperial Estimates
+Wine Up to1.5 litres of wine Up to 53 fluid ounces Two 750 ml bottles of wine
+Alcoholic beverages Up to 1.14 litres Up to 40 fluid ounces One large standard bottle of liquor
+Beer or ale Up to 8.5 litres Up to 287 fluid ounces Approximately 24 cans or bottles (355 ml each) of beer or ale.
+You must meet the minimum age of the province or territory where you enter Canada. Minimum ages are established by provincial or territorial authorities: 18 years for Alberta, Manitoba and Quebec and 19 years for the remaining provinces and territories.
+
+The CBSA classifies "cooler" products according to the alcoholic beverage they contain. For example, beer coolers are considered to be beer and wine coolers are considered to be wine.
+
+The quantities of alcoholic beverages you can import must be within the limit set by provincial and territorial liquor control authorities that apply where you will enter Canada. If the amount of alcohol you want to import exceeds your personal exemption, you will be required to pay the duty and taxes as well as any provincial or territorial levies that apply. Contact the appropriate provincial or territorial liquor control authority for more information before you return to Canada.
+
+You must be of legal age in the province of importation. While you are allowed to import more alcoholic beverages than the amounts listed above, you will be responsible for paying duty and taxes on the additional alcoholic beverages you are bringing into Canada.
+
+For more information on bringing alcoholic beverages to Canada, consult the Alcohol and tobacco limits page.
+
+Tobacco products
+As a visitor or a temporary resident, you may bring into Canada, free of duty and taxes, all of the following amounts of tobacco products, as long as these items are in your possession when you arrive in Canada:
+
+You can speed up your clearance by having your tobacco products available for inspection when you arrive.
+
+Whether they are stamped or unstamped, if you bring in tobacco products that exceed your personal exemption, you will be required to pay the regular duty and taxes as well as any provincial or territorial levies that apply on the excess amount.
+
+Note: You must be 18 years of age to bring tobacco products into Canada under your personal exemption.
+
+Stamped Tobacco Products – Personal exemption amounts
+If you wish to import cigarettes, manufactured tobacco and tobacco sticks duty free as part of your personal exemption, the packages must be stamped "duty paid Canada droit acquitté". You will find tobacco products sold at duty-free stores marked this way.
+
+Product Amount
+Cigarettes 200 cigarettes
+Cigars 50 cigars
+Tobacco 200 grams (7 ounces) of manufactured tobacco
+Tobacco sticks 200 tobacco sticks
+Unstamped Tobacco Products – Special duties rate
+A special duty rate applies to cigarettes, manufactured tobacco and tobacco sticks that are not stamped "duty paid Canada droit acquitté".
+
+For example, if you claim a carton of 200 cigarettes as part of your personal exemption and it is not stamped "duty paid Canada droit acquitté", you will be assessed at a special duty rate.
+
+Unstamped Tobacco Products – Import limits
+In addition to your personal exemption amounts, there are limits on the quantity of tobacco products that may be imported if it is not packaged and not stamped "duty paid Canada droit acquitté". The limit is currently five units of tobacco products. One unit of tobacco products consists of one of the following:
+
+Product Amount
+Cigarettes 200 cigarettes
+Cigars 50 cigars
+Tobacco 200 grams (7 ounces) of manufactured tobacco
+Tobacco sticks 200 tobacco sticks
+For short visits, these quantities may be limited to amounts that are appropriate in respect of the nature, purpose, and duration of the visit.
+
+For more information on bringing alcoholic beverages to Canada, consult the Alcohol and tobacco limits page.
+
+Restricted/prohibited goods
+Certain goods are restricted or prohibited in Canada. To avoid the possibility of penalties, including seizure or prosecution, make sure you have the information you need before attempting to bring items into Canada.
+
+The following are some examples of restricted or prohibited goods:
+
+Firearms and weapons: You must declare all weapons and firearms at the CBSA port of entry when you enter Canada.
+Food, plants, animals and related products: All food, plants, animals, and related products must be declared. Food can carry disease, such as E. coli. Plants and plant products can carry invasive alien species, such as the Asian Long-Horned Beetle. Animals and animal products can carry diseases, such as avian influenza and foot-and-mouth disease.
+Explosives, fireworks and ammunition: You must have written authorization and permits to bring explosives, fireworks and certain types of ammunition into Canada.
+Vehicles: Vehicles include any kind of pleasure vehicles such as passenger cars, pickup trucks, snowmobiles and motor homes, as long as you use them for non-commercial purposes. There are many requirements that apply to importing a vehicle.
+Consumer products: Certain consumer products that could pose a danger to the public (e.g., baby walkers, jequirity beans that are often found in art or bead work) are not allowed to be brought into Canada. Canadian residents should be aware of consumer products that have safety requirements in Canada. Many of these safety requirements are stricter than requirements of other countries.
+For more information consult the Restricted and Prohibited Goods page.
+
+Travelling with CAN$10,000 or more
+If you have currency or monetary instruments equal to or greater than CAN$10,000 (or the equivalent in a foreign currency) in your possession when arriving in or departing from Canada, you must report to the CBSA. Monetary instruments include items such as stocks, bonds, bank drafts, cheques, and travellers' cheques.
+
+This regulation applies to currency and monetary instruments you have on your person, in your baggage and/or in your vehicle.
+
+When you arrive in Canada with CAN$10,000 or more in your possession, you must report it on the CBSA Declaration Card (if one was provided to you), or in the verbal declaration made to a BSO.
+
+When you leave Canada by air with CAN$10,000 or more in your possession, you must report to the CBSA office within the airport, before clearing security or, if leaving by land or boat, report your intent to export to the CBSA at one of our offices.
+
+For more information, including instructions on how to report your intent to import or export currency in person, by mail, or by courier, you can consult Travelling with CAN$10,000 or more.
+
+Travelling with gifts
+If you are travelling with gifts, do not wrap them before crossing the border. If a gift is wrapped, a BSO may need to un-wrap the gift to examine the goods you are bringing into Canada.
+
+Can I enter Canada?
+Resources for visitors
+Come to Canada to settle, study or work
Refugees (Immigration, Refugees and Citizenship Canada website)
\ No newline at end of file
diff --git a/tests/test_dataset_variability.py b/tests/test_dataset_variability.py
new file mode 100644
index 0000000..05c6776
--- /dev/null
+++ b/tests/test_dataset_variability.py
@@ -0,0 +1,31 @@
+import pytest
+import numpy as np
+from faiss_search import DatasetVariability
+
+@pytest.fixture(scope="module")
+def embeddings():
+ return np.load("tests/resources/sample_embedding_files/paraphrasemini-space-embeddings.npy")
+
+def test_normalization(embeddings):
+ variability = DatasetVariability(embeddings)
+ norms = np.linalg.norm(variability.embedding, axis=1)
+ expected_result = np.ones(embeddings.shape[0])
+ assert np.allclose(norms, expected_result, atol=1e-5)
+
+def test_no_normalization(embeddings):
+ variability = DatasetVariability(embeddings, normalize=False)
+ norms = np.linalg.norm(variability.embedding, axis=1)
+ normalized_result = np.ones(embeddings.shape[0])
+ assert not np.allclose(norms, normalized_result, atol=1e-5)
+
+@pytest.fixture(scope="module")
+def variability(embeddings):
+ return DatasetVariability(embeddings)
+
+def test_cosine_similarity_in_range(variability):
+ sim = variability.cosine_similarity_avg()
+ assert (sim >= 0) and (sim <= 1)
+
+def test_variance_in_range(variability):
+ sim = variability.variance()
+ assert (sim >= 0) and (sim <= 1)
diff --git a/tests/test_faiss_index.py b/tests/test_faiss_index.py
index 4f018c6..208fe5f 100644
--- a/tests/test_faiss_index.py
+++ b/tests/test_faiss_index.py
@@ -1,210 +1,210 @@
-import os
-import pytest
-import faiss
-import numpy as np
-from faiss_search import faiss_index
-
-test_embeddings = np.load("tests/resources/faiss_test_files/sample_embeddings.npy")
-query_vec = np.load("tests/resources/faiss_test_files/sample_query_vector.npy")
-
-metric_dict = {
- "L2": 1,
- "IP": 0
-}
-
-
-create_flat_variable_names = "embeddings, file_path, metric"
-create_flat_values = [
- (test_embeddings, None, None),
- (test_embeddings, "tests/resources/faiss_test_files/flat_index.faiss", None),
- (test_embeddings[:1,:], None, None),
- (test_embeddings, None, "L2"),
- (test_embeddings, None, "IP"),
- (test_embeddings, None, "Test")
-]
-@pytest.mark.parametrize(create_flat_variable_names, create_flat_values)
-def test_create_flat_index(embeddings, file_path, metric):
- index = faiss_index.create_flat_index(embeddings, file_path, metric)
- assert isinstance(index.index, faiss.IndexFlat)
- assert index.index.ntotal == embeddings.shape[0]
- if file_path is not None:
- assert os.path.exists(file_path)
- os.remove(file_path)
- try:
- metric_id = metric_dict[metric]
- except KeyError:
- metric_id = 1
- assert index.index.metric_type == metric_id
-
-query_flat_variable_names = "embeddings, xq, k"
-query_flat_values = [
- (test_embeddings, query_vec, 0),
- (test_embeddings, query_vec, 3),
- (test_embeddings, query_vec, test_embeddings.shape[0]),
- (test_embeddings, query_vec, test_embeddings.shape[0] + 1)
-]
-@pytest.mark.parametrize(query_flat_variable_names, query_flat_values)
-def test_flat_query(embeddings, xq, k):
- index = faiss_index.create_flat_index(embeddings)
- if k < 1:
- with pytest.raises(Exception):
- index.query(xq, k)
- else:
- D, I = index.query(xq, k)
- assert D.shape == I.shape
- assert D.shape[1] == k
-
-create_ivf_variable_names = "embeddings, nlist, file_path, metric"
-create_ivf_values = [
- (test_embeddings, None, None, None),
- (test_embeddings, None, "tests/resources/faiss_test_files/ivf_index.faiss", None),
- (test_embeddings, 1, None, None),
- (test_embeddings, 2, None, None),
- (test_embeddings, 10, "tests/resources/faiss_test_files/ivf_index.faiss", "L2"),
- (test_embeddings, test_embeddings.shape[0], None, None),
- (test_embeddings, test_embeddings.shape[0] // 2, None, None),
- (test_embeddings[:1,:], None, None, None),
- (test_embeddings, None, None, "IP"),
- (test_embeddings, None, None, "L2"),
- (test_embeddings, None, None, "Test")
-]
-@pytest.mark.parametrize(create_ivf_variable_names, create_ivf_values)
-def test_create_ivf_flat_index(embeddings, nlist, file_path, metric):
- index = faiss_index.create_IVF_flat_index(embeddings, nlist, file_path, metric)
- assert isinstance(index.index, faiss.IndexIVFFlat)
- assert index.index.ntotal == embeddings.shape[0]
- if nlist is not None:
- assert index.index.nlist == nlist
- if file_path is not None:
- assert os.path.exists(file_path)
- os.remove(file_path)
- try:
- metric_id = metric_dict[metric]
- except KeyError:
- metric_id = 1
- assert index.index.metric_type == metric_id
-
-create_ivf_error_values = [
- (test_embeddings, test_embeddings.shape[0] + 1, None, None),
- (test_embeddings, -1, None, None),
- (test_embeddings, 2.5, None, None)
-]
-@pytest.mark.parametrize(create_ivf_variable_names, create_ivf_error_values)
-def test_create_ivf_flat_index_hyperparameter_errors(embeddings, nlist, file_path, metric):
- with pytest.raises(Exception):
- faiss_index.create_IVF_flat_index(embeddings, nlist, file_path, metric)
-
-query_ivf_variable_names = "embeddings, nlist, xq, k, nprobe"
-query_ivf_values = [
- (test_embeddings, 5, query_vec, 0, None),
- (test_embeddings, 5, query_vec, 3, None),
- (test_embeddings, 5, query_vec, test_embeddings.shape[0], None),
- (test_embeddings, 5, query_vec, test_embeddings.shape[0] + 1, None),
- (test_embeddings, 5, query_vec, 1, 0),
- (test_embeddings, 3, query_vec, 1, 4),
- (test_embeddings, 3, query_vec, 2, 1),
- (test_embeddings, 3, query_vec, 5, 1),
- (test_embeddings, 3, query_vec, 1, 5),
- (test_embeddings, 3, query_vec, 4, 5),
- (test_embeddings, 3, query_vec, 5, 4),
- (test_embeddings, 1, query_vec, 3, 1),
- (test_embeddings, 3, query_vec, 3, 3),
- (test_embeddings, test_embeddings.shape[0], query_vec, 3, test_embeddings.shape[0])
-]
-@pytest.mark.parametrize(query_ivf_variable_names, query_ivf_values)
-def test_ivf_query(embeddings, nlist, xq, k, nprobe):
- index = faiss_index.create_IVF_flat_index(embeddings, nlist)
- if k < 1:
- with pytest.raises(Exception):
- index.query(xq, k, nprobe)
- elif (nprobe is not None) and (nprobe not in range(1, nlist + 1)):
- with pytest.raises(Exception):
- index.query(xq, k, nprobe)
- else:
- D, I = index.query(xq, k, nprobe)
- assert D.shape == I.shape
- assert D.shape[1] == k
- # check against the flat index
- if nlist == nprobe:
- flat = faiss_index.create_flat_index(embeddings)
- Df, If = flat.query(query_vec, k)
- assert np.array_equal(D, Df)
- assert np.array_equal(I, If)
-
-create_hnsw_variable_names = "embeddings, M, efConstruction, file_path, metric"
-create_hnsw_values = [
- (test_embeddings, None, None, None, None),
- (test_embeddings, None, None, "tests/resources/faiss_test_files/hnsw_index.faiss", None),
- (test_embeddings, 32, None, None, None),
- (test_embeddings, None, 40, None, None),
- (test_embeddings, 128, 40, None, None),
- (test_embeddings, 128, 40, "tests/resources/faiss_test_files/hnsw_index.faiss", None),
- (test_embeddings[:1,:], None, None, None, None),
- (test_embeddings, None, None, None, "IP"),
- (test_embeddings, None, None, None, "L2"),
- (test_embeddings, None, None, None, "Test")
-]
-@pytest.mark.parametrize(create_hnsw_variable_names, create_hnsw_values)
-def test_create_hnsw_index(embeddings, M, efConstruction, file_path, metric):
- index = faiss_index.create_HNSW_index(embeddings, M, efConstruction, file_path, metric)
- assert isinstance(index.index, faiss.IndexHNSWFlat)
- assert index.index.ntotal == embeddings.shape[0]
- if file_path is not None:
- assert os.path.exists(file_path)
- os.remove(file_path)
- try:
- metric_id = metric_dict[metric]
- except KeyError:
- metric_id = 1
- assert index.index.metric_type == metric_id
-
-create_hnsw_error_values = [
- (test_embeddings, -1, None, None, None),
- (test_embeddings, -1, -1, None, None),
- (test_embeddings, None, -1, None, None),
- (test_embeddings, 5.5, 10, None, None),
- (test_embeddings, 5, 7.5, None, None),
- (test_embeddings, 2.5, 2.5, None, None),
- (test_embeddings, -1, 2.5, None, None)
-]
-@pytest.mark.parametrize(create_hnsw_variable_names, create_hnsw_error_values)
-def test_create_hnsw_index_hyperparameter_errors(embeddings, M, efConstruction, file_path, metric):
- with pytest.raises(Exception):
- faiss_index.create_HNSW_index(embeddings, M, efConstruction, file_path, metric)
-
-query_hnsw_variable_names = "embeddings, M, efConstruction, xq, k, efSearch"
-query_hnsw_values = [
- (test_embeddings, 32, 32, query_vec, 0, None),
- (test_embeddings, 32, 32, query_vec, 3, None),
- (test_embeddings, 32, 32, query_vec, test_embeddings.shape[0], None),
- (test_embeddings, 32, 32, query_vec, test_embeddings.shape[0] + 1, None),
- (test_embeddings, 32, 32, query_vec, 1, 0),
- (test_embeddings, 32, 32, query_vec, 1, 16),
- (test_embeddings, 32, 32, query_vec, 2, 1),
- (test_embeddings, 32, 32, query_vec, 3, 64)
-]
-@pytest.mark.parametrize(query_hnsw_variable_names, query_hnsw_values)
-def test_hnsw_query(embeddings, M, efConstruction, xq, k, efSearch):
- index = faiss_index.create_HNSW_index(embeddings, M, efConstruction)
- if k < 1:
- with pytest.raises(Exception):
- index.query(xq, k, efSearch)
- elif (efSearch is not None) and (efSearch < 1):
- with pytest.raises(Exception):
- index.query(xq, k, efSearch)
- else:
- D, I = index.query(xq, k, efSearch)
- assert D.shape == I.shape
- assert D.shape[1] == k
-
-load_index_variable_names = "file_path, index_type"
-load_index_values = [
- ("tests/resources/faiss_test_files/flat.faiss", faiss_index.flat_index.FlatIndex),
- ("tests/resources/faiss_test_files/ivf.faiss", faiss_index.IVF_flat_index.IVFFlatIndex),
- ("tests/resources/faiss_test_files/hnsw.faiss", faiss_index.HNSW_index.HNSWIndex),
- ("tests/resources/faiss_test_files/ivfpq.faiss", faiss_index.general_index.GeneralIndex)
-]
-@pytest.mark.parametrize(load_index_variable_names, load_index_values)
-def test_load_index(file_path, index_type):
- index = faiss_index.load_index(file_path)
- assert isinstance(index, index_type)
+import os
+import pytest
+import faiss
+import numpy as np
+from faiss_search import faiss_index
+
+test_embeddings = np.load("tests/resources/faiss_test_files/sample_embeddings.npy")
+query_vec = np.load("tests/resources/faiss_test_files/sample_query_vector.npy")
+
+metric_dict = {
+ "L2": 1,
+ "IP": 0
+}
+
+
+create_flat_variable_names = "embeddings, file_path, metric"
+create_flat_values = [
+ (test_embeddings, None, None),
+ (test_embeddings, "tests/resources/faiss_test_files/flat_index.faiss", None),
+ (test_embeddings[:1,:], None, None),
+ (test_embeddings, None, "L2"),
+ (test_embeddings, None, "IP"),
+ (test_embeddings, None, "Test")
+]
+@pytest.mark.parametrize(create_flat_variable_names, create_flat_values)
+def test_create_flat_index(embeddings, file_path, metric):
+ index = faiss_index.create_flat_index(embeddings, file_path, metric)
+ assert isinstance(index.index, faiss.IndexFlat)
+ assert index.index.ntotal == embeddings.shape[0]
+ if file_path is not None:
+ assert os.path.exists(file_path)
+ os.remove(file_path)
+ try:
+ metric_id = metric_dict[metric]
+ except KeyError:
+ metric_id = 1
+ assert index.index.metric_type == metric_id
+
+query_flat_variable_names = "embeddings, xq, k"
+query_flat_values = [
+ (test_embeddings, query_vec, 0),
+ (test_embeddings, query_vec, 3),
+ (test_embeddings, query_vec, test_embeddings.shape[0]),
+ (test_embeddings, query_vec, test_embeddings.shape[0] + 1)
+]
+@pytest.mark.parametrize(query_flat_variable_names, query_flat_values)
+def test_flat_query(embeddings, xq, k):
+ index = faiss_index.create_flat_index(embeddings)
+ if k < 1:
+ with pytest.raises(Exception):
+ index.query(xq, k)
+ else:
+ D, I = index.query(xq, k)
+ assert D.shape == I.shape
+ assert D.shape[1] == k
+
+create_ivf_variable_names = "embeddings, nlist, file_path, metric"
+create_ivf_values = [
+ (test_embeddings, None, None, None),
+ (test_embeddings, None, "tests/resources/faiss_test_files/ivf_index.faiss", None),
+ (test_embeddings, 1, None, None),
+ (test_embeddings, 2, None, None),
+ (test_embeddings, 10, "tests/resources/faiss_test_files/ivf_index.faiss", "L2"),
+ (test_embeddings, test_embeddings.shape[0], None, None),
+ (test_embeddings, test_embeddings.shape[0] // 2, None, None),
+ (test_embeddings[:1,:], None, None, None),
+ (test_embeddings, None, None, "IP"),
+ (test_embeddings, None, None, "L2"),
+ (test_embeddings, None, None, "Test")
+]
+@pytest.mark.parametrize(create_ivf_variable_names, create_ivf_values)
+def test_create_ivf_flat_index(embeddings, nlist, file_path, metric):
+ index = faiss_index.create_IVF_flat_index(embeddings, nlist, file_path, metric)
+ assert isinstance(index.index, faiss.IndexIVFFlat)
+ assert index.index.ntotal == embeddings.shape[0]
+ if nlist is not None:
+ assert index.index.nlist == nlist
+ if file_path is not None:
+ assert os.path.exists(file_path)
+ os.remove(file_path)
+ try:
+ metric_id = metric_dict[metric]
+ except KeyError:
+ metric_id = 1
+ assert index.index.metric_type == metric_id
+
+create_ivf_error_values = [
+ (test_embeddings, test_embeddings.shape[0] + 1, None, None),
+ (test_embeddings, -1, None, None),
+ (test_embeddings, 2.5, None, None)
+]
+@pytest.mark.parametrize(create_ivf_variable_names, create_ivf_error_values)
+def test_create_ivf_flat_index_hyperparameter_errors(embeddings, nlist, file_path, metric):
+ with pytest.raises(Exception):
+ faiss_index.create_IVF_flat_index(embeddings, nlist, file_path, metric)
+
+query_ivf_variable_names = "embeddings, nlist, xq, k, nprobe"
+query_ivf_values = [
+ (test_embeddings, 5, query_vec, 0, None),
+ (test_embeddings, 5, query_vec, 3, None),
+ (test_embeddings, 5, query_vec, test_embeddings.shape[0], None),
+ (test_embeddings, 5, query_vec, test_embeddings.shape[0] + 1, None),
+ (test_embeddings, 5, query_vec, 1, 0),
+ (test_embeddings, 3, query_vec, 1, 4),
+ (test_embeddings, 3, query_vec, 2, 1),
+ (test_embeddings, 3, query_vec, 5, 1),
+ (test_embeddings, 3, query_vec, 1, 5),
+ (test_embeddings, 3, query_vec, 4, 5),
+ (test_embeddings, 3, query_vec, 5, 4),
+ (test_embeddings, 1, query_vec, 3, 1),
+ (test_embeddings, 3, query_vec, 3, 3),
+ (test_embeddings, test_embeddings.shape[0], query_vec, 3, test_embeddings.shape[0])
+]
+@pytest.mark.parametrize(query_ivf_variable_names, query_ivf_values)
+def test_ivf_query(embeddings, nlist, xq, k, nprobe):
+ index = faiss_index.create_IVF_flat_index(embeddings, nlist)
+ if k < 1:
+ with pytest.raises(Exception):
+ index.query(xq, k, nprobe)
+ elif (nprobe is not None) and (nprobe not in range(1, nlist + 1)):
+ with pytest.raises(Exception):
+ index.query(xq, k, nprobe)
+ else:
+ D, I = index.query(xq, k, nprobe)
+ assert D.shape == I.shape
+ assert D.shape[1] == k
+ # check against the flat index
+ if nlist == nprobe:
+ flat = faiss_index.create_flat_index(embeddings)
+ Df, If = flat.query(query_vec, k)
+ assert np.array_equal(D, Df)
+ assert np.array_equal(I, If)
+
+create_hnsw_variable_names = "embeddings, M, efConstruction, file_path, metric"
+create_hnsw_values = [
+ (test_embeddings, None, None, None, None),
+ (test_embeddings, None, None, "tests/resources/faiss_test_files/hnsw_index.faiss", None),
+ (test_embeddings, 32, None, None, None),
+ (test_embeddings, None, 40, None, None),
+ (test_embeddings, 128, 40, None, None),
+ (test_embeddings, 128, 40, "tests/resources/faiss_test_files/hnsw_index.faiss", None),
+ (test_embeddings[:1,:], None, None, None, None),
+ (test_embeddings, None, None, None, "IP"),
+ (test_embeddings, None, None, None, "L2"),
+ (test_embeddings, None, None, None, "Test")
+]
+@pytest.mark.parametrize(create_hnsw_variable_names, create_hnsw_values)
+def test_create_hnsw_index(embeddings, M, efConstruction, file_path, metric):
+ index = faiss_index.create_HNSW_index(embeddings, M, efConstruction, file_path, metric)
+ assert isinstance(index.index, faiss.IndexHNSWFlat)
+ assert index.index.ntotal == embeddings.shape[0]
+ if file_path is not None:
+ assert os.path.exists(file_path)
+ os.remove(file_path)
+ try:
+ metric_id = metric_dict[metric]
+ except KeyError:
+ metric_id = 1
+ assert index.index.metric_type == metric_id
+
+create_hnsw_error_values = [
+ (test_embeddings, -1, None, None, None),
+ (test_embeddings, -1, -1, None, None),
+ (test_embeddings, None, -1, None, None),
+ (test_embeddings, 5.5, 10, None, None),
+ (test_embeddings, 5, 7.5, None, None),
+ (test_embeddings, 2.5, 2.5, None, None),
+ (test_embeddings, -1, 2.5, None, None)
+]
+@pytest.mark.parametrize(create_hnsw_variable_names, create_hnsw_error_values)
+def test_create_hnsw_index_hyperparameter_errors(embeddings, M, efConstruction, file_path, metric):
+ with pytest.raises(Exception):
+ faiss_index.create_HNSW_index(embeddings, M, efConstruction, file_path, metric)
+
+query_hnsw_variable_names = "embeddings, M, efConstruction, xq, k, efSearch"
+query_hnsw_values = [
+ (test_embeddings, 32, 32, query_vec, 0, None),
+ (test_embeddings, 32, 32, query_vec, 3, None),
+ (test_embeddings, 32, 32, query_vec, test_embeddings.shape[0], None),
+ (test_embeddings, 32, 32, query_vec, test_embeddings.shape[0] + 1, None),
+ (test_embeddings, 32, 32, query_vec, 1, 0),
+ (test_embeddings, 32, 32, query_vec, 1, 16),
+ (test_embeddings, 32, 32, query_vec, 2, 1),
+ (test_embeddings, 32, 32, query_vec, 3, 64)
+]
+@pytest.mark.parametrize(query_hnsw_variable_names, query_hnsw_values)
+def test_hnsw_query(embeddings, M, efConstruction, xq, k, efSearch):
+ index = faiss_index.create_HNSW_index(embeddings, M, efConstruction)
+ if k < 1:
+ with pytest.raises(Exception):
+ index.query(xq, k, efSearch)
+ elif (efSearch is not None) and (efSearch < 1):
+ with pytest.raises(Exception):
+ index.query(xq, k, efSearch)
+ else:
+ D, I = index.query(xq, k, efSearch)
+ assert D.shape == I.shape
+ assert D.shape[1] == k
+
+load_index_variable_names = "file_path, index_type"
+load_index_values = [
+ ("tests/resources/faiss_test_files/flat.faiss", faiss_index.flat_index.FlatIndex),
+ ("tests/resources/faiss_test_files/ivf.faiss", faiss_index.IVF_flat_index.IVFFlatIndex),
+ ("tests/resources/faiss_test_files/hnsw.faiss", faiss_index.HNSW_index.HNSWIndex),
+ ("tests/resources/faiss_test_files/ivfpq.faiss", faiss_index.general_index.GeneralIndex)
+]
+@pytest.mark.parametrize(load_index_variable_names, load_index_values)
+def test_load_index(file_path, index_type):
+ index = faiss_index.load_index(file_path)
+ assert isinstance(index, index_type)
diff --git a/tests/test_search_directory.py b/tests/test_search_directory.py
index 0a4bf3a..7965c1f 100644
--- a/tests/test_search_directory.py
+++ b/tests/test_search_directory.py
@@ -1,163 +1,239 @@
-import os
-import pytest
-import shutil
-import numpy as np
-from faiss_search import SearchDirectory
-
-@pytest.fixture(scope="module")
-def resource_folder():
- return "tests/resources/sample_text_files"
-
-@pytest.fixture(scope="module")
-def embedding_model():
- return "paraphrase-MiniLM-L3-v2"
-
-# Test chunking step
-
-def test_empty_directory(tmp_path):
- SearchDirectory(tmp_path)
- assert not any(os.scandir(tmp_path))
-
-def test_chunk_from_report(resource_folder, tmp_path):
- search = SearchDirectory(tmp_path)
- search.report_from_directory(resource_folder)
- assert os.path.exists(tmp_path / "report.csv")
- search.chunk_text()
- assert os.path.exists(tmp_path / "data_chunked.csv")
- assert os.path.exists(tmp_path / "setup_data.json")
-
-def test_chunk_without_file(tmp_path):
- search = SearchDirectory(tmp_path)
- with pytest.raises(Exception):
- search.chunk_text()
-
-def test_load_chunks_without_csv(tmp_path):
- search = SearchDirectory(tmp_path)
- with pytest.raises(Exception):
- search.chunk_text("tests/resources/search_directory_test_files/Test_excel_file.xlsx")
-
-def test_load_chunks_with_name_issues(tmp_path):
- search = SearchDirectory(tmp_path)
- with pytest.raises(Exception):
- search.chunk_text("tests/resources/search_directory_test_files/2021_Census_English.csv")
-
-@pytest.fixture()
-def directory_with_chunks(resource_folder, tmp_path_factory):
- file_path = tmp_path_factory.mktemp("just_chunks")
- search = SearchDirectory(file_path)
- search.report_from_directory(resource_folder)
- search.chunk_text()
- return file_path
-
-def test_load_with_chunks(directory_with_chunks):
- search = SearchDirectory(directory_with_chunks)
- assert search.n_chunks is not None
-
-def test_load_chunks_different_column_names(directory_with_chunks, tmp_path):
- search1 = SearchDirectory(tmp_path)
- search1.chunk_text("tests/resources/search_directory_test_files/report_modified.csv",
- "path",
- "content")
- search2 = SearchDirectory(directory_with_chunks)
- assert search2.n_chunks == search1.n_chunks
-
-# Test load embedding step
-
-def test_load_embedding_model(directory_with_chunks, embedding_model):
- search = SearchDirectory(directory_with_chunks)
- search.load_embedding_model(embedding_model)
- assert search.encoder is not None
- assert search.encoding_name == embedding_model
-
-@pytest.fixture(scope="module")
-def directory_with_embeding_module(resource_folder, tmp_path_factory, embedding_model):
- file_path = tmp_path_factory.mktemp("with_embedding_module")
- search = SearchDirectory(file_path)
- search.report_from_directory(resource_folder)
- search.chunk_text()
- search.load_embedding_model(embedding_model)
- return file_path
-
-def test_load_after_model_defined(directory_with_embeding_module, embedding_model):
- search = SearchDirectory(directory_with_embeding_module)
- assert search.encoder is not None
- assert search.encoding_name == embedding_model
-
-# Test embedding step
-
-def test_embedding_without_model(directory_with_chunks):
- search = SearchDirectory(directory_with_chunks)
- with pytest.raises(Exception):
- search.embed_text()
-
-variable_names = "start, end, batch, clean_files, expected_files, combined"
-values = [
- (0, None, 100, False, ["0-76"], True),
- (0, None, 100, False, ['0-76'], True),
- (10, 40, 10, True, ['0-76'], True),
- (0, None, 20, True, ["0-20", '20-40', '40-60', '60-76'], True),
- (10, None, 30, False, ["10-40", "40-70", "70-76"], False),
- (15, 25, 5, True, ["10-40", "40-70", "70-76"], False),
- (20, 25, 1, True, ["20-21", '21-22', '22-23', '23-24', '24-25'], False),
- (25, 60, 20, False, ['25-45', '45-60'], False),
- (0, None, 15, True, ['0-15', '15-25', '25-45', '45-60', '60-75', '75-76'], True),
- (25, 60, 20, False, ['25-45', '45-60'], False),
- (0, 30, 20, False, ['0-20', '20-25', '25-45', '45-60'], False),
- (50, 100, 15, True, ['0-20', '20-25', '25-45', '45-60', '60-75', '75-76'], True),
- (0, 80, 40, True, ['0-40', '40-76'], True),
- (0, -1, 100, True, ['0-76'], True),
- (-4, -1, 2, True, ['73-75', '75-76'], False),
- (-77, -1, 100, True, ['0-76'], True),
- (-77, 76, 100, True, ['0-76'], True),
- (10, -2, 100, True, ['10-75'], False),
- (-78, None, 100, True, [], False),
- (0, 0, 100, True, [], False),
- (-1, None, 100, True, [], False),
- (76, 80, 100, True, [], False),
- (75, 80, 100, True, ['75-76'], False),
- (0, -77, 100, True, [], False)
-]
-
-@pytest.mark.parametrize(variable_names, values)
-def test_embedding_creation(directory_with_embeding_module, start, end, batch, clean_files, expected_files, combined):
- search = SearchDirectory(directory_with_embeding_module)
- try:
- if len(expected_files) == 0:
- with pytest.raises(Exception):
- search.embed_text(start, end, batch)
- else:
- search.embed_text(start, end, batch)
- for file in expected_files:
- assert f"embeddings ({file}).npy" in os.listdir(directory_with_embeding_module / "embedding_batches")
- assert os.path.exists(directory_with_embeding_module / "embeddings.npy") == combined
- if combined:
- embeddings = np.load(directory_with_embeding_module / "embeddings.npy")
- assert embeddings.shape[0] == search.n_chunks
- finally:
- # remove created files
- if clean_files:
- shutil.rmtree(directory_with_embeding_module / "embedding_batches")
- if os.path.exists(directory_with_embeding_module / "embeddings.npy"):
- os.remove(directory_with_embeding_module / "embeddings.npy")
-
-def test_overlapping_embedding_files(embedding_model, tmp_path):
- search2 = SearchDirectory(tmp_path)
- search2.chunk_text("tests/resources/search_directory_test_files/report_modified.csv",
- "path",
- "content")
- search2.load_embedding_model(embedding_model)
- search2.embed_text()
- search1 = SearchDirectory("tests/resources/search_directory_test_files")
- search1.chunk_text("tests/resources/search_directory_test_files/report_modified.csv",
- "path",
- "content")
- search1.load_embedding_model(embedding_model)
- search1.embed_text(batch_size=30)
- try:
- embeddings1 = np.load("tests/resources/search_directory_test_files/embeddings.npy")
- embeddings2 = np.load(tmp_path / "embeddings.npy")
- assert np.allclose(embeddings1, embeddings2, atol=1e-5)
- finally:
- os.remove("tests/resources/search_directory_test_files/data_chunked.csv")
- os.remove("tests/resources/search_directory_test_files/setup_data.json")
- os.remove("tests/resources/search_directory_test_files/embeddings.npy")
+import os
+import pytest
+import shutil
+import numpy as np
+from faiss_search import SearchDirectory
+
+@pytest.fixture(scope="module")
+def resource_folder():
+ return "tests/resources/sample_text_files"
+
+@pytest.fixture(scope="module")
+def embedding_model():
+ return "paraphrase-MiniLM-L3-v2"
+
+# Test chunking step
+
+def test_empty_directory(tmp_path):
+ SearchDirectory(tmp_path)
+ assert not any(os.scandir(tmp_path))
+
+def test_chunk_from_report(resource_folder, tmp_path):
+ search = SearchDirectory(tmp_path)
+ search.report_from_directory(resource_folder)
+ assert os.path.exists(tmp_path / "report.csv")
+ search.chunk_text()
+ assert os.path.exists(tmp_path / "data_chunked.csv")
+ assert os.path.exists(tmp_path / "setup_data.json")
+
+def test_chunk_without_file(tmp_path):
+ search = SearchDirectory(tmp_path)
+ with pytest.raises(Exception):
+ search.chunk_text()
+
+def test_load_chunks_without_csv(tmp_path):
+ search = SearchDirectory(tmp_path)
+ with pytest.raises(Exception):
+ search.chunk_text("tests/resources/search_directory_test_files/Test_excel_file.xlsx")
+
+def test_load_chunks_with_name_issues(tmp_path):
+ search = SearchDirectory(tmp_path)
+ with pytest.raises(Exception):
+ search.chunk_text("tests/resources/search_directory_test_files/2021_Census_English.csv")
+
+@pytest.fixture()
+def directory_with_chunks(resource_folder, tmp_path_factory):
+ file_path = tmp_path_factory.mktemp("just_chunks")
+ search = SearchDirectory(file_path)
+ search.report_from_directory(resource_folder)
+ search.chunk_text()
+ return file_path
+
+def test_load_with_chunks(directory_with_chunks):
+ search = SearchDirectory(directory_with_chunks)
+ assert search.n_chunks is not None
+
+def test_load_chunks_different_column_names(directory_with_chunks, tmp_path):
+ search1 = SearchDirectory(tmp_path)
+ search1.chunk_text("tests/resources/search_directory_test_files/report_modified.csv",
+ "path",
+ "content")
+ search2 = SearchDirectory(directory_with_chunks)
+ assert search2.n_chunks == search1.n_chunks
+
+# Test load embedding step
+
+def test_load_embedding_model(directory_with_chunks, embedding_model):
+ search = SearchDirectory(directory_with_chunks)
+ search.load_embedding_model(embedding_model)
+ assert search.encoder is not None
+ assert search.encoding_name == embedding_model
+
+@pytest.fixture(scope="module")
+def directory_with_embeding_module(resource_folder, tmp_path_factory, embedding_model):
+ file_path = tmp_path_factory.mktemp("with_embedding_module")
+ search = SearchDirectory(file_path)
+ search.report_from_directory(resource_folder)
+ search.chunk_text()
+ search.load_embedding_model(embedding_model)
+ return file_path
+
+def test_load_after_model_defined(directory_with_embeding_module, embedding_model):
+ search = SearchDirectory(directory_with_embeding_module)
+ assert search.encoder is not None
+ assert search.encoding_name == embedding_model
+
+# Test embedding step
+
+def test_embedding_without_model(directory_with_chunks):
+ search = SearchDirectory(directory_with_chunks)
+ with pytest.raises(Exception):
+ search.embed_text()
+
+variable_names = "start, end, batch, clean_files, expected_files, combined"
+values = [
+ (0, None, 100, False, ["0-76"], True),
+ (0, None, 100, False, ['0-76'], True),
+ (10, 40, 10, True, ['0-76'], True),
+ (0, None, 20, True, ["0-20", '20-40', '40-60', '60-76'], True),
+ (10, None, 30, False, ["10-40", "40-70", "70-76"], False),
+ (15, 25, 5, True, ["10-40", "40-70", "70-76"], False),
+ (20, 25, 1, True, ["20-21", '21-22', '22-23', '23-24', '24-25'], False),
+ (25, 60, 20, False, ['25-45', '45-60'], False),
+ (0, None, 15, True, ['0-15', '15-25', '25-45', '45-60', '60-75', '75-76'], True),
+ (25, 60, 20, False, ['25-45', '45-60'], False),
+ (0, 30, 20, False, ['0-20', '20-25', '25-45', '45-60'], False),
+ (50, 100, 15, True, ['0-20', '20-25', '25-45', '45-60', '60-75', '75-76'], True),
+ (0, 80, 40, True, ['0-40', '40-76'], True),
+ (0, -1, 100, True, ['0-76'], True),
+ (-4, -1, 2, True, ['73-75', '75-76'], False),
+ (-77, -1, 100, True, ['0-76'], True),
+ (-77, 76, 100, True, ['0-76'], True),
+ (10, -2, 100, True, ['10-75'], False),
+ (-78, None, 100, True, [], False),
+ (0, 0, 100, True, [], False),
+ (-1, None, 100, True, [], False),
+ (76, 80, 100, True, [], False),
+ (75, 80, 100, True, ['75-76'], False),
+ (0, -77, 100, True, [], False)
+]
+
+@pytest.mark.parametrize(variable_names, values)
+def test_embedding_creation(directory_with_embeding_module, start, end, batch, clean_files, expected_files, combined):
+ search = SearchDirectory(directory_with_embeding_module)
+ try:
+ if len(expected_files) == 0:
+ with pytest.raises(Exception):
+ search.embed_text(start, end, batch)
+ else:
+ search.embed_text(start, end, batch)
+ for file in expected_files:
+ assert f"embeddings ({file}).npy" in os.listdir(directory_with_embeding_module / "embedding_batches")
+ assert os.path.exists(directory_with_embeding_module / "embeddings.npy") == combined
+ if combined:
+ embeddings = np.load(directory_with_embeding_module / "embeddings.npy")
+ assert embeddings.shape[0] == search.n_chunks
+ finally:
+ # remove created files
+ if clean_files:
+ shutil.rmtree(directory_with_embeding_module / "embedding_batches")
+ if os.path.exists(directory_with_embeding_module / "embeddings.npy"):
+ os.remove(directory_with_embeding_module / "embeddings.npy")
+
+def test_overlapping_embedding_files(embedding_model, tmp_path):
+ search2 = SearchDirectory(tmp_path)
+ search2.chunk_text("tests/resources/search_directory_test_files/report_modified.csv",
+ "path",
+ "content")
+ search2.load_embedding_model(embedding_model)
+ search2.embed_text()
+ search1 = SearchDirectory("tests/resources/search_directory_test_files")
+ search1.chunk_text("tests/resources/search_directory_test_files/report_modified.csv",
+ "path",
+ "content")
+ search1.load_embedding_model(embedding_model)
+ search1.embed_text(batch_size=30)
+ try:
+ embeddings1 = np.load("tests/resources/search_directory_test_files/embeddings.npy")
+ embeddings2 = np.load(tmp_path / "embeddings.npy")
+ assert np.allclose(embeddings1, embeddings2, atol=1e-5)
+ finally:
+ os.remove("tests/resources/search_directory_test_files/data_chunked.csv")
+ os.remove("tests/resources/search_directory_test_files/setup_data.json")
+ os.remove("tests/resources/search_directory_test_files/embeddings.npy")
+
+# Test FAISS search step
+
+def test_flat_faiss_index_creation(directory_with_embeding_module):
+ search = SearchDirectory(directory_with_embeding_module)
+ try:
+ search.embed_text()
+ search.create_flat_index()
+ assert os.path.exists(directory_with_embeding_module / "index.faiss")
+ finally:
+ shutil.rmtree(directory_with_embeding_module / "embedding_batches")
+ os.remove(directory_with_embeding_module / "embeddings.npy")
+ os.remove(directory_with_embeding_module / "index.faiss")
+
+def test_ivf_flat_faiss_index_creation(directory_with_embeding_module):
+ search = SearchDirectory(directory_with_embeding_module)
+ try:
+ search.embed_text()
+ search.create_ivf_flat_index()
+ assert os.path.exists(directory_with_embeding_module / "index.faiss")
+ finally:
+ shutil.rmtree(directory_with_embeding_module / "embedding_batches")
+ os.remove(directory_with_embeding_module / "embeddings.npy")
+ os.remove(directory_with_embeding_module / "index.faiss")
+
+def test_hnsw_faiss_index_creation(directory_with_embeding_module):
+ search = SearchDirectory(directory_with_embeding_module)
+ try:
+ search.embed_text()
+ search.create_hnsw_index()
+ assert os.path.exists(directory_with_embeding_module / "index.faiss")
+ finally:
+ shutil.rmtree(directory_with_embeding_module / "embedding_batches")
+ os.remove(directory_with_embeding_module / "embeddings.npy")
+ os.remove(directory_with_embeding_module / "index.faiss")
+
+@pytest.fixture(scope="module")
+def directory_with_faiss(resource_folder, tmp_path_factory, embedding_model):
+ file_path = tmp_path_factory.mktemp("with_faiss")
+ search = SearchDirectory(file_path)
+ search.report_from_directory(resource_folder)
+ search.chunk_text()
+ search.load_embedding_model(embedding_model)
+ search.embed_text()
+ search.create_ivf_flat_index()
+ return file_path
+
+search_variable_names = "query, k, nprobe"
+general_query = "What is the meaning of life, the universe, and everything?"
+search_values = [
+ (general_query, 3, 1),
+ (general_query, 3, 3),
+ (general_query, 1, 3)
+]
+
+@pytest.mark.parametrize(search_variable_names, search_values)
+def test_faiss_search(directory_with_faiss, query, k, nprobe):
+ search_ivf = SearchDirectory(directory_with_faiss)
+ results_ivf = search_ivf.search(query, k, nprobe)
+ assert search_ivf.index.index.nprobe == nprobe
+ assert len(results_ivf) == k
+
+def test_search_no_chunks(tmp_path):
+ search = SearchDirectory(tmp_path)
+ with pytest.raises(Exception):
+ search.search("Test")
+
+def test_search_no_embedding_model(directory_with_chunks):
+ search = SearchDirectory(directory_with_chunks)
+ with pytest.raises(Exception):
+ search.search("Test")
+
+def test_search_no_faiss(directory_with_embeding_module):
+ search = SearchDirectory(directory_with_embeding_module)
+ with pytest.raises (Exception):
+ search.search("Test")