Skip to content

Commit 031a271

Browse files
fabnemEPFLleagriederMikaelKalajdzic
authored
Live retrieval API with index API and retriever API (#94)
* Added retrieving among a set of specified documents * Adapted run_retriever * simplified code in retriever * simplified run_retriever.py * misc style changes * adapted retriever to rag api * cherry picking from RAG interface is now possible by providing document_ids in the json file * Preparation for the retriever api * basic retriever api * Updated retrieval api + documentation Co-authored-by: MikaelKalajdzic <mikael@kalajdzic.ch> Co-authored-by: fabnemEPFL <fabrice.nemo@epfl.ch> * adapted retriever to effectively use the arguments minSimilarity and maxMatches * execution state shut down at the end of processing * import fixes on run_retriever.py * removed useless enumerate * retriever config and fixes with retriever_api * fixed mmore retriever * fixed default retriever_api config * fix default DB uri * fixed error 404 if deletion of file already absent * new fixes of HTTP errors * changed retriever endpoint to post * making a single FastAPI combining index-api and retriever-api * various fixes * more fixes * retrieve the original name of the file sent on the download endpoit * made one shady line of the code clearer * fixed the logic of retriever * cosmetic changes * moved utilities to utils.py * added live-retrieval in the module * cosmetic changes * removed the retriever endpoint from run_index_api * removed useless import * initializing the indexer and processors on starttime of the live retrieval * small fix * added a tag to the retriever endpoint * small changes * fixes * black and isort fixes * logic fix * cosmetic changes * fixes * pre-loading of processors * chunking documents in index_api * attempt of fix * smol fix * changed default LLM for RAG * initialize PDF processor on startup * split document id and chunk id * fixed typo * fix * changed default values * fix * ruff formatting * removed useless imports * reformatted tests * pyright compliance and reformatting * ruff compliance * ruff fixes * small fixes * renamed a function * again * manageable preloading of processor models * changed default model for media processor --------- Co-authored-by: leagrieder <lea@grieder.org> Co-authored-by: MikaelKalajdzic <mikael@kalajdzic.ch>
1 parent 275f692 commit 031a271

28 files changed

Lines changed: 1024 additions & 611 deletions

docs/index_api.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ Returns the file with binary content.
173173
- File types supported:
174174

175175
```
176-
.pdf, .docx, .pptx, .md, .txt, .xlsx, .xls, .csv, .mp4, .avi, .mov, .mkv, .mp3, .wav, .aac, .eml, .html
176+
.pdf, .docx, .pptx, .md, .txt, .xlsx, .xls, .csv, .mp4, .avi, .mov, .mkv, .mp3, .wav, .aac, .eml, .html, .htm
177177
```
178178
179179

examples/rag/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
rag:
22
llm:
33
llm_name: OpenMeditron/meditron3-8b
4-
max_new_tokens: 250
4+
max_new_tokens: 1200
55
retriever:
66
db:
77
uri: ./proc_demo.db

examples/rag/config_api.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
rag:
33
# LLM Config
44
llm:
5-
llm_name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # "epfl-llm/meditron-70b" # "gpt-4o-mini" # Anything supported
6-
max_new_tokens: 100
5+
llm_name: Qwen/Qwen3-8B # "epfl-llm/meditron-70b" # "gpt-4o-mini" # Anything supported
6+
max_new_tokens: 1200
77
temperature: 0.8
88
# Retriever Config
99
retriever:

examples/retriever_api/config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
db:
2+
uri: ./proc_demo.db
3+
name: my_db
4+
hybrid_search_weight: 0.5
5+
k: 5
6+
collection_name: my_docs

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ exclude = ["src/mmore/run_retriever.py"] # TODO: add back when GH CI bug is f
164164

165165
[tool.ruff.lint]
166166
select = ["E", "F", "W", "I", "N"]
167-
ignore = ["E501"] # Avoid enforcing line-length violations (`E501`)
167+
ignore = ["E501", "E402"] # Avoid enforcing line-length violations (`E501`) and Module level import not at top of file (`E402`)
168168

169169
[tool.ruff.lint.per-file-ignores]
170170
"__init__.py" = ["F401"]

src/mmore/cli.py

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from typing import Optional
2+
13
import click
24

35

@@ -97,36 +99,82 @@ def index(config_file: str, documents_path: str, collection_name: str):
9799
"-c",
98100
type=str,
99101
required=True,
100-
help="Dispatcher configuration file path.",
102+
help="Retriever configuration file path.",
101103
)
102104
@click.option(
103105
"--input-file",
104106
"-f",
105-
type=str,
106-
required=True,
107+
type=Optional[str],
108+
required=False,
109+
default=None,
107110
help="Path to the JSONL file of the input queries.",
108111
)
109112
@click.option(
110113
"--output-file",
111114
"-o",
112-
type=str,
113-
required=True,
115+
type=Optional[str],
116+
required=False,
117+
default=None,
114118
help="Path to which save the results of the retriever as a JSON.",
115119
)
116-
def retrieve(config_file: str, input_file: str, output_file: str):
120+
@click.option(
121+
"--host", type=str, default="0.0.0.0", help="Host on which the API should be run."
122+
)
123+
@click.option(
124+
"--port", type=int, default=8001, help="Port on which the API should be run."
125+
)
126+
def retrieve(
127+
config_file: str,
128+
input_file: Optional[str],
129+
output_file: Optional[str],
130+
host: str,
131+
port: int,
132+
):
117133
"""Retrieve documents for specified queries.
118134
119135
Args:
120-
config_file: path to the config file for the retriver.
136+
config_file: path to the config file for the retriever.
121137
input_file: path to the JSONL file of the input queries.
122138
output_file: path to which save the results of the retriever as a JSON.
123139
124140
Returns:
125141
126142
"""
127143
from .run_retriever import retrieve as run_retrieve
144+
from .run_retriever import run_api
145+
146+
if input_file:
147+
assert isinstance(output_file, str)
148+
run_retrieve(config_file, input_file, output_file)
149+
else:
150+
run_api(config_file, host, port)
151+
152+
153+
@main.command()
154+
@click.option(
155+
"--config-file",
156+
"-c",
157+
type=str,
158+
required=True,
159+
help="Retriever configuration file path.",
160+
)
161+
@click.option(
162+
"--host", type=str, default="0.0.0.0", help="Host on which the API should be run."
163+
)
164+
@click.option(
165+
"--port", type=int, default=8000, help="Port on which the API should be run."
166+
)
167+
def live_retrieval(config_file: str, host: str, port: int):
168+
"""API for live indexing and retrieval of documents.
169+
170+
Args:
171+
config_file: Path to the retriever configuration file.
172+
host: Host on which the API should be run.
173+
port: Port on which the API should be run.
174+
"""
175+
from .run_live_retrieval import run
128176

129-
run_retrieve(config_file, input_file, output_file)
177+
run(config_file, host, port)
130178

131179

132180
@main.command()
@@ -148,16 +196,24 @@ def rag(config_file: str):
148196

149197

150198
@main.command()
199+
@click.option(
200+
"--config-file",
201+
"-c",
202+
type=str,
203+
required=True,
204+
help="Retriever configuration file path.",
205+
)
151206
@click.option(
152207
"--host", type=str, default="0.0.0.0", help="Host on which the API should be run."
153208
)
154209
@click.option(
155210
"--port", type=int, default=8000, help="Port on which the API should be run."
156211
)
157-
def index_api(host, port):
212+
def index_api(config_file, host, port):
158213
"""Run the Index API.
159214
160215
Args:
216+
config_file: Path to the retriever configuration file.
161217
host: Host on which the API should be run.
162218
port: Port on which the API should be run.
163219
@@ -166,7 +222,7 @@ def index_api(host, port):
166222
"""
167223
from .run_index_api import run_api
168224

169-
run_api(host, port)
225+
run_api(config_file, host, port)
170226

171227

172228
@main.command()

src/mmore/index/indexer.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
@dataclass
2525
class DBConfig:
26-
uri: str = "demo.db"
26+
uri: str = "./proc_demo.db"
2727
name: str = "my_db"
2828

2929

@@ -114,7 +114,8 @@ def _create_collection_with_schema(self, collection_name: str):
114114
fields = [
115115
FieldSchema(
116116
name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=128
117-
), # Add doc_id field
117+
),
118+
FieldSchema(name="document_id", dtype=DataType.VARCHAR, max_length=128),
118119
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
119120
FieldSchema(
120121
name="dense_embedding",
@@ -192,6 +193,7 @@ def _index_documents(
192193
data = [
193194
{
194195
"id": sample.id,
196+
"document_id": sample.document_id,
195197
"text": sample.text,
196198
"dense_embedding": d,
197199
"sparse_embedding": s.reshape(1, -1),
@@ -206,7 +208,7 @@ def _index_documents(
206208
partition_name=partition_name,
207209
)
208210

209-
inserted += list(batch_inserted.values())[0]
211+
inserted += batch_inserted["insert_count"]
210212

211213
return inserted
212214

src/mmore/process/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def get_config_path():
5050
app_dir.mkdir(parents=True, exist_ok=True)
5151
except PermissionError as e:
5252
click.echo(f"Error creating config directory: {e}", err=True)
53-
raise
53+
raise e
5454
return app_dir / "config.yaml"
5555

5656

src/mmore/process/crawler.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def _get_metadata_jsonl_path(self, results_jsonl_path):
144144
if "metadata" in data and "file_path" in data["metadata"]:
145145
file_paths.append(data["metadata"]["file_path"])
146146
else:
147-
print(
147+
logger.error(
148148
f"Warning file_path not found in metadate (line{i} of {results_jsonl_path})"
149149
)
150150
return file_paths
@@ -224,9 +224,9 @@ def from_yaml(yaml_path: str):
224224
with open(yaml_path, "r") as file:
225225
config = yaml.safe_load(file)
226226
return CrawlerConfig.from_dict(config)
227-
except (FileNotFoundError, yaml.YAMLError):
227+
except (FileNotFoundError, yaml.YAMLError) as e:
228228
logger.error(f"[Crawler] Error processing {yaml_path}.")
229-
raise
229+
raise e
230230

231231
def to_dict(self):
232232
"""

src/mmore/process/dispatcher.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def from_dict(config: Dict) -> "DispatcherConfig":
8585
use_fast_processors=config.get("use_fast_processors", True),
8686
distributed=config.get("distributed", False),
8787
scheduler_file=config.get("scheduler_file"),
88-
processor_config=config.get("processor"),
88+
processor_config=config.get("processor_config"),
8989
process_batch_sizes=config.get("process_batch_sizes"),
9090
batch_multiplier=config.get("batch_multiplier", 1),
9191
extract_images=config.get("extract_images", False),
@@ -100,9 +100,9 @@ def from_yaml(yaml_path: str):
100100
with open(yaml_path, "r") as file:
101101
config = yaml.safe_load(file)
102102
return DispatcherConfig.from_dict(config)
103-
except (FileNotFoundError, yaml.YAMLError):
103+
except (FileNotFoundError, yaml.YAMLError) as e:
104104
logger.error(f"[Dispatcher] Error processing file {yaml_path}")
105-
raise
105+
raise e
106106

107107
def to_dict(self) -> Dict:
108108
"""Convert the DispatcherConfig object to a dictionary."""
@@ -111,7 +111,7 @@ def to_dict(self) -> Dict:
111111
"distributed": self.distributed,
112112
"scheduler_file": self.scheduler_file,
113113
"output_path": self.output_path,
114-
"processor": self.processor_config,
114+
"processor_config": self.processor_config,
115115
"process_batch_sizes": self.process_batch_sizes,
116116
"batch_multiplier": self.batch_multiplier,
117117
"extract_images": self.extract_images,

0 commit comments

Comments
 (0)