Skip to content

Commit 46826d6

Browse files
authored
Add support for extract strategies (#146)
1 parent 0e58a73 commit 46826d6

File tree

4 files changed

+30
-4
lines changed

4 files changed

+30
-4
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
## 4.4.6 (unreleased)
44

55

6-
- Nothing changed yet.
6+
- Add support for extract strategy on file uploads, link and text fields.
77

88

99
## 4.4.5 (2025-01-16)

nuclia/lib/kb.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ def start_tus_upload(
263263
rid: Optional[str] = None,
264264
md5: Optional[str] = None,
265265
content_type: str = "application/octet-stream",
266+
extract_strategy: Optional[str] = None,
266267
):
267268
if self.writer_session is None:
268269
raise Exception("KB not configured")
@@ -284,6 +285,8 @@ def start_tus_upload(
284285
headers["upload-metadata"] += (
285286
f",md5 {base64.b64encode(md5.encode()).decode()}"
286287
)
288+
if extract_strategy is not None:
289+
headers["x-extract-strategy"] = extract_strategy
287290

288291
response: httpx.Response = self.writer_session.post(url, headers=headers)
289292
handle_http_errors(response)
@@ -605,6 +608,7 @@ async def start_tus_upload(
605608
rid: Optional[str] = None,
606609
md5: Optional[str] = None,
607610
content_type: str = "application/octet-stream",
611+
extract_strategy: Optional[str] = None,
608612
):
609613
if self.writer_session is None:
610614
raise Exception("KB not configured")
@@ -626,6 +630,8 @@ async def start_tus_upload(
626630
headers["upload-metadata"] += (
627631
f",md5 {base64.b64encode(md5.encode()).decode()}"
628632
)
633+
if extract_strategy is not None:
634+
headers["x-extract-strategy"] = extract_strategy
629635

630636
response = await self.writer_session.post(url, headers=headers)
631637
handle_http_errors(response)

nuclia/sdk/upload.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def file(
6464
interpretTables: Optional[bool] = False,
6565
blanklineSplitter: Optional[bool] = False,
6666
mimetype: Optional[str] = None,
67+
extract_strategy: Optional[str] = None,
6768
**kwargs,
6869
) -> Optional[str]:
6970
"""Upload a file from filesystem to a Nuclia KnowledgeBox"""
@@ -98,6 +99,7 @@ def file(
9899
filename=filename,
99100
content_type=mimetype,
100101
md5=md5_hash.hexdigest(),
102+
extract_strategy=extract_strategy,
101103
)
102104

103105
offset = 0
@@ -197,6 +199,9 @@ def text(
197199
"format": format,
198200
}
199201
}
202+
extract_strategy = kwargs.get("extract_strategy")
203+
if extract_strategy is not None:
204+
texts[field]["extract_strategy"] = extract_strategy
200205
rid, is_new_resource = self._get_or_create_resource(
201206
texts=texts,
202207
icon=icon,
@@ -226,6 +231,9 @@ def link(
226231
"css_selector": css_selector,
227232
}
228233
}
234+
extract_strategy = kwargs.get("extract_strategy")
235+
if extract_strategy is not None:
236+
links[field]["extract_strategy"] = extract_strategy
229237
kwargs["icon"] = "application/stf-link"
230238
rid, is_new_resource = self._get_or_create_resource(
231239
links=links,
@@ -248,6 +256,7 @@ def remote(
248256
field: Optional[str] = "file",
249257
interpretTables: Optional[bool] = False,
250258
blanklineSplitter: Optional[bool] = False,
259+
extract_strategy: Optional[str] = None,
251260
**kwargs,
252261
) -> str:
253262
"""Upload a remote url to a Nuclia KnowledgeBox"""
@@ -279,6 +288,7 @@ def remote(
279288
size=size,
280289
filename=filename,
281290
content_type=mimetype,
291+
extract_strategy=extract_strategy,
282292
)
283293
offset = 0
284294
for _ in tqdm(range((size // CHUNK_SIZE) + 1)):
@@ -375,6 +385,7 @@ async def file(
375385
mimetype: Optional[str] = None,
376386
interpretTables: Optional[bool] = False,
377387
blanklineSplitter: Optional[bool] = False,
388+
extract_strategy: Optional[str] = None,
378389
**kwargs,
379390
) -> str:
380391
"""Upload a file from filesystem to a Nuclia KnowledgeBox"""
@@ -407,6 +418,7 @@ async def file(
407418
filename=filename,
408419
content_type=mimetype,
409420
md5=md5_hash.hexdigest(),
421+
extract_strategy=extract_strategy,
410422
)
411423
offset = 0
412424
for _ in tqdm(range((size // CHUNK_SIZE) + 1)):
@@ -503,6 +515,9 @@ async def text(
503515
"format": format,
504516
}
505517
}
518+
extract_strategy = kwargs.get("extract_strategy")
519+
if extract_strategy is not None:
520+
texts[field]["extract_strategy"] = extract_strategy
506521
rid, is_new_resource = await self._get_or_create_resource(
507522
texts=texts,
508523
icon=icon,
@@ -530,6 +545,9 @@ async def link(
530545
"uri": uri,
531546
}
532547
}
548+
extract_strategy = kwargs.get("extract_strategy")
549+
if extract_strategy is not None:
550+
links[field]["extract_strategy"] = extract_strategy
533551
kwargs["icon"] = "application/stf-link"
534552
rid, is_new_resource = await self._get_or_create_resource(
535553
links=links,
@@ -552,6 +570,7 @@ async def remote(
552570
field: Optional[str] = "file",
553571
interpretTables: Optional[bool] = False,
554572
blanklineSplitter: Optional[bool] = False,
573+
extract_strategy: Optional[str] = None,
555574
**kwargs,
556575
) -> str:
557576
"""Upload a remote url to a Nuclia KnowledgeBox"""
@@ -578,6 +597,7 @@ async def remote(
578597
size=size,
579598
filename=filename,
580599
content_type=mimetype,
600+
extract_strategy=extract_strategy,
581601
)
582602
offset = 0
583603
with tqdm(total=(size // CHUNK_SIZE) + 1) as p_bar:

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ requests
55
httpx
66
httpcore>=1.0.0
77
prompt_toolkit
8-
nucliadb_sdk>=6.2.1.post2735,<7
9-
nucliadb_models>=6.2.1.post2735,<7
10-
nucliadb_protos>=6.2.1.post2735,<7
8+
nucliadb_sdk>=6.2.1.post2864,<7
9+
nucliadb_models>=6.2.1.post2864,<7
10+
nucliadb_protos>=6.2.1.post2864,<7
1111
nuclia-models>=0.24.3
1212
tqdm
1313
aiofiles

0 commit comments

Comments
 (0)