Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## 4.4.6 (unreleased)


- Nothing changed yet.
- Add support for extract strategy on file uploads, link and text fields.


## 4.4.5 (2025-01-16)
Expand Down
6 changes: 6 additions & 0 deletions nuclia/lib/kb.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ def start_tus_upload(
rid: Optional[str] = None,
md5: Optional[str] = None,
content_type: str = "application/octet-stream",
extract_strategy: Optional[str] = None,
):
if self.writer_session is None:
raise Exception("KB not configured")
Expand All @@ -284,6 +285,8 @@ def start_tus_upload(
headers["upload-metadata"] += (
f",md5 {base64.b64encode(md5.encode()).decode()}"
)
if extract_strategy is not None:
headers["x-extract-strategy"] = extract_strategy

response: httpx.Response = self.writer_session.post(url, headers=headers)
handle_http_errors(response)
Expand Down Expand Up @@ -605,6 +608,7 @@ async def start_tus_upload(
rid: Optional[str] = None,
md5: Optional[str] = None,
content_type: str = "application/octet-stream",
extract_strategy: Optional[str] = None,
):
if self.writer_session is None:
raise Exception("KB not configured")
Expand All @@ -626,6 +630,8 @@ async def start_tus_upload(
headers["upload-metadata"] += (
f",md5 {base64.b64encode(md5.encode()).decode()}"
)
if extract_strategy is not None:
headers["x-extract-strategy"] = extract_strategy

response = await self.writer_session.post(url, headers=headers)
handle_http_errors(response)
Expand Down
20 changes: 20 additions & 0 deletions nuclia/sdk/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def file(
interpretTables: Optional[bool] = False,
blanklineSplitter: Optional[bool] = False,
mimetype: Optional[str] = None,
extract_strategy: Optional[str] = None,
**kwargs,
) -> Optional[str]:
"""Upload a file from filesystem to a Nuclia KnowledgeBox"""
Expand Down Expand Up @@ -98,6 +99,7 @@ def file(
filename=filename,
content_type=mimetype,
md5=md5_hash.hexdigest(),
extract_strategy=extract_strategy,
)

offset = 0
Expand Down Expand Up @@ -197,6 +199,9 @@ def text(
"format": format,
}
}
extract_strategy = kwargs.get("extract_strategy")
if extract_strategy is not None:
texts[field]["extract_strategy"] = extract_strategy
rid, is_new_resource = self._get_or_create_resource(
texts=texts,
icon=icon,
Expand Down Expand Up @@ -226,6 +231,9 @@ def link(
"css_selector": css_selector,
}
}
extract_strategy = kwargs.get("extract_strategy")
if extract_strategy is not None:
links[field]["extract_strategy"] = extract_strategy
kwargs["icon"] = "application/stf-link"
rid, is_new_resource = self._get_or_create_resource(
links=links,
Expand All @@ -248,6 +256,7 @@ def remote(
field: Optional[str] = "file",
interpretTables: Optional[bool] = False,
blanklineSplitter: Optional[bool] = False,
extract_strategy: Optional[str] = None,
**kwargs,
) -> str:
"""Upload a remote url to a Nuclia KnowledgeBox"""
Expand Down Expand Up @@ -279,6 +288,7 @@ def remote(
size=size,
filename=filename,
content_type=mimetype,
extract_strategy=extract_strategy,
)
offset = 0
for _ in tqdm(range((size // CHUNK_SIZE) + 1)):
Expand Down Expand Up @@ -375,6 +385,7 @@ async def file(
mimetype: Optional[str] = None,
interpretTables: Optional[bool] = False,
blanklineSplitter: Optional[bool] = False,
extract_strategy: Optional[str] = None,
**kwargs,
) -> str:
"""Upload a file from filesystem to a Nuclia KnowledgeBox"""
Expand Down Expand Up @@ -407,6 +418,7 @@ async def file(
filename=filename,
content_type=mimetype,
md5=md5_hash.hexdigest(),
extract_strategy=extract_strategy,
)
offset = 0
for _ in tqdm(range((size // CHUNK_SIZE) + 1)):
Expand Down Expand Up @@ -503,6 +515,9 @@ async def text(
"format": format,
}
}
extract_strategy = kwargs.get("extract_strategy")
if extract_strategy is not None:
texts[field]["extract_strategy"] = extract_strategy
rid, is_new_resource = await self._get_or_create_resource(
texts=texts,
icon=icon,
Expand Down Expand Up @@ -530,6 +545,9 @@ async def link(
"uri": uri,
}
}
extract_strategy = kwargs.get("extract_strategy")
if extract_strategy is not None:
links[field]["extract_strategy"] = extract_strategy
kwargs["icon"] = "application/stf-link"
rid, is_new_resource = await self._get_or_create_resource(
links=links,
Expand All @@ -552,6 +570,7 @@ async def remote(
field: Optional[str] = "file",
interpretTables: Optional[bool] = False,
blanklineSplitter: Optional[bool] = False,
extract_strategy: Optional[str] = None,
**kwargs,
) -> str:
"""Upload a remote url to a Nuclia KnowledgeBox"""
Expand All @@ -578,6 +597,7 @@ async def remote(
size=size,
filename=filename,
content_type=mimetype,
extract_strategy=extract_strategy,
)
offset = 0
with tqdm(total=(size // CHUNK_SIZE) + 1) as p_bar:
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ requests
httpx
httpcore>=1.0.0
prompt_toolkit
nucliadb_sdk>=6.2.1.post2735,<7
nucliadb_models>=6.2.1.post2735,<7
nucliadb_protos>=6.2.1.post2735,<7
nucliadb_sdk>=6.2.1.post2864,<7
nucliadb_models>=6.2.1.post2864,<7
nucliadb_protos>=6.2.1.post2864,<7
nuclia-models>=0.24.3
tqdm
aiofiles
Expand Down
Loading