diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d73564d..e0508ba0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## 4.4.6 (unreleased) -- Nothing changed yet. +- Add support for extract strategy on file uploads, link and text fields. ## 4.4.5 (2025-01-16) diff --git a/nuclia/lib/kb.py b/nuclia/lib/kb.py index e8e156a0..e51af977 100644 --- a/nuclia/lib/kb.py +++ b/nuclia/lib/kb.py @@ -263,6 +263,7 @@ def start_tus_upload( rid: Optional[str] = None, md5: Optional[str] = None, content_type: str = "application/octet-stream", + extract_strategy: Optional[str] = None, ): if self.writer_session is None: raise Exception("KB not configured") @@ -284,6 +285,8 @@ def start_tus_upload( headers["upload-metadata"] += ( f",md5 {base64.b64encode(md5.encode()).decode()}" ) + if extract_strategy is not None: + headers["x-extract-strategy"] = extract_strategy response: httpx.Response = self.writer_session.post(url, headers=headers) handle_http_errors(response) @@ -605,6 +608,7 @@ async def start_tus_upload( rid: Optional[str] = None, md5: Optional[str] = None, content_type: str = "application/octet-stream", + extract_strategy: Optional[str] = None, ): if self.writer_session is None: raise Exception("KB not configured") @@ -626,6 +630,8 @@ async def start_tus_upload( headers["upload-metadata"] += ( f",md5 {base64.b64encode(md5.encode()).decode()}" ) + if extract_strategy is not None: + headers["x-extract-strategy"] = extract_strategy response = await self.writer_session.post(url, headers=headers) handle_http_errors(response) diff --git a/nuclia/sdk/upload.py b/nuclia/sdk/upload.py index 24514016..7db33518 100644 --- a/nuclia/sdk/upload.py +++ b/nuclia/sdk/upload.py @@ -64,6 +64,7 @@ def file( interpretTables: Optional[bool] = False, blanklineSplitter: Optional[bool] = False, mimetype: Optional[str] = None, + extract_strategy: Optional[str] = None, **kwargs, ) -> Optional[str]: """Upload a file from filesystem to a Nuclia KnowledgeBox""" @@ -98,6 +99,7 @@ def file( filename=filename, content_type=mimetype, md5=md5_hash.hexdigest(), + extract_strategy=extract_strategy, ) offset = 0 @@ -197,6 +199,9 @@ def text( "format": format, } } + extract_strategy = kwargs.get("extract_strategy") + if extract_strategy is not None: + texts[field]["extract_strategy"] = extract_strategy rid, is_new_resource = self._get_or_create_resource( texts=texts, icon=icon, @@ -226,6 +231,9 @@ def link( "css_selector": css_selector, } } + extract_strategy = kwargs.get("extract_strategy") + if extract_strategy is not None: + links[field]["extract_strategy"] = extract_strategy kwargs["icon"] = "application/stf-link" rid, is_new_resource = self._get_or_create_resource( links=links, @@ -248,6 +256,7 @@ def remote( field: Optional[str] = "file", interpretTables: Optional[bool] = False, blanklineSplitter: Optional[bool] = False, + extract_strategy: Optional[str] = None, **kwargs, ) -> str: """Upload a remote url to a Nuclia KnowledgeBox""" @@ -279,6 +288,7 @@ def remote( size=size, filename=filename, content_type=mimetype, + extract_strategy=extract_strategy, ) offset = 0 for _ in tqdm(range((size // CHUNK_SIZE) + 1)): @@ -375,6 +385,7 @@ async def file( mimetype: Optional[str] = None, interpretTables: Optional[bool] = False, blanklineSplitter: Optional[bool] = False, + extract_strategy: Optional[str] = None, **kwargs, ) -> str: """Upload a file from filesystem to a Nuclia KnowledgeBox""" @@ -407,6 +418,7 @@ async def file( filename=filename, content_type=mimetype, md5=md5_hash.hexdigest(), + extract_strategy=extract_strategy, ) offset = 0 for _ in tqdm(range((size // CHUNK_SIZE) + 1)): @@ -503,6 +515,9 @@ async def text( "format": format, } } + extract_strategy = kwargs.get("extract_strategy") + if extract_strategy is not None: + texts[field]["extract_strategy"] = extract_strategy rid, is_new_resource = await self._get_or_create_resource( texts=texts, icon=icon, @@ -530,6 +545,9 @@ async def link( "uri": uri, } } + extract_strategy = kwargs.get("extract_strategy") + if extract_strategy is not None: + links[field]["extract_strategy"] = extract_strategy kwargs["icon"] = "application/stf-link" rid, is_new_resource = await self._get_or_create_resource( links=links, @@ -552,6 +570,7 @@ async def remote( field: Optional[str] = "file", interpretTables: Optional[bool] = False, blanklineSplitter: Optional[bool] = False, + extract_strategy: Optional[str] = None, **kwargs, ) -> str: """Upload a remote url to a Nuclia KnowledgeBox""" @@ -578,6 +597,7 @@ async def remote( size=size, filename=filename, content_type=mimetype, + extract_strategy=extract_strategy, ) offset = 0 with tqdm(total=(size // CHUNK_SIZE) + 1) as p_bar: diff --git a/requirements.txt b/requirements.txt index fdecd074..bf9367ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,9 +5,9 @@ requests httpx httpcore>=1.0.0 prompt_toolkit -nucliadb_sdk>=6.2.1.post2735,<7 -nucliadb_models>=6.2.1.post2735,<7 -nucliadb_protos>=6.2.1.post2735,<7 +nucliadb_sdk>=6.2.1.post2864,<7 +nucliadb_models>=6.2.1.post2864,<7 +nucliadb_protos>=6.2.1.post2864,<7 nuclia-models>=0.24.3 tqdm aiofiles