bugfix/uncompress process in pipeline (#83)

rbiseck3 · web-flow · commit cfd4b524b601 · 2024-09-04T14:35:18.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,16 @@
-## 0.0.9-dev1
+## 0.0.9
 
 ### Enhancements
 
-**Chroma dict settings should allow string inputs
-**Move opensearch non-secret fields out of access config
-**Support string inputs for dict type model fields** Use the `BeforeValidator` support from pydantic to map a string value to a dict if that's provided. 
+* **Chroma dict settings should allow string inputs**
+* **Move opensearch non-secret fields out of access config**
+* **Support string inputs for dict type model fields** Use the `BeforeValidator` support from pydantic to map a string value to a dict if that's provided. 
+* **Move opensearch non-secret fields out of access config
+
+### Fixes
+
+**Fix uncompress logic** Use of the uncompress process wasn't being leveraged in the pipeline correctly. Updated to use the new loca download path for where the partitioned looks for the new file.  
+>>>>>>> d7a2cab (Add entry to changelog)
 
 ## 0.0.8
 
diff --git a/test_e2e/src/s3-compression.sh b/test_e2e/src/s3-compression.sh
@@ -34,8 +34,9 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
   --remote-url s3://utic-dev-tech-fixtures/small-pdf-set-w-compression/ \
   --anonymous \
   --work-dir "$WORK_DIR" \
-  --uncompress
+  --uncompress \
+  --file-glob "*.pdf,*.zip,*.tgz"
 
-"$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME
+"$SCRIPT_DIR"/check-num-files-output.sh 18 $OUTPUT_FOLDER_NAME
 
 "$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
diff --git a/test_e2e/test-src.sh b/test_e2e/test-src.sh
@@ -26,6 +26,7 @@ all_tests=(
   'biomed-path.sh'
   # NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files
   'pdf-fast-reprocess.sh'
+  's3-compression.sh'
   'salesforce.sh'
   'box.sh'
   'discord.sh'
diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py
@@ -1 +1 @@
-__version__ = "0.0.9-dev1"  # pragma: no cover
+__version__ = "0.0.9"  # pragma: no cover
diff --git a/unstructured_ingest/v2/interfaces/file_data.py b/unstructured_ingest/v2/interfaces/file_data.py
@@ -42,6 +42,7 @@ class FileData(DataClassJsonMixin):
     metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
     additional_metadata: dict[str, Any] = field(default_factory=dict)
     reprocess: bool = False
+    local_download_path: Optional[str] = None
 
     @classmethod
     def from_file(cls, path: str) -> "FileData":
diff --git a/unstructured_ingest/v2/pipeline/steps/download.py b/unstructured_ingest/v2/pipeline/steps/download.py
@@ -68,10 +68,9 @@ def should_download(self, file_data: FileData, file_data_path: str) -> bool:
     def update_file_data(
         self, file_data: FileData, file_data_path: Path, download_path: Path
     ) -> None:
+        file_data.local_download_path = str(download_path.resolve())
         file_size_bytes = download_path.stat().st_size
-        changed = False
         if not file_data.metadata.filesize_bytes and file_size_bytes:
-            changed = True
             file_data.metadata.filesize_bytes = file_size_bytes
         if (
             file_data.metadata.filesize_bytes
@@ -82,12 +81,10 @@ def update_file_data(
                 f"({file_data.metadata.filesize_bytes}) doesn't "
                 f"match size of local file: {file_size_bytes}, updating"
             )
-            changed = True
             file_data.metadata.filesize_bytes = file_size_bytes
-        if changed:
-            logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
-            with file_data_path.open("w") as file:
-                json.dump(file_data.to_dict(), file, indent=2)
+        logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
+        with file_data_path.open("w") as file:
+            json.dump(file_data.to_dict(), file, indent=2)
 
     async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
         file_data = FileData.from_file(path=file_data_path)
diff --git a/unstructured_ingest/v2/pipeline/steps/uncompress.py b/unstructured_ingest/v2/pipeline/steps/uncompress.py
@@ -1,4 +1,5 @@
 import asyncio
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, TypedDict
 
@@ -15,6 +16,7 @@ class UncompressStepResponse(TypedDict):
     path: str
 
 
+@dataclass
 class UncompressStep(PipelineStep):
     process: Uncompressor
     identifier: str = STEP_ID
@@ -23,21 +25,6 @@ def __post_init__(self):
         config = self.process.config.json() if self.process.config else None
         logger.info(f"Created {self.identifier} with configs: {config}")
 
-    def _run(self, path: str, file_data_path: str) -> list[UncompressStepResponse]:
-        file_data = FileData.from_file(path=file_data_path)
-        new_file_data = self.process.run(file_data=file_data)
-        responses = []
-        for new_file in new_file_data:
-            new_file_data_path = Path(file_data_path).parent / f"{new_file.identifier}.json"
-            new_file.to_file(path=str(new_file_data_path.resolve()))
-            responses.append(
-                UncompressStepResponse(
-                    path=new_file.source_identifiers.fullpath,
-                    file_data_path=str(new_file_data_path),
-                )
-            )
-        return responses
-
     async def _run_async(
         self, fn: Callable, path: str, file_data_path: str
     ) -> list[UncompressStepResponse]:
@@ -56,7 +43,7 @@ async def _run_async(
             new_file.to_file(path=str(new_file_data_path.resolve()))
             responses.append(
                 UncompressStepResponse(
-                    path=new_file.source_identifiers.fullpath,
+                    path=new_file.local_download_path,
                     file_data_path=str(new_file_data_path),
                 )
             )
diff --git a/unstructured_ingest/v2/processes/uncompress.py b/unstructured_ingest/v2/processes/uncompress.py
@@ -3,12 +3,14 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
+from uuid import NAMESPACE_DNS, uuid5
 
 from pydantic import BaseModel
 
 from unstructured_ingest.utils.compression import TAR_FILE_EXT, ZIP_FILE_EXT, uncompress_file
-from unstructured_ingest.v2.interfaces import FileData
+from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
 from unstructured_ingest.v2.interfaces.process import BaseProcess
+from unstructured_ingest.v2.logger import logger
 
 
 class UncompressConfig(BaseModel):
@@ -23,19 +25,35 @@ def is_async(self) -> bool:
         return True
 
     def run(self, file_data: FileData, **kwargs: Any) -> list[FileData]:
-        local_filepath = Path(file_data.source_identifiers.fullpath)
+        local_filepath = Path(file_data.local_download_path)
         if local_filepath.suffix not in TAR_FILE_EXT + ZIP_FILE_EXT:
             return [file_data]
         new_path = uncompress_file(filename=str(local_filepath))
         new_files = [i for i in Path(new_path).rglob("*") if i.is_file()]
         responses = []
+        logger.debug(
+            "uncompressed {} files from original file {}: {}".format(
+                len(new_files), local_filepath, ", ".join([str(f) for f in new_files])
+            )
+        )
         for f in new_files:
             new_file_data = copy(file_data)
-            new_file_data.source_identifiers.fullpath = str(f)
-            if new_file_data.source_identifiers.rel_path:
-                new_file_data.source_identifiers.rel_path = str(f).replace(
-                    str(local_filepath.parent), ""
-                )[1:]
+            new_file_data.identifier = str(uuid5(NAMESPACE_DNS, str(f)))
+            new_file_data.local_download_path = str(f.resolve())
+            new_rel_download_path = str(f).replace(str(Path(local_filepath.parent)), "")[1:]
+            new_file_data.source_identifiers = SourceIdentifiers(
+                filename=f.name,
+                fullpath=file_data.source_identifiers.fullpath.replace(
+                    file_data.source_identifiers.filename, new_rel_download_path
+                ),
+                rel_path=(
+                    file_data.source_identifiers.rel_path.replace(
+                        file_data.source_identifiers.filename, new_rel_download_path
+                    )
+                    if file_data.source_identifiers.rel_path
+                    else None
+                ),
+            )
             responses.append(new_file_data)
         return responses
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.9-dev1" # pragma: no cover`
	`1`	`+__version__ = "0.0.9" # pragma: no cover`