Skip to content

Commit a6c09ec

Browse files
authored
Roman/dry ingest pipeline step (#3203)
### Description The main goal of this was to reduce the duplicate code that was being written for each ingest pipeline step to support async and not async functionality. Additional bug fixes found and fixed: * each logger for ingest wasn't being instantiated correctly. This was fixed to instantiate in the beginning of a pipeline run as soon as the verbosity level can be determined. * The `requires_dependencies` wrapper wasn't wrapping async functions correctly. This was fixed so that `asyncio.iscoroutinefunction()` gets trigger correctly.
1 parent 29e64eb commit a6c09ec

14 files changed

+128
-154
lines changed

Diff for: CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.14.6-dev6
1+
## 0.14.6-dev7
22

33
### Enhancements
44

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.6-dev6" # pragma: no cover
1+
__version__ = "0.14.6-dev7" # pragma: no cover

Diff for: unstructured/ingest/v2/example.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
if __name__ == "__main__":
2525
logger.info(f"Writing all content in: {work_dir.resolve()}")
2626
Pipeline.from_configs(
27-
context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True),
27+
context=ProcessorConfig(
28+
work_dir=str(work_dir.resolve()), tqdm=True, reprocess=True, verbose=True
29+
),
2830
indexer_config=S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/"),
2931
downloader_config=S3DownloaderConfig(download_dir=download_path),
3032
source_connection_config=S3ConnectionConfig(anonymous=True),

Diff for: unstructured/ingest/v2/logger.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ def redact_jsons(s: str) -> str:
8484
try:
8585
formatted_j = json.dumps(json.loads(j))
8686
except json.JSONDecodeError:
87-
formatted_j = json.dumps(ast.literal_eval(j))
87+
lit = ast.literal_eval(j)
88+
formatted_j = json.dumps(lit)
8889
hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
8990
s = s.replace(j, hidden_j)
9091
return s
@@ -112,7 +113,8 @@ def make_default_logger(level: int) -> Logger:
112113
handler.name = "ingest_log_handler"
113114
formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
114115
handler.setFormatter(formatter)
115-
logger.addHandler(handler)
116+
if handler.name not in [h.name for h in logger.handlers]:
117+
logger.addHandler(handler)
116118
logger.setLevel(level)
117119
remove_root_handlers(logger)
118120
return logger

Diff for: unstructured/ingest/v2/pipeline/interfaces.py

+12-10
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@
66
from functools import wraps
77
from pathlib import Path
88
from time import time
9-
from typing import Any, Optional, TypeVar
9+
from typing import Any, Callable, Optional, TypeVar
1010

1111
from tqdm import tqdm
1212
from tqdm.asyncio import tqdm as tqdm_asyncio
1313

1414
from unstructured.ingest.v2.interfaces import BaseProcess, ProcessorConfig
15-
from unstructured.ingest.v2.logger import logger
15+
from unstructured.ingest.v2.logger import logger, make_default_logger
1616

1717
BaseProcessT = TypeVar("BaseProcessT", bound=BaseProcess)
1818
iterable_input = list[dict[str, Any]]
@@ -98,7 +98,7 @@ def _wrap_mp(self, input_kwargs: dict) -> Any:
9898

9999
def _set_log_level(self, log_level: int):
100100
# Set the log level for each spawned process when using multiprocessing pool
101-
logger.setLevel(log_level)
101+
make_default_logger(log_level)
102102

103103
@timed
104104
def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
@@ -113,15 +113,16 @@ def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
113113
return self.process_async(iterable=iterable)
114114
return self.process_multiprocess(iterable=iterable)
115115

116-
def _run(self, *args, **kwargs: Any) -> Optional[Any]:
117-
raise NotImplementedError
116+
def _run(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
117+
return asyncio.run(self.run_async(_fn=fn, **kwargs))
118118

119-
async def _run_async(self, *args, **kwargs: Any) -> Optional[Any]:
119+
async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
120120
raise NotImplementedError
121121

122-
def run(self, *args, **kwargs: Any) -> Optional[Any]:
122+
def run(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
123123
try:
124-
return self._run(*args, **kwargs)
124+
fn = _fn or self.process.run
125+
return self._run(fn=fn, **kwargs)
125126
except Exception as e:
126127
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
127128
if "file_data_path" in kwargs:
@@ -130,9 +131,10 @@ def run(self, *args, **kwargs: Any) -> Optional[Any]:
130131
raise e
131132
return None
132133

133-
async def run_async(self, *args, **kwargs: Any) -> Optional[Any]:
134+
async def run_async(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
134135
try:
135-
return await self._run_async(*args, **kwargs)
136+
fn = _fn or self.process.run_async
137+
return await self._run_async(fn=fn, **kwargs)
136138
except Exception as e:
137139
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
138140
if "file_data_path" in kwargs:

Diff for: unstructured/ingest/v2/pipeline/pipeline.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing import Any, Optional, Union
66

77
from unstructured.ingest.v2.interfaces import ProcessorConfig
8-
from unstructured.ingest.v2.logger import logger
8+
from unstructured.ingest.v2.logger import logger, make_default_logger
99
from unstructured.ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
1010
from unstructured.ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
1111
from unstructured.ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
@@ -59,7 +59,7 @@ def __post_init__(
5959
stager: UploadStager = None,
6060
uploader: Uploader = None,
6161
):
62-
logger.setLevel(level=logging.DEBUG if self.context.verbose else logging.INFO)
62+
make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
6363
self.indexer_step = IndexStep(process=indexer, context=self.context)
6464
self.downloader_step = DownloadStep(process=downloader, context=self.context)
6565
self.partitioner_step = PartitionStep(process=partitioner, context=self.context)

Diff for: unstructured/ingest/v2/pipeline/steps/chunk.py

+11-19
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1+
import asyncio
12
import hashlib
23
import json
34
from dataclasses import dataclass
45
from pathlib import Path
5-
from typing import Optional, TypedDict
6+
from typing import Callable, Optional, TypedDict
67

78
from unstructured.ingest.v2.interfaces import FileData
89
from unstructured.ingest.v2.logger import logger
@@ -53,32 +54,23 @@ def _save_output(self, output_filepath: str, chunked_content: list[dict]):
5354
logger.debug(f"Writing chunker output to: {output_filepath}")
5455
json.dump(chunked_content, f, indent=2)
5556

56-
def _run(self, path: str, file_data_path: str) -> ChunkStepResponse:
57+
async def _run_async(
58+
self, fn: Callable, path: str, file_data_path: str, **kwargs
59+
) -> ChunkStepResponse:
5760
path = Path(path)
5861
file_data = FileData.from_file(path=file_data_path)
5962
output_filepath = self.get_output_filepath(filename=path)
6063
if not self.should_chunk(filepath=output_filepath, file_data=file_data):
6164
logger.debug(f"Skipping chunking, output already exists: {output_filepath}")
6265
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
63-
chunked_content_raw = self.process.run(elements_filepath=path)
64-
self._save_output(
65-
output_filepath=str(output_filepath),
66-
chunked_content=elements_to_dicts(chunked_content_raw),
67-
)
68-
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
69-
70-
async def _run_async(self, path: str, file_data_path: str) -> ChunkStepResponse:
71-
path = Path(path)
72-
file_data = FileData.from_file(path=file_data_path)
73-
output_filepath = self.get_output_filepath(filename=path)
74-
if not self.should_chunk(filepath=output_filepath, file_data=file_data):
75-
logger.debug(f"Skipping chunking, output already exists: {output_filepath}")
76-
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
77-
if semaphore := self.context.semaphore:
66+
fn_kwargs = {"elements_filepath": path}
67+
if not asyncio.iscoroutinefunction(fn):
68+
chunked_content_raw = fn(**fn_kwargs)
69+
elif semaphore := self.context.semaphore:
7870
async with semaphore:
79-
chunked_content_raw = await self.process.run_async(elements_filepath=path)
71+
chunked_content_raw = await fn(**fn_kwargs)
8072
else:
81-
chunked_content_raw = await self.process.run_async(elements_filepath=path)
73+
chunked_content_raw = await fn(**fn_kwargs)
8274
self._save_output(
8375
output_filepath=str(output_filepath),
8476
chunked_content=elements_to_dicts(chunked_content_raw),

Diff for: unstructured/ingest/v2/pipeline/steps/download.py

+21-31
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
import asyncio
12
import hashlib
23
import json
34
from dataclasses import dataclass
4-
from typing import Optional, TypedDict, TypeVar
5+
from typing import Callable, Optional, TypedDict, TypeVar
56

67
from unstructured.ingest.v2.interfaces import FileData, download_responses
78
from unstructured.ingest.v2.interfaces.downloader import Downloader
@@ -55,7 +56,7 @@ def should_download(self, file_data: FileData, file_data_path: str) -> bool:
5556
if self.context.re_download:
5657
return True
5758
download_path = self.process.get_download_path(file_data=file_data)
58-
if not download_path.exists():
59+
if not download_path or not download_path.exists():
5960
return True
6061
if (
6162
download_path.is_file()
@@ -69,6 +70,24 @@ def should_download(self, file_data: FileData, file_data_path: str) -> bool:
6970
return True
7071
return False
7172

73+
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
74+
file_data = FileData.from_file(path=file_data_path)
75+
download_path = self.process.get_download_path(file_data=file_data)
76+
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
77+
logger.debug(f"Skipping download, file already exists locally: {download_path}")
78+
return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
79+
fn_kwargs = {"file_data": file_data}
80+
if not asyncio.iscoroutinefunction(fn):
81+
download_results = fn(**fn_kwargs)
82+
elif semaphore := self.context.semaphore:
83+
async with semaphore:
84+
download_results = await fn(**fn_kwargs)
85+
else:
86+
download_results = await fn(**fn_kwargs)
87+
return self.create_step_results(
88+
current_file_data_path=file_data_path, download_results=download_results
89+
)
90+
7291
def create_step_results(
7392
self, current_file_data_path: str, download_results: download_responses
7493
) -> list[DownloadStepResponse]:
@@ -87,35 +106,6 @@ def create_step_results(
87106
)
88107
return download_step_results
89108

90-
def _run(self, file_data_path: str) -> list[DownloadStepResponse]:
91-
file_data = FileData.from_file(path=file_data_path)
92-
download_path = self.process.get_download_path(file_data=file_data)
93-
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
94-
logger.debug(f"Skipping download, file already exists locally: {download_path}")
95-
return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
96-
97-
download_results = self.process.run(file_data=file_data)
98-
return self.create_step_results(
99-
current_file_data_path=file_data_path, download_results=download_results
100-
)
101-
102-
async def _run_async(self, file_data_path: str) -> list[DownloadStepResponse]:
103-
file_data = FileData.from_file(path=file_data_path)
104-
download_path = self.process.get_download_path(file_data=file_data)
105-
if download_path and not self.should_download(
106-
file_data=file_data, file_data_path=file_data_path
107-
):
108-
logger.debug(f"Skipping download, file already exists locally: {download_path}")
109-
return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
110-
if semaphore := self.context.semaphore:
111-
async with semaphore:
112-
download_results = await self.process.run_async(file_data=file_data)
113-
else:
114-
download_results = await self.process.run_async(file_data=file_data)
115-
return self.create_step_results(
116-
current_file_data_path=file_data_path, download_results=download_results
117-
)
118-
119109
def persist_new_file_data(self, file_data: FileData) -> str:
120110
record_hash = self.get_hash(extras=[file_data.identifier])
121111
filename = f"{record_hash}.json"

Diff for: unstructured/ingest/v2/pipeline/steps/embed.py

+9-20
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1+
import asyncio
12
import hashlib
23
import json
34
from dataclasses import dataclass
45
from pathlib import Path
5-
from typing import Optional, TypedDict
6+
from typing import Callable, Optional, TypedDict
67

78
from unstructured.ingest.v2.interfaces import FileData
89
from unstructured.ingest.v2.logger import logger
@@ -53,33 +54,21 @@ def _save_output(self, output_filepath: str, embedded_content: list[dict]):
5354
logger.debug(f"Writing embedded output to: {output_filepath}")
5455
json.dump(embedded_content, f, indent=2)
5556

56-
def _run(self, path: str, file_data_path: str) -> EmbedStepResponse:
57-
path = Path(path)
58-
file_data = FileData.from_file(path=file_data_path)
59-
60-
output_filepath = self.get_output_filepath(filename=path)
61-
if not self.should_embed(filepath=output_filepath, file_data=file_data):
62-
logger.debug(f"Skipping embedding, output already exists: {output_filepath}")
63-
return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
64-
embed_content_raw = self.process.run(elements_filepath=path)
65-
self._save_output(
66-
output_filepath=str(output_filepath),
67-
embedded_content=elements_to_dicts(embed_content_raw),
68-
)
69-
return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
70-
71-
async def _run_async(self, path: str, file_data_path: str) -> EmbedStepResponse:
57+
async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
7258
path = Path(path)
7359
file_data = FileData.from_file(path=file_data_path)
7460
output_filepath = self.get_output_filepath(filename=path)
7561
if not self.should_embed(filepath=output_filepath, file_data=file_data):
7662
logger.debug(f"Skipping embedding, output already exists: {output_filepath}")
7763
return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
78-
if semaphore := self.context.semaphore:
64+
fn_kwargs = {"elements_filepath": path}
65+
if not asyncio.iscoroutinefunction(fn):
66+
embed_content_raw = fn(**fn_kwargs)
67+
elif semaphore := self.context.semaphore:
7968
async with semaphore:
80-
embed_content_raw = await self.process.run_async(elements_filepath=path)
69+
embed_content_raw = await fn(**fn_kwargs)
8170
else:
82-
embed_content_raw = await self.process.run_async(elements_filepath=path)
71+
embed_content_raw = await fn(**fn_kwargs)
8372

8473
self._save_output(
8574
output_filepath=str(output_filepath),

Diff for: unstructured/ingest/v2/pipeline/steps/partition.py

+11-22
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1+
import asyncio
12
import hashlib
23
import json
34
from dataclasses import dataclass
45
from pathlib import Path
5-
from typing import Optional, TypedDict
6+
from typing import Callable, Optional, TypedDict
67

78
from unstructured.ingest.v2.interfaces import FileData
89
from unstructured.ingest.v2.logger import logger
@@ -48,35 +49,23 @@ def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
4849
logger.debug(f"Writing partitioned output to: {output_filepath}")
4950
json.dump(partitioned_content, f, indent=2)
5051

51-
def _run(self, path: str, file_data_path: str) -> PartitionStepResponse:
52+
async def _run_async(
53+
self, fn: Callable, path: str, file_data_path: str
54+
) -> PartitionStepResponse:
5255
path = Path(path)
5356
file_data = FileData.from_file(path=file_data_path)
5457
output_filepath = self.get_output_filepath(filename=Path(file_data_path))
5558
if not self.should_partition(filepath=output_filepath, file_data=file_data):
5659
logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
5760
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
58-
partitioned_content = self.process.run(filename=path, metadata=file_data.metadata)
59-
self._save_output(
60-
output_filepath=str(output_filepath), partitioned_content=partitioned_content
61-
)
62-
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
63-
64-
async def _run_async(self, path: str, file_data_path: str) -> PartitionStepResponse:
65-
path = Path(path)
66-
file_data = FileData.from_file(path=file_data_path)
67-
output_filepath = self.get_output_filepath(filename=Path(file_data_path))
68-
if not self.should_partition(filepath=output_filepath, file_data=file_data):
69-
logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
70-
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
71-
if semaphore := self.context.semaphore:
61+
fn_kwargs = {"filename": path, "metadata": file_data.metadata}
62+
if not asyncio.iscoroutinefunction(fn):
63+
partitioned_content = fn(**fn_kwargs)
64+
elif semaphore := self.context.semaphore:
7265
async with semaphore:
73-
partitioned_content = await self.process.run_async(
74-
filename=path, metadata=file_data.metadata
75-
)
66+
partitioned_content = await fn(**fn_kwargs)
7667
else:
77-
partitioned_content = await self.process.run_async(
78-
filename=path, metadata=file_data.metadata
79-
)
68+
partitioned_content = await fn(**fn_kwargs)
8069
self._save_output(
8170
output_filepath=str(output_filepath), partitioned_content=partitioned_content
8271
)

0 commit comments

Comments
 (0)