|
1 | 1 | import multiprocessing as mp
|
2 | 2 | import os
|
| 3 | +import random |
| 4 | +import string |
| 5 | + |
| 6 | +import click |
| 7 | + |
3 | 8 | from unstructured.ingest.connector.s3_connector import S3Connector, SimpleS3Config
|
4 |
| -from unstructured.ingest.doc_processor.generalized import process_document |
| 9 | +from unstructured.ingest.doc_processor.generalized import initialize, process_document |
5 | 10 |
|
6 | 11 | class MainProcess:
|
7 | 12 |
|
8 |
| - def __init__(self, doc_connector, doc_processor_fn, num_processes): |
| 13 | + def __init__(self, doc_connector, doc_processor_fn, num_processes, reprocess): |
9 | 14 | # initialize the reader and writer
|
10 | 15 | self.doc_connector = doc_connector
|
11 | 16 | self.doc_processor_fn = doc_processor_fn
|
12 | 17 | self.num_processes = num_processes
|
13 |
| - |
| 18 | + self.reprocess = reprocess |
14 | 19 |
|
15 | 20 | def initialize(self):
|
16 | 21 | """Slower initialization things: check connections, load things into memory, etc."""
|
17 |
| - self.doc_connector.initialize() |
18 |
| - |
| 22 | + initialize() |
| 23 | + |
19 | 24 | def cleanup(self):
|
20 | 25 | self.doc_connector.cleanup()
|
21 | 26 |
|
| 27 | + def _filter_docs_with_outputs(self, docs): |
| 28 | + num_docs_all = len(docs) |
| 29 | + docs = [doc for doc in docs if not doc.has_output()] |
| 30 | + num_docs_to_process = len(docs) |
| 31 | + if num_docs_to_process == 0: |
| 32 | + print("All docs have structured outputs, nothing to do. Use --reprocess to process all.") |
| 33 | + return None |
| 34 | + elif num_docs_to_process != num_docs_all: |
| 35 | + print(f"Skipping processing for {num_docs_all - num_docs_to_process} docs out of " |
| 36 | + f"{num_docs_all} since their structured outputs already exist, use --reprocess to " |
| 37 | + "reprocess those in addition to the unprocessed ones.") |
| 38 | + return docs |
| 39 | + |
22 | 40 | def run(self):
|
23 | 41 | self.initialize()
|
24 |
| - |
| 42 | + |
25 | 43 | self.doc_connector.fetch_docs()
|
26 | 44 |
|
27 | 45 | # fetch the list of lazy downloading IngestDoc obj's
|
28 | 46 | docs = self.doc_connector.fetch_docs()
|
29 | 47 |
|
| 48 | + # remove docs that have already been processed |
| 49 | + if not self.reprocess: |
| 50 | + docs = self._filter_docs_with_outputs(docs) |
| 51 | + if not docs: |
| 52 | + return |
| 53 | + |
30 | 54 | # Debugging tip: use the below line and comment out the mp.Pool loop
|
31 | 55 | # block to remain in single process
|
32 |
| - #self.doc_processor_fn(docs[0]) |
33 |
| - |
| 56 | + # self.doc_processor_fn(docs[0]) |
| 57 | + |
34 | 58 | with mp.Pool(processes=self.num_processes) as pool:
|
35 | 59 | results = pool.map(self.doc_processor_fn, docs)
|
36 |
| - |
| 60 | + |
37 | 61 | self.cleanup()
|
38 | 62 |
|
39 |
| - @staticmethod |
40 |
| - def main(): |
41 |
| - doc_connector = S3Connector( |
42 |
| - config=SimpleS3Config( |
43 |
| - s3_url="s3://utic-dev-tech-fixtures/small-pdf-set/", |
44 |
| - output_dir="structured-output", |
45 |
| - # set to False to use your AWS creds (not needed for this public s3 url) |
46 |
| - anonymous=True, |
47 |
| - ), |
| 63 | +@click.command() |
| 64 | +@click.option('--s3-url', default="s3://utic-dev-tech-fixtures/small-pdf-set/", |
| 65 | + help="Prefix of s3 objects (files) to download. E.g. s3://bucket1/path/. This value may also be a single file.") |
| 66 | +@click.option('--re-download/--no-re-download', default=False, |
| 67 | + help="Re-download files from s3 even if they are already present in --download-dir.") |
| 68 | +@click.option('--download-dir', |
| 69 | + help="Where s3 files are downloaded to, defaults to tmp-ingest-<6 random chars>." ) |
| 70 | +@click.option('--preserve-downloads', is_flag=True, default=False, |
| 71 | + help="Preserve downloaded s3 files. Otherwise each file is removed after being processed successfully." ) |
| 72 | +@click.option('--structured-output-dir', default="structured-output", |
| 73 | + help="Where to place structured output .json files.") |
| 74 | +@click.option('--reprocess', is_flag=True, default=False, |
| 75 | + help="Reprocess a downloaded file from s3 even if the relevant structured output .json file in --structured-output-dir already exists.") |
| 76 | +@click.option('--num-processes', default=2, show_default=True, |
| 77 | + help="Number of parallel processes to process docs in.") |
| 78 | +@click.option('--anonymous', is_flag=True, default=False, |
| 79 | + help="Connect to s3 without local AWS credentials.") |
| 80 | +@click.option('-v', '--verbose', is_flag=True, default=False) |
| 81 | +def main(s3_url, re_download, download_dir, preserve_downloads, structured_output_dir, |
| 82 | + reprocess, num_processes, anonymous, verbose): |
| 83 | + if not preserve_downloads and download_dir: |
| 84 | + print("Warning: not preserving downloaded s3 files but --download_dir is specified") |
| 85 | + if not download_dir: |
| 86 | + download_dir = "tmp-ingest-" + "".join( |
| 87 | + random.choice(string.ascii_letters) for i in range(6) |
48 | 88 | )
|
49 |
| - MainProcess(doc_connector=doc_connector, |
50 |
| - doc_processor_fn=process_document, |
51 |
| - num_processes=2).run() |
| 89 | + doc_connector = S3Connector( |
| 90 | + config=SimpleS3Config( |
| 91 | + download_dir=download_dir, |
| 92 | + s3_url=s3_url, |
| 93 | + output_dir=structured_output_dir, |
| 94 | + # set to False to use your AWS creds (not needed for this public s3 url) |
| 95 | + anonymous=anonymous, |
| 96 | + re_download=re_download, |
| 97 | + preserve_downloads=preserve_downloads, |
| 98 | + verbose=verbose, |
| 99 | + ), |
| 100 | + ) |
| 101 | + MainProcess(doc_connector=doc_connector, |
| 102 | + doc_processor_fn=process_document, |
| 103 | + num_processes=num_processes, |
| 104 | + reprocess=reprocess, |
| 105 | + ).run() |
52 | 106 |
|
| 107 | + |
53 | 108 | if __name__ == '__main__':
|
54 |
| - MainProcess.main() |
| 109 | + main() |
0 commit comments