Skip to content

Commit c16862e

Browse files
authored
feat: add --metadata-include and --metadata-exclude parameters to unstructured-ingest (#368)
* added metadata in/exclude params * updated process_file * existing tests * remove default behavior * changelog and ci * line length * import * import * import sorted * import * type * line length * main * ci * json * dict * type ignore * lint * unit tests for process_file * lint * type changed to Optional(str) * ci * line length * added mutex check * nit
1 parent d5a0fce commit c16862e

17 files changed

+175
-8
lines changed

Diff for: CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
### Features
88

9+
* Add `--metadata-include` and `--metadata-exclude` parameters to `unstructured-ingest`
910
* Add `clean_non_ascii_chars` to remove non-ascii characters from unicode string
1011

1112
### Fixes

Diff for: test_unstructured_ingest/test-ingest-azure.sh

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
44
cd "$SCRIPT_DIR"/.. || exit 1
55

66
PYTHONPATH=. ./unstructured/ingest/main.py \
7+
--metadata-exclude filename \
78
--remote-url abfs://container1/ \
89
--azure-account-name azureunstructured1 \
910
--structured-output-dir azure-ingest-output \

Diff for: test_unstructured_ingest/test-ingest-biomed-api.sh

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/biomed-ingest-
1010
fi
1111

1212
PYTHONPATH=. ./unstructured/ingest/main.py \
13+
--metadata-exclude filename \
1314
--biomed-api-from "2019-01-02" \
1415
--biomed-api-until "2019-01-02+00:03:10" \
1516
--structured-output-dir biomed-ingest-output-api \

Diff for: test_unstructured_ingest/test-ingest-biomed-path.sh

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/biomed-ingest-
1010
fi
1111

1212
PYTHONPATH=. ./unstructured/ingest/main.py \
13+
--metadata-exclude filename \
1314
--biomed-path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
1415
--structured-output-dir biomed-ingest-output-path \
1516
--num-processes 2 \

Diff for: test_unstructured_ingest/test-ingest-github.sh

+6-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@ if [[ "$CI" == "true" ]]; then
1212
fi
1313

1414

15-
PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --git-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose
15+
PYTHONPATH=. ./unstructured/ingest/main.py \
16+
--metadata-exclude filename \
17+
--github-url dcneiner/Downloadify \
18+
--git-file-glob '*.html,*.txt' \
19+
--structured-output-dir github-downloadify-output \
20+
--verbose
1621

1722
if ! diff -ru test_unstructured_ingest/expected-structured-output/github-downloadify github-downloadify-output ; then
1823
echo

Diff for: test_unstructured_ingest/test-ingest-gitlab.sh

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
44
cd "$SCRIPT_DIR"/.. || exit 1
55

66
PYTHONPATH=. ./unstructured/ingest/main.py \
7+
--metadata-exclude filename \
78
--gitlab-url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \
89
--git-file-glob '*.md,*.txt' \
910
--structured-output-dir gitlab-ingest-output \

Diff for: test_unstructured_ingest/test-ingest-s3.sh

+5-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@ if [[ "$(find test_unstructured_ingest/expected-structured-output/s3-small-batch
99
exit 1
1010
fi
1111

12-
PYTHONPATH=. ./unstructured/ingest/main.py --s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ --s3-anonymous --structured-output-dir s3-small-batch-output
12+
PYTHONPATH=. ./unstructured/ingest/main.py \
13+
--metadata-exclude filename \
14+
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
15+
--s3-anonymous \
16+
--structured-output-dir s3-small-batch-output \
1317

1418
if ! diff -ru test_unstructured_ingest/expected-structured-output/s3-small-batch s3-small-batch-output ; then
1519
echo

Diff for: test_unstructured_ingest/test-ingest-wikipedia.sh

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
44
cd "$SCRIPT_DIR"/.. || exit 1
55

66
PYTHONPATH=. ./unstructured/ingest/main.py \
7+
--metadata-exclude filename \
78
--wikipedia-page-title "Open Source Software" \
89
--structured-output-dir wikipedia-ingest-output \
910
--num-processes 2 \

Diff for: test_unstructured_ingest/test_interfaces.py

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import os
2+
import pathlib
3+
4+
import pytest
5+
6+
from unstructured.ingest.connector.git import GitIngestDoc, SimpleGitConfig
7+
8+
DIRECTORY = pathlib.Path(__file__).parent.resolve()
9+
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "example-docs")
10+
11+
test_files = [
12+
"layout-parser-paper-fast.jpg",
13+
"layout-parser-paper-fast.pdf",
14+
]
15+
16+
17+
@pytest.mark.parametrize("filename", test_files)
18+
def test_process_file_include_filename(filename: str):
19+
ingest_doc = GitIngestDoc(
20+
path=filename,
21+
config=SimpleGitConfig(
22+
download_dir=EXAMPLE_DOCS_DIRECTORY,
23+
metadata_include="filename",
24+
),
25+
)
26+
isd_elems = ingest_doc.process_file()
27+
28+
for elem in isd_elems:
29+
for k in elem["metadata"]:
30+
assert k == "filename"
31+
32+
33+
@pytest.mark.parametrize("filename", test_files)
34+
def test_process_file_include_filename_pagenum(filename: str):
35+
ingest_doc = GitIngestDoc(
36+
path=filename,
37+
config=SimpleGitConfig(
38+
download_dir=EXAMPLE_DOCS_DIRECTORY,
39+
metadata_include="filename,page_number",
40+
),
41+
)
42+
isd_elems = ingest_doc.process_file()
43+
44+
for elem in isd_elems:
45+
for k in elem["metadata"]:
46+
assert k in ["filename", "page_number"]
47+
48+
49+
@pytest.mark.parametrize("filename", test_files)
50+
def test_process_file_exclude_filename(filename: str):
51+
ingest_doc = GitIngestDoc(
52+
path=filename,
53+
config=SimpleGitConfig(
54+
download_dir=EXAMPLE_DOCS_DIRECTORY,
55+
metadata_exclude="filename",
56+
),
57+
)
58+
isd_elems = ingest_doc.process_file()
59+
60+
for elem in isd_elems:
61+
for k in elem["metadata"]:
62+
assert k != "filename"
63+
64+
65+
@pytest.mark.parametrize("filename", test_files)
66+
def test_process_file_exclude_filename_pagenum(filename: str):
67+
ingest_doc = GitIngestDoc(
68+
path=filename,
69+
config=SimpleGitConfig(
70+
download_dir=EXAMPLE_DOCS_DIRECTORY,
71+
metadata_exclude="filename,page_number",
72+
),
73+
)
74+
isd_elems = ingest_doc.process_file()
75+
76+
for elem in isd_elems:
77+
for k in elem["metadata"]:
78+
assert k not in ["filename", "page_number"]

Diff for: unstructured/ingest/connector/biomed.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from datetime import datetime
66
from ftplib import FTP, error_perm
77
from pathlib import Path
8-
from typing import List, Union
8+
from typing import List, Optional, Union
99

1010
import requests
1111
from bs4 import BeautifulSoup
@@ -48,6 +48,8 @@ class SimpleBiomedConfig(BaseConnectorConfig):
4848
output_dir: str
4949
re_download: bool = False
5050
preserve_downloads: bool = False
51+
metadata_include: Optional[str] = None
52+
metadata_exclude: Optional[str] = None
5153

5254
def _validate_date_args(self, date):
5355
date_formats = ["%Y-%m-%d", "%Y-%m-%d+%H:%M:%S"]

Diff for: unstructured/ingest/connector/fsspec.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import re
44
from dataclasses import dataclass, field
55
from pathlib import Path
6-
from typing import Type
6+
from typing import Optional, Type
77

88
from unstructured.ingest.interfaces import (
99
BaseConnector,
@@ -29,6 +29,8 @@ class SimpleFsspecConfig(BaseConnectorConfig):
2929
output_dir: str
3030
preserve_downloads: bool = False
3131
re_download: bool = False
32+
metadata_include: Optional[str] = None
33+
metadata_exclude: Optional[str] = None
3234

3335
# fsspec specific options
3436
access_kwargs: dict = field(default_factory=dict)

Diff for: unstructured/ingest/connector/git.py

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ class SimpleGitConfig(BaseConnectorConfig):
2626
output_dir: str
2727
preserve_downloads: bool = False
2828
re_download: bool = False
29+
metadata_include: Optional[str] = None
30+
metadata_exclude: Optional[str] = None
2931

3032
repo_path: str = field(init=False, repr=False)
3133

Diff for: unstructured/ingest/connector/google_drive.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from dataclasses import dataclass
55
from mimetypes import guess_extension
66
from pathlib import Path
7-
from typing import Dict
7+
from typing import Dict, Optional
88

99
from unstructured.file_utils.filetype import EXT_TO_FILETYPE
1010
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
@@ -77,6 +77,8 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
7777
output_dir: str
7878
re_download: bool = False
7979
preserve_downloads: bool = False
80+
metadata_include: Optional[str] = None
81+
metadata_exclude: Optional[str] = None
8082

8183
recursive: bool = False
8284

Diff for: unstructured/ingest/connector/reddit.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33
from dataclasses import dataclass, field
44
from pathlib import Path
5-
from typing import TYPE_CHECKING
5+
from typing import TYPE_CHECKING, Optional
66

77
from unstructured.ingest.interfaces import (
88
BaseConnector,
@@ -31,6 +31,8 @@ class SimpleRedditConfig(BaseConnectorConfig):
3131
output_dir: str
3232
preserve_downloads: bool = False
3333
re_download: bool = False
34+
metadata_include: Optional[str] = None
35+
metadata_exclude: Optional[str] = None
3436

3537
def __post_init__(self):
3638
if self.num_posts <= 0:

Diff for: unstructured/ingest/connector/wikipedia.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33
from dataclasses import dataclass, field
44
from pathlib import Path
5-
from typing import TYPE_CHECKING
5+
from typing import TYPE_CHECKING, Optional
66

77
from unstructured.ingest.interfaces import (
88
BaseConnector,
@@ -26,6 +26,8 @@ class SimpleWikipediaConfig(BaseConnectorConfig):
2626
output_dir: str
2727
preserve_downloads: bool = False
2828
re_download: bool = False
29+
metadata_include: Optional[str] = None
30+
metadata_exclude: Optional[str] = None
2931

3032

3133
@dataclass

Diff for: unstructured/ingest/interfaces.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
through Unstructured."""
33

44
from abc import ABC, abstractmethod
5+
from typing import Optional
56

67
from unstructured.ingest.logger import logger
78
from unstructured.partition.auto import partition
@@ -47,6 +48,8 @@ class BaseConnectorConfig(ABC):
4748
# where to write structured data outputs
4849
output_dir: str
4950
re_download: bool = False
51+
metadata_include: Optional[str] = None
52+
metadata_exclude: Optional[str] = None
5053

5154

5255
class BaseIngestDoc(ABC):
@@ -58,6 +61,8 @@ class BaseIngestDoc(ABC):
5861
Crucially, it is not responsible for the actual processing of the raw document.
5962
"""
6063

64+
config: BaseConnectorConfig
65+
6166
@property
6267
@abstractmethod
6368
def filename(self):
@@ -94,7 +99,24 @@ def process_file(self):
9499
self.isd_elems_no_filename = []
95100
for elem in isd_elems:
96101
# type: ignore
97-
elem["metadata"].pop("filename", None) # type: ignore[attr-defined]
102+
if (
103+
self.config.metadata_exclude is not None
104+
and self.config.metadata_include is not None
105+
):
106+
raise ValueError(
107+
"Arguments `--metadata-include` and `--metadata-exclude` are "
108+
"mutually exclusive with each other.",
109+
)
110+
elif self.config.metadata_exclude is not None:
111+
ex_list = self.config.metadata_exclude.split(",")
112+
for ex in ex_list:
113+
elem["metadata"].pop(ex, None) # type: ignore[attr-defined]
114+
elif self.config.metadata_include is not None:
115+
in_list = self.config.metadata_include.split(",")
116+
for k in elem["metadata"]:
117+
if k not in in_list:
118+
elem["metadata"].pop(k, None) # type: ignore[attr-defined]
119+
98120
elem.pop("coordinates") # type: ignore[attr-defined]
99121
self.isd_elems_no_filename.append(elem)
100122

0 commit comments

Comments
 (0)