Skip to content

Commit 0ca154a

Browse files
Fix: MongoDB connector URI password redaction, basic unit tests for Git connector (#2268)
MongoDB connector: Issue: [MongoDB documentation](https://www.mongodb.com/docs/manual/reference/connection-string/) states that characters `$ : / ? # [ ] @` must be percent encoded. URI with password containing such special character will not be redacted. Fix: This fix removes usage of `unquote_plus` on password which allows detected password to match with one inside URI and successfully replace it. Git connector: Added very basic unit tests for repository filtering methods. Their impact is rather minimal but showcases current limitation in `is_file_type_supported` method.
1 parent e65a44e commit 0ca154a

File tree

5 files changed

+81
-3
lines changed

5 files changed

+81
-3
lines changed

Diff for: CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
### Fixes
1212

1313
* **Fix unequal row-length in HTMLTable.text_as_html.** Fixes to other aspects of partition_html() in v0.11 allowed unequal cell-counts in table rows. Make the cells in each row correspond 1:1 with cells in the original table row. This fix also removes "noise" cells resulting from HTML-formatting whitespace and eliminates the "column-shifting" of cells that previously resulted from noise-cells.
14+
* **Fix MongoDB connector URI password redaction.** MongoDB documentation states that characters `$ : / ? # [ ] @` must be percent encoded. URIs with password containing such special character were not redacted.
1415

1516
## 0.11.8
1617

Diff for: test_unstructured_ingest/unit/test_common.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from unstructured.ingest.cli.common import options_redactions
2+
3+
4+
def test_options_redactions():
5+
given_options = {
6+
"uri": "mongodb+srv://myDatabaseUser:D1fficultP%[email protected]/"
7+
"?authSource=admin&replicaSet=myRepl"
8+
}
9+
10+
when_options = options_redactions(options=given_options)
11+
12+
assert given_options["uri"] != when_options["uri"]
13+
assert (
14+
when_options["uri"] == "mongodb+srv://myDatabaseUser:***REDACTED***@mongodb0.example.com/"
15+
"?authSource=admin&replicaSet=myRepl"
16+
)

Diff for: test_unstructured_ingest/unit/test_connector_git.py

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from pathlib import Path
2+
3+
import pytest
4+
5+
from unstructured.ingest.connector.git import GitAccessConfig, GitSourceConnector, SimpleGitConfig
6+
7+
8+
@pytest.mark.parametrize(
9+
("given_file_path", "then_is_supported"),
10+
[
11+
(Path("src/submodule/document.md"), True),
12+
(Path("src/submodule/document.txt"), True),
13+
(Path("src/submodule/document.pdf"), True),
14+
(Path("src/submodule/document.doc"), True),
15+
(Path("src/submodule/document.docx"), True),
16+
(Path("src/submodule/document.eml"), True),
17+
(Path("src/submodule/document.html"), True),
18+
(Path("src/submodule/document.png"), True),
19+
(Path("src/submodule/document.jpg"), True),
20+
(Path("src/submodule/document.ppt"), True),
21+
(Path("src/submodule/document.pptx"), True),
22+
(Path("src/submodule/document.xml"), True),
23+
(Path("src/submodule/code.py"), False),
24+
(Path("src/submodule/Dockerfile"), False),
25+
(Path("src/submodule/Makefile"), False),
26+
(Path("src/submodule/LICENSE"), False),
27+
],
28+
)
29+
def test_connector_supports_file(given_file_path, then_is_supported):
30+
when_is_supported = GitSourceConnector.is_file_type_supported(str(given_file_path))
31+
32+
assert when_is_supported == then_is_supported
33+
34+
35+
class FakeGitSourceConnectorImpl(GitSourceConnector):
36+
def get_ingest_docs(self):
37+
pass
38+
39+
40+
@pytest.mark.parametrize(
41+
("given_file_path", "given_file_glob", "then_matches_glob"),
42+
[
43+
(Path("LICENSE"), None, True),
44+
(Path("Makefile"), ["Makefile"], True),
45+
(Path("src/my/super/module/main.py"), ["**/*.py"], True),
46+
(Path("src/my/super/module/main.pyc"), ["**/*.py"], False),
47+
],
48+
)
49+
def test_connector_does_path_match_glob(given_file_path, given_file_glob, then_matches_glob):
50+
connector_config = SimpleGitConfig(
51+
url="some_fake_url",
52+
access_config=GitAccessConfig(access_token="some_fake_token"),
53+
file_glob=given_file_glob,
54+
)
55+
connector = FakeGitSourceConnectorImpl(
56+
processor_config=None, read_config=None, connector_config=connector_config
57+
)
58+
59+
when_matches_glob = connector.does_path_match_glob(str(given_file_path))
60+
61+
assert when_matches_glob == then_matches_glob

Diff for: unstructured/ingest/connector/git.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ def initialize(self):
8686
def check_connection(self):
8787
pass
8888

89-
def is_file_type_supported(self, path: str) -> bool:
89+
@staticmethod
90+
def is_file_type_supported(path: str) -> bool:
9091
# Workaround to ensure that auto.partition isn't fed with .yaml, .py, etc. files
9192
# TODO: What to do with no filenames? e.g. LICENSE, Makefile, etc.
9293
supported = path.endswith(

Diff for: unstructured/ingest/connector/mongodb.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import json
22
import typing as t
33
from dataclasses import dataclass, field
4-
from urllib.parse import unquote_plus
54

65
from dataclasses_json.core import Json
76

@@ -24,7 +23,7 @@
2423

2524
def parse_userinfo(userinfo: str) -> t.Tuple[str, str]:
2625
user, _, passwd = userinfo.partition(":")
27-
return unquote_plus(user), unquote_plus(passwd)
26+
return user, passwd
2827

2928

3029
def redact(uri: str, redacted_text="***REDACTED***") -> str:

0 commit comments

Comments
 (0)