-
Notifications
You must be signed in to change notification settings - Fork 904
/
Copy pathtest_basic.py
168 lines (143 loc) · 8.13 KB
/
test_basic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""Test suite for the `unstructured.chunking.basic` module.
That module implements the baseline chunking strategy. The baseline strategy has all behaviors
shared by all chunking strategies and no extra rules like perserve section or page boundaries.
"""
from __future__ import annotations
from typing import Any
import pytest
from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import CompositeElement, Text, Title
from unstructured.partition.docx import partition_docx
def test_it_chunks_a_document_when_basic_chunking_strategy_is_specified_on_partition_function():
"""Basic chunking can be combined with partitioning, exercising the decorator."""
filename = "example-docs/handbook-1p.docx"
chunks = partition_docx(filename, chunking_strategy="basic")
assert chunks == [
CompositeElement(
"US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 – INTRODUCTION"
"\n\nA. PURPOSE"
),
CompositeElement(
"The United States Trustee appoints and supervises standing trustees and monitors and"
" supervises cases under chapter 13 of title 11 of the United States Code. 28 U.S.C."
" § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586,"
" establishes or clarifies the position of the United States Trustee Program (Program)"
" on the duties owed by a standing trustee to the debtors, creditors, other parties in"
" interest, and the United States Trustee. The Handbook does not present a full and"
),
CompositeElement(
"complete statement of the law; it should not be used as a substitute for legal"
" research and analysis. The standing trustee must be familiar with relevant"
" provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules),"
" any local bankruptcy rules, and case law. 11 U.S.C. § 321, 28 U.S.C. § 586,"
" 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips"
" identified in this Handbook but these are not considered mandatory."
),
CompositeElement(
"Nothing in this Handbook should be construed to excuse the standing trustee from"
" complying with all duties imposed by the Bankruptcy Code and Rules, local rules, and"
" orders of the court. The standing trustee should notify the United States Trustee"
" whenever the provision of the Handbook conflicts with the local rules or orders of"
" the court. The standing trustee is accountable for all duties set forth in this"
" Handbook, but need not personally perform any duty unless otherwise indicated. All"
),
CompositeElement(
"statutory references in this Handbook refer to the Bankruptcy Code, 11 U.S.C. § 101"
" et seq., unless otherwise indicated."
),
CompositeElement(
"This Handbook does not create additional rights against the standing trustee or"
" United States Trustee in favor of other parties.\n\nB. ROLE OF THE UNITED STATES"
" TRUSTEE"
),
CompositeElement(
"The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the"
" responsibilities for daytoday administration of cases. Debtors, creditors, and"
" third parties with adverse interests to the trustee were concerned that the court,"
" which previously appointed and supervised the trustee, would not impartially"
" adjudicate their rights as adversaries of that trustee. To address these concerns,"
" judicial and administrative functions within the bankruptcy system were bifurcated."
),
CompositeElement(
"Many administrative functions formerly performed by the court were placed within the"
" Department of Justice through the creation of the Program. Among the administrative"
" functions assigned to the United States Trustee were the appointment and supervision"
" of chapter 13 trustees./ This Handbook is issued under the authority of the"
" Program’s enabling statutes.\n\nC. STATUTORY DUTIES OF A STANDING TRUSTEE"
),
CompositeElement(
"The standing trustee has a fiduciary responsibility to the bankruptcy estate. The"
" standing trustee is more than a mere disbursing agent. The standing trustee must"
" be personally involved in the trustee operation. If the standing trustee is or"
" becomes unable to perform the duties and responsibilities of a standing trustee,"
" the standing trustee must immediately advise the United States Trustee."
" 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b)."
),
CompositeElement(
"Although this Handbook is not intended to be a complete statutory reference, the"
" standing trustee’s primary statutory duties are set forth in 11 U.S.C. § 1302, which"
" incorporates by reference some of the duties of chapter 7 trustees found in"
" 11 U.S.C. § 704. These duties include, but are not limited to, the"
" following:\n\nCopyright"
),
]
def test_it_chunks_elements_when_the_user_already_has_them():
elements = [
Title("Introduction"),
Text(
# --------------------------------------------------------- 64 -v
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
" porta volutpat.",
),
]
chunks = chunk_elements(elements, max_characters=64)
assert chunks == [
CompositeElement("Introduction"),
# -- splits on even word boundary, not mid-"rhoncus" --
CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing elit. In"),
CompositeElement("rhoncus ipsum sed lectus porta volutpat."),
]
def test_it_includes_original_elements_as_metadata_when_requested():
element = Title("Introduction")
element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.")
element_3 = Text("In rhoncus ipsum sed lectus porta volutpat.")
chunks = chunk_elements(
[element, element_2, element_3], max_characters=70, include_orig_elements=True
)
assert len(chunks) == 2
chunk = chunks[0]
assert chunk == CompositeElement(
"Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit."
)
assert chunk.metadata.orig_elements == [element, element_2]
# --
chunk = chunks[1]
assert chunk == CompositeElement("In rhoncus ipsum sed lectus porta volutpat.")
assert chunk.metadata.orig_elements == [element_3]
# ------------------------------------------------------------------------------------------------
# UNIT TESTS
# ------------------------------------------------------------------------------------------------
class Describe_chunk_elements:
"""Unit-test suite for `unstructured.chunking.basic.chunk_elements()` function."""
@pytest.mark.parametrize(
("kwargs", "expected_value"),
[
({"include_orig_elements": True}, True),
({"include_orig_elements": False}, False),
({"include_orig_elements": None}, True),
({}, True),
],
)
def it_supports_the_include_orig_elements_option(
self, kwargs: dict[str, Any], expected_value: bool, _chunk_elements_: Mock
):
# -- this line would raise if "include_orig_elements" was not an available parameter on
# -- `chunk_elements()`.
chunk_elements([], **kwargs)
_, opts = _chunk_elements_.call_args.args
assert opts.include_orig_elements is expected_value
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture()
def _chunk_elements_(self, request: FixtureRequest): # noqa: PT005
return function_mock(request, "unstructured.chunking.basic._chunk_elements")