Skip to content

Commit dd243b4

Browse files
chore: pass ocr_mode in partition_pdf_or_image (#1154)
Set to individual_blocks for now to work around [this bug](Unstructured-IO/unstructured-inference#179). I verified by printing the current ocr_mode in inference. The `entire_page` default is overridden. --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: awalker4 <[email protected]>
1 parent 1456f06 commit dd243b4

File tree

11 files changed

+231
-173
lines changed

11 files changed

+231
-173
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
## 0.10.4-dev0
1+
## 0.10.4
22

33
### Enhancements
44
* Adds ability to reuse connections per process in unstructured-ingest
5+
* Pass ocr_mode in partition_pdf and set the default back to individual pages for now
56

67
### Features
78

Diff for: test_unstructured/partition/test_pdf.py

+3
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ def test_partition_pdf_with_model_name_env_var(
177177
filename,
178178
is_image=False,
179179
ocr_languages="eng",
180+
ocr_mode="individual_blocks",
180181
extract_tables=False,
181182
model_name="checkbox",
182183
)
@@ -197,6 +198,7 @@ def test_partition_pdf_with_model_name(
197198
filename,
198199
is_image=False,
199200
ocr_languages="eng",
201+
ocr_mode="individual_blocks",
200202
extract_tables=False,
201203
model_name="checkbox",
202204
)
@@ -402,6 +404,7 @@ def test_partition_pdf_with_dpi():
402404
filename,
403405
is_image=False,
404406
ocr_languages="eng",
407+
ocr_mode="individual_blocks",
405408
extract_tables=False,
406409
model_name=None,
407410
pdf_image_dpi=100,

Diff for: test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json

+124-84
Large diffs are not rendered by default.

Diff for: test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json

+57-47
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
[
22
{
33
"type": "Title",
4-
"element_id": "88591a76b54e47215c0827ae8838ec13",
4+
"element_id": "0c4e18d78e721c8179f3946b75b17d15",
55
"metadata": {
66
"data_source": {},
77
"filetype": "image/png",
88
"page_number": 1
99
},
10-
"text": "Instructions for Form 3115 (Rev. November 1987)"
10+
"text": "Instructions for Form 3115 (Rev. November 1987) Annlicatinn far Chance in Accounting Mathond"
1111
},
1212
{
1313
"type": "NarrativeText",
14-
"element_id": "766cf1d1243ef2cdbb0db5ad32d7f9c9",
14+
"element_id": "41f3d9c83b2b4679195c9796134fd8f5",
1515
"metadata": {
1616
"data_source": {},
1717
"filetype": "image/png",
@@ -21,7 +21,7 @@
2121
},
2222
{
2323
"type": "ListItem",
24-
"element_id": "36a565493a214d3f7e7f24794c1dc7f4",
24+
"element_id": "97968e4ba14bd2d082a70ec61ef2d9b1",
2525
"metadata": {
2626
"data_source": {},
2727
"filetype": "image/png",
@@ -111,7 +111,7 @@
111111
},
112112
{
113113
"type": "ListItem",
114-
"element_id": "59bc2945a7f606bd5078bac3bc1199d4",
114+
"element_id": "f0d2beb7f43493694a91137e8e65b5f3",
115115
"metadata": {
116116
"data_source": {},
117117
"filetype": "image/png",
@@ -121,7 +121,7 @@
121121
},
122122
{
123123
"type": "ListItem",
124-
"element_id": "5157d731aa6a97c9b166799db2295bce",
124+
"element_id": "13f2a282f705590fbe7b6ce15b08862a",
125125
"metadata": {
126126
"data_source": {},
127127
"filetype": "image/png",
@@ -141,7 +141,7 @@
141141
},
142142
{
143143
"type": "ListItem",
144-
"element_id": "34b66452ca63c465c69d849e4acf6d46",
144+
"element_id": "9820f79275e683f5afe3f2f1283de4ca",
145145
"metadata": {
146146
"data_source": {},
147147
"filetype": "image/png",
@@ -161,7 +161,7 @@
161161
},
162162
{
163163
"type": "ListItem",
164-
"element_id": "b0fa5aaff0cee8574822dd8ac6537c06",
164+
"element_id": "a98378f4a88db65dff42b7d8bd75be92",
165165
"metadata": {
166166
"data_source": {},
167167
"filetype": "image/png",
@@ -181,7 +181,7 @@
181181
},
182182
{
183183
"type": "ListItem",
184-
"element_id": "13f155c0754434406190f3cf49c82c3c",
184+
"element_id": "3cb57c50002187a715e1c5048e643c65",
185185
"metadata": {
186186
"data_source": {},
187187
"filetype": "image/png",
@@ -201,33 +201,33 @@
201201
},
202202
{
203203
"type": "ListItem",
204-
"element_id": "178d6933ed193747b1c4aa1c048e7f94",
204+
"element_id": "beeb50db70ce1aa76813cce98e46bd56",
205205
"metadata": {
206206
"data_source": {},
207207
"filetype": "image/png",
208208
"page_number": 1
209209
},
210-
"text": "for these changes."
210+
"text": "for these changes. Tb od Db bee Cl"
211211
},
212212
{
213213
"type": "NarrativeText",
214-
"element_id": "7685df2334a5f6c8c8099dea61a8f1b4",
214+
"element_id": "640a100da1a3bee6f1f134c51a2c8648",
215215
"metadata": {
216216
"data_source": {},
217217
"filetype": "image/png",
218218
"page_number": 1
219219
},
220-
"text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed."
220+
"text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed"
221221
},
222222
{
223223
"type": "Title",
224-
"element_id": "61ed58fa51293f429f87e8cf1896c9e4",
224+
"element_id": "a232d246e22a4f6bb8dcab62cffb2567",
225225
"metadata": {
226226
"data_source": {},
227227
"filetype": "image/png",
228228
"page_number": 1
229229
},
230-
"text": "Paperwork Reduction Act Notice"
230+
"text": "Paperwork Reduction Act Notice We ack for thic infarenatinn te marry mye the."
231231
},
232232
{
233233
"type": "Title",
@@ -241,27 +241,37 @@
241241
},
242242
{
243243
"type": "ListItem",
244-
"element_id": "5f8051f8010896bab02aaf784c04ae02",
244+
"element_id": "58f1649a32eda8b8c513e51a209666a6",
245245
"metadata": {
246246
"data_source": {},
247247
"filetype": "image/png",
248248
"page_number": 1
249249
},
250-
"text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page"
250+
"text": "Signature Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page"
251+
},
252+
{
253+
"type": "ListItem",
254+
"element_id": "586e989b479e4362ebe28a6954c1427b",
255+
"metadata": {
256+
"data_source": {},
257+
"filetype": "image/png",
258+
"page_number": 1
259+
},
260+
"text": "If the individual or firm is also authorized to"
251261
},
252262
{
253263
"type": "NarrativeText",
254-
"element_id": "4660422c06dddc914ab634c5e4045dec",
264+
"element_id": "446ccb7d96fea659d50aef8a6dd670df",
255265
"metadata": {
256266
"data_source": {},
257267
"filetype": "image/png",
258268
"page_number": 1
259269
},
260-
"text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws an¢ to allow us to figure and collect the nght amount of tax. You are required to give us this information."
270+
"text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws an¢ to allow us to figure and collect the right amount of tax. You are required to give us this information,"
261271
},
262272
{
263273
"type": "Title",
264-
"element_id": "a1547a4ed1611eee44b15e99120fb978",
274+
"element_id": "226fa83297914d5195e002508d61fb1d",
265275
"metadata": {
266276
"data_source": {},
267277
"filetype": "image/png",
@@ -271,77 +281,77 @@
271281
},
272282
{
273283
"type": "Title",
274-
"element_id": "68a3289177b49b285e133a5267eb355f",
284+
"element_id": "f0e951e5bcb4a6070fa6672b37822348",
275285
"metadata": {
276286
"data_source": {},
277287
"filetype": "image/png",
278288
"page_number": 1
279289
},
280-
"text": "Purpose of Form"
290+
"text": "Purpose of Form Cin bce Secon te cece cget."
281291
},
282292
{
283293
"type": "NarrativeText",
284-
"element_id": "f9b8e17da7a31507773f78959378e09c",
294+
"element_id": "5e5451e052baf894b2bdad4132f6cd2f",
285295
"metadata": {
286296
"data_source": {},
287297
"filetype": "image/png",
288298
"page_number": 1
289299
},
290-
"text": "File this form to request a change in your accounting method, including the accounting treatment of any item. if you are requesting 2 change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods,"
300+
"text": "ee File this form to request a change in your accounting method, including the accounting treatment of any item. if you are requesting 2 change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods,"
291301
},
292302
{
293303
"type": "NarrativeText",
294-
"element_id": "b3859f2f29884b1d3ba0892e52859a99",
304+
"element_id": "cc1701e3ce9347e344b3df80d426bd21",
295305
"metadata": {
296306
"data_source": {},
297307
"filetype": "image/png",
298308
"page_number": 1
299309
},
300-
"text": "When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current. revision date of Form 3115)"
310+
"text": "Seti aes When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current. revision date of Form 3115)"
301311
},
302312
{
303313
"type": "NarrativeText",
304-
"element_id": "e5a95dc10d4071983b70898a21f11175",
314+
"element_id": "b81dc18d0f8666f9bf7400a00657dc72",
305315
"metadata": {
306316
"data_source": {},
307317
"filetype": "image/png",
308318
"page_number": 1
309319
},
310-
"text": "Generally, applicants must complete Section ‘A. In addition, complete the appropriate sections (B:1 through H) for which a change is desired."
320+
"text": "POMS SANE OPFOR DA 29). Generally, applicants must complete Section ‘A. In addition, complete the appropriate sections (B:1 through H) for which a change is desired. You must give alll relevant facts, including a"
311321
},
312322
{
313323
"type": "Title",
314-
"element_id": "5756fb398995bb6518a87637f24f426e",
324+
"element_id": "c7502aa5b000d6446f3eca882518a260",
315325
"metadata": {
316326
"data_source": {},
317327
"filetype": "image/png",
318328
"page_number": 1
319329
},
320-
"text": "Time and Place for Filing"
330+
"text": "Time and Place for Filing amarall, ammlimeete maet file snete"
321331
},
322332
{
323333
"type": "NarrativeText",
324-
"element_id": "25f830e7c39c115c9937eb9d11cfb1f2",
334+
"element_id": "8b35e7c212710b1099b675ce9394fb47",
325335
"metadata": {
326336
"data_source": {},
327337
"filetype": "image/png",
328338
"page_number": 1
329339
},
330-
"text": "State whether you desire a conference in the National Office if the Service proposes to disapprove your application"
340+
"text": "Se NB ON State whether you desire a conference in the National Office if the Service proposes to disapprove your application."
331341
},
332342
{
333343
"type": "Title",
334-
"element_id": "8b06cd6e2bf7fc15130d5d9ed7e66283",
344+
"element_id": "0a16a0fea889be77576c0fd88575554a",
335345
"metadata": {
336346
"data_source": {},
337347
"filetype": "image/png",
338348
"page_number": 1
339349
},
340-
"text": "Affiliated Groups"
350+
"text": "Affiliated Groups Tavmayare that ara mam)"
341351
},
342352
{
343353
"type": "Title",
344-
"element_id": "242a9dba10a04654d4adef9c58ff96f6",
354+
"element_id": "68b58298cabd9069c975b192a7183139",
345355
"metadata": {
346356
"data_source": {},
347357
"filetype": "image/png",
@@ -351,62 +361,62 @@
351361
},
352362
{
353363
"type": "Title",
354-
"element_id": "11c98a9cbd6a200fbc5b93fed15007ac",
364+
"element_id": "6a8881a6e87021b2362243f7df3e4b1d",
355365
"metadata": {
356366
"data_source": {},
357367
"filetype": "image/png",
358368
"page_number": 1
359369
},
360-
"text": "Uniform capitalization rules and limitation on"
370+
"text": "Uniform capitalization rules and limitation on cash method.—If you are required to char"
361371
},
362372
{
363373
"type": "Title",
364-
"element_id": "58703de56debc34a1d68e6ed6f8fd067",
374+
"element_id": "8daeb8b48fb666f1dd54e2af283d0c22",
365375
"metadata": {
366376
"data_source": {},
367377
"filetype": "image/png",
368378
"page_number": 1
369379
},
370-
"text": "Specific Instructions Section A"
380+
"text": "Specific Instructions Section A Neem Ea mama 1 !Taeahle inemes"
371381
},
372382
{
373383
"type": "Title",
374-
"element_id": "a4316c02df07840f1beb56609cb09735",
384+
"element_id": "09203a0c6955f64ca8eb52cd6ea47034",
375385
"metadata": {
376386
"data_source": {},
377387
"filetype": "image/png",
378388
"page_number": 1
379389
},
380-
"text": "Late Applications"
390+
"text": "Late Applications Me coup armlimatinm te ler"
381391
},
382392
{
383393
"type": "NarrativeText",
384-
"element_id": "39458f370b98a606db29ac6dee975e07",
394+
"element_id": "962e3f0ceb1f0b1b08a1c19adde8d962",
385395
"metadata": {
386396
"data_source": {},
387397
"filetype": "image/png",
388398
"page_number": 1
389399
},
390-
"text": "Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and"
400+
"text": "lethal elaine bela Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and the basis for that conclusion. Identify the"
391401
},
392402
{
393403
"type": "Title",
394-
"element_id": "025a65465b6fd9635316e92633b24c7e",
404+
"element_id": "bfe98eb672d95c15a11ed3e618928b4e",
395405
"metadata": {
396406
"data_source": {},
397407
"filetype": "image/png",
398408
"page_number": 1
399409
},
400-
"text": "Identifying Number"
410+
"text": "Identifying Number Ndiuidesale Am omptisoehesal"
401411
},
402412
{
403413
"type": "NarrativeText",
404-
"element_id": "9240bfa889b87dc2fb3fa746ca4eeeb4",
414+
"element_id": "87f8128b03a72c616ee1a1bb91e11c56",
405415
"metadata": {
406416
"data_source": {},
407417
"filetype": "image/png",
408418
"page_number": 1
409419
},
410-
"text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block,"
420+
"text": "—e—e—— eee Others.-—The employer identification number of an applicant other than an individual should be entered in this block,"
411421
}
412422
]

0 commit comments

Comments
 (0)