-
Notifications
You must be signed in to change notification settings - Fork 873
/
Copy pathfiletype.py
812 lines (637 loc) · 28.8 KB
/
filetype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
"""Automatically detect file-type based on inspection of the file's contents.
Auto-detection proceeds via a sequence of strategies. The first strategy to confidently determine a
file-type returns that value. A strategy that is not applicable, either because it lacks the input
required or fails to determine a file-type, returns `None` and execution continues with the next
strategy.
`_FileTypeDetector` is the main object and implements the three strategies.
The three strategies are:
- Use MIME-type asserted by caller in the `content_type` argument.
- Guess a MIME-type using libmagic, falling back to the `filetype` package when libmagic is
unavailable.
- Map filename-extension to a `FileType` member.
A file that fails all three strategies is assigned the value `FileType.UNK`, for "unknown".
`_FileTypeDetectionContext` encapsulates the various arguments received by `detect_filetype()` and
provides values derived from them. This object is immutable and can be passed to delegates of
`_FileTypeDetector` to provide whatever context they need on the current detection instance.
`_FileTypeDetector` delegates to _differentiator_ objects like `_ZipFileDifferentiator` for
specialized discrimination and/or confirmation of ambiguous or frequently mis-identified
MIME-types. Additional differentiators are planned, one for `application/x-ole-storage`
(DOC, PPT, XLS, and MSG file-types) and perhaps others.
"""
from __future__ import annotations
import contextlib
import functools
import importlib.util
import json
import os
import re
import zipfile
from typing import IO, Callable, Iterator, Optional
import filetype as ft
from olefile import OleFileIO
from oxmsg.storage import Storage
from typing_extensions import ParamSpec
from unstructured.documents.elements import Element
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.nlp.patterns import DICT_PATTERN, EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
from unstructured.partition.common.common import add_element_metadata, exactly_one
from unstructured.partition.common.metadata import set_element_hierarchy
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
try:
importlib.import_module("magic")
LIBMAGIC_AVAILABLE = True
except ImportError:
LIBMAGIC_AVAILABLE = False # pyright: ignore[reportConstantRedefinition]
def detect_filetype(
file_path: str | None = None,
file: IO[bytes] | None = None,
encoding: str | None = None,
content_type: str | None = None,
metadata_file_path: Optional[str] = None,
) -> FileType:
"""Determine file-type of specified file using libmagic and/or fallback methods.
One of `file_path` or `file` must be specified. A `file_path` that does not
correspond to a file on the filesystem raises `ValueError`.
Args:
content_type: MIME-type of document-source, when already known. Providing
a value for this argument disables auto-detection unless it does not map
to a FileType member or is ambiguous, in which case it is ignored.
encoding: Only used for textual file-types. When omitted, `utf-8` is
assumed. Should generally be omitted except to resolve a problem with
textual file-types like HTML.
metadata_file_path: Only used when `file` is provided and then only as a
source for a filename-extension that may be needed as a secondary
content-type indicator. Ignored with the document is specified using
`file_path`.
Returns:
A member of the `FileType` enumeration, `FileType.UNK` when the file type
could not be determined or is not supported.
Raises:
ValueError: when:
- `file_path` is specified but does not correspond to a file on the
filesystem.
- Neither `file_path` nor `file` were specified.
"""
ctx = _FileTypeDetectionContext.new(
file_path=file_path,
file=file,
encoding=encoding,
content_type=content_type,
metadata_file_path=metadata_file_path,
)
return _FileTypeDetector.file_type(ctx)
def is_json_processable(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
file_text: Optional[str] = None,
encoding: Optional[str] = "utf-8",
) -> bool:
"""True when file looks like a JSON array of objects.
Uses regex on a file prefix, so not entirely reliable but good enough if you already know the
file is JSON.
"""
exactly_one(filename=filename, file=file, file_text=file_text)
if file_text is None:
file_text = _FileTypeDetectionContext.new(
file_path=filename, file=file, encoding=encoding
).text_head
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
def is_ndjson_processable(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
file_text: Optional[str] = None,
encoding: Optional[str] = "utf-8",
) -> bool:
"""True when file looks like a JSON array of objects.
Uses regex on a file prefix, so not entirely reliable but good enough if you already know the
file is JSON.
"""
exactly_one(filename=filename, file=file, file_text=file_text)
if file_text is None:
file_text = _FileTypeDetectionContext.new(
file_path=filename, file=file, encoding=encoding
).text_head
return re.match(DICT_PATTERN, file_text) is not None
class _FileTypeDetector:
"""Determines file type from a variety of possible inputs."""
def __init__(self, ctx: _FileTypeDetectionContext):
self._ctx = ctx
@classmethod
def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType:
"""Detect file-type of document-source described by `ctx`."""
return cls(ctx)._file_type
@property
def _file_type(self) -> FileType:
"""FileType member corresponding to this document source."""
# -- An explicit content-type most commonly asserted by the client/SDK and is therefore
# -- inherently unreliable. On the other hand, binary file-types can be detected with 100%
# -- accuracy. So start with binary types and only then consider an asserted content-type,
# -- generally as a last resort.
# -- strategy 1: most binary types can be detected with 100% accuracy --
if file_type := self._known_binary_file_type:
return file_type
# -- strategy 2: use content-type asserted by caller --
if file_type := self._file_type_from_content_type:
return file_type
# -- strategy 3: guess MIME-type using libmagic and use that --
if file_type := self._file_type_from_guessed_mime_type:
return file_type
# -- strategy 4: use filename-extension, like ".docx" -> FileType.DOCX --
if file_type := self._file_type_from_file_extension:
return file_type
# -- strategy 5: give up and report FileType.UNK --
return FileType.UNK
# == STRATEGIES ============================================================
@property
def _known_binary_file_type(self) -> FileType | None:
"""Detect file-type for binary types we can positively detect."""
if file_type := _OleFileDetector.file_type(self._ctx):
return file_type
self._ctx.rule_out_cfb_content_types()
if file_type := _ZipFileDetector.file_type(self._ctx):
return file_type
self._ctx.rule_out_zip_content_types()
return None
@property
def _file_type_from_content_type(self) -> FileType | None:
"""Map passed content-type argument to a file-type, subject to certain rules."""
# -- when no content-type was asserted by caller, this strategy is not applicable --
if not self._ctx.content_type:
return None
# -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it --
return FileType.from_mime_type(self._ctx.content_type)
@property
def _file_type_from_guessed_mime_type(self) -> FileType | None:
"""FileType based on auto-detection of MIME-type by libmagic.
In some cases refinements are necessary on the magic-derived MIME-types. This process
includes applying those rules, most of which are accumulated through practical experience.
"""
mime_type = self._ctx.mime_type
extension = self._ctx.extension
# -- when libmagic is not installed, the `filetype` package is used instead.
# -- `filetype.guess()` returns `None` for file-types it does not support, which
# -- unfortunately includes all the textual file-types like CSV, EML, HTML, MD, RST, RTF,
# -- TSV, and TXT. When we have no guessed MIME-type, this strategy is not applicable.
if mime_type is None:
return None
if mime_type.endswith("xml"):
return FileType.HTML if extension in (".html", ".htm") else FileType.XML
if differentiator := _TextFileDifferentiator.applies(self._ctx):
return differentiator.file_type
# -- All source-code files (e.g. *.py, *.js) are classified as plain text for the moment --
if self._ctx.has_code_mime_type:
return FileType.TXT
if mime_type.endswith("empty"):
return FileType.EMPTY
# -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present --
file_type = FileType.from_mime_type(mime_type)
return file_type if file_type != FileType.UNK else None
@lazyproperty
def _file_type_from_file_extension(self) -> FileType | None:
"""Determine file-type from filename extension.
Returns `None` when no filename is available or when the extension does not map to a
supported file-type.
"""
return FileType.from_extension(self._ctx.extension)
class _FileTypeDetectionContext:
"""Provides all arguments to auto-file detection and values derived from them.
NOTE that `._content_type` is mutable via `.rule_out_*_content_types()` methods, so it should
not be assumed to be a constant value across those calls.
This keeps computation of derived values out of the file-detection code but more importantly
allows the main filetype-detector to pass the full context to any delegates without coupling
itself to which values it might need.
"""
def __init__(
self,
file_path: str | None = None,
*,
file: IO[bytes] | None = None,
encoding: str | None = None,
content_type: str | None = None,
metadata_file_path: str | None = None,
):
self._file_path_arg = file_path
self._file_arg = file
self._encoding_arg = encoding
self._content_type = content_type
self._metadata_file_path = metadata_file_path
@classmethod
def new(
cls,
*,
file_path: str | None,
file: IO[bytes] | None,
encoding: str | None,
content_type: str | None = None,
metadata_file_path: str | None = None,
) -> _FileTypeDetectionContext:
self = cls(
file_path=file_path,
file=file,
encoding=encoding,
content_type=content_type,
metadata_file_path=metadata_file_path,
)
self._validate()
return self
@property
def content_type(self) -> str | None:
"""MIME-type asserted by caller; not based on inspection of file by this process.
Would commonly occur when the file was downloaded via HTTP and a `"Content-Type:` header was
present on the response. These are often ambiguous and sometimes just wrong so get some
further verification. All lower-case when not `None`.
"""
# -- Note `._content_type` is mutable via `.invalidate_content_type()` so this cannot be a
# -- `@lazyproperty`.
return self._content_type.lower() if self._content_type else None
@lazyproperty
def encoding(self) -> str:
"""Character-set used to encode text of this file.
Relevant for textual file-types only, like HTML, TXT, JSON, etc.
"""
return format_encoding_str(self._encoding_arg or "utf-8")
@lazyproperty
def extension(self) -> str:
"""Best filename-extension we can muster, "" when there is no available source."""
# -- use metadata file-path when provided --
if file_path := self._metadata_file_path:
return os.path.splitext(file_path)[1].lower()
# -- otherwise get from file_path, or file when it has a name (path) --
with self.open() as file:
if hasattr(file, "name") and file.name:
return os.path.splitext(file.name)[1].lower()
# -- otherwise empty str means no extension, same as a path like "a/b/name-no-ext" --
return ""
@lazyproperty
def file_head(self) -> bytes:
"""The initial bytes of the file to be recognized, for use with libmagic detection."""
with self.open() as file:
return file.read(8192)
@lazyproperty
def file_path(self) -> str | None:
"""Filesystem path to file to be inspected, when provided on call.
None when the caller specified the source as a file-like object instead. Useful for user
feedback on an error, but users of context should have little use for it otherwise.
"""
if (file_path := self._file_path_arg) is None:
return None
return os.path.realpath(file_path) if os.path.islink(file_path) else file_path
@lazyproperty
def has_code_mime_type(self) -> bool:
"""True when `mime_type` plausibly indicates a programming language source-code file."""
mime_type = self.mime_type
if mime_type is None:
return False
# -- check Go separately to avoid matching other MIME type containing "go" --
if mime_type == "text/x-go":
return True
return any(
lang in mime_type
for lang in [
"c#",
"c++",
"cpp",
"csharp",
"java",
"javascript",
"php",
"python",
"ruby",
"swift",
"typescript",
]
)
@lazyproperty
def is_zipfile(self) -> bool:
"""True when file is a Zip archive."""
with self.open() as file:
return zipfile.is_zipfile(file)
@lazyproperty
def mime_type(self) -> str | None:
"""The best MIME-type we can get from `magic` (or `filetype` package).
A `str` return value is always in lower-case.
"""
file_path = self.file_path
if LIBMAGIC_AVAILABLE:
import magic
mime_type = (
magic.from_file(file_path, mime=True)
if file_path
else magic.from_buffer(self.file_head, mime=True)
)
return mime_type.lower() if mime_type else None
mime_type = ft.guess_mime(file_path) if file_path else ft.guess_mime(self.file_head)
if mime_type is None:
logger.warning(
"libmagic is unavailable but assists in filetype detection. Please consider"
" installing libmagic for better results."
)
return None
return mime_type.lower()
@contextlib.contextmanager
def open(self) -> Iterator[IO[bytes]]:
"""Encapsulates complexity of dealing with file-path or file-like-object.
Provides an `IO[bytes]` object as the "common-denominator" document source.
Must be used as a context manager using a `with` statement:
with self._file as file:
do things with file
File is guaranteed to be at read position 0 when called.
"""
if self.file_path:
with open(self.file_path, "rb") as f:
yield f
else:
file = self._file_arg
assert file is not None # -- guaranteed by `._validate()` --
file.seek(0)
yield file
def rule_out_cfb_content_types(self) -> None:
"""Invalidate content-type when a legacy MS-Office file-type is asserted.
Used before returning `None`; at that point we know the file is not one of these formats
so if the asserted `content-type` is a legacy MS-Office type we know it's wrong and should
not be used as a fallback later in the detection process.
"""
if FileType.from_mime_type(self._content_type) in (
FileType.DOC,
FileType.MSG,
FileType.PPT,
FileType.XLS,
):
self._content_type = None
def rule_out_zip_content_types(self) -> None:
"""Invalidate content-type when an MS-Office 2007+ file-type is asserted.
Used before returning `None`; at that point we know the file is not one of these formats
so if the asserted `content-type` is an MS-Office 2007+ type we know it's wrong and should
not be used as a fallback later in the detection process.
"""
if FileType.from_mime_type(self._content_type) in (
FileType.DOCX,
FileType.EPUB,
FileType.ODT,
FileType.PPTX,
FileType.XLSX,
FileType.ZIP,
):
self._content_type = None
@lazyproperty
def text_head(self) -> str:
"""The initial characters of the text file for use with text-format differentiation.
Raises:
UnicodeDecodeError if file cannot be read as text.
"""
# TODO: only attempts fallback character-set detection for file-path case, not for
# file-like object case. Seems like we should do both.
if file := self._file_arg:
file.seek(0)
content = file.read(4096)
file.seek(0)
return (
content
if isinstance(content, str)
else content.decode(encoding=self.encoding, errors="ignore")
)
file_path = self.file_path
assert file_path is not None # -- guaranteed by `._validate` --
try:
with open(file_path, encoding=self.encoding) as f:
return f.read(4096)
except UnicodeDecodeError:
encoding, _ = detect_file_encoding(filename=file_path)
with open(file_path, encoding=encoding) as f:
return f.read(4096)
def _validate(self) -> None:
"""Raise if the context is invalid."""
if self.file_path and not os.path.isfile(self.file_path):
raise FileNotFoundError(f"no such file {self._file_path_arg}")
if not self.file_path and not self._file_arg:
raise ValueError("either `file_path` or `file` argument must be provided")
class _OleFileDetector:
"""Detect and differentiate a CFB file, aka. "OLE" file.
Compound File Binary Format (CFB), aka. OLE file, is use by Microsoft for legacy MS Office
files (DOC, PPT, XLS) as well as for Outlook MSG files.
"""
def __init__(self, ctx: _FileTypeDetectionContext):
self._ctx = ctx
@classmethod
def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None:
"""Specific file-type when file is a CFB file, `None` otherwise."""
return cls(ctx)._file_type
@property
def _file_type(self) -> FileType | None:
"""Differentiated file-type for Microsoft Compound File Binary Format (CFBF).
Returns one of:
- `FileType.DOC`
- `FileType.PPT`
- `FileType.XLS`
- `FileType.MSG`
- `None` when the file is not one of these.
"""
# -- all CFB files share common magic number, start with that --
if not self._is_ole_file:
return None
# -- check storage contents of the ole file for file-type specific stream names --
if (ole_file_type := self._ole_file_type) is not None:
return ole_file_type
return None
@lazyproperty
def _is_ole_file(self) -> bool:
"""True when file has CFB magic first 8 bytes."""
with self._ctx.open() as file:
return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
@lazyproperty
def _ole_file_type(self) -> FileType | None:
with self._ctx.open() as f:
ole = OleFileIO(f) # pyright: ignore[reportUnknownVariableType]
root_storage = Storage.from_ole(ole) # pyright: ignore[reportUnknownMemberType]
for stream in root_storage.streams:
if stream.name == "WordDocument":
return FileType.DOC
elif stream.name == "PowerPoint Document":
return FileType.PPT
elif stream.name == "Workbook":
return FileType.XLS
elif stream.name == "__properties_version1.0":
return FileType.MSG
return None
class _TextFileDifferentiator:
"""Refine a textual file-type that may not be as specific as it could be."""
def __init__(self, ctx: _FileTypeDetectionContext):
self._ctx = ctx
@classmethod
def applies(cls, ctx: _FileTypeDetectionContext) -> _TextFileDifferentiator | None:
"""Constructs an instance, but only if this differentiator applies in `ctx`."""
mime_type = ctx.mime_type
return (
cls(ctx)
if mime_type and (mime_type == "message/rfc822" or mime_type.startswith("text"))
else None
)
@lazyproperty
def file_type(self) -> FileType:
"""Differentiated file-type for textual content.
Always produces a file-type, worst case that's `FileType.TXT` when nothing more specific
applies.
"""
extension = self._ctx.extension
if extension in [
".csv",
".eml",
".html",
".json",
".markdown",
".md",
".org",
".p7s",
".rst",
".rtf",
".tab",
".tsv",
]:
return FileType.from_extension(extension) or FileType.TXT
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
# installed on the Unstructured docker image, .json files resolve to "text/plain"
# rather than "application/json". this corrects for that case.
if self._is_json:
return FileType.JSON
if self._is_csv:
return FileType.CSV
if self._is_eml:
return FileType.EML
if extension in (".text", ".txt"):
return FileType.TXT
# Safety catch
if file_type := FileType.from_mime_type(self._ctx.mime_type):
return file_type
return FileType.TXT
@lazyproperty
def _is_csv(self) -> bool:
"""True when file is plausibly in Comma Separated Values (CSV) format."""
def count_commas(text: str):
"""Counts the number of commas in a line, excluding commas in quotes."""
pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$),"
matches = re.findall(pattern, text)
return len(matches)
lines = self._ctx.text_head.strip().splitlines()
if len(lines) < 2:
return False
# -- check at most the first 10 lines --
lines = lines[: len(lines)] if len(lines) < 10 else lines[:10]
# -- any lines without at least one comma disqualifies the file --
if any("," not in line for line in lines):
return False
header_count = count_commas(lines[0])
return all(count_commas(line) == header_count for line in lines[1:])
@lazyproperty
def _is_eml(self) -> bool:
"""Checks if a text/plain file is actually a .eml file.
Uses a regex pattern to see if the start of the file matches the typical pattern for a .eml
file.
"""
return EMAIL_HEAD_RE.match(self._ctx.text_head) is not None
@lazyproperty
def _is_json(self) -> bool:
"""True when file is JSON collection.
A JSON file that contains only a string, number, or boolean, while valid JSON, will fail
this test since it is not partitionable.
"""
text_head = self._ctx.text_head
# -- an empty file is not JSON --
if not text_head.lstrip():
return False
# -- has to be a list or object, no string, number, or bool --
if text_head.lstrip()[0] not in "[{":
return False
try:
with self._ctx.open() as file:
json.load(file)
return True
except json.JSONDecodeError:
return False
class _ZipFileDetector:
"""Detect and differentiate a Zip-archive file."""
def __init__(self, ctx: _FileTypeDetectionContext):
self._ctx = ctx
@classmethod
def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None:
"""Most specific file-type available when file is a Zip file, `None` otherwise.
MS-Office 2007+ files are detected with 100% accuracy. Otherwise this returns `None`, even
when we can tell it's a Zip file, so later strategies can have a crack at it. In
particular, ODT and EPUB files are Zip archives but are not detected here.
"""
return cls(ctx)._file_type
@lazyproperty
def _file_type(self) -> FileType | None:
"""Differentiated file-type for a Zip archive.
Returns `FileType.DOCX`, `FileType.PPTX`, or `FileType.XLSX` when one of those applies,
`None` otherwise.
"""
if not self._ctx.is_zipfile:
return None
with self._ctx.open() as file:
zip = zipfile.ZipFile(file)
filenames = zip.namelist()
if "word/document.xml" in filenames:
return FileType.DOCX
if "xl/workbook.xml" in filenames:
return FileType.XLSX
if "ppt/presentation.xml" in filenames:
return FileType.PPTX
# -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --
if "mimetype" in filenames:
with zip.open("mimetype") as f:
mime_type = f.read().decode("utf-8").strip()
return FileType.from_mime_type(mime_type)
return FileType.ZIP
_P = ParamSpec("_P")
def add_metadata(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
@functools.wraps(func)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
elements = func(*args, **kwargs)
call_args = get_call_args_applying_defaults(func, *args, **kwargs)
if call_args.get("metadata_filename"):
call_args["filename"] = call_args.get("metadata_filename")
metadata_kwargs = {
kwarg: call_args.get(kwarg) for kwarg in ("filename", "url", "text_as_html")
}
# NOTE (yao): do not use cast here as cast(None) still is None
if not str(kwargs.get("model_name", "")).startswith("chipper"):
# NOTE(alan): Skip hierarchy if using chipper, as it should take care of that
elements = set_element_hierarchy(elements)
for element in elements:
# NOTE(robinson) - Attached files have already run through this logic
# in their own partitioning function
if element.metadata.attached_to_filename is None:
add_element_metadata(element, **metadata_kwargs)
return elements
return wrapper
def add_filetype(
filetype: FileType,
) -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]:
"""Post-process element-metadata for list[Element] from partitioning.
This decorator adds a post-processing step to a document partitioner.
- Adds `.metadata.filetype` (source-document MIME-type) metadata value
This "partial" decorator is present because `partition_image()` does not apply
`.metadata.filetype` this way since each image type has its own MIME-type (e.g. `image.jpeg`,
`image/png`, etc.).
"""
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
@functools.wraps(func)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
elements = func(*args, **kwargs)
for element in elements:
# NOTE(robinson) - Attached files have already run through this logic
# in their own partitioning function
if element.metadata.attached_to_filename is None:
add_element_metadata(element, filetype=filetype.mime_type)
return elements
return wrapper
return decorator
def add_metadata_with_filetype(
filetype: FileType,
) -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]:
"""..."""
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
return add_filetype(filetype=filetype)(add_metadata(func))
return decorator