Skip to content

Commit f890972

Browse files
authored
docs: add bricks training notebook (#211)
* added bricks notebook * more unicode quotes; isd dataframe column fix * fix remove_punctuation docs * typo fixes * put staging bricks in code
1 parent d0c6d50 commit f890972

File tree

8 files changed

+767
-8
lines changed

8 files changed

+767
-8
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
## 0.4.7-dev3
1+
## 0.4.7-dev4
22

33
* Added the ability to pull an HTML document from a url in `partition_html`.
44
* Added the the ability to get file summary info from lists of filenames and lists
55
of file contents.
66
* Added optional page break to `partition` for `.pptx`, `.pdf`, images, and `.html` files.
77
* Added `to_dict` method to document elements.
8+
* Include more unicode quotes in `replace_unicode_quotes`.
89

910
## 0.4.6
1011

Diff for: docs/source/bricks.rst

+1-4
Original file line numberDiff line numberDiff line change
@@ -587,10 +587,7 @@ Examples:
587587
from unstructured.cleaners.core import remove_punctuation
588588
589589
# Returns "A lovely quote"
590-
replace_unicode_characters("“A lovely quote!”")
591-
592-
# Returns ""
593-
replace_unicode_characters("'()[]{};:'\",.?/\\-_")
590+
remove_punctuation("“A lovely quote!”")
594591
595592
596593
``clean_prefix``

Diff for: example-docs/layout-parser-paper-fast.jpg

93 KB
Loading

Diff for: examples/training/1-Intro to Bricks.ipynb

+741
Large diffs are not rendered by default.

Diff for: test_unstructured/staging/test_base_staging.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def test_convert_to_isd_csv(output_csv_file):
4848
isd_csv_string = base.convert_to_isd_csv(elements)
4949
csv_file.write(isd_csv_string)
5050

51-
fieldnames = ["type", "text", "coordinates", "element_id"]
51+
fieldnames = ["type", "text"]
5252
with open(output_csv_file, "r") as csv_file:
5353
csv_rows = csv.DictReader(csv_file)
5454
assert all(set(row.keys()) == set(fieldnames) for row in csv_rows)

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.7-dev3" # pragma: no cover
1+
__version__ = "0.4.7-dev4" # pragma: no cover

Diff for: unstructured/cleaners/core.py

+20
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,31 @@ def replace_unicode_quotes(text) -> str:
5555
-------
5656
\x93What a lovely quote!\x94 -> “What a lovely quote!”
5757
"""
58+
# NOTE(robinson) - We should probably make this something more sane like a regex
59+
# instead of a whole big series of replaces
5860
text = text.replace("\x91", "‘")
5961
text = text.replace("\x92", "’")
6062
text = text.replace("\x93", "“")
6163
text = text.replace("\x94", "”")
6264
text = text.replace("'", "'")
65+
text = text.replace(\x80\x99", "'")
66+
text = text.replace(\x80“", "—")
67+
text = text.replace(\x80”", "–")
68+
text = text.replace(\x80˜", "‘")
69+
text = text.replace(\x80¦", "…")
70+
text = text.replace(\x80™", "’")
71+
text = text.replace(\x80œ", "“")
72+
text = text.replace(\x80?", "”")
73+
text = text.replace(\x80ť", "”")
74+
text = text.replace(\x80ś", "“")
75+
text = text.replace(\x80¨", "—")
76+
text = text.replace(\x80ł", "″")
77+
text = text.replace(\x80Ž", "")
78+
text = text.replace(\x80‚", "")
79+
text = text.replace(\x80‰", "")
80+
text = text.replace(\x80‹", "")
81+
text = text.replace(\x80", "")
82+
text = text.replace(\x80s'", "")
6383
return text
6484

6585

Diff for: unstructured/staging/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def convert_to_isd_csv(elements: List[Text]) -> str:
3838
Returns the representation of document elements as an Initial Structured Document (ISD)
3939
in CSV Format.
4040
"""
41-
csv_fieldnames: List[str] = ["type", "text", "coordinates", "element_id"]
41+
csv_fieldnames: List[str] = ["type", "text"]
4242
rows: List[Dict[str, str]] = convert_to_isd(elements)
4343
with io.StringIO() as buffer:
4444
csv_writer = csv.DictWriter(buffer, fieldnames=csv_fieldnames)

0 commit comments

Comments
 (0)