Skip to content

Commit a5da3de

Browse files
authored
fix: ensure all text is maintained in html output (#335)
* fix: ensure all text is maintained in html pages * add back in replace unicode quotes * changelog and version bump * apt-get update in ci * white space differences in output
1 parent ed074b5 commit a5da3de

File tree

7 files changed

+70
-12
lines changed

7 files changed

+70
-12
lines changed

Diff for: .github/workflows/ci.yml

+1
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ jobs:
104104
run: |
105105
source .venv/bin/activate
106106
make install-detectron2
107+
sudo apt-get update
107108
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice
108109
make test
109110
make check-coverage

Diff for: CHANGELOG.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.5.2-dev1
1+
## 0.5.2
22

33
### Enhancements
44

@@ -9,10 +9,11 @@ rather than a "tmp-ingest-" dir in the working directory.
99

1010
### Fixes
1111

12-
* 'setup_ubuntu.sh` no longer fails in some contexts by interpreting
12+
* 'setup_ubuntu.sh` no longer fails in some contexts by interpreting
1313
`DEBIAN_FRONTEND=noninteractive` as a command
1414
* `unstructured-ingest` no longer re-downloads files when --preserve-downloads
1515
is used without --download-dir.
16+
* Fixed an issue that was causing text to be skipped in some HTML documents.
1617

1718
## 0.5.1
1819

Diff for: example-docs/ideas-page.html

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2+
<html><script type="text/javascript">
3+
<!--
4+
(new Image).src="https://store.yahoo.net/cgi-bin/refsd?e=http://paulgraham.com/getideas.html&h=paulgraham.com&v=1.0&dr=" + escape(document.referrer);
5+
-->
6+
</script>
7+
<head><title>How to Get New Ideas</title><!-- <META NAME="ROBOTS" CONTENT="NOODP"> -->
8+
<link rel="shortcut icon" href="http://ycombinator.com/arc/arc.png">
9+
</head><body bgcolor=ffffff background="https://sep.yimg.com/ca/I/paulgraham_2271_0" text=000000 link=000099 vlink=464646><table border=0 cellspacing=0 cellpadding=0><tr valign=top><td><map name=c04963d10de5f><area shape=rect coords="0,0,67,21" href="index.html"><area shape=rect coords="0,21,67,42" href="articles.html"><area shape=rect coords="0,42,67,63" href="http://www.amazon.com/gp/product/0596006624"><area shape=rect coords="0,63,67,84" href="books.html"><area shape=rect coords="0,84,67,105" href="http://ycombinator.com"><area shape=rect coords="0,105,67,126" href="arc.html"><area shape=rect coords="0,126,67,147" href="bel.html"><area shape=rect coords="0,147,67,168" href="lisp.html"><area shape=rect coords="0,168,67,189" href="antispam.html"><area shape=rect coords="0,189,67,210" href="kedrosky.html"><area shape=rect coords="0,210,67,231" href="faq.html"><area shape=rect coords="0,231,67,252" href="raq.html"><area shape=rect coords="0,252,67,273" href="quo.html"><area shape=rect coords="0,273,67,294" href="rss.html"><area shape=rect coords="0,294,67,315" href="bio.html"><area shape=rect coords="0,315,67,336" href="https://twitter.com/paulg"><area shape=rect coords="0,336,67,357" href="https://mas.to/@paulg"></map><img src="https://s.yimg.com/aah/paulgraham/img-20.gif" width=69 height=357 usemap=#c04963d10de5f border=0 hspace=0 vspace=0 ismap></td><td><img src="https://sep.yimg.com/ca/Img/trans_1x1.gif" height=1 width=26 border=0></td><td><a href="index.html"><img src="https://sep.yimg.com/ca/I/paulgraham_2271_3232" width=410 height=45 border=0 hspace=0 vspace=0></a><br><br><table border=0 cellspacing=0 cellpadding=0 width=435><tr valign=top><td width=435><img src="https://s.yimg.com/aah/paulgraham/how-to-get-new-ideas-1.gif" width=176 height=18 border=0 hspace=0 vspace=0 alt="How to Get New Ideas"><br><br><font size=2 face="verdana">January 2023<br><br><i>(<a href="https://twitter.com/stef/status/1617222428727586816"><u>Someone</u></a> fed my essays into GPT to make something that could answer
10+
questions based on them, then asked it where good ideas come from. The
11+
answer was ok, but not what I would have said. This is what I would have said.)</i><br><br>The way to get new ideas is to notice anomalies: what seems strange,
12+
or missing, or broken? You can see anomalies in everyday life (much
13+
of standup comedy is based on this), but the best place to look for
14+
them is at the frontiers of knowledge.<br><br>Knowledge grows fractally.
15+
From a distance its edges look smooth, but when you learn enough
16+
to get close to one, you'll notice it's full of gaps. These gaps
17+
will seem obvious; it will seem inexplicable that no one has tried
18+
x or wondered about y. In the best case, exploring such gaps yields
19+
whole new fractal buds.<br><br></font></td></tr></table><table border=0 cellspacing=0 cellpadding=0 width=435><tr><td><font size=2 face="verdana"><br><br><hr></font></td></tr></table></td></tr></table></body>
20+
<script type="text/javascript">
21+
csell_env = 'bf1';
22+
var storeCheckoutDomain = 'order.store.yahoo.net';
23+
</script>
24+
<script type="text/javascript">
25+
// Begin Yahoo Store Generated Code
26+
</script> <script type="text/javascript" src="https://s.turbifycdn.com/lq/ult/ylc_1.9.js" ></script> <script type="text/javascript" src="https://s.turbifycdn.com/ae/lib/smbiz/store/csell/beacon-a9518fc6e4.js" >
27+
</script>
28+
<script type="text/javascript">
29+
// Begin Yahoo Store Generated Code
30+
csell_page_data = {}; csell_page_rec_data = []; ts='TOK_STORE_ID';
31+
</script>
32+
<script type="text/javascript">
33+
// Begin Yahoo Store Generated Code
34+
function csell_GLOBAL_INIT_TAG() { var csell_token_map = {}; csell_token_map['TOK_ITEM_ID_LIST'] = 'getideas'; csell_token_map['TOK_BEACON_TYPE'] = 'prod'; csell_token_map['TOK_RAND_KEY'] = 't'; csell_token_map['TOK_SPACEID'] = '2022276099'; csell_token_map['TOK_IS_ORDERABLE'] = '2'; csell_token_map['TOK_STORE_ID'] = 'paulgraham'; csell_token_map['TOK_URL'] = ''; csell_token_map['TOK_ORDER_HOST'] = 'order.store.yahoo.net'; c = csell_page_data; var x = (typeof storeCheckoutDomain == 'string')?storeCheckoutDomain:'order.store.yahoo.net'; var t = csell_token_map; c['s'] = t['TOK_SPACEID']; c['url'] = t['TOK_URL']; c['si'] = t[ts]; c['ii'] = t['TOK_ITEM_ID_LIST']; c['bt'] = t['TOK_BEACON_TYPE']; c['rnd'] = t['TOK_RAND_KEY']; c['io'] = t['TOK_IS_ORDERABLE']; YStore.addItemUrl = 'http%s://'+x+'/'+t[ts]+'/ymix/MetaController.html?eventName.addEvent&cartDS.shoppingcart_ROW0_m_orderItemVector_ROW0_m_itemId=%s&cartDS.shoppingcart_ROW0_m_orderItemVector_ROW0_m_quantity=1&ysco_key_cs_item=1&sectionId=ysco.cart&ysco_key_store_id='+t[ts]; }
35+
</script>
36+
<script type="text/javascript">
37+
// Begin Yahoo Store Generated Code
38+
function csell_REC_VIEW_TAG() { var env = (typeof csell_env == 'string')?csell_env:'prod'; var p = csell_page_data; var a = '/sid='+p['si']+'/io='+p['io']+'/ii='+p['ii']+'/bt='+p['bt']+'-view'+'/en='+env; var r=Math.random(); YStore.CrossSellBeacon.renderBeaconWithRecData(p['url']+'/p/s='+p['s']+'/'+p['rnd']+'='+r+a); }
39+
</script>
40+
<script type="text/javascript">
41+
// Begin Yahoo Store Generated Code
42+
var csell_token_map = {}; csell_token_map['TOK_PAGE'] = 'p'; csell_token_map['TOK_WS_URL'] = 'https://paulgraham.csell.store.yahoo.net/cs/recommend?itemids=getideas&location=p'; csell_token_map['TOK_SHOW_CS_RECS'] = 'false'; csell_token_map['TOK_CURR_SYM'] = '$'; var t = csell_token_map; csell_GLOBAL_INIT_TAG(); YStore.page = t['TOK_PAGE']; YStore.currencySymbol = t['TOK_CURR_SYM']; YStore.crossSellUrl = t['TOK_WS_URL']; YStore.showCSRecs = t['TOK_SHOW_CS_RECS']; </script> <script type="text/javascript" src="https://s.turbifycdn.com/ae/store/secure/recs-1.3.2.2.js" ></script> <script type="text/javascript" >
43+
</script>
44+
</html>

Diff for: test_unstructured/partition/test_html_partition.py

+8
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,11 @@ def test_partition_html_raises_with_too_many_specified():
9898

9999
with pytest.raises(ValueError):
100100
partition_html(filename=filename, text=text)
101+
102+
103+
def test_partition_html_on_ideas_page():
104+
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "ideas-page.html")
105+
elements = partition_html(filename=filename)
106+
document_text = "\n\n".join([str(el) for el in elements])
107+
assert document_text.startswith("January 2023(Someone fed my essays into GPT")
108+
assert document_text.endswith("whole new fractal buds.")

Diff for: test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json

+4-4
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
}
99
},
1010
{
11-
"element_id": "4300054a3c2601f905282a7bc7199044",
12-
"text": "More info available at the \n\t\tGithub Project Page",
11+
"element_id": "d551bbfc9477547e4dce6264d8196c7b",
12+
"text": "More info available at the Github Project Page",
1313
"type": "Title",
1414
"metadata": {
1515
"page_number": 1
@@ -24,8 +24,8 @@
2424
}
2525
},
2626
{
27-
"element_id": "a309823c9d508290682a198270b84bca",
28-
"text": "File Contents\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
27+
"element_id": "43f65b1c5bd47774b25c72e2f96de300",
28+
"text": "File Contents\n\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
2929
"type": "NarrativeText",
3030
"metadata": {
3131
"page_number": 1

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.2-dev1" # pragma: no cover
1+
__version__ = "0.5.2" # pragma: no cover

Diff for: unstructured/documents/html.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ def _read(self) -> List[Page]:
9797
return self._pages
9898
logger.info("Reading document ...")
9999
pages: List[Page] = []
100+
etree.strip_elements(self.document_tree, ["script"])
100101
root = _find_main(self.document_tree)
101102

102103
articles = _find_articles(root)
@@ -213,6 +214,8 @@ def _parse_tag(
213214
processing the document tree again. In the future we might want to keep descendants too,
214215
but we don't have a use for them at the moment."""
215216
ancestortags: Tuple[str, ...] = tuple(el.tag for el in tag_elem.iterancestors())[::-1]
217+
if tag_elem.tag == "script":
218+
return None
216219
text = _construct_text(tag_elem)
217220
if not text:
218221
return None
@@ -265,11 +268,12 @@ def is_narrative_tag(text: str, tag: str) -> bool:
265268
def _construct_text(tag_elem: etree.Element) -> str:
266269
"""Extracts text from a text tag element."""
267270
text = ""
268-
for item in tag_elem.iter():
269-
if item.text and item.tag != "script":
270-
text += item.text
271-
if item.tail:
272-
text += item.tail
271+
for item in tag_elem.itertext():
272+
if item:
273+
text += item
274+
275+
if tag_elem.tail:
276+
text = text + tag_elem.tail
273277

274278
text = replace_unicode_quotes(text)
275279
return text.strip()

0 commit comments

Comments
 (0)