Skip to content

Commit 24bb20c

Browse files
committed
Added dev branch features.
Bumped to version 0.6
1 parent 1546587 commit 24bb20c

File tree

9 files changed

+223
-173
lines changed

9 files changed

+223
-173
lines changed

Diff for: README

+4
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ Command-line usage::
3434

3535
python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
3636

37+
To open resulting page in browser::
38+
39+
python -m readability.readability -b -u http://pypi.python.org/pypi/readability-lxml
3740

3841
Using positive/negative keywords example::
3942

@@ -56,3 +59,4 @@ Updates
5659
- 0.3 Added Document.encoding, positive_keywords and negative_keywords
5760
- 0.4 Added Videos loading and allowed more images per paragraph
5861
- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
62+
- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 and 3.4

Diff for: readability/browser.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
def open_in_browser(html):
2+
"""
3+
Open the HTML document in a web browser, saving it to a temporary
4+
file to open it. Note that this does not delete the file after
5+
use. This is mainly meant for debugging.
6+
"""
7+
import os
8+
import webbrowser
9+
import tempfile
10+
handle, fn = tempfile.mkstemp(suffix='.html')
11+
f = os.fdopen(handle, 'wb')
12+
try:
13+
f.write(b"<meta charset='UTF-8' />")
14+
f.write(html.encode('utf-8'))
15+
finally:
16+
# we leak the file itself here, but we should at least close it
17+
f.close()
18+
url = 'file://' + fn.replace(os.path.sep, '/')
19+
webbrowser.open(url)
20+
return url

Diff for: readability/cleaners.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import re
33
from lxml.html.clean import Cleaner
44

5-
bad_attrs = ['style', '[-a-z]*color', 'background[-a-z]*', 'on*']
5+
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
66
single_quoted = "'[^']+'"
77
double_quoted = '"[^"]+"'
88
non_space = '[^ "\'>]+'
@@ -20,7 +20,8 @@ def clean_attributes(html):
2020
return html
2121

2222
def normalize_spaces(s):
23-
if not s: return ''
23+
if not s:
24+
return ''
2425
"""replace any sequence of whitespace
2526
characters with a single space"""
2627
return ' '.join(s.split())

Diff for: readability/compat/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,8 @@
44
It mainly exists because their are certain incompatibilities in the Python
55
syntax that can only be solved by conditionally importing different functions.
66
"""
7+
import sys
8+
if sys.version_info[0] == 2:
9+
str_ = unicode
10+
elif sys.version_info[0] == 3:
11+
str_ = str

Diff for: readability/debug.py

+45-17
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,53 @@
1-
def save_to_file(text, filename):
2-
f = open(filename, 'wt')
3-
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
4-
f.write(text.encode('utf-8'))
5-
f.close()
1+
import re
62

7-
uids = {}
8-
def describe(node, depth=2):
3+
4+
#FIXME: use with caution, can leak memory
5+
uids = {}
6+
uids_document = None
7+
8+
9+
def describe_node(node):
10+
global uids
11+
if node is None:
12+
return ''
913
if not hasattr(node, 'tag'):
1014
return "[%s]" % type(node)
1115
name = node.tag
12-
if node.get('id', ''): name += '#'+node.get('id')
13-
if node.get('class', ''):
14-
name += '.' + node.get('class').replace(' ','.')
16+
if node.get('id', ''):
17+
name += '#' + node.get('id')
18+
if node.get('class', ''):
19+
name += '.' + node.get('class').replace(' ', '.')
1520
if name[:4] in ['div#', 'div.']:
1621
name = name[3:]
1722
if name in ['tr', 'td', 'div', 'p']:
18-
if not node in uids:
19-
uid = uids[node] = len(uids)+1
20-
else:
21-
uid = uids.get(node)
22-
name += "%02d" % (uid)
23-
if depth and node.getparent() is not None:
24-
return name+' - '+describe(node.getparent(), depth-1)
23+
uid = uids.get(node)
24+
if uid is None:
25+
uid = uids[node] = len(uids) + 1
26+
name += "{%02d}" % uid
2527
return name
28+
29+
30+
def describe(node, depth=2):
31+
global uids, uids_document
32+
doc = node.getroottree().getroot()
33+
if doc != uids_document:
34+
uids = {}
35+
uids_document = doc
36+
37+
#return repr(NodeRepr(node))
38+
parent = ''
39+
if depth and node.getparent() is not None:
40+
parent = describe(node.getparent(), depth=depth - 1)
41+
return parent + '/' + describe_node(node)
42+
43+
44+
RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U)
45+
46+
47+
def text_content(elem, length=40):
48+
content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
49+
if len(content) < length:
50+
return content
51+
return content[:length] + '...'
52+
53+

Diff for: readability/encoding.py

+36-28
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,34 @@
22
import chardet
33
import sys
44

5+
6+
RE_CHARSET = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
7+
RE_PRAGMA = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
8+
RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
9+
10+
CHARSETS = {
11+
'big5': 'big5hkscs',
12+
'gb2312': 'gb18030',
13+
'ascii': 'utf-8',
14+
'maccyrillic': 'cp1251',
15+
'win1251': 'cp1251',
16+
'win-1251': 'cp1251',
17+
'windows-1251': 'cp1251',
18+
}
19+
20+
def fix_charset(encoding):
21+
"""Overrides encoding when charset declaration
22+
or charset determination is a subset of a larger
23+
charset. Created because of issues with Chinese websites"""
24+
encoding = encoding.lower()
25+
return CHARSETS.get(encoding, encoding)
26+
27+
528
def get_encoding(page):
629
# Regex for XML and HTML Meta charset declaration
7-
charset_re = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
8-
pragma_re = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
9-
xml_re = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
10-
11-
declared_encodings = (charset_re.findall(page) +
12-
pragma_re.findall(page) +
13-
xml_re.findall(page))
30+
declared_encodings = (RE_CHARSET.findall(page) +
31+
RE_PRAGMA.findall(page) +
32+
RE_XML.findall(page))
1433

1534
# Try any declared encodings
1635
for declared_encoding in declared_encodings:
@@ -21,34 +40,23 @@ def get_encoding(page):
2140
# ever use non-ascii characters in the name of an encoding.
2241
declared_encoding = declared_encoding.decode('ascii', 'replace')
2342

24-
page.decode(custom_decode(declared_encoding))
25-
return custom_decode(declared_encoding)
43+
encoding = fix_charset(declared_encoding)
44+
45+
# Now let's decode the page
46+
page.decode()
47+
# It worked!
48+
return encoding
2649
except UnicodeDecodeError:
2750
pass
2851

2952
# Fallback to chardet if declared encodings fail
30-
text = re.sub(b'</?[^>]*>\s*', b' ', page)
53+
# Remove all HTML tags, and leave only text for chardet
54+
text = re.sub(b'(\s*</?[^>]*>)+\s*', b' ', page).strip()
3155
enc = 'utf-8'
32-
if not text.strip() or len(text) < 10:
56+
if len(text) < 10:
3357
return enc # can't guess
3458
res = chardet.detect(text)
3559
enc = res['encoding'] or 'utf-8'
3660
#print '->', enc, "%.2f" % res['confidence']
37-
enc = custom_decode(enc)
61+
enc = fix_charset(enc)
3862
return enc
39-
40-
def custom_decode(encoding):
41-
"""Overrides encoding when charset declaration
42-
or charset determination is a subset of a larger
43-
charset. Created because of issues with Chinese websites"""
44-
encoding = encoding.lower()
45-
alternates = {
46-
'big5': 'big5hkscs',
47-
'gb2312': 'gb18030',
48-
'ascii': 'utf-8',
49-
'MacCyrillic': 'cp1251',
50-
}
51-
if encoding in alternates:
52-
return alternates[encoding]
53-
else:
54-
return encoding

Diff for: readability/htmls.py

+20-14
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,25 @@
55

66
from .cleaners import normalize_spaces, clean_attributes
77
from .encoding import get_encoding
8+
from .compat import str_
89

910
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
1011

11-
if sys.version_info[0] == 2:
12-
str = unicode
13-
1412
def build_doc(page):
15-
if isinstance(page, str):
16-
enc = None
17-
page_unicode = page
13+
if isinstance(page, str_):
14+
encoding = None
15+
decoded_page = page
1816
else:
19-
enc = get_encoding(page) or 'utf-8'
20-
page_unicode = page.decode(enc, 'replace')
21-
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
22-
return doc, enc
17+
encoding = get_encoding(page) or 'utf-8'
18+
decoded_page = page.decode(encoding, 'replace')
19+
20+
# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
21+
doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser)
22+
return doc, encoding
2323

2424
def js_re(src, pattern, flags, repl):
2525
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
2626

27-
2827
def normalize_entities(cur_title):
2928
entities = {
3029
u'\u2014':'-',
@@ -58,6 +57,10 @@ def add_match(collection, text, orig):
5857
if text.replace('"', '') in orig.replace('"', ''):
5958
collection.add(text)
6059

60+
TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
61+
'.news_title', '.title', '.head', '.heading',
62+
'.contentheading', '.small_header_red']
63+
6164
def shorten_title(doc):
6265
title = doc.find('.//title')
6366
if title is None or title.text is None or len(title.text) == 0:
@@ -74,7 +77,7 @@ def shorten_title(doc):
7477
if e.text_content():
7578
add_match(candidates, e.text_content(), orig)
7679

77-
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
80+
for item in TITLE_CSS_HEURISTICS:
7881
for e in doc.cssselect(item):
7982
if e.text:
8083
add_match(candidates, e.text, orig)
@@ -107,8 +110,11 @@ def shorten_title(doc):
107110
return title
108111

109112
def get_body(doc):
110-
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
111-
raw_html = str(tostring(doc.body or doc))
113+
for elem in doc.xpath('.//script | .//link | .//style'):
114+
elem.drop_tree()
115+
# tostring() always return utf-8 encoded string
116+
# FIXME: isn't better to use tounicode?
117+
raw_html = str_(tostring(doc.body or doc))
112118
cleaned = clean_attributes(raw_html)
113119
try:
114120
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?

0 commit comments

Comments
 (0)