-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_reader.py
28 lines (22 loc) · 1.03 KB
/
html_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# This class is used to read NHS pages and return the relevant text only
from bs4 import BeautifulSoup
import urllib2
import re
class HTMLReader(object):
def __init__(self, url =''):
self.url = url
self.headers = {'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
def get_text_from_page(self):
request = urllib2.Request(self.url, None, self.headers)
response = urllib2.urlopen(request)
html = response.read()
html_dom = BeautifulSoup(html)
# get main content div
content_div = html_dom.find("div", { "class" : "main-content healthaz-content clear" })
# remove irrelevant divs
content_div.find('div', id="webZoneMiddleTop").decompose()
content_div.find('div', { "class" : "review-dates" }).decompose()
cleaned_text = re.sub('\n\n', '', content_div.text)
# List items end up having two spaces, replace that with a comma
cleaned_text = re.sub(' ', ', ', cleaned_text)
return cleaned_text