-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathhtmlParser.py
79 lines (55 loc) · 2.75 KB
/
htmlParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import lxml.html as lh
from types import *
import urllib2
from urllib2 import urlopen
import logging
from datetime import date
from datetime import datetime
import csv
class htmlParser:
def __init__(self , url , tagType):
self.url = url
self.tagType = tagType
def removeNonAscii(self , text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
def fetchText(self):
logging.debug(str(datetime.now()) + " ## START htmlParser ## " + self.url)
text = ' '
try:
req = urllib2.Request(self.url, headers={'User-Agent' : "Magic Browser"})
f = urlopen(req)
#f = urlopen(self.url)
html = f.read()
#Returns document_fromstring or fragment_fromstring, based on whether the string looks like a full document, or just a fragment.
root = lh.fromstring(html)
#Select elements from this element and its children, using a CSS selector expression. (Note that .xpath(expr) is also available as on all lxml elements.)
articleElements = root.cssselect(self.tagType)
if len(articleElements) == 0:
return text
# # Strategy one, script and function data also comes while using this.
# text = articleElements[0].text_content()
#Strategy two, currently in use
nestedElements = articleElements[0].cssselect('*')
for elem in nestedElements:
if elem.tag not in ['script','style']:
if type(elem.text) in [StringType, UnicodeType]:
text = text + elem.text.rstrip() + '. '
# Checking length on every addition increases runtime by 8times on average.
# if( len(text_f) > 32767 ):
# break
# else:
# text=text_f
except Exception as e:
logging.warning( "Could not process article from " + str( self.url))
return self.removeNonAscii(text)
return self.removeNonAscii(text)
# start = datetime.now()
# obj = htmlParser('http://www.winbeta.org/news/classrooms-around-the-world-invited-to-participate-in-skype-a-thon', 'article')
# text = obj.fetchText()
#
# with open("C:\Users\Jyoti.Gupta\Documents\crawler\sample.csv" , 'ab') as csvFile:
# csvWriter = csv.writer(csvFile, delimiter="," , quoting=csv.QUOTE_ALL)
#
# csvWriter.writerow([ text[:32760]])
#
# print (datetime.now()-start).total_seconds()