-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathcrawl_single_page.py
executable file
·56 lines (41 loc) · 2.02 KB
/
crawl_single_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
header = '''
#####################################################################################################################
#Program: docbao crawler
#Author: hailoc12
#Version: 1.0.0
#Date: 14/06/2019
#Repository: http://github.com/hailoc12/docbao_crawler
#File: crawl_single_page.py
#Function: demo crawl single page using firefox browser
#####################################################################################################################
'''
from lib import *
print(header)
### CHANGE THIS
url = 'http://dantri.com.vn' # crawl page
# set crawl configuration
webconfig = WebConfig() # object contains crawl configuration for a specific website
browser = BrowserWrapper() # wrapper to get back reference to Firefox browser created in read_url_source function
# MAIN CALL HERE !!!
# crawl without browser
print("Demo crawl without browser")
webconfig.set_config('use_browser', False) # Use Firefox browser to crawl or not
html = read_url_source(url, webconfig, browser)
# extract data
if html is not None: #crawl ok
html_tree = etree.HTML(html) #use lxml to parse HTML to element tree
title = html_tree.xpath("//title/text()")[0] # extract title tag from page html
print("Title of page %s is: %s" % (url, title))
# crawl with browser
print("Demo crawl with browser")
webconfig.set_config('use_browser', True) # Use Firefox browser to crawl or not
webconfig.set_config('browser_fast_load', True) # use adblock extensions, disable css...to load page faster
webconfig.set_config('display_browser', False) # note: display_browser=True won't work if program is run through SSH
html = read_url_source(url, webconfig, browser)
# extract data
if html is not None: #crawl ok
html_tree = etree.HTML(html) #use lxml to parse HTML to element tree
title = html_tree.xpath("//title/text()")[0] # extract title tag from page html
print("Title of page %s is: %s" % (url, title))
# quit browser to avoid memory leak: IMPORTANT !
browser.quit()