Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion openscraper/config/settings_corefields.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@
"HTTPCACHE_ENABLED",
"AUTOTHROTTLE_ENABLED",
"ROBOTSTXT_OBEY",
"CHROME_HEADLESS"

]
CONTRIBUTOR_EDIT_FIELDS_RADIO_TEXTS = {
Expand Down Expand Up @@ -302,7 +303,9 @@
"ROBOTSTXT_OBEY",

"BOT_NAME",
"USER_AGENT"
"USER_AGENT",

"CHROME_HEADLESS"

# "page_count" , # keep track of how many pages were crawled
]
Expand Down Expand Up @@ -375,6 +378,7 @@
"ROBOTSTXT_OBEY" : ROBOTSTXT_OBEY,
"BOT_NAME" : BOT_NAME,
"USER_AGENT" : USER_AGENT,
"CHROME_HEADLESS" : CHROME_HEADLESS,

"download_delay" : 0.5, # delay

Expand Down
2 changes: 2 additions & 0 deletions openscraper/config/settings_scrapy.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
# 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
ROBOTSTXT_OBEY = False

CHROME_HEADLESS = True

'''
check
wget -U 'Open Scraper (+https://github.com/entrepreneur-interet-general/OpenScraper)' https://fondation.credit-cooperatif.coop/acor
Expand Down
1 change: 1 addition & 0 deletions openscraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,7 @@ def __init__(self, mode="default"):
self.coll_spiders.update_many({'scraper_settings.HTTPCACHE_ENABLED' : {"$exists" : False}}, {"$set": {'scraper_settings.HTTPCACHE_ENABLED' : True }})
self.coll_spiders.update_many({'scraper_settings.AUTOTHROTTLE_ENABLED' : {"$exists" : False}}, {"$set": {'scraper_settings.AUTOTHROTTLE_ENABLED' : False }})
self.coll_spiders.update_many({'scraper_settings.ROBOTSTXT_OBEY' : {"$exists" : False}}, {"$set": {'scraper_settings.ROBOTSTXT_OBEY' : False }})
self.coll_spiders.update_many({'scraper_settings.CHROME_HEADLESS' : {"$exists" : False}}, {"$set": {'scraper_settings.CHROME_HEADLESS' : True }})
self.coll_spiders.update_many({'scraper_settings.BOT_NAME' : {"$exists" : False}}, {"$set": {'scraper_settings.BOT_NAME' : "OpenScraper" }})
self.coll_spiders.update_many({'scraper_settings.USER_AGENT' : {"$exists" : False}}, {"$set": {'scraper_settings.USER_AGENT' : "Open Scraper (+https://github.com/entrepreneur-interet-general/OpenScraper)" }})

Expand Down
1 change: 1 addition & 0 deletions openscraper/scraper/cis_spiders/cis_spiders/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
CHROME_HEADLESS = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
Expand Down
3 changes: 2 additions & 1 deletion openscraper/scraper/masterspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@
### cf : https://duo.com/decipher/driving-headless-chrome-with-python
options_selenium = webdriver.ChromeOptions()
# options.binary_location = '/usr/local/bin/chromedriver'
options_selenium.add_argument('headless')
# option.add_argument(' — incognito')
# set the window size
options_selenium.add_argument('window-size=1200x600')
Expand Down Expand Up @@ -770,6 +769,8 @@ def parse(self, response):

### specify executable path to launch webdriver-->
# cf : where chromedriver was installed when `brew install chromedriver`
if self.spider_config_flat['CHROME_HEADLESS']:
options_selenium.add_argument('headless')
self.driver = webdriver.Chrome(executable_path=chromedriver_path, chrome_options=options_selenium)
# self.driver = webdriver.Chrome(chrome_options=options_selenium)
# self.driver = webdriver.Firefox()
Expand Down