Skip to content

Commit 137d375

Browse files
committed
fix imdb scraping
IMDb now seems to require JS for the challenge on every request. This unfortunately means that we need to spin up an actual browser to make the request. I couldn't get a non-JS request to work, even in the browser (with cookies cleared) a non-JS request would fail. I'd massively prefer to use geckodriver, but on average a chrome request (including process spinup) takes around 2s, while FF takes 2-3 as much time. That's a massive cap I can't in good consious add to every movie show request requiring a scrape. For completeness, here's the code that works with FF: ```ruby options = Selenium::WebDriver::Firefox::Options.new options.add_argument('--headless') options.binary = '/snap/firefox/current/usr/lib/firefox/firefox' driver = Selenium::WebDriver.for :firefox, options: options ```
1 parent 2c04a63 commit 137d375

File tree

3 files changed

+52
-19
lines changed

3 files changed

+52
-19
lines changed

www/Gemfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ gem "passwordless"
2222
gem "csv", require: false
2323
gem "good_job"
2424
gem "sparql", require: false
25+
gem "selenium-webdriver"
2526

2627
group :development, :test do
2728
gem "debug", require: "debug/prelude"
@@ -40,6 +41,5 @@ group :test do
4041
# https://github.com/minitest/minitest/issues/1040#issuecomment-3668131216
4142
gem "minitest", "< 6"
4243
gem "capybara"
43-
gem "selenium-webdriver"
4444
gem "webmock"
4545
end

www/app/models/http_grabber.rb

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,53 @@
1-
class HttpGrabber
2-
def initialize(url)
3-
@url = url
1+
module HttpGrabber
2+
class Curl
3+
def initialize(url)
4+
@url = url
5+
end
6+
7+
def run(selector)
8+
response = ::Curl.get(@url) do |http|
9+
http.connect_timeout = 5
10+
http.timeout = 5
11+
http.follow_location = true
12+
http.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
13+
http.headers["User-Agent"] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
14+
end
15+
page = Nokogiri::HTML(response.body_str)
16+
page.at_css(selector)&.content
17+
rescue StandardError => e
18+
Rails.logger.error("event=http_grabber_error url=\"#{@url}\" error=\"#{e.inspect}\"")
19+
nil
20+
end
421
end
522

6-
def run(selector)
7-
response = Curl.get(@url) do |http|
8-
http.connect_timeout = 5
9-
http.timeout = 5
10-
http.follow_location = true
11-
http.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
12-
http.headers["User-Agent"] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
23+
class Selenium
24+
def initialize(url)
25+
@url = url
26+
end
27+
28+
def run(selector)
29+
options = ::Selenium::WebDriver::Options.chrome
30+
options.add_argument("--headless=new")
31+
options.add_argument("user-agent=mozilla/5.0 (x11; ubuntu; linux x86_64; rv:147.0) gecko/20100101 firefox/147.0")
32+
options.timeouts = {
33+
page_load: 5_000, # 5 seconds
34+
script: 5_000 # 5 seconds
35+
}
36+
options.page_load_strategy = :none
37+
driver = ::Selenium::WebDriver.for(:chrome, options: options)
38+
39+
driver.get(@url)
40+
41+
wait = ::Selenium::WebDriver::Wait.new(timeout: 5)
42+
element = wait.until { driver.find_element(css: 'script[type="application/ld+json"]') }
43+
content = element.attribute("innerHTML")
44+
45+
driver.quit
46+
47+
content
48+
rescue StandardError => e
49+
Rails.logger.error("event=http_grabber_error url=\"#{@url}\" error=\"#{e.inspect}\"")
50+
nil
1351
end
14-
page = Nokogiri::HTML(response.body_str)
15-
page.at_css(selector)&.content
16-
rescue StandardError => e
17-
Rails.logger.error("event=http_grabber_error url=\"#{@url}\" error=\"#{e.inspect}\"")
18-
nil
1952
end
2053
end

www/app/models/score_fetcher.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def fetch_imdb
2121
url = @result.movie.imdb_url
2222
return unless url
2323

24-
if html = HttpGrabber.new(url).run('script[type="application/ld+json"]')
24+
if html = HttpGrabber::Selenium.new(url).run('script[type="application/ld+json"]')
2525
data = JSON.parse(html)
2626

2727
if score = data.dig("aggregateRating", "ratingValue")
@@ -39,7 +39,7 @@ def fetch_metacritic
3939
url = @result.movie.metacritic_url
4040
return unless url
4141

42-
if html = HttpGrabber.new(url).run('script[type="application/ld+json"]')
42+
if html = HttpGrabber::Curl.new(url).run('script[type="application/ld+json"]')
4343
data = JSON.parse(html)
4444

4545
if score = data.dig("aggregateRating", "ratingValue")
@@ -55,7 +55,7 @@ def fetch_rotten_tomatoes
5555
url = @result.movie.rotten_url
5656
return unless url
5757

58-
if html = HttpGrabber.new(url).run("#media-scorecard-json")
58+
if html = HttpGrabber::Curl.new(url).run("#media-scorecard-json")
5959
data = JSON.parse(html)
6060

6161
if critics_score = data.dig("criticsScore", "score")

0 commit comments

Comments
 (0)