fix imdb scraping

koffeinfrei · koffeinfrei · commit 137d3756e718 · 2026-03-03T21:25:39.000+01:00
IMDb now seems to require JS for the challenge on every request. This
unfortunately means that we need to spin up an actual browser to make
the request. I couldn't get a non-JS request to work, even in the
browser (with cookies cleared) a non-JS request would fail.

I'd massively prefer to use geckodriver, but on average a chrome request
(including process spinup) takes around 2s, while FF takes 2-3 as much
time. That's a massive cap I can't in good consious add to every movie
show request requiring a scrape.
For completeness, here's the code that works with FF:

```ruby
options = Selenium::WebDriver::Firefox::Options.new
options.add_argument('--headless')
options.binary = '/snap/firefox/current/usr/lib/firefox/firefox'
driver = Selenium::WebDriver.for :firefox, options: options
```
diff --git a/www/Gemfile b/www/Gemfile
@@ -22,6 +22,7 @@ gem "passwordless"
 gem "csv", require: false
 gem "good_job"
 gem "sparql", require: false
+gem "selenium-webdriver"
 
 group :development, :test do
   gem "debug", require: "debug/prelude"
@@ -40,6 +41,5 @@ group :test do
   # https://github.com/minitest/minitest/issues/1040#issuecomment-3668131216
   gem "minitest", "< 6"
   gem "capybara"
-  gem "selenium-webdriver"
   gem "webmock"
 end
diff --git a/www/app/models/http_grabber.rb b/www/app/models/http_grabber.rb
@@ -1,20 +1,53 @@
-class HttpGrabber
-  def initialize(url)
-    @url = url
+module HttpGrabber
+  class Curl
+    def initialize(url)
+      @url = url
+    end
+
+    def run(selector)
+      response = ::Curl.get(@url) do |http|
+        http.connect_timeout = 5
+        http.timeout = 5
+        http.follow_location = true
+        http.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
+        http.headers["User-Agent"] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
+      end
+      page = Nokogiri::HTML(response.body_str)
+      page.at_css(selector)&.content
+    rescue StandardError => e
+      Rails.logger.error("event=http_grabber_error url=\"#{@url}\" error=\"#{e.inspect}\"")
+      nil
+    end
   end
 
-  def run(selector)
-    response = Curl.get(@url) do |http|
-      http.connect_timeout = 5
-      http.timeout = 5
-      http.follow_location = true
-      http.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
-      http.headers["User-Agent"] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:145.0) Gecko/20100101 Firefox/145.0"
+  class Selenium
+    def initialize(url)
+      @url = url
+    end
+
+    def run(selector)
+      options = ::Selenium::WebDriver::Options.chrome
+      options.add_argument("--headless=new")
+      options.add_argument("user-agent=mozilla/5.0 (x11; ubuntu; linux x86_64; rv:147.0) gecko/20100101 firefox/147.0")
+      options.timeouts = {
+        page_load: 5_000, # 5 seconds
+        script: 5_000     # 5 seconds
+      }
+      options.page_load_strategy = :none
+      driver = ::Selenium::WebDriver.for(:chrome, options: options)
+
+      driver.get(@url)
+
+      wait = ::Selenium::WebDriver::Wait.new(timeout: 5)
+      element = wait.until { driver.find_element(css: 'script[type="application/ld+json"]') }
+      content = element.attribute("innerHTML")
+
+      driver.quit
+
+      content
+    rescue StandardError => e
+      Rails.logger.error("event=http_grabber_error url=\"#{@url}\" error=\"#{e.inspect}\"")
+      nil
     end
-    page = Nokogiri::HTML(response.body_str)
-    page.at_css(selector)&.content
-  rescue StandardError => e
-    Rails.logger.error("event=http_grabber_error url=\"#{@url}\" error=\"#{e.inspect}\"")
-    nil
   end
 end
diff --git a/www/app/models/score_fetcher.rb b/www/app/models/score_fetcher.rb
@@ -21,7 +21,7 @@ def fetch_imdb
     url = @result.movie.imdb_url
     return unless url
 
-    if html = HttpGrabber.new(url).run('script[type="application/ld+json"]')
+    if html = HttpGrabber::Selenium.new(url).run('script[type="application/ld+json"]')
       data = JSON.parse(html)
 
       if score = data.dig("aggregateRating", "ratingValue")
@@ -39,7 +39,7 @@ def fetch_metacritic
     url = @result.movie.metacritic_url
     return unless url
 
-    if html = HttpGrabber.new(url).run('script[type="application/ld+json"]')
+    if html = HttpGrabber::Curl.new(url).run('script[type="application/ld+json"]')
       data = JSON.parse(html)
 
       if score = data.dig("aggregateRating", "ratingValue")
@@ -55,7 +55,7 @@ def fetch_rotten_tomatoes
     url = @result.movie.rotten_url
     return unless url
 
-    if html = HttpGrabber.new(url).run("#media-scorecard-json")
+    if html = HttpGrabber::Curl.new(url).run("#media-scorecard-json")
       data = JSON.parse(html)
 
       if critics_score = data.dig("criticsScore", "score")