geek-cookbook · pull · May 31, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
diff --git a/backend/Cargo.lock b/backend/Cargo.lock
diff --git a/backend/Cargo.toml b/backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mediafusion-api"
-version = "6.0.0-beta.12"
+version = "6.0.0-beta.13"
 edition = "2021"
 
 [lib]
@@ -64,6 +64,7 @@ sha2 = "0.11"
 
 # Compression
 flate2 = "1"
+tempfile = "3"
 
 # Random bytes for AES IV generation + sports poster selection
 rand_core = "0.10"

diff --git a/backend/migrations/0011_imdb_dataset_import.down.sql b/backend/migrations/0011_imdb_dataset_import.down.sql
@@ -0,0 +1,12 @@
+DELETE FROM cron_jobs WHERE name = 'imdb_dataset_import';
+
+DROP TABLE IF EXISTS imdb_stage_names;
+DROP TABLE IF EXISTS imdb_stage_principals;
+DROP TABLE IF EXISTS imdb_stage_episode;
+DROP TABLE IF EXISTS imdb_stage_crew;
+DROP TABLE IF EXISTS imdb_stage_akas;
+DROP TABLE IF EXISTS imdb_stage_ratings;
+DROP TABLE IF EXISTS imdb_stage_basics;
+DROP TABLE IF EXISTS imdb_import_state;
+
+-- Provider seed rows are intentionally left in place (shared with Python runtime).
diff --git a/backend/migrations/0011_imdb_dataset_import.up.sql b/backend/migrations/0011_imdb_dataset_import.up.sql
@@ -0,0 +1,108 @@
+-- IMDb bulk import infrastructure (additive only — no ALTER on existing tables).
+
+-- ── Provider seeds (idempotent) ─────────────────────────────────────────────
+
+INSERT INTO metadata_provider (name, display_name, is_external, is_active, priority, default_priority, created_at)
+VALUES
+    ('imdb', 'IMDb', true, true, 10, 10, now()),
+    ('tvdb', 'TVDB', true, true, 15, 15, now()),
+    ('tmdb', 'TMDB', true, true, 20, 20, now())
+ON CONFLICT (name) DO NOTHING;
+
+INSERT INTO rating_provider (name, display_name, max_rating, is_percentage, is_active, display_order)
+VALUES ('imdb', 'IMDb', 10, false, true, 10)
+ON CONFLICT (name) DO NOTHING;
+
+-- ── Import state (conditional GET + observability) ──────────────────────────
+
+CREATE TABLE IF NOT EXISTS imdb_import_state (
+    dataset       text PRIMARY KEY,
+    etag          text,
+    last_modified text,
+    last_run_at   timestamptz,
+    rows_loaded   bigint
+);
+
+-- ── UNLOGGED staging tables (all text; column order matches IMDb TSV headers) ─
+
+CREATE UNLOGGED TABLE IF NOT EXISTS imdb_stage_basics (
+    tconst          text,
+    title_type      text,
+    primary_title   text,
+    original_title  text,
+    is_adult        text,
+    start_year      text,
+    end_year        text,
+    runtime_minutes text,
+    genres          text
+);
+
+CREATE UNLOGGED TABLE IF NOT EXISTS imdb_stage_ratings (
+    tconst          text,
+    average_rating  text,
+    num_votes       text
+);
+
+CREATE UNLOGGED TABLE IF NOT EXISTS imdb_stage_akas (
+    title_id            text,
+    ordering            text,
+    title               text,
+    region              text,
+    language            text,
+    types               text,
+    attributes          text,
+    is_original_title   text
+);
+
+CREATE UNLOGGED TABLE IF NOT EXISTS imdb_stage_crew (
+    tconst      text,
+    directors   text,
+    writers     text
+);
+
+CREATE UNLOGGED TABLE IF NOT EXISTS imdb_stage_episode (
+    tconst          text,
+    parent_tconst   text,
+    season_number   text,
+    episode_number  text
+);
+
+CREATE UNLOGGED TABLE IF NOT EXISTS imdb_stage_principals (
+    tconst      text,
+    ordering    text,
+    nconst      text,
+    category    text,
+    job         text,
+    characters  text
+);
+
+CREATE UNLOGGED TABLE IF NOT EXISTS imdb_stage_names (
+    nconst              text,
+    primary_name        text,
+    birth_year          text,
+    death_year          text,
+    primary_profession  text,
+    known_for_titles    text
+);
+
+CREATE INDEX IF NOT EXISTS imdb_stage_basics_tconst_idx
+    ON imdb_stage_basics (tconst);
+CREATE INDEX IF NOT EXISTS imdb_stage_ratings_tconst_idx
+    ON imdb_stage_ratings (tconst);
+CREATE INDEX IF NOT EXISTS imdb_stage_akas_title_id_idx
+    ON imdb_stage_akas (title_id);
+CREATE INDEX IF NOT EXISTS imdb_stage_crew_tconst_idx
+    ON imdb_stage_crew (tconst);
+CREATE INDEX IF NOT EXISTS imdb_stage_episode_parent_idx
+    ON imdb_stage_episode (parent_tconst);
+CREATE INDEX IF NOT EXISTS imdb_stage_principals_tconst_idx
+    ON imdb_stage_principals (tconst);
+CREATE INDEX IF NOT EXISTS imdb_stage_principals_nconst_idx
+    ON imdb_stage_principals (nconst);
+CREATE INDEX IF NOT EXISTS imdb_stage_names_nconst_idx
+    ON imdb_stage_names (nconst);
+
+-- Weekly refresh (disabled by default — enable via admin UI or UPDATE cron_jobs)
+INSERT INTO cron_jobs (name, schedule, queue, payload, enabled)
+VALUES ('imdb_dataset_import', '0 4 * * 0', 'imdb_dataset_import', '{}', false)
+ON CONFLICT (name) DO NOTHING;
diff --git a/backend/migrations/0012_imdb_scheduler_config.down.sql b/backend/migrations/0012_imdb_scheduler_config.down.sql
@@ -0,0 +1,3 @@
+UPDATE cron_jobs
+SET payload = '{}'::jsonb
+WHERE name = 'imdb_dataset_import';
diff --git a/backend/migrations/0012_imdb_scheduler_config.up.sql b/backend/migrations/0012_imdb_scheduler_config.up.sql
@@ -0,0 +1,8 @@
+-- Store default IMDb import options in cron_jobs.payload for admin UI management.
+UPDATE cron_jobs
+SET payload = '{
+  "datasets": ["basics", "names", "ratings", "akas", "episode", "crew", "principals"],
+  "include_adult": false
+}'::jsonb
+WHERE name = 'imdb_dataset_import'
+  AND payload = '{}'::jsonb;
diff --git a/backend/migrations/0013_remove_internal_mediafusion_external_ids.down.sql b/backend/migrations/0013_remove_internal_mediafusion_external_ids.down.sql
@@ -0,0 +1 @@
+-- Irreversible cleanup: deleted internal mediafusion external_id rows are not restored.
diff --git a/backend/migrations/0013_remove_internal_mediafusion_external_ids.up.sql b/backend/migrations/0013_remove_internal_mediafusion_external_ids.up.sql
@@ -0,0 +1,17 @@
+-- Drop internal MediaFusion identifiers from media_external_id.
+--
+-- media.id is the sole internal primary key. Stremio-facing aliases like mf:123
+-- are computed at read time and must not be persisted as external provider IDs.
+--
+-- Preserves real provider rows (imdb, tmdb, tvdb, kitsu, …) and user-scoped
+-- identifiers (mf:user:…) used for IPTV/M3U imports.
+
+DELETE FROM media_external_id
+WHERE provider = 'mediafusion';
+
+DELETE FROM media_external_id
+WHERE external_id LIKE 'mfm\_%' ESCAPE '\'
+   OR external_id LIKE 'mfs\_%' ESCAPE '\'
+   OR external_id LIKE 'mf_tv\_%' ESCAPE '\'
+   OR external_id ~ '^mf:[0-9]+$'
+   OR external_id ~ '^mf[0-9]+$';
diff --git a/backend/src/bin/worker.rs b/backend/src/bin/worker.rs
@@ -12,6 +12,7 @@ use mediafusion_api::{
             cleanup::Cleanup,
             discover_prewarm::DiscoverPrewarm,
             dmm_hashlist::DmmHashlistScraper,
+            imdb_dataset_import::ImdbDatasetImport,
             integration_syncs::IntegrationSyncs,
             jackett_feed::JackettFeedScraper,
             m3u_import::M3uImport,
@@ -182,6 +183,7 @@ async fn main() {
     reg.register(Arc::new(UpdateSeeders));
     reg.register(Arc::new(UpdateTvPosters));
     reg.register(Arc::new(DiscoverPrewarm));
+    reg.register(Arc::new(ImdbDatasetImport));
     reg.register(Arc::new(Cleanup));
     reg.register(Arc::new(IntegrationSyncs));
     reg.register(Arc::new(M3uImport));

diff --git a/backend/src/config.rs b/backend/src/config.rs
@@ -57,6 +57,19 @@ pub struct AppConfig {
     pub is_scrap_from_mediafusion: bool,
     pub is_scrap_from_dmm_hashlist: bool,
     pub disable_dmm_hashlist_scraper: bool,
+    /// GitHub repo owner for DMM hashlist ingestion.
+    pub dmm_hashlist_repo_owner: String,
+    /// GitHub repo name for DMM hashlist ingestion.
+    pub dmm_hashlist_repo_name: String,
+    /// Git branch to read DMM hashlist commits from.
+    pub dmm_hashlist_branch: String,
+    /// Max new commits to process per incremental DMM hashlist run.
+    pub dmm_hashlist_commits_per_run: usize,
+    /// Max backfill commits to walk per DMM hashlist run.
+    pub dmm_hashlist_backfill_commits_per_run: usize,
+    /// Optional GitHub token for DMM hashlist fetches (`DMM_HASHLIST_GITHUB_TOKEN`, else `GITHUB_TOKEN`).
+    /// Unauthenticated requests still work against the public GitHub API with lower rate limits.
+    pub dmm_hashlist_github_token: Option<String>,
     pub is_scrap_from_public_indexers: bool,
     pub is_scrap_from_public_usenet_indexers: bool,
     pub is_scrap_from_jackett: bool,
@@ -233,6 +246,12 @@ pub struct AppConfig {
     pub tvdb_api_key: Option<String>,
     /// When false, do not call v3-cinemeta.strem.io (mirrors Python `imdb_cinemeta_fallback_enabled`).
     pub imdb_cinemeta_fallback_enabled: bool,
+    /// Base URL for IMDb non-commercial dataset files.
+    pub imdb_datasets_base_url: String,
+    /// Include adult titles when importing IMDb basics (default: false).
+    pub imdb_import_include_adult: bool,
+    /// Optional allowlist of dataset keys to import (empty = all).
+    pub imdb_import_datasets: Vec<String>,
     /// Primary metadata source for scrapers (`imdb` or `tmdb`, Python `metadata_primary_source`).
     pub metadata_primary_source: String,
     /// Ordered anime provider chain for search/fetch (`kitsu`, `anilist`).
@@ -425,6 +444,24 @@ impl AppConfig {
                 .ok().and_then(|v| v.parse().ok()).unwrap_or(false),
             disable_dmm_hashlist_scraper: env("DISABLE_DMM_HASHLIST_SCRAPER")
                 .ok().and_then(|v| v.parse().ok()).unwrap_or(false),
+            dmm_hashlist_repo_owner: env("DMM_HASHLIST_REPO_OWNER")
+                .unwrap_or_else(|_| "debridmediamanager".into()),
+            dmm_hashlist_repo_name: env("DMM_HASHLIST_REPO_NAME")
+                .unwrap_or_else(|_| "hashlists".into()),
+            dmm_hashlist_branch: env("DMM_HASHLIST_BRANCH")
+                .unwrap_or_else(|_| "main".into()),
+            dmm_hashlist_commits_per_run: env("DMM_HASHLIST_COMMITS_PER_RUN")
+                .ok()
+                .and_then(|v| v.parse().ok())
+                .unwrap_or(20),
+            dmm_hashlist_backfill_commits_per_run: env("DMM_HASHLIST_BACKFILL_COMMITS_PER_RUN")
+                .ok()
+                .and_then(|v| v.parse().ok())
+                .unwrap_or(20),
+            dmm_hashlist_github_token: env("DMM_HASHLIST_GITHUB_TOKEN")
+                .ok()
+                .filter(|s| !s.is_empty())
+                .or_else(|| env("GITHUB_TOKEN").ok().filter(|s| !s.is_empty())),
             is_scrap_from_public_indexers: env("IS_SCRAP_FROM_PUBLIC_INDEXERS")
                 .ok().and_then(|v| v.parse().ok()).unwrap_or(true),
             is_scrap_from_public_usenet_indexers: env("IS_SCRAP_FROM_PUBLIC_USENET_INDEXERS")
@@ -650,6 +687,24 @@ impl AppConfig {
                 .ok()
                 .and_then(|v| v.parse().ok())
                 .unwrap_or(true),
+            imdb_datasets_base_url: env("IMDB_DATASETS_BASE_URL")
+                .unwrap_or_else(|_| "https://datasets.imdbws.com".into())
+                .trim_end_matches('/')
+                .to_string(),
+            imdb_import_include_adult: env("IMDB_IMPORT_INCLUDE_ADULT")
+                .ok()
+                .and_then(|v| v.parse().ok())
+                .unwrap_or(false),
+            imdb_import_datasets: env("IMDB_IMPORT_DATASETS")
+                .ok()
+                .filter(|s| !s.is_empty())
+                .map(|s| {
+                    s.split(',')
+                        .map(|p| p.trim().to_ascii_lowercase())
+                        .filter(|p| !p.is_empty())
+                        .collect()
+                })
+                .unwrap_or_default(),
             metadata_primary_source: env("METADATA_PRIMARY_SOURCE")
                 .unwrap_or_else(|_| "imdb".into())
                 .to_lowercase(),

diff --git a/backend/src/db/media.rs b/backend/src/db/media.rs
@@ -10,6 +10,7 @@ pub struct MediaCandidate {
     pub media_id: MediaId,
     pub title: String,
     pub year: Option<i32>,
+    pub end_year: Option<i32>,
     pub imdb_id: Option<String>,
     pub tmdb_id: Option<String>,
     pub tvdb_id: Option<String>,
@@ -33,6 +34,7 @@ pub async fn search_media_candidates(
             m.id AS media_id,
             m.title,
             m.year,
+            EXTRACT(YEAR FROM m.end_date)::int AS end_year,
             MAX(CASE WHEN mei.provider = 'imdb' THEN mei.external_id END) AS imdb_id,
             MAX(CASE WHEN mei.provider = 'tmdb' THEN mei.external_id END) AS tmdb_id,
             MAX(CASE WHEN mei.provider = 'tvdb' THEN mei.external_id END) AS tvdb_id
@@ -41,9 +43,16 @@ pub async fn search_media_candidates(
                ON mei.media_id = m.id
               AND mei.provider IN ('imdb', 'tmdb', 'tvdb')
         WHERE m.type = $1
-          AND m.title_tsv @@ plainto_tsquery('simple', $2)
-        GROUP BY m.id, m.title, m.year
-        ORDER BY m.popularity DESC NULLS LAST
+          AND (
+              m.title_tsv @@ plainto_tsquery('simple', $2)
+              OR EXISTS (
+                  SELECT 1 FROM aka_title at
+                  WHERE at.media_id = m.id
+                    AND at.title_tsv @@ plainto_tsquery('simple', $2)
+              )
+          )
+        GROUP BY m.id, m.title, m.year, m.end_date
+        ORDER BY m.popularity DESC NULLS LAST, m.id
         LIMIT 12
         "#,
     )
@@ -57,6 +66,18 @@ pub async fn search_media_candidates(
     })
 }
 
+/// Load alternate titles for metadata matching (e.g. IMDB AKAs).
+pub async fn load_aka_titles(pool: &PgPool, media_id: MediaId) -> Vec<String> {
+    sqlx::query_scalar("SELECT title FROM aka_title WHERE media_id = $1")
+        .bind(media_id)
+        .fetch_all(pool)
+        .await
+        .unwrap_or_else(|e| {
+            warn!("load_aka_titles media_id={media_id}: {e}");
+            vec![]
+        })
+}
+
 /// Fetch title, year, and imdb_id for a known media_id.
 /// Returns None if the row doesn't exist.
 pub async fn get_media_meta(

diff --git a/backend/src/db/mod.rs b/backend/src/db/mod.rs
@@ -40,7 +40,8 @@ pub mod torznab;
 pub mod watch_history;
 
 pub use media::{
-    get_media_id_by_external_id, get_media_meta, resolve_media_ids, search_media_candidates,
+    get_media_id_by_external_id, get_media_meta, load_aka_titles, resolve_media_ids,
+    search_media_candidates,
 };
 pub use streams::{
     fetch_acestream_streams_bulk, fetch_http_streams_bulk, fetch_stream_playback_info,

diff --git a/backend/src/jobs/handlers/background_search.rs b/backend/src/jobs/handlers/background_search.rs
@@ -7,6 +7,7 @@ use serde::Deserialize;
 use tracing::{debug, warn};
 
 use crate::{
+    db::MediaType,
     jobs::{
         error::JobError,
         handler::{JobCtx, JobHandler},
@@ -44,9 +45,10 @@ struct MediaRow {
 async fn lookup_movie(pool: &sqlx::PgPool, id: &str) -> Option<MediaRow> {
     let parsed_id: i32 = id.parse().ok()?;
     sqlx::query_as::<_, (i32, String, Option<i32>)>(
-        "SELECT id, title, year FROM media WHERE id = $1 AND type = 'movie'",
+        "SELECT id, title, year FROM media WHERE id = $1 AND type = $2",
     )
     .bind(parsed_id)
+    .bind(MediaType::Movie)
     .fetch_optional(pool)
     .await
     .ok()?
@@ -59,9 +61,10 @@ async fn lookup_movie(pool: &sqlx::PgPool, id: &str) -> Option<MediaRow> {
 
 async fn lookup_series_media(pool: &sqlx::PgPool, media_id: i32) -> Option<MediaRow> {
     sqlx::query_as::<_, (i32, String, Option<i32>)>(
-        "SELECT id, title, year FROM media WHERE id = $1 AND type = 'series'",
+        "SELECT id, title, year FROM media WHERE id = $1 AND type = $2",
     )
     .bind(media_id)
+    .bind(MediaType::Series)
     .fetch_optional(pool)
     .await
     .ok()?
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		-- Irreversible cleanup: deleted internal mediafusion external_id rows are not restored.