GKalliatakis
diff --git a/‎.github/workflows/validate.yml‎
Lines changed: 24 additions & 0 deletions b/‎.github/workflows/validate.yml‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CITATION.cff‎
Lines changed: 30 additions & 0 deletions b/‎CITATION.cff‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 5 deletions b/‎README.md‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎data/collections.yaml‎ b/‎data/collections.yaml‎
diff --git a/‎data/papers.yaml‎
Lines changed: 78 additions & 1 deletion b/‎data/papers.yaml‎
Lines changed: 78 additions & 1 deletion
diff --git a/‎data/schema.json‎
Lines changed: 101 additions & 0 deletions b/‎data/schema.json‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎data/tags.yaml‎ b/‎data/tags.yaml‎
diff --git a/‎data/venues.yaml‎ b/‎data/venues.yaml‎
diff --git a/‎requirements.txt‎
Lines changed: 3 additions & 1 deletion b/‎requirements.txt‎
Lines changed: 3 additions & 1 deletion
@@ -0,0 +1,24 @@
+name: Validate dataset
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install deps
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Validate papers.yaml
+        run: |
+          python scripts/validate_dataset.py
@@ -205,3 +205,5 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+site/public/data/*.json
@@ -0,0 +1,30 @@
+cff-version: 1.2.0
+message: "If you use Diffusion Lighthouse in your work, please cite it as below."
+title: "Diffusion Lighthouse"
+type: software
+license: MIT
+repository-code: "https://github.com/GKalliatakis/diffusion-lighthouse"
+abstract: >
+  Diffusion Lighthouse is a living, citation-guided index of influential diffusion-model
+  research papers. It combines careful human curation with best-effort citation snapshots
+  to help researchers navigate foundational work, major methods, scaling trends, and applications.
+keywords:
+  - diffusion models
+  - generative modeling
+  - score-based models
+  - denoising diffusion
+  - bibliography
+  - research index
+  - citations
+authors:
+  - family-names: Kalliatakis
+    given-names: Grigorios
+preferred-citation:
+  type: software
+  title: "Diffusion Lighthouse"
+  authors:
+    - family-names: Kalliatakis
+      given-names: Grigorios
+  year: 2025
+  version: "0.1.0"
+  url: "https://github.com/GKalliatakis/diffusion-lighthouse"
@@ -24,12 +24,9 @@
 </p>
 
 
-**Diffusion Lighthouse** is a living, citation-guided index of the most influential research papers on **Diffusion Models**.
+**Diffusion Lighthouse** helps researchers, students, and practitioners navigate a fast-growing literature by highlighting **impactful work**, using **Google Scholar citation counts** combined with careful human curation.
 
-It helps researchers, students, and practitioners navigate a fast-growing literature by highlighting **impactful work**, using **Google Scholar citation counts** combined with careful human curation.
-
-> Like a lighthouse, this project does not chart every wave —  
-> it helps you orient toward the most important signals.
+> Like a lighthouse, this project does not chart every wave — it helps you orient toward the most important signals.
 
 ---
 
 
@@ -3,6 +3,7 @@
 # This list is human-curated. Citation counts are added separately.
 
 papers:
+
   # ===== Foundations =====
   - id: ddpm_2020
     title: "Denoising Diffusion Probabilistic Models"
@@ -16,6 +17,12 @@ papers:
       scholar_url: "https://scholar.google.com/scholar?cluster=622631041436591387&hl=en&as_sdt=2005&sciodt=0,5"
       scholar_query: "\"Denoising Diffusion Probabilistic Models\" Ho Jain Abbeel"
     tags: ["foundations", "image"]
+    impact_type: foundational
+    why_it_matters: >
+      Introduced diffusion probabilistic models as a practical generative modeling framework,
+      showing that iterative denoising from Gaussian noise can achieve competitive image synthesis.
+      Established the core training objective and sampling procedure that form the basis of most
+      subsequent diffusion models.
 
   - id: ddim_2020
     title: "Denoising Diffusion Implicit Models"
@@ -29,6 +36,15 @@ papers:
       scholar_url: "https://scholar.google.com/scholar?cluster=15692403916484267912&hl=en&as_sdt=2005&sciodt=0,5"
       scholar_query: "\"Denoising Diffusion Implicit Models\""
     tags: ["foundations", "sampling"]
+    impact_type: enabling
+    relations:
+      - type: improves
+        target: ddpm_2020
+    why_it_matters: >
+      Demonstrated that diffusion models can be sampled using a non-Markovian process that
+      preserves sample quality while requiring far fewer steps. This significantly improved
+      inference efficiency without retraining and made diffusion models more practical in real
+      applications.
 
   - id: score_sde_2021
     title: "Score-Based Generative Modeling through Stochastic Differential Equations"
@@ -42,6 +58,17 @@ papers:
       scholar_url: "https://scholar.google.com/scholar?cluster=14592788616550656262&hl=en&as_sdt=2005&sciodt=0,5"
       scholar_query: "\"Score-Based Generative Modeling through Stochastic Differential Equations\""
     tags: ["foundations", "theory"]
+    impact_type: foundational
+    relations:
+      - type: unifies
+        target: score_matching_2019
+      - type: extends
+        target: ddpm_2020
+    why_it_matters: >
+      Unified score-based generative models and diffusion processes under a continuous-time
+      stochastic differential equation framework. Provided theoretical clarity and enabled new
+      sampling methods, connecting discrete diffusion models with broader probabilistic modeling
+      theory.
 
   - id: score_matching_2019
     title: "Generative Modeling by Estimating Gradients of the Data Distribution"
@@ -55,6 +82,14 @@ papers:
       scholar_url: "https://scholar.google.com/scholar?cluster=7819543055117584506&hl=en&as_sdt=2005&sciodt=0,5"
       scholar_query: "\"Generative Modeling by Estimating Gradients of the Data Distribution\""
     tags: ["foundations", "score"]
+    impact_type: foundational
+    relations:
+      - type: precedes
+        target: ddpm_2020
+    why_it_matters: >
+      Introduced score matching as a scalable approach for generative modeling, enabling models
+      to learn gradients of the data distribution directly. This work laid the conceptual and
+      mathematical groundwork for later score-based diffusion models.
 
   # ===== Training & Objectives =====
   - id: improved_ddpm_2021
@@ -66,9 +101,17 @@ papers:
       arxiv: "https://arxiv.org/abs/2102.09672"
       pdf: "https://arxiv.org/pdf/2102.09672.pdf"
     scholar:
-      scholar_url: "https://scholar.google.com/scholar?cluster=2227179395488568184&hl=en&as_sdt=2005&sciodt=0,5"
+      scholar_url: "https://scholar.google.com/scholar?cluster=1314010070205781055&hl=en&as_sdt=2005&sciodt=0,5"
       scholar_query: "\"Improved Denoising Diffusion Probabilistic Models\""
     tags: ["training", "image"]
+    impact_type: refinement
+    relations:
+      - type: improves
+        target: ddpm_2020
+    why_it_matters: >
+      Improved diffusion training and sampling through better noise schedules,
+      parameterization, and loss weighting. These refinements significantly increased
+      sample quality and stability and became standard practice in later diffusion models.
 
   # ===== Guidance & Conditioning =====
   - id: classifier_guidance_2021
@@ -83,6 +126,12 @@ papers:
       scholar_url: "https://scholar.google.com/scholar?cluster=17982230494456470673&hl=en&as_sdt=2005&sciodt=0,5"
       scholar_query: "\"Diffusion Models Beat GANs on Image Synthesis\""
     tags: ["guidance", "image"]
+    impact_type: enabling
+    why_it_matters: >
+      Demonstrated state-of-the-art class-conditional image synthesis with diffusion models and
+      introduced classifier guidance as a practical way to trade off sample fidelity and diversity.
+      This helped establish diffusion as a competitive (and later dominant) paradigm over GANs
+      for high-fidelity image generation.
 
   - id: classifier_free_guidance_2022
     title: "Classifier-Free Diffusion Guidance"
@@ -96,6 +145,12 @@ papers:
       scholar_url: "https://scholar.google.com/scholar?cluster=9321084442049185729&hl=en&as_sdt=2005&sciodt=0,5"
       scholar_query: "\"Classifier-Free Diffusion Guidance\""
     tags: ["guidance", "conditioning"]
+    impact_type: enabling
+    why_it_matters: >
+      Proposed classifier-free guidance, enabling strong conditional generation without training a
+      separate classifier by mixing conditional and unconditional predictions during sampling.
+      This became a default technique in text-to-image systems for boosting prompt adherence with
+      a single model.
 
   # ===== Latent & Scaling =====
   - id: latent_diffusion_2022
@@ -110,6 +165,12 @@ papers:
       scholar_url: "https://scholar.google.com/scholar?cluster=2427242760668866618&hl=en&as_sdt=2005&sciodt=0,5"
       scholar_query: "\"High-Resolution Image Synthesis with Latent Diffusion Models\""
     tags: ["latent", "image", "systems"]
+    impact_type: enabling
+    why_it_matters: >
+      Moved diffusion to a learned latent space to dramatically reduce computation and memory while
+      retaining high perceptual quality, enabling practical high-resolution generation on commodity
+      hardware. This design underpins many widely-used text-to-image pipelines and made large-scale
+      diffusion deployment far more feasible.
 
   # ===== Text-to-Image =====
   - id: glide_2021
@@ -124,6 +185,12 @@ papers:
       scholar_url: "https://scholar.google.com/scholar?cluster=15472303808406531445&hl=en&as_sdt=2005&sciodt=0,5"
       scholar_query: "\"GLIDE\" text-guided diffusion models"
     tags: ["text-to-image", "editing"]
+    impact_type: enabling
+    why_it_matters: >
+      Showed that diffusion models can be effectively conditioned on text for both generation and
+      image editing, establishing core recipes for text-guided diffusion before the big wave of
+      production text-to-image systems. Helped crystallize “prompted diffusion” as a general-purpose
+      controllable generation approach.
 
   - id: imagen_2022
     title: "Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding"
@@ -137,6 +204,11 @@ papers:
       scholar_url: "https://scholar.google.com/scholar?cluster=2130901831690841916&hl=en&as_sdt=2005&sciodt=0,5"
       scholar_query: "\"Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding\""
     tags: ["text-to-image", "scaling"]
+    impact_type: enabling
+    why_it_matters: >
+      Demonstrated that strong text encoders and large-scale training substantially improve text
+      alignment and photorealism in diffusion-based text-to-image generation. Popularized the idea
+      that “language understanding” (not just image modeling) is a key lever for text-to-image quality.
 
   # ===== Acceleration =====
   - id: progressive_distillation_2022
@@ -151,3 +223,8 @@ papers:
       scholar_url: "https://scholar.google.com/scholar?cluster=5194434213555432016&hl=en&as_sdt=2005&sciodt=0,5"
       scholar_query: "\"Progressive Distillation for Fast Sampling of Diffusion Models\""
     tags: ["acceleration", "distillation"]
+    impact_type: enabling
+    why_it_matters: >
+      Introduced a practical distillation approach that progressively reduces the number of sampling
+      steps while maintaining quality, making diffusion inference significantly faster. This work is
+      a cornerstone for later “few-step” diffusion approaches and production-oriented acceleration.
@@ -0,0 +1,101 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://diffusion-lighthouse.org/schema/papers.schema.json",
+  "title": "Diffusion Lighthouse Papers Dataset",
+  "type": "object",
+  "required": ["papers"],
+  "additionalProperties": false,
+  "properties": {
+    "papers": {
+      "type": "array",
+      "minItems": 1,
+      "items": { "$ref": "#/$defs/paper" }
+    }
+  },
+  "$defs": {
+    "paper": {
+      "type": "object",
+      "required": [
+        "id",
+        "title",
+        "year",
+        "authors",
+        "venue",
+        "links",
+        "tags",
+        "impact_type",
+        "why_it_matters"
+      ],
+      "additionalProperties": false,
+      "properties": {
+        "id": {
+          "type": "string",
+          "pattern": "^[a-z0-9]+(?:_[a-z0-9]+)*$",
+          "description": "Stable snake_case identifier (e.g., ddpm_2020)."
+        },
+        "title": { "type": "string", "minLength": 3 },
+        "year": { "type": "integer", "minimum": 2010, "maximum": 2035 },
+        "authors": {
+          "type": "array",
+          "minItems": 1,
+          "items": { "type": "string", "minLength": 1 }
+        },
+        "venue": { "type": "string", "minLength": 2 },
+        "links": {
+          "type": "object",
+          "required": ["arxiv", "pdf"],
+          "additionalProperties": true,
+          "properties": {
+            "arxiv": { "type": "string", "format": "uri" },
+            "pdf": { "type": "string", "format": "uri" },
+            "code": { "type": "string", "format": "uri" },
+            "project": { "type": "string", "format": "uri" }
+          }
+        },
+        "scholar": {
+          "type": "object",
+          "required": ["scholar_url", "scholar_query"],
+          "additionalProperties": false,
+          "properties": {
+            "scholar_url": { "type": "string", "format": "uri" },
+            "scholar_query": { "type": "string", "minLength": 1 }
+          }
+        },
+        "tags": {
+          "type": "array",
+          "minItems": 1,
+          "items": { "type": "string", "pattern": "^[a-z0-9]+(?:-[a-z0-9]+)*$" }
+        },
+        "impact_type": {
+          "type": "string",
+          "enum": [
+            "foundational",
+            "enabling",
+            "refinement",
+            "survey",
+            "application",
+            "benchmark",
+            "system"
+          ]
+        },
+        "relations": {
+          "type": "array",
+          "items": { "$ref": "#/$defs/relation" }
+        },
+        "why_it_matters": { "type": "string", "minLength": 20 }
+      }
+    },
+    "relation": {
+      "type": "object",
+      "required": ["type", "target"],
+      "additionalProperties": false,
+      "properties": {
+        "type": {
+          "type": "string",
+          "enum": ["precedes", "extends", "improves", "unifies", "uses", "inspired_by"]
+        },
+        "target": { "type": "string", "pattern": "^[a-z0-9]+(?:_[a-z0-9]+)*$" }
+      }
+    }
+  }
+}
@@ -1,4 +1,6 @@
 PyYAML==6.0.2
 requests==2.32.3
 beautifulsoup4==4.12.3
-lxml==5.3.0
+lxml==5.3.0
+pyyaml>=6.0.0
+jsonschema>=4.20.0