Skip to content

Commit 9b5b644

Browse files
committed
v0
1 parent ba7ef0b commit 9b5b644

21 files changed

+1372
-353
lines changed

.github/workflows/validate.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: Validate dataset
2+
3+
on:
4+
push:
5+
pull_request:
6+
7+
jobs:
8+
validate:
9+
runs-on: ubuntu-latest
10+
steps:
11+
- uses: actions/checkout@v4
12+
13+
- uses: actions/setup-python@v5
14+
with:
15+
python-version: "3.11"
16+
17+
- name: Install deps
18+
run: |
19+
python -m pip install --upgrade pip
20+
pip install -r requirements.txt
21+
22+
- name: Validate papers.yaml
23+
run: |
24+
python scripts/validate_dataset.py

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,3 +205,5 @@ cython_debug/
205205
marimo/_static/
206206
marimo/_lsp/
207207
__marimo__/
208+
209+
site/public/data/*.json

CITATION.cff

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
cff-version: 1.2.0
2+
message: "If you use Diffusion Lighthouse in your work, please cite it as below."
3+
title: "Diffusion Lighthouse"
4+
type: software
5+
license: MIT
6+
repository-code: "https://github.com/GKalliatakis/diffusion-lighthouse"
7+
abstract: >
8+
Diffusion Lighthouse is a living, citation-guided index of influential diffusion-model
9+
research papers. It combines careful human curation with best-effort citation snapshots
10+
to help researchers navigate foundational work, major methods, scaling trends, and applications.
11+
keywords:
12+
- diffusion models
13+
- generative modeling
14+
- score-based models
15+
- denoising diffusion
16+
- bibliography
17+
- research index
18+
- citations
19+
authors:
20+
- family-names: Kalliatakis
21+
given-names: Grigorios
22+
preferred-citation:
23+
type: software
24+
title: "Diffusion Lighthouse"
25+
authors:
26+
- family-names: Kalliatakis
27+
given-names: Grigorios
28+
year: 2025
29+
version: "0.1.0"
30+
url: "https://github.com/GKalliatakis/diffusion-lighthouse"

README.md

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,9 @@
2424
</p>
2525

2626

27-
**Diffusion Lighthouse** is a living, citation-guided index of the most influential research papers on **Diffusion Models**.
27+
**Diffusion Lighthouse** helps researchers, students, and practitioners navigate a fast-growing literature by highlighting **impactful work**, using **Google Scholar citation counts** combined with careful human curation.
2828

29-
It helps researchers, students, and practitioners navigate a fast-growing literature by highlighting **impactful work**, using **Google Scholar citation counts** combined with careful human curation.
30-
31-
> Like a lighthouse, this project does not chart every wave —
32-
> it helps you orient toward the most important signals.
29+
> Like a lighthouse, this project does not chart every wave — it helps you orient toward the most important signals.
3330
3431
---
3532

data/collections.yaml

Whitespace-only changes.

data/papers.yaml

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# This list is human-curated. Citation counts are added separately.
44

55
papers:
6+
67
# ===== Foundations =====
78
- id: ddpm_2020
89
title: "Denoising Diffusion Probabilistic Models"
@@ -16,6 +17,12 @@ papers:
1617
scholar_url: "https://scholar.google.com/scholar?cluster=622631041436591387&hl=en&as_sdt=2005&sciodt=0,5"
1718
scholar_query: "\"Denoising Diffusion Probabilistic Models\" Ho Jain Abbeel"
1819
tags: ["foundations", "image"]
20+
impact_type: foundational
21+
why_it_matters: >
22+
Introduced diffusion probabilistic models as a practical generative modeling framework,
23+
showing that iterative denoising from Gaussian noise can achieve competitive image synthesis.
24+
Established the core training objective and sampling procedure that form the basis of most
25+
subsequent diffusion models.
1926
2027
- id: ddim_2020
2128
title: "Denoising Diffusion Implicit Models"
@@ -29,6 +36,15 @@ papers:
2936
scholar_url: "https://scholar.google.com/scholar?cluster=15692403916484267912&hl=en&as_sdt=2005&sciodt=0,5"
3037
scholar_query: "\"Denoising Diffusion Implicit Models\""
3138
tags: ["foundations", "sampling"]
39+
impact_type: enabling
40+
relations:
41+
- type: improves
42+
target: ddpm_2020
43+
why_it_matters: >
44+
Demonstrated that diffusion models can be sampled using a non-Markovian process that
45+
preserves sample quality while requiring far fewer steps. This significantly improved
46+
inference efficiency without retraining and made diffusion models more practical in real
47+
applications.
3248
3349
- id: score_sde_2021
3450
title: "Score-Based Generative Modeling through Stochastic Differential Equations"
@@ -42,6 +58,17 @@ papers:
4258
scholar_url: "https://scholar.google.com/scholar?cluster=14592788616550656262&hl=en&as_sdt=2005&sciodt=0,5"
4359
scholar_query: "\"Score-Based Generative Modeling through Stochastic Differential Equations\""
4460
tags: ["foundations", "theory"]
61+
impact_type: foundational
62+
relations:
63+
- type: unifies
64+
target: score_matching_2019
65+
- type: extends
66+
target: ddpm_2020
67+
why_it_matters: >
68+
Unified score-based generative models and diffusion processes under a continuous-time
69+
stochastic differential equation framework. Provided theoretical clarity and enabled new
70+
sampling methods, connecting discrete diffusion models with broader probabilistic modeling
71+
theory.
4572
4673
- id: score_matching_2019
4774
title: "Generative Modeling by Estimating Gradients of the Data Distribution"
@@ -55,6 +82,14 @@ papers:
5582
scholar_url: "https://scholar.google.com/scholar?cluster=7819543055117584506&hl=en&as_sdt=2005&sciodt=0,5"
5683
scholar_query: "\"Generative Modeling by Estimating Gradients of the Data Distribution\""
5784
tags: ["foundations", "score"]
85+
impact_type: foundational
86+
relations:
87+
- type: precedes
88+
target: ddpm_2020
89+
why_it_matters: >
90+
Introduced score matching as a scalable approach for generative modeling, enabling models
91+
to learn gradients of the data distribution directly. This work laid the conceptual and
92+
mathematical groundwork for later score-based diffusion models.
5893
5994
# ===== Training & Objectives =====
6095
- id: improved_ddpm_2021
@@ -66,9 +101,17 @@ papers:
66101
arxiv: "https://arxiv.org/abs/2102.09672"
67102
pdf: "https://arxiv.org/pdf/2102.09672.pdf"
68103
scholar:
69-
scholar_url: "https://scholar.google.com/scholar?cluster=2227179395488568184&hl=en&as_sdt=2005&sciodt=0,5"
104+
scholar_url: "https://scholar.google.com/scholar?cluster=1314010070205781055&hl=en&as_sdt=2005&sciodt=0,5"
70105
scholar_query: "\"Improved Denoising Diffusion Probabilistic Models\""
71106
tags: ["training", "image"]
107+
impact_type: refinement
108+
relations:
109+
- type: improves
110+
target: ddpm_2020
111+
why_it_matters: >
112+
Improved diffusion training and sampling through better noise schedules,
113+
parameterization, and loss weighting. These refinements significantly increased
114+
sample quality and stability and became standard practice in later diffusion models.
72115
73116
# ===== Guidance & Conditioning =====
74117
- id: classifier_guidance_2021
@@ -83,6 +126,12 @@ papers:
83126
scholar_url: "https://scholar.google.com/scholar?cluster=17982230494456470673&hl=en&as_sdt=2005&sciodt=0,5"
84127
scholar_query: "\"Diffusion Models Beat GANs on Image Synthesis\""
85128
tags: ["guidance", "image"]
129+
impact_type: enabling
130+
why_it_matters: >
131+
Demonstrated state-of-the-art class-conditional image synthesis with diffusion models and
132+
introduced classifier guidance as a practical way to trade off sample fidelity and diversity.
133+
This helped establish diffusion as a competitive (and later dominant) paradigm over GANs
134+
for high-fidelity image generation.
86135
87136
- id: classifier_free_guidance_2022
88137
title: "Classifier-Free Diffusion Guidance"
@@ -96,6 +145,12 @@ papers:
96145
scholar_url: "https://scholar.google.com/scholar?cluster=9321084442049185729&hl=en&as_sdt=2005&sciodt=0,5"
97146
scholar_query: "\"Classifier-Free Diffusion Guidance\""
98147
tags: ["guidance", "conditioning"]
148+
impact_type: enabling
149+
why_it_matters: >
150+
Proposed classifier-free guidance, enabling strong conditional generation without training a
151+
separate classifier by mixing conditional and unconditional predictions during sampling.
152+
This became a default technique in text-to-image systems for boosting prompt adherence with
153+
a single model.
99154
100155
# ===== Latent & Scaling =====
101156
- id: latent_diffusion_2022
@@ -110,6 +165,12 @@ papers:
110165
scholar_url: "https://scholar.google.com/scholar?cluster=2427242760668866618&hl=en&as_sdt=2005&sciodt=0,5"
111166
scholar_query: "\"High-Resolution Image Synthesis with Latent Diffusion Models\""
112167
tags: ["latent", "image", "systems"]
168+
impact_type: enabling
169+
why_it_matters: >
170+
Moved diffusion to a learned latent space to dramatically reduce computation and memory while
171+
retaining high perceptual quality, enabling practical high-resolution generation on commodity
172+
hardware. This design underpins many widely-used text-to-image pipelines and made large-scale
173+
diffusion deployment far more feasible.
113174
114175
# ===== Text-to-Image =====
115176
- id: glide_2021
@@ -124,6 +185,12 @@ papers:
124185
scholar_url: "https://scholar.google.com/scholar?cluster=15472303808406531445&hl=en&as_sdt=2005&sciodt=0,5"
125186
scholar_query: "\"GLIDE\" text-guided diffusion models"
126187
tags: ["text-to-image", "editing"]
188+
impact_type: enabling
189+
why_it_matters: >
190+
Showed that diffusion models can be effectively conditioned on text for both generation and
191+
image editing, establishing core recipes for text-guided diffusion before the big wave of
192+
production text-to-image systems. Helped crystallize “prompted diffusion” as a general-purpose
193+
controllable generation approach.
127194
128195
- id: imagen_2022
129196
title: "Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding"
@@ -137,6 +204,11 @@ papers:
137204
scholar_url: "https://scholar.google.com/scholar?cluster=2130901831690841916&hl=en&as_sdt=2005&sciodt=0,5"
138205
scholar_query: "\"Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding\""
139206
tags: ["text-to-image", "scaling"]
207+
impact_type: enabling
208+
why_it_matters: >
209+
Demonstrated that strong text encoders and large-scale training substantially improve text
210+
alignment and photorealism in diffusion-based text-to-image generation. Popularized the idea
211+
that “language understanding” (not just image modeling) is a key lever for text-to-image quality.
140212
141213
# ===== Acceleration =====
142214
- id: progressive_distillation_2022
@@ -151,3 +223,8 @@ papers:
151223
scholar_url: "https://scholar.google.com/scholar?cluster=5194434213555432016&hl=en&as_sdt=2005&sciodt=0,5"
152224
scholar_query: "\"Progressive Distillation for Fast Sampling of Diffusion Models\""
153225
tags: ["acceleration", "distillation"]
226+
impact_type: enabling
227+
why_it_matters: >
228+
Introduced a practical distillation approach that progressively reduces the number of sampling
229+
steps while maintaining quality, making diffusion inference significantly faster. This work is
230+
a cornerstone for later “few-step” diffusion approaches and production-oriented acceleration.

data/schema.json

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
{
2+
"$schema": "https://json-schema.org/draft/2020-12/schema",
3+
"$id": "https://diffusion-lighthouse.org/schema/papers.schema.json",
4+
"title": "Diffusion Lighthouse Papers Dataset",
5+
"type": "object",
6+
"required": ["papers"],
7+
"additionalProperties": false,
8+
"properties": {
9+
"papers": {
10+
"type": "array",
11+
"minItems": 1,
12+
"items": { "$ref": "#/$defs/paper" }
13+
}
14+
},
15+
"$defs": {
16+
"paper": {
17+
"type": "object",
18+
"required": [
19+
"id",
20+
"title",
21+
"year",
22+
"authors",
23+
"venue",
24+
"links",
25+
"tags",
26+
"impact_type",
27+
"why_it_matters"
28+
],
29+
"additionalProperties": false,
30+
"properties": {
31+
"id": {
32+
"type": "string",
33+
"pattern": "^[a-z0-9]+(?:_[a-z0-9]+)*$",
34+
"description": "Stable snake_case identifier (e.g., ddpm_2020)."
35+
},
36+
"title": { "type": "string", "minLength": 3 },
37+
"year": { "type": "integer", "minimum": 2010, "maximum": 2035 },
38+
"authors": {
39+
"type": "array",
40+
"minItems": 1,
41+
"items": { "type": "string", "minLength": 1 }
42+
},
43+
"venue": { "type": "string", "minLength": 2 },
44+
"links": {
45+
"type": "object",
46+
"required": ["arxiv", "pdf"],
47+
"additionalProperties": true,
48+
"properties": {
49+
"arxiv": { "type": "string", "format": "uri" },
50+
"pdf": { "type": "string", "format": "uri" },
51+
"code": { "type": "string", "format": "uri" },
52+
"project": { "type": "string", "format": "uri" }
53+
}
54+
},
55+
"scholar": {
56+
"type": "object",
57+
"required": ["scholar_url", "scholar_query"],
58+
"additionalProperties": false,
59+
"properties": {
60+
"scholar_url": { "type": "string", "format": "uri" },
61+
"scholar_query": { "type": "string", "minLength": 1 }
62+
}
63+
},
64+
"tags": {
65+
"type": "array",
66+
"minItems": 1,
67+
"items": { "type": "string", "pattern": "^[a-z0-9]+(?:-[a-z0-9]+)*$" }
68+
},
69+
"impact_type": {
70+
"type": "string",
71+
"enum": [
72+
"foundational",
73+
"enabling",
74+
"refinement",
75+
"survey",
76+
"application",
77+
"benchmark",
78+
"system"
79+
]
80+
},
81+
"relations": {
82+
"type": "array",
83+
"items": { "$ref": "#/$defs/relation" }
84+
},
85+
"why_it_matters": { "type": "string", "minLength": 20 }
86+
}
87+
},
88+
"relation": {
89+
"type": "object",
90+
"required": ["type", "target"],
91+
"additionalProperties": false,
92+
"properties": {
93+
"type": {
94+
"type": "string",
95+
"enum": ["precedes", "extends", "improves", "unifies", "uses", "inspired_by"]
96+
},
97+
"target": { "type": "string", "pattern": "^[a-z0-9]+(?:_[a-z0-9]+)*$" }
98+
}
99+
}
100+
}
101+
}

data/tags.yaml

Whitespace-only changes.

data/venues.yaml

Whitespace-only changes.

requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
PyYAML==6.0.2
22
requests==2.32.3
33
beautifulsoup4==4.12.3
4-
lxml==5.3.0
4+
lxml==5.3.0
5+
pyyaml>=6.0.0
6+
jsonschema>=4.20.0

0 commit comments

Comments
 (0)