Skip to content

Commit 2038b68

Browse files
committed
Update validate_dataset
1 parent 9e55b65 commit 2038b68

File tree

1 file changed

+117
-22
lines changed

1 file changed

+117
-22
lines changed

scripts/validate_dataset.py

Lines changed: 117 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import argparse
55
import sys
66
from pathlib import Path
7-
from typing import Any, Dict, List, Set
7+
from typing import Any, Dict, List, Set, Optional
8+
from urllib.parse import urlparse
89

910
import yaml
1011

@@ -19,6 +20,10 @@
1920
sys.exit(2)
2021

2122

23+
# -------------------------
24+
# IO
25+
# -------------------------
26+
2227
def load_yaml(path: Path) -> Dict[str, Any]:
2328
with path.open("r", encoding="utf-8") as f:
2429
return yaml.safe_load(f)
@@ -30,19 +35,39 @@ def load_json(path: Path) -> Dict[str, Any]:
3035
return json.load(f)
3136

3237

33-
def validate_relations_exist(papers: List[Dict[str, Any]]) -> List[str]:
34-
errors: List[str] = []
35-
ids: Set[str] = {p["id"] for p in papers}
38+
# -------------------------
39+
# Helpers
40+
# -------------------------
41+
42+
def is_url(s: Any) -> bool:
43+
if not isinstance(s, str) or not s.strip():
44+
return False
45+
try:
46+
u = urlparse(s)
47+
return u.scheme in {"http", "https"} and bool(u.netloc)
48+
except Exception:
49+
return False
50+
51+
52+
def is_canonical_preprint(p: Dict[str, Any]) -> bool:
53+
return str(p.get("publication_status", "")).strip().lower() == "canonical_preprint"
3654

37-
for p in papers:
38-
for rel in p.get("relations", []) or []:
39-
tgt = rel.get("target")
40-
if tgt not in ids:
41-
errors.append(f"{p['id']}: relation target '{tgt}' does not exist")
42-
if tgt == p["id"]:
43-
errors.append(f"{p['id']}: relation target cannot be self")
44-
return errors
4555

56+
def resolve_relation_target(rel: Any) -> Optional[str]:
57+
# Supports the dataset + app.js conventions:
58+
# target_id | paper_id | target | id | paper
59+
if not isinstance(rel, dict):
60+
return None
61+
for k in ("target_id", "paper_id", "target", "id", "paper"):
62+
v = rel.get(k)
63+
if isinstance(v, str) and v.strip():
64+
return v.strip()
65+
return None
66+
67+
68+
# -------------------------
69+
# Integrity checks
70+
# -------------------------
4671

4772
def validate_unique_ids(papers: List[Dict[str, Any]]) -> List[str]:
4873
errors: List[str] = []
@@ -55,21 +80,91 @@ def validate_unique_ids(papers: List[Dict[str, Any]]) -> List[str]:
5580
return errors
5681

5782

58-
def validate_arxiv_links(papers: List[Dict[str, Any]]) -> List[str]:
83+
def validate_relations_exist(papers: List[Dict[str, Any]]) -> List[str]:
5984
errors: List[str] = []
85+
ids: Set[str] = {p["id"] for p in papers}
86+
87+
for p in papers:
88+
rels = p.get("relations", []) or []
89+
for rel in rels:
90+
tgt = resolve_relation_target(rel)
91+
if not tgt:
92+
errors.append(f"{p['id']}: relation missing target (expected target_id/paper_id/target)")
93+
continue
94+
if tgt not in ids:
95+
errors.append(f"{p['id']}: relation target '{tgt}' does not exist")
96+
if tgt == p["id"]:
97+
errors.append(f"{p['id']}: relation target cannot be self")
98+
return errors
99+
100+
101+
def validate_links(papers: List[Dict[str, Any]]) -> List[str]:
102+
"""Peer-reviewed-first rules (matches the site logic).
103+
104+
- If publication_status == canonical_preprint:
105+
* require links.arxiv to be https://arxiv.org/abs/...
106+
* require links.pdf to be https://arxiv.org/pdf/... .pdf
107+
108+
- Otherwise:
109+
* do NOT require arXiv links
110+
* require at least one canonical (non-arXiv) URL among:
111+
doi, journal, proceedings, publisher, official, url, pdf
112+
* if links.pdf is provided, it must NOT be an arXiv PDF URL
113+
"""
114+
errors: List[str] = []
115+
116+
canonical_keys = ["doi", "journal", "proceedings", "publisher", "official", "url", "pdf"]
117+
60118
for p in papers:
61119
links = p.get("links", {}) or {}
62-
arxiv = links.get("arxiv", "")
63-
pdf = links.get("pdf", "")
64-
65-
# Keep this strict to avoid drift; loosen later if needed.
66-
if not isinstance(arxiv, str) or not arxiv.startswith("https://arxiv.org/abs/"):
67-
errors.append(f"{p['id']}: links.arxiv must start with 'https://arxiv.org/abs/'")
68-
if not isinstance(pdf, str) or not (pdf.startswith("https://arxiv.org/pdf/") and pdf.endswith(".pdf")):
69-
errors.append(f"{p['id']}: links.pdf must be an arXiv PDF URL ending in .pdf")
120+
121+
arxiv = links.get("arxiv")
122+
pdf = links.get("pdf")
123+
124+
if is_canonical_preprint(p):
125+
if not (isinstance(arxiv, str) and arxiv.startswith("https://arxiv.org/abs/")):
126+
errors.append(
127+
f"{p['id']}: canonical_preprint requires links.arxiv starting with 'https://arxiv.org/abs/'"
128+
)
129+
if not (isinstance(pdf, str) and pdf.startswith("https://arxiv.org/pdf/") and pdf.endswith(".pdf")):
130+
errors.append(f"{p['id']}: canonical_preprint requires links.pdf as an arXiv PDF URL ending in .pdf")
131+
continue
132+
133+
# Non-preprint: require some canonical peer-reviewed link (non-arXiv)
134+
found_canonical = False
135+
for k in canonical_keys:
136+
v = links.get(k)
137+
if not is_url(v):
138+
continue
139+
if isinstance(v, str) and "arxiv.org" in v:
140+
continue
141+
found_canonical = True
142+
break
143+
144+
if not found_canonical:
145+
errors.append(
146+
f"{p['id']}: missing canonical peer-reviewed link (add one of: doi/journal/proceedings/publisher/official/url/pdf)"
147+
)
148+
149+
# If pdf exists, make sure it's not an arXiv PDF for non-preprints
150+
if isinstance(pdf, str) and pdf.strip():
151+
if "arxiv.org/pdf/" in pdf:
152+
errors.append(f"{p['id']}: links.pdf must NOT be an arXiv PDF for peer-reviewed entries")
153+
elif not is_url(pdf):
154+
errors.append(f"{p['id']}: links.pdf must be a valid URL")
155+
156+
# If arxiv exists, it can be kept as provenance, but validate format if provided
157+
if isinstance(arxiv, str) and arxiv.strip():
158+
if not arxiv.startswith("https://arxiv.org/abs/"):
159+
errors.append(f"{p['id']}: links.arxiv must start with 'https://arxiv.org/abs/' (if provided)")
160+
70161
return errors
71162

72163

164+
# -------------------------
165+
# Main
166+
# -------------------------
167+
73168
def main() -> int:
74169
ap = argparse.ArgumentParser()
75170
ap.add_argument("--data", default=str(DEFAULT_DATA))
@@ -98,7 +193,7 @@ def main() -> int:
98193
errors: List[str] = []
99194
errors += validate_unique_ids(papers)
100195
errors += validate_relations_exist(papers)
101-
errors += validate_arxiv_links(papers)
196+
errors += validate_links(papers)
102197

103198
if errors:
104199
print("Dataset validation failed:", file=sys.stderr)

0 commit comments

Comments
 (0)