44import argparse
55import sys
66from pathlib import Path
7- from typing import Any , Dict , List , Set
7+ from typing import Any , Dict , List , Set , Optional
8+ from urllib .parse import urlparse
89
910import yaml
1011
1920 sys .exit (2 )
2021
2122
23+ # -------------------------
24+ # IO
25+ # -------------------------
26+
2227def load_yaml (path : Path ) -> Dict [str , Any ]:
2328 with path .open ("r" , encoding = "utf-8" ) as f :
2429 return yaml .safe_load (f )
@@ -30,19 +35,39 @@ def load_json(path: Path) -> Dict[str, Any]:
3035 return json .load (f )
3136
3237
33- def validate_relations_exist (papers : List [Dict [str , Any ]]) -> List [str ]:
34- errors : List [str ] = []
35- ids : Set [str ] = {p ["id" ] for p in papers }
38+ # -------------------------
39+ # Helpers
40+ # -------------------------
41+
42+ def is_url (s : Any ) -> bool :
43+ if not isinstance (s , str ) or not s .strip ():
44+ return False
45+ try :
46+ u = urlparse (s )
47+ return u .scheme in {"http" , "https" } and bool (u .netloc )
48+ except Exception :
49+ return False
50+
51+
52+ def is_canonical_preprint (p : Dict [str , Any ]) -> bool :
53+ return str (p .get ("publication_status" , "" )).strip ().lower () == "canonical_preprint"
3654
37- for p in papers :
38- for rel in p .get ("relations" , []) or []:
39- tgt = rel .get ("target" )
40- if tgt not in ids :
41- errors .append (f"{ p ['id' ]} : relation target '{ tgt } ' does not exist" )
42- if tgt == p ["id" ]:
43- errors .append (f"{ p ['id' ]} : relation target cannot be self" )
44- return errors
4555
56+ def resolve_relation_target (rel : Any ) -> Optional [str ]:
57+ # Supports the dataset + app.js conventions:
58+ # target_id | paper_id | target | id | paper
59+ if not isinstance (rel , dict ):
60+ return None
61+ for k in ("target_id" , "paper_id" , "target" , "id" , "paper" ):
62+ v = rel .get (k )
63+ if isinstance (v , str ) and v .strip ():
64+ return v .strip ()
65+ return None
66+
67+
68+ # -------------------------
69+ # Integrity checks
70+ # -------------------------
4671
4772def validate_unique_ids (papers : List [Dict [str , Any ]]) -> List [str ]:
4873 errors : List [str ] = []
@@ -55,21 +80,91 @@ def validate_unique_ids(papers: List[Dict[str, Any]]) -> List[str]:
5580 return errors
5681
5782
58- def validate_arxiv_links (papers : List [Dict [str , Any ]]) -> List [str ]:
83+ def validate_relations_exist (papers : List [Dict [str , Any ]]) -> List [str ]:
5984 errors : List [str ] = []
85+ ids : Set [str ] = {p ["id" ] for p in papers }
86+
87+ for p in papers :
88+ rels = p .get ("relations" , []) or []
89+ for rel in rels :
90+ tgt = resolve_relation_target (rel )
91+ if not tgt :
92+ errors .append (f"{ p ['id' ]} : relation missing target (expected target_id/paper_id/target)" )
93+ continue
94+ if tgt not in ids :
95+ errors .append (f"{ p ['id' ]} : relation target '{ tgt } ' does not exist" )
96+ if tgt == p ["id" ]:
97+ errors .append (f"{ p ['id' ]} : relation target cannot be self" )
98+ return errors
99+
100+
101+ def validate_links (papers : List [Dict [str , Any ]]) -> List [str ]:
102+ """Peer-reviewed-first rules (matches the site logic).
103+
104+ - If publication_status == canonical_preprint:
105+ * require links.arxiv to be https://arxiv.org/abs/...
106+ * require links.pdf to be https://arxiv.org/pdf/... .pdf
107+
108+ - Otherwise:
109+ * do NOT require arXiv links
110+ * require at least one canonical (non-arXiv) URL among:
111+ doi, journal, proceedings, publisher, official, url, pdf
112+ * if links.pdf is provided, it must NOT be an arXiv PDF URL
113+ """
114+ errors : List [str ] = []
115+
116+ canonical_keys = ["doi" , "journal" , "proceedings" , "publisher" , "official" , "url" , "pdf" ]
117+
60118 for p in papers :
61119 links = p .get ("links" , {}) or {}
62- arxiv = links .get ("arxiv" , "" )
63- pdf = links .get ("pdf" , "" )
64-
65- # Keep this strict to avoid drift; loosen later if needed.
66- if not isinstance (arxiv , str ) or not arxiv .startswith ("https://arxiv.org/abs/" ):
67- errors .append (f"{ p ['id' ]} : links.arxiv must start with 'https://arxiv.org/abs/'" )
68- if not isinstance (pdf , str ) or not (pdf .startswith ("https://arxiv.org/pdf/" ) and pdf .endswith (".pdf" )):
69- errors .append (f"{ p ['id' ]} : links.pdf must be an arXiv PDF URL ending in .pdf" )
120+
121+ arxiv = links .get ("arxiv" )
122+ pdf = links .get ("pdf" )
123+
124+ if is_canonical_preprint (p ):
125+ if not (isinstance (arxiv , str ) and arxiv .startswith ("https://arxiv.org/abs/" )):
126+ errors .append (
127+ f"{ p ['id' ]} : canonical_preprint requires links.arxiv starting with 'https://arxiv.org/abs/'"
128+ )
129+ if not (isinstance (pdf , str ) and pdf .startswith ("https://arxiv.org/pdf/" ) and pdf .endswith (".pdf" )):
130+ errors .append (f"{ p ['id' ]} : canonical_preprint requires links.pdf as an arXiv PDF URL ending in .pdf" )
131+ continue
132+
133+ # Non-preprint: require some canonical peer-reviewed link (non-arXiv)
134+ found_canonical = False
135+ for k in canonical_keys :
136+ v = links .get (k )
137+ if not is_url (v ):
138+ continue
139+ if isinstance (v , str ) and "arxiv.org" in v :
140+ continue
141+ found_canonical = True
142+ break
143+
144+ if not found_canonical :
145+ errors .append (
146+ f"{ p ['id' ]} : missing canonical peer-reviewed link (add one of: doi/journal/proceedings/publisher/official/url/pdf)"
147+ )
148+
149+ # If pdf exists, make sure it's not an arXiv PDF for non-preprints
150+ if isinstance (pdf , str ) and pdf .strip ():
151+ if "arxiv.org/pdf/" in pdf :
152+ errors .append (f"{ p ['id' ]} : links.pdf must NOT be an arXiv PDF for peer-reviewed entries" )
153+ elif not is_url (pdf ):
154+ errors .append (f"{ p ['id' ]} : links.pdf must be a valid URL" )
155+
156+ # If arxiv exists, it can be kept as provenance, but validate format if provided
157+ if isinstance (arxiv , str ) and arxiv .strip ():
158+ if not arxiv .startswith ("https://arxiv.org/abs/" ):
159+ errors .append (f"{ p ['id' ]} : links.arxiv must start with 'https://arxiv.org/abs/' (if provided)" )
160+
70161 return errors
71162
72163
164+ # -------------------------
165+ # Main
166+ # -------------------------
167+
73168def main () -> int :
74169 ap = argparse .ArgumentParser ()
75170 ap .add_argument ("--data" , default = str (DEFAULT_DATA ))
@@ -98,7 +193,7 @@ def main() -> int:
98193 errors : List [str ] = []
99194 errors += validate_unique_ids (papers )
100195 errors += validate_relations_exist (papers )
101- errors += validate_arxiv_links (papers )
196+ errors += validate_links (papers )
102197
103198 if errors :
104199 print ("Dataset validation failed:" , file = sys .stderr )
0 commit comments