Skip to content

Commit 27e12fe

Browse files
committed
improve year extraction logic in orcidLoader (some publications had year stored in citation field and not in the year field.)
1 parent 4b83186 commit 27e12fe

File tree

2 files changed

+45
-7
lines changed

2 files changed

+45
-7
lines changed

src/content/loaders/orcid.ts

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ export function orcidLoader(options: { orcid: string }): Loader {
7676
authors: z.string(),
7777
firstAuthor: z.string(),
7878
date: z.string(),
79-
year: z.number(),
79+
year: z.number().optional(),
8080
journal: z.string(),
8181
europePmc: z.string().optional(),
8282
isPreprint: z.boolean(),
@@ -148,16 +148,45 @@ export function orcidLoader(options: { orcid: string }): Loader {
148148
}
149149

150150
const journalMatch = workXml.match(/<work:journal-title>(.*?)<\/work:journal-title>/);
151-
const journal = journalMatch?.[1] || 'Unknown Journal';
151+
let journal = journalMatch?.[1] || 'Unknown Journal';
152152

153153
const yearMatch = workXml.match(/<common:year>(.*?)<\/common:year>/);
154154
const monthMatch = workXml.match(/<common:month>(.*?)<\/common:month>/);
155155
const dayMatch = workXml.match(/<common:day>(.*?)<\/common:day>/);
156156

157-
const year = yearMatch?.[1] ? parseInt(yearMatch[1]) : new Date().getFullYear();
157+
// Fallback: try to extract year from citation if structured date is missing
158+
let year = yearMatch?.[1] ? parseInt(yearMatch[1]) : null;
159+
160+
if (!year) {
161+
const citationMatch = workXml.match(/<work:citation-value>(.*?)<\/work:citation-value>/s);
162+
if (citationMatch?.[1]) {
163+
const citation = citationMatch[1];
164+
// Try to extract 4-digit year from citation (look for years like 2012, 1999, etc.)
165+
const citationYearMatch = citation.match(/\b(19|20)\d{2}\b/);
166+
if (citationYearMatch?.[0]) {
167+
year = parseInt(citationYearMatch[0]);
168+
logger.info(`Extracted year ${year} from citation for: ${title}`);
169+
170+
// Also try to extract journal from citation if not found
171+
if (journal === 'Unknown Journal') {
172+
// Look for text within <i>...</i> or &lt;i&gt;...&lt;/i&gt;
173+
const journalFromCitationMatch = citation.match(/(?:<i>|&lt;i&gt;)(.*?)(?:<\/i>|&lt;\/i&gt;)/);
174+
if (journalFromCitationMatch?.[1]) {
175+
journal = journalFromCitationMatch[1];
176+
}
177+
}
178+
}
179+
}
180+
}
181+
182+
// If no year found, log warning but continue
183+
if (!year) {
184+
logger.warn(`No year found for publication: "${title}"`);
185+
}
186+
158187
const month = monthMatch?.[1]?.padStart(2, '0') || '01';
159188
const day = dayMatch?.[1]?.padStart(2, '0') || '01';
160-
const dateString = `${year}-${month}-${day}`;
189+
const dateString = year ? `${year}-${month}-${day}` : '';
161190

162191
const doiMatch = workXml.match(/<common:external-id-type>doi<\/common:external-id-type>[\s\S]*?<common:external-id-value>(.*?)<\/common:external-id-value>/);
163192
const pmidMatch = workXml.match(/<common:external-id-type>pmid<\/common:external-id-type>[\s\S]*?<common:external-id-value>(.*?)<\/common:external-id-value>/);
@@ -213,8 +242,17 @@ export function orcidLoader(options: { orcid: string }): Loader {
213242
logger.info(`Successfully processed ${allPublications.length} publications from ORCID`);
214243

215244
allPublications.sort((a, b) => {
216-
if (a.year !== b.year) {
217-
return b.year - a.year;
245+
// Publications without year go to the end
246+
if (!a.year && !b.year) {
247+
return b.date.localeCompare(a.date);
248+
}
249+
if (!a.year) return 1;
250+
if (!b.year) return -1;
251+
252+
// Both years are defined at this point
253+
const yearDiff = b.year - a.year;
254+
if (yearDiff !== 0) {
255+
return yearDiff;
218256
}
219257
return b.date.localeCompare(a.date);
220258
});

src/types/types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ export interface Publication {
5656
issue?: string;
5757
pages?: string;
5858
date: string;
59-
year: number;
59+
year?: number;
6060
isOpenAccess?: boolean;
6161
isPreprint?: boolean;
6262
isReview?: boolean;

0 commit comments

Comments
 (0)