Skip to content

Commit 14d93bd

Browse files
committed
feat: self-citations
1 parent d05af6e commit 14d93bd

File tree

5 files changed

+179
-44
lines changed

5 files changed

+179
-44
lines changed

paperscraper/citations/entity/paper.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,15 +84,9 @@ def get_result(self) -> Optional[PaperResult]:
8484
Returns: PaperResult if available.
8585
"""
8686
if not hasattr(self, "self_ref"):
87-
logger.warning(
88-
f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
89-
)
90-
return
91-
elif not hasattr(self, "self_cite"):
92-
logger.warning(
93-
f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
94-
)
95-
return
87+
self.self_references()
88+
if not hasattr(self, "self_cite"):
89+
self.self_citations()
9690
return PaperResult(
9791
title=self.title,
9892
**{

paperscraper/citations/entity/researcher.py

Lines changed: 96 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,19 @@
11
import asyncio
22
import os
33
from typing import Any, List, Literal, Optional, Tuple
4-
from time import sleep
54

65
from semanticscholar import SemanticScholar
76

87
from ..orcid import orcid_to_author_name
9-
from ..self_citations import CitationResult
8+
from ..self_citations import CitationResult, self_citations_paper
109
from ..self_references import ReferenceResult, self_references_paper
1110
from ..utils import author_name_to_ssaid, get_papers_for_author
1211
from .core import Entity, EntityResult
1312

1413

1514
class ResearcherResult(EntityResult):
1615
name: str
17-
ssid: int
16+
ssaid: int
1817
orcid: Optional[str] = None
1918

2019
def _ordered_items(self) -> List[Tuple[str, Any]]:
@@ -27,7 +26,7 @@ def _ordered_items(self) -> List[Tuple[str, Any]]:
2726
("num_citations", self.num_citations),
2827
("self_references", self.self_references),
2928
("self_citations", self.self_citations),
30-
("ssid", self.ssid),
29+
("ssaid", self.ssaid),
3130
("orcid", self.orcid),
3231
]
3332

@@ -46,8 +45,9 @@ def __str__(self) -> str:
4645

4746
class Researcher(Entity):
4847
name: str
49-
ssid: int
48+
ssaid: int
5049
orcid: Optional[str] = None
50+
ssids: List[int] = []
5151

5252
def __init__(self, input: str, mode: ModeType = "infer"):
5353
"""
@@ -78,22 +78,31 @@ def __init__(self, input: str, mode: ModeType = "infer"):
7878
mode = "name"
7979
if mode == "ssaid":
8080
self.name = sch.get_author(input)._name
81-
self.ssid = input
81+
self.ssaid = input
8282
elif mode == "orcid":
8383
orcid_name = orcid_to_author_name(input)
8484
self.orcid = input
85-
self.ssid, self.name = author_name_to_ssaid(orcid_name)
85+
self.ssaid, self.name = author_name_to_ssaid(orcid_name)
8686
elif mode == "name":
87-
name = input
88-
self.ssid, self.name = author_name_to_ssaid(input)
87+
self.name = input
88+
self.ssaid, self.name = author_name_to_ssaid(input)
89+
90+
self.result = ResearcherResult(
91+
name=self.name,
92+
ssaid=int(self.ssaid),
93+
orcid=self.orcid,
94+
num_citations=-1,
95+
num_references=-1,
96+
)
8997

9098
async def _self_references_async(
9199
self, verbose: bool = False
92100
) -> List[ReferenceResult]:
93101
"""Async version of self_references."""
94-
if self.ssid == '-1':
102+
if self.ssaid == "-1":
95103
return []
96-
self.ssids = await get_papers_for_author(self.ssid)
104+
if self.ssids == []:
105+
self.ssids = await get_papers_for_author(self.ssaid)
97106

98107
results: List[ReferenceResult] = await self_references_paper(
99108
self.ssids, verbose=verbose
@@ -122,36 +131,91 @@ def self_references(self, verbose: bool = False) -> ResearcherResult:
122131
reference_results = asyncio.run(self._self_references_async(verbose=verbose))
123132

124133
individual_self_references = {
125-
getattr(result, "title"): getattr(result, "self_references").get(self.name, 0.0)
134+
getattr(result, "title"): getattr(result, "self_references").get(
135+
self.name, 0.0
136+
)
126137
for result in reference_results
127138
}
128-
reference_ratio = sum(individual_self_references.values()) / max(1, len(
129-
individual_self_references
130-
))
131-
return ResearcherResult(
132-
name=self.name,
133-
ssid=int(self.ssid),
134-
orcid=self.orcid,
135-
num_references=sum(r.num_references for r in reference_results),
136-
num_citations=-1,
137-
self_references=dict(
138-
sorted(
139-
individual_self_references.items(), key=lambda x: x[1], reverse=True
140-
)
141-
),
142-
self_citations={},
143-
self_reference_ratio=round(reference_ratio, 3),
144-
self_citation_ratio=-1.0,
139+
reference_ratio = sum(individual_self_references.values()) / max(
140+
1, len(individual_self_references)
145141
)
146142

147-
def self_citations(self) -> ResearcherResult:
143+
self.result = self.result.model_copy(
144+
update={
145+
"num_references": sum(r.num_references for r in reference_results),
146+
"self_references": dict(
147+
sorted(
148+
individual_self_references.items(),
149+
key=lambda x: x[1],
150+
reverse=True,
151+
)
152+
),
153+
"self_reference_ratio": round(reference_ratio, 3),
154+
}
155+
)
156+
157+
return self.result
158+
159+
async def _self_citations_async(
160+
self, verbose: bool = False
161+
) -> List[CitationResult]:
162+
"""Async version of self_citations."""
163+
if self.ssaid == "-1":
164+
return []
165+
if self.ssids == []:
166+
self.ssids = await get_papers_for_author(self.ssaid)
167+
168+
results: List[CitationResult] = await self_citations_paper(
169+
self.ssids, verbose=verbose
170+
)
171+
# Remove papers with zero references or that are erratum/corrigendum
172+
results = [
173+
r
174+
for r in results
175+
if r.num_citations > 0
176+
and "erratum" not in r.title.lower()
177+
and "corrigendum" not in r.title.lower()
178+
]
179+
180+
return results
181+
182+
def self_citations(self, verbose: bool = False) -> ResearcherResult:
148183
"""
149184
Sifts through all papers of a researcher and finds how often they are self-cited.
150185
"""
151-
...
186+
citation_results = asyncio.run(self._self_citations_async(verbose=verbose))
187+
individual_self_citations = {
188+
getattr(result, "title"): getattr(result, "self_citations").get(
189+
self.name, 0.0
190+
)
191+
for result in citation_results
192+
}
193+
citation_ratio = sum(individual_self_citations.values()) / max(
194+
1, len(individual_self_citations)
195+
)
196+
197+
self.result = self.result.model_copy(
198+
update={
199+
"num_citations": sum(r.num_citations for r in citation_results),
200+
"self_citations": dict(
201+
sorted(
202+
individual_self_citations.items(),
203+
key=lambda x: x[1],
204+
reverse=True,
205+
)
206+
),
207+
"self_citation_ratio": round(citation_ratio, 3),
208+
}
209+
)
210+
211+
return self.result
152212

153213
def get_result(self) -> ResearcherResult:
154214
"""
155215
Provides the result of the analysis.
156216
"""
157-
...
217+
if not hasattr(self, "self_ref"):
218+
self.self_references()
219+
if not hasattr(self, "self_cite"):
220+
self.self_citations()
221+
return self.result

paperscraper/citations/self_citations.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class CitationResult(BaseModel):
2424
citation_score: float
2525

2626

27+
@retry_with_exponential_backoff(max_retries=14, base_delay=1.0)
2728
async def _fetch_citation_data(
2829
client: httpx.AsyncClient, suffix: str
2930
) -> Dict[str, Any]:
@@ -120,7 +121,7 @@ async def self_citations_paper(
120121
if verbose:
121122
for res in results:
122123
logger.info(
123-
f'Self-citations in "{res.ssid}": N={res.num_citations}, Score={res.citation_score}%'
124+
f'Self-citations in "{res.title}": N={res.num_citations}, Score={res.citation_score}%'
124125
)
125126
for author, pct in res.self_citations.items():
126127
logger.info(f" {author}: {pct}%")

paperscraper/citations/tests/test_self_citations.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import logging
22
import time
3+
from typing import Dict
34

45
import pytest
56

67
from paperscraper.citations import self_citations_paper
8+
from paperscraper.citations.entity import Researcher
79
from paperscraper.citations.self_citations import CitationResult
810

911
logging.disable(logging.INFO)
@@ -69,3 +71,77 @@ def test_multiple_dois(self, dois):
6971

7072
for a, s in zip(result, sync_result):
7173
assert a == s, f"{a} vs {s}"
74+
75+
def test_researcher(self):
76+
"""
77+
Tests calculation of self-references for all papers of an author.
78+
"""
79+
ssaid = "2328976118"
80+
researcher = Researcher(ssaid)
81+
result = researcher.self_citations(verbose=True)
82+
assert result.ssaid == int(ssaid)
83+
assert isinstance(result.name, str)
84+
assert result.name == "Kacper Wyrwal"
85+
assert isinstance(result.num_references, int)
86+
assert result.num_references == -1
87+
assert isinstance(result.num_citations, int)
88+
assert result.num_citations > 0
89+
assert isinstance(result.self_citations, Dict)
90+
for title, ratio in result.self_citations.items():
91+
assert isinstance(title, str)
92+
assert isinstance(ratio, float)
93+
assert ratio >= 0 and ratio <= 100
94+
95+
assert result.self_citation_ratio >= 0 and result.self_citation_ratio <= 100
96+
print(result)
97+
98+
def test_researcher_from_orcid(self):
99+
"""
100+
Tests calculation of self-references for all papers of an author.
101+
"""
102+
orcid = "0000-0003-4221-6988"
103+
researcher = Researcher(orcid)
104+
result = researcher.self_citations(verbose=True)
105+
assert result.orcid == orcid
106+
assert isinstance(result.name, str)
107+
assert result.name == "Juan M. Galeazzi"
108+
assert isinstance(result.num_references, int)
109+
assert result.num_references == -1
110+
assert isinstance(result.num_citations, int)
111+
assert result.num_citations > 0
112+
assert isinstance(result.self_references, Dict)
113+
for title, ratio in result.self_citations.items():
114+
assert isinstance(title, str)
115+
assert isinstance(ratio, float)
116+
assert ratio >= 0 and ratio <= 100
117+
118+
assert result.self_citation_ratio >= 0 and result.self_citation_ratio <= 100
119+
print(result)
120+
121+
def test_whole_researcher(self):
122+
ssaid = "2104445902"
123+
researcher = Researcher(ssaid)
124+
result = researcher.get_result()
125+
assert result.ssaid == int(ssaid)
126+
assert isinstance(result.name, str)
127+
assert result.name == "Aleksandros Sobczyk"
128+
assert isinstance(result.num_references, int)
129+
assert result.num_references > 0
130+
assert isinstance(result.num_citations, int)
131+
assert result.num_citations > 0
132+
assert isinstance(result.self_citations, Dict)
133+
assert isinstance(result.self_references, Dict)
134+
assert len(result.self_citations) > 5
135+
assert len(result.self_references) >= 3
136+
for title, ratio in result.self_citations.items():
137+
assert isinstance(title, str)
138+
assert isinstance(ratio, float)
139+
assert ratio >= 0 and ratio <= 100
140+
for title, ratio in result.self_references.items():
141+
assert isinstance(title, str)
142+
assert isinstance(ratio, float)
143+
assert ratio >= 0 and ratio <= 100
144+
145+
assert result.self_citation_ratio >= 0 and result.self_citation_ratio <= 100
146+
assert result.self_reference_ratio >= 0 and result.self_reference_ratio <= 100
147+
print(result)

paperscraper/citations/tests/test_self_references.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def test_researcher(self):
8484
ssaid = "2326988211"
8585
researcher = Researcher(ssaid)
8686
result = researcher.self_references(verbose=True)
87-
assert result.ssid == int(ssaid)
87+
assert result.ssaid == int(ssaid)
8888
assert isinstance(result.name, str)
8989
assert result.name == "Patrick Soga"
9090
assert isinstance(result.num_references, int)
@@ -121,4 +121,4 @@ def test_researcher_from_orcid(self):
121121
assert ratio >= 0 and ratio <= 100
122122

123123
assert result.self_reference_ratio >= 0 and result.self_reference_ratio <= 100
124-
print(result)
124+
print(result)

0 commit comments

Comments
 (0)