Skip to content

Commit cb12ea3

Browse files
authored
Merge pull request #29 from NLeSC/load-csv
Add `load_csv` function
2 parents afda094 + c669fab commit cb12ea3

File tree

5 files changed

+393
-19
lines changed

5 files changed

+393
-19
lines changed

docs/api/sources.rst

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,31 +17,36 @@ CrossRef
1717
.. automodule:: litstudy
1818
:members: fetch_crossref, refine_crossref, search_crossref
1919

20-
bibtex
20+
CSV
2121
---------------
2222
.. automodule:: litstudy
23-
:members: load_bibtex
23+
:members: load_csv
2424

25-
RIS
25+
IEEE Xplore
2626
---------------
2727
.. automodule:: litstudy
28-
:members: load_ris_file
28+
:members: load_ieee_csv
2929

30+
Springer Link
31+
---------------
32+
.. automodule:: litstudy
33+
:members: load_springer_csv
3034

31-
dblp
35+
bibtex
3236
---------------
3337
.. automodule:: litstudy
34-
:members: search_dblp
38+
:members: load_bibtex
3539

36-
IEEE Xplore
40+
RIS
3741
---------------
3842
.. automodule:: litstudy
39-
:members: load_ieee_csv
43+
:members: load_ris_file
4044

41-
Springer Link
45+
46+
dblp
4247
---------------
4348
.. automodule:: litstudy
44-
:members: load_springer_csv
49+
:members: search_dblp
4550

4651
arXiv
4752
---------------

litstudy/sources/__init__.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,23 @@
88
from .dblp import search_dblp
99
from .ris import load_ris_file
1010
from .arxiv import search_arxiv
11+
from .csv import load_csv
1112

1213
__all__ = [
13-
'refine_crossref',
1414
'fetch_crossref',
15-
'search_crossref',
16-
'refine_semanticscholar',
17-
'search_semanticscholar',
18-
'fetch_semanticscholar',
19-
'refine_scopus',
20-
'search_scopus',
2115
'fetch_scopus',
16+
'fetch_semanticscholar',
2217
'load_bibtex',
18+
'load_csv',
2319
'load_ieee_csv',
24-
'load_springer_csv',
2520
'load_ris_file',
21+
'load_springer_csv',
22+
'refine_crossref',
23+
'refine_scopus',
24+
'refine_semanticscholar',
25+
'search_arxiv',
26+
'search_crossref',
2627
'search_dblp',
27-
'search_arxiv'
28+
'search_scopus',
29+
'search_semanticscholar',
2830
]

litstudy/sources/csv.py

Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
import csv
2+
import datetime
3+
4+
from ..types import Author, Document, DocumentSet, DocumentIdentifier
5+
from ..common import robust_open, fuzzy_match
6+
7+
8+
class CsvAuthor(Author):
9+
def __init__(self, name):
10+
self._name = name
11+
12+
@property
13+
def name(self):
14+
return self._name
15+
16+
17+
class CsvDocument(Document):
18+
def __init__(self, record, fields):
19+
self.fields = fields
20+
self.record = record
21+
22+
id = DocumentIdentifier(
23+
self.title,
24+
doi=self._field("doi"),
25+
pubmed=self._field("pubmed")
26+
)
27+
super().__init__(id)
28+
29+
def __getitem__(self, key):
30+
return self.record[key]
31+
32+
def __setitem__(self, key, value):
33+
self.record[key] = value
34+
35+
def __iter__(self):
36+
return iter(self.record)
37+
38+
def _field(self, field_name):
39+
key = self.fields[field_name]
40+
return self.record.get(key) or None
41+
42+
@property
43+
def title(self):
44+
return self._field("title")
45+
46+
@property
47+
def abstract(self):
48+
return self._field("abstract")
49+
50+
@property
51+
def publication_source(self):
52+
return self._field("source")
53+
54+
@property
55+
def language(self):
56+
return self._field("language")
57+
58+
@property
59+
def publisher(self):
60+
return self._field("publisher")
61+
62+
@property
63+
def citation_count(self):
64+
try:
65+
return int(self._field("citation"))
66+
except Exception:
67+
return None
68+
69+
@property
70+
def keywords(self):
71+
text = self._field("keywords")
72+
if not text:
73+
return None
74+
75+
# Try to split on something
76+
for delim in ";|\t, ":
77+
if delim in text:
78+
return [t.strip() for t in text.split(delim)]
79+
80+
return [text]
81+
82+
@property
83+
def publication_date(self):
84+
text = self._field("date")
85+
if not text:
86+
return None
87+
88+
# Is it a year?
89+
try:
90+
year = int(text)
91+
if year > 1500 and year < 2500:
92+
return datetime.date(year, 1, 1)
93+
else:
94+
return None
95+
except Exception:
96+
pass
97+
98+
# Is it an iso date?
99+
try:
100+
return datetime.date.fromisoformat(text)
101+
except Exception:
102+
pass
103+
104+
# Is it one of these formats?
105+
formats = [
106+
"%c",
107+
"%x",
108+
"%d/%m/%y",
109+
"%d/%m/%Y",
110+
"%m/%d/%y",
111+
"%m/%d/%Y",
112+
"%d.%m.%y",
113+
"%d.%m.%Y",
114+
"%Y-%m-%d",
115+
"%y-%m-%d",
116+
"%Y-%d",
117+
"%d-%Y",
118+
"%y-%d",
119+
]
120+
121+
for fmt in formats:
122+
try:
123+
return datetime.strptime(text, fmt)
124+
except Exception:
125+
pass
126+
127+
# I give up, failed to parse date
128+
return None
129+
130+
@property
131+
def publication_year(self):
132+
date = self.publication_date
133+
if not date:
134+
return None
135+
136+
return date.year
137+
138+
@property
139+
def authors(self):
140+
text = self._field("authors")
141+
if not text:
142+
return None
143+
144+
for delim in [";", "|", " and ", ","]:
145+
if delim in text:
146+
names = text.split(delim)
147+
names = [name.strip() for name in names]
148+
names = [name for name in names if name]
149+
return [CsvAuthor(name) for name in names]
150+
151+
# Just one author?
152+
return [CsvAuthor(text)]
153+
154+
155+
def find_field(columns, possible_names):
156+
PREFIXES = ["", "document", "article", "paper", "item", "publication"]
157+
158+
for a in possible_names:
159+
for b in columns:
160+
for prefix in PREFIXES:
161+
if fuzzy_match(f"{prefix} {a}", b):
162+
return b
163+
164+
165+
def load_csv(
166+
path: str,
167+
dialect: "csv.Dialect" = None,
168+
title_field: str = None,
169+
authors_field: str = None,
170+
abstract_field: str = None,
171+
citation_field: str = None,
172+
date_field: str = None,
173+
source_field: str = None,
174+
filter=None,
175+
) -> DocumentSet:
176+
""" Load an abitrary CSV file and parse its contents as a ``DocumentSet``
177+
on a best effort basis.
178+
179+
An attempt is made to guess the purpose of the fields of the CSV file
180+
based on their names. For example, the date of publication is likely
181+
given by a field named something like "Publication Date",
182+
"Year of Publication", or "Published Year". In case the field name
183+
cannot be determined, it is possible to explicitly set the purpose of
184+
field names by passing additional parameters. For example, ``date_field``
185+
explicit sets name of the date field.
186+
187+
The CSV is parsed using the given ``dialect``. If not dialect is given, an
188+
attempt is made to guess the dialect based on the file's content.
189+
190+
:param path: Name of CSV file.
191+
:param dialect: Used to read the CSV file.
192+
:param title_field: Field name for ``title``.
193+
:param authors_field: Field name for ``authors``.
194+
:param abstract_field: Field name for ``abstract``.
195+
:param citation_field: Field name for ``citation_count``.
196+
:param date_field: Field name for ``publication_date`` or
197+
:param filter: Optional function applied to each loaded record. This
198+
function can be used to, for example, add or delete fields.
199+
200+
Examples:
201+
```
202+
docs = litstudy.load_csv("my_data.csv",
203+
title_field="Document Title",
204+
date_field="Pub Date")
205+
```
206+
"""
207+
with robust_open(path) as f:
208+
text = f.read()
209+
210+
# If file is empty, exit now
211+
if not text:
212+
return DocumentSet([])
213+
214+
# Guess CSV dialect
215+
if dialect is None:
216+
dialect = csv.Sniffer().sniff(text)
217+
f.seek(0)
218+
219+
# Read the records
220+
records = []
221+
for record in csv.DictReader(f):
222+
if filter:
223+
record = filter(record)
224+
225+
if record:
226+
records.append(record)
227+
228+
# No records, exit now
229+
if not records:
230+
return DocumentSet([])
231+
232+
# Get the colum names
233+
columns = list(records[0].keys())
234+
235+
# Guess the field names
236+
fields = dict(
237+
title=title_field or find_field(columns, [
238+
"title",
239+
]),
240+
authors=authors_field or find_field(columns, [
241+
"authors",
242+
"author(s)",
243+
"author",
244+
"names",
245+
"people",
246+
"person",
247+
"persons",
248+
]),
249+
abstract=abstract_field or find_field(columns, [
250+
"abstract",
251+
"description",
252+
"content",
253+
"text",
254+
"short text",
255+
"body",
256+
]),
257+
citation=citation_field or find_field(columns, [
258+
"citation count",
259+
"citations count",
260+
"number of citations",
261+
"number citations",
262+
"cited by",
263+
"citations",
264+
"cited",
265+
]),
266+
date=date_field or find_field(columns, [
267+
"pub date",
268+
"datum",
269+
"date of publication",
270+
"published date",
271+
"publishing date",
272+
"pub year",
273+
"year of publication",
274+
"published year",
275+
"publishing year",
276+
"date",
277+
"year",
278+
]),
279+
source=source_field or find_field(columns, [
280+
"source title",
281+
"source name",
282+
"source",
283+
"conference name",
284+
]),
285+
pubmed=find_field(columns, [
286+
"pubmed",
287+
"pubmedid",
288+
"pubmed id",
289+
]),
290+
doi=find_field(columns, [
291+
"doi",
292+
"object identifier",
293+
"object identification",
294+
]),
295+
keywords=find_field(columns, [
296+
"keywords",
297+
"tags",
298+
"categories",
299+
"keys",
300+
"indices",
301+
"author keywords",
302+
"author tags",
303+
]),
304+
publisher=find_field(columns, [
305+
"publisher",
306+
"publisher name",
307+
]),
308+
language=find_field(columns, [
309+
"language",
310+
"lang",
311+
"original language",
312+
]),
313+
)
314+
315+
docs = [CsvDocument(record, fields) for record in records]
316+
317+
return DocumentSet(docs)

0 commit comments

Comments
 (0)