Skip to content

Commit 4fcb617

Browse files
authored
Merge pull request #101 from MITLibraries/TIMX-232-springshare-ids
Timx 232 springshare ids
2 parents 25649ec + 86c4ea2 commit 4fcb617

File tree

5 files changed

+101
-52
lines changed

5 files changed

+101
-52
lines changed

Pipfile.lock

+29-29
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/test_springshare.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord(
1313
source="LibGuides",
1414
source_link="https://libguides.mit.edu/materials",
15-
timdex_record_id="libguides:materials",
15+
timdex_record_id="libguides:guides-175846",
1616
title="Materials Science & Engineering",
1717
citation="Materials Science & Engineering. libguides. "
1818
"https://libguides.mit.edu/materials",
@@ -33,7 +33,7 @@
3333
RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord(
3434
source="Research Databases",
3535
source_link="https://libguides.mit.edu/llba",
36-
timdex_record_id="researchdatabases:llba",
36+
timdex_record_id="researchdatabases:az-65257807",
3737
title="Linguistics and Language Behavior Abstracts (LLBA)",
3838
citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. "
3939
"https://libguides.mit.edu/llba",
@@ -94,7 +94,7 @@ def test_libguide_transform_with_all_fields_transforms_correctly():
9494
assert next(output_records) == timdex.TimdexRecord(
9595
source="LibGuides",
9696
source_link="https://libguides.mit.edu/materials",
97-
timdex_record_id="libguides:materials",
97+
timdex_record_id="libguides:guides-175846",
9898
title="Materials Science & Engineering",
9999
citation="Ye Li. Materials Science & Engineering. MIT Libraries. libguides. "
100100
"https://libguides.mit.edu/materials",
@@ -154,7 +154,7 @@ def test_research_databases_transform_with_all_fields_transforms_correctly():
154154
assert next(output_records) == timdex.TimdexRecord(
155155
source="Research Databases",
156156
source_link="https://libguides.mit.edu/llba",
157-
timdex_record_id="researchdatabases:llba",
157+
timdex_record_id="researchdatabases:az-65257807",
158158
title="Linguistics and Language Behavior Abstracts (LLBA)",
159159
citation="Linguistics and Language Behavior Abstracts (LLBA). "
160160
"researchdatabases. https://libguides.mit.edu/llba",

transmogrifier/sources/ead.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -445,8 +445,8 @@ def parse_mixed_value(
445445
"""
446446
if skipped_elements is None:
447447
skipped_elements = []
448-
if type(item) == NavigableString and item.strip():
448+
if isinstance(item, NavigableString) and item.strip():
449449
yield str(item.strip())
450-
elif type(item) == Tag and item.name not in skipped_elements:
450+
elif isinstance(item, Tag) and item.name not in skipped_elements:
451451
for child in item.children:
452452
yield from cls.parse_mixed_value(child, skipped_elements)

transmogrifier/sources/springshare.py

+20-14
Original file line numberDiff line numberDiff line change
@@ -80,25 +80,31 @@ def get_links(self, source_record_id: str, xml: Tag) -> Optional[List[timdex.Lin
8080
]
8181

8282
@classmethod
83-
def get_source_record_id(cls, xml: Tag) -> str:
83+
def get_source_link(
84+
cls, source_base_url: str, source_record_id: str, xml: Tag
85+
) -> str:
8486
"""
85-
Get the source record ID from a Springshare OAI DC XML record.
87+
Override for default source_link behavior.
8688
87-
Overrides metaclass get_source_record_id() method.
89+
Springshare resources contain the source link in their dc:identifier fields.
90+
However, this cannot be reliably split and combined with the source base url,
91+
as this either provides poorly formed timdex record ids OR source links that
92+
do not work.
8893
89-
The URL path of the Springshare resource is used as the source record id, which
90-
results in a timdex record id like "libguides:materials" or
91-
"researchdatabases:llba". This is preferred over the OAI-PMH identifier, a
92-
numeric value, which cannot be used to construct an accessible source link.
94+
Example libguides OAI identifier and <dc:identifier>:
95+
- oai:libguides.com:guides/175846, https://libguides.mit.edu/materials
96+
- oai:libguides.com:guides/175847, https://libguides.mit.edu/c.php?g=175847
9397
94-
Libguides example:
95-
"https://libguides.mit.edu/materials" -> "materials"
98+
Example researchdatabases OAI identifier and <dc:identifier>:
99+
- oai:libguides.com:az/65257807, https://libguides.mit.edu/llba
96100
97-
AZ (Research Database) example:
98-
"https://libguides.mit.edu/llba" -> "llba"
101+
It is preferable to split the OAI header identifier and use this as the TIMDEX
102+
record id, but then take the dc:identifier wholesale and use this for the source
103+
link.
99104
100105
Args:
101-
xml: A BeautifulSoup Tag representing a single Springshare OAI DC XML record.
106+
source_base_url: Source base URL.
107+
source_record_id: Record identifier for the source record.
108+
xml: A BeautifulSoup Tag representing a single XML record.
102109
"""
103-
104-
return str(xml.find("dc:identifier").string).split("/")[-1]
110+
return str(xml.find("dc:identifier").string)

transmogrifier/sources/transformer.py

+46-3
Original file line numberDiff line numberDiff line change
@@ -117,11 +117,16 @@ def get_required_fields(self, xml: Tag) -> dict:
117117
xml: A BeautifulSoup Tag representing a single OAI-PMH XML record.
118118
"""
119119
source_record_id = self.get_source_record_id(xml)
120+
121+
# run methods to generate required fields
122+
source_link = self.get_source_link(self.source_base_url, source_record_id, xml)
123+
timdex_record_id = self.get_timdex_record_id(self.source, source_record_id, xml)
120124
title = self.get_valid_title(source_record_id, xml)
125+
121126
return {
122127
"source": self.source_name,
123-
"source_link": self.source_base_url + source_record_id,
124-
"timdex_record_id": f"{self.source}:{source_record_id.replace('/', '-')}",
128+
"source_link": source_link,
129+
"timdex_record_id": timdex_record_id,
125130
"title": title,
126131
}
127132

@@ -180,7 +185,7 @@ def get_valid_title(cls, source_record_id: str, xml: Tag) -> str:
180185
source_record_id,
181186
all_titles,
182187
)
183-
if all_titles and type(all_titles[0]) == str:
188+
if all_titles and isinstance(all_titles[0], str):
184189
title = all_titles[0]
185190
elif all_titles and all_titles[0].string:
186191
title = all_titles[0].string
@@ -191,3 +196,41 @@ def get_valid_title(cls, source_record_id: str, xml: Tag) -> str:
191196
)
192197
title = "Title not provided"
193198
return title
199+
200+
@classmethod
201+
def get_source_link(
202+
cls, source_base_url: str, source_record_id: str, xml: Tag
203+
) -> str:
204+
"""
205+
Class method to set the source link for the item.
206+
207+
May be overridden by source subclasses if needed.
208+
209+
Default behavior is to concatenate the source base URL + source record id.
210+
211+
Args:
212+
source_base_url: Source base URL.
213+
source_record_id: Record identifier for the source record.
214+
xml: A BeautifulSoup Tag representing a single XML record.
215+
- not used by default implementation, but could be useful for subclass
216+
overrides
217+
"""
218+
return source_base_url + source_record_id
219+
220+
@classmethod
221+
def get_timdex_record_id(cls, source: str, source_record_id: str, xml: Tag) -> str:
222+
"""
223+
Class method to set the TIMDEX record id.
224+
225+
May be overridden by source subclasses if needed.
226+
227+
Default behavior is to concatenate the source name + source record id.
228+
229+
Args:
230+
source: Source name.
231+
source_record_id: Record identifier for the source record.
232+
xml: A BeautifulSoup Tag representing a single XML record.
233+
- not used by default implementation, but could be useful for subclass
234+
overrides
235+
"""
236+
return f"{source}:{source_record_id.replace('/', '-')}"

0 commit comments

Comments
 (0)