11import logging
22import time
3+ from django .core .exceptions import ValidationError
4+ from django .core .validators import URLValidator
35import requests
46
57import xmltodict
911
1012class GallicaSearch :
1113 # API defaults
12- API_ENDPOINT = "https://gallica.bnf.fr/SRU"
14+ DEFAULT_API_ENDPOINT = "https://gallica.bnf.fr/SRU"
1315 DEFAULT_MAX_RECORDS = 15 # Max: 50
1416 DEFAULT_START_RECORD = 1
1517 DEFAULT_MAX_RETRIES = 3
1618
17- def __init__ (self , max_records : int = DEFAULT_MAX_RECORDS ) -> None :
19+ def __init__ (
20+ self ,
21+ max_records : int = DEFAULT_MAX_RECORDS ,
22+ endpoint : str = DEFAULT_API_ENDPOINT ,
23+ ) -> None :
1824 self .set_max_records (max_records )
19-
25+ self . set_api_endpoint ( endpoint )
2026 self .raw_records = []
2127 self .total_records = 0
2228 self .records = {}
@@ -34,6 +40,16 @@ def set_max_records(self, max_records: int) -> None:
3440 else :
3541 self .max_records = self .DEFAULT_MAX_RECORDS
3642
43+ def set_api_endpoint (self , endpoint : str ) -> None :
44+ """
45+ Changes the SRU API endpoint.
46+
47+ param:
48+ - endpoint: the base URL of the SRU on a white-label version of Gallica
49+ (ex: "https://nutrisco-patrimoine.lehavre.fr/SRU")
50+ """
51+ self .api_endpoint = endpoint
52+
3753 def set_slow_mode (self , slow_mode : float = 0 ) -> None :
3854 """
3955 Set the value of the slow_mode parameter (defaults to zero)
@@ -60,14 +76,14 @@ def gallica_search_retrieve(
6076 "query" : query ,
6177 }
6278
63- response = requests .get (self .API_ENDPOINT , params = payload )
79+ response = requests .get (self .api_endpoint , params = payload )
6480
6581 retries = self .DEFAULT_MAX_RETRIES
6682 if response .status_code == 500 :
6783 while retries :
6884 logging .warning (f"Error 500, retrying (retries: { retries } )" )
6985 time .sleep (5 )
70- response = requests .get (self .API_ENDPOINT , params = payload )
86+ response = requests .get (self .DEFAULT_API_ENDPOINT , params = payload )
7187 if response .status_code == 200 :
7288 break
7389 else :
@@ -90,10 +106,7 @@ def count_records(self, query: str) -> dict:
90106
91107 return {"total_records" : self .total_records }
92108
93- def get_records (
94- self ,
95- query : str ,
96- ) -> dict :
109+ def fetch_records (self , query : str ) -> None :
97110 """
98111 Retrieve the full lists of records for a query.
99112 Calls the gallica_search_retrieve recursively until all results are retrieved.
@@ -134,41 +147,97 @@ def get_records(
134147 else :
135148 logging .info ("The research returned no (new) results." )
136149
137- return {"records" : self .raw_records , "total_records" : self .total_records }
138-
139150 def parse_records (self ) -> None :
140151 for raw in self .raw_records :
152+ raw = dict (raw )
141153 record = Record (raw )
142154 self .records [record .ark_id ] = record
143155
156+ def get_records (self ) -> dict :
157+ """
158+ Returns the dict with the records
159+ """
160+ return self .records
161+
144162
145163class Record :
146164 def __init__ (self , raw : dict ) -> None :
147165 self .raw = raw
148- self .get_ark ()
166+ self .set_ark ()
167+ self .set_date ()
149168 self .ark_id = self .ark_url .split ("/" )[- 1 ]
150- self .title = raw ["dc:title" ]
151- self .date = raw ["dc:date" ]
169+ self .title = self .get_first_value ("dc:title" )
152170
153171 def get_values (self , key : str ) -> list :
154172 """
155173 A datapoint can either be a string or a list of strings.
156174
157175 This function casts everything into a list to streamline the parsing.
176+
177+ It also returns an empty list if the key is missing in the record.
158178 """
159- values = self .raw [key ]
160- if isinstance (values , str ):
161- values = [values ]
179+ if key in self .raw :
180+ values = self .raw [key ]
181+
182+ if isinstance (values , str ):
183+ values = [values ]
184+ else :
185+ values = []
162186
163187 return values
164188
165- def get_ark (self ) -> str :
189+ def get_first_value (self , key : str ) -> str :
190+ """
191+ Returns the first value, or an empty string if no value is present
192+ """
193+ values = self .get_values (key )
194+ if len (values ):
195+ return values [0 ]
196+ else :
197+ return ""
198+
199+ def set_date (self ) -> None :
200+ """
201+ Stores the 'dc:date' value as a string
202+ """
203+ if "dc:date" in self .raw :
204+ date = self .get_first_value ("dc:date" )
205+
206+ # Manage some common bad values
207+ if date == "[S.d.]" :
208+ date = ""
209+
210+ if "à nos jours" in date :
211+ date = ""
212+
213+ # If a date range is provided, only keep the latest year
214+ if "-" in date :
215+ date_range = date .split ("-" )
216+ date = date_range [- 1 ]
217+ self .date = date
218+
219+ else :
220+ self .date = ""
221+
222+ def set_ark (self , ark_root = "https://gallica.bnf.fr/ark" ) -> str :
166223 ids = self .get_values ("dc:identifier" )
167224
225+ self .ark_url = ""
226+ # First, check for the proper Gallica ark
168227 for id in ids :
169- if "https://gallica.bnf.fr/ark" in id :
228+ if ark_root in id :
170229 self .ark_url = id
171230
231+ # Else, check for any URL
232+ if not self .ark_url :
233+ url_validate = URLValidator ()
234+ for id in ids :
235+ try :
236+ url_validate (id )
237+ self .ark_url = id
238+ except ValidationError :
239+ pass
240+
172241 return self .ark_url
173242
174243 def get_thumbnail (self ) -> str :
0 commit comments