Skip to content
This repository was archived by the owner on Nov 4, 2024. It is now read-only.

Commit 8cd39f6

Browse files
authored
Merge pull request #29 from entrepreneur-interet-general/sp9-squash
Sp9 squash
2 parents ec14f17 + 4d772c0 commit 8cd39f6

19 files changed

+3785
-56
lines changed

bnsp/admin.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
from django.contrib import admin
2-
from django.utils.html import format_html
3-
from django.urls import reverse
1+
from django.contrib import admin, messages
2+
from django.http.response import HttpResponseRedirect
3+
from django.utils.translation import ngettext
44

55
from francedata.services.django_admin import (
66
TimeStampModelAdmin,
@@ -9,11 +9,11 @@
99

1010
from bnsp import models
1111

12-
# Register your models here.
13-
1412

1513
@admin.register(models.Query)
1614
class QueryAdmin(TimeStampModelAdmin):
15+
change_form_template = "bnsp/admin/query_changeform.html"
16+
1717
search_fields = ("name", "query")
1818
list_display = (
1919
"__str__",
@@ -54,7 +54,35 @@ class QueryAdmin(TimeStampModelAdmin):
5454
("Métadonnées", {"fields": ["id", "created_at", "updated_at"]}),
5555
]
5656

57+
actions = ["force_run_queries"]
58+
5759
def view_documents_link(self, obj):
5860
return view_reverse_changelink(obj, "core", "bnsp_query", "document")
5961

6062
view_documents_link.short_description = "Documents"
63+
64+
# Force immediate execution of query on the admin detail view
65+
def response_change(self, request, obj):
66+
if "_force_run_query" in request.POST:
67+
obj.run()
68+
return HttpResponseRedirect(".") # stay on the same detail page
69+
return super().response_change(request, obj)
70+
71+
# Force immediate execution of queries on the admin list view
72+
@admin.action(description="Lancer les requêtes maintenant")
73+
def force_run_queries(self, request, queryset):
74+
for q in queryset:
75+
q.run()
76+
77+
updated = len(queryset)
78+
79+
self.message_user(
80+
request,
81+
ngettext(
82+
"%d requête a été exécutée avec succès.",
83+
"%d requêtes ont été exécutées avec succès.",
84+
updated,
85+
)
86+
% updated,
87+
messages.SUCCESS,
88+
)
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Generated by Django 3.2.7 on 2021-09-09 14:07
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('bnsp', '0001_initial'),
10+
]
11+
12+
operations = [
13+
migrations.AlterField(
14+
model_name='query',
15+
name='name',
16+
field=models.CharField(max_length=255, unique=True, verbose_name='nom'),
17+
),
18+
migrations.AlterField(
19+
model_name='query',
20+
name='query',
21+
field=models.CharField(max_length=255, unique=True, verbose_name='requête'),
22+
),
23+
]

bnsp/models.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from dateutil import parser as dateparser
12
from django.db import models
23
from django.utils import timezone
34
from django.utils.html import strip_tags
@@ -9,8 +10,8 @@
910

1011
class Query(TimeStampModel):
1112
# A query on the Gallica Search API
12-
name = models.CharField(max_length=255, verbose_name="nom")
13-
query = models.CharField(max_length=255, verbose_name="requête")
13+
name = models.CharField(max_length=255, verbose_name="nom", unique=True)
14+
query = models.CharField(max_length=255, verbose_name="requête", unique=True)
1415

1516
source = models.ForeignKey(
1617
Source, on_delete=models.CASCADE, verbose_name="source associée"
@@ -50,10 +51,11 @@ def run(self, since: str = "") -> None:
5051

5152
search = GallicaSearch(max_records=50)
5253
dated_query = f'({self.query}) and indexationdate > "{indexation_date}"'
53-
response = search.get_records(dated_query)
54+
search.fetch_records(dated_query)
5455

55-
if len(response):
56-
for record in search.records.values():
56+
records = search.get_records().values()
57+
if len(records):
58+
for record in records:
5759
self.create_or_update_document(record)
5860
self.last_change = now
5961

@@ -93,9 +95,12 @@ def create_or_update_document(self, record: Record) -> None:
9395
new_doc.title = strip_tags(record.title[:255])
9496

9597
# Document vintage
96-
year, _ = DataYear.objects.get_or_create(year=record.date)
97-
98-
new_doc.years.add(year)
98+
try:
99+
date = dateparser.parse(record.date).strftime("%Y")
100+
year, _ = DataYear.objects.get_or_create(year=date)
101+
new_doc.years.add(year)
102+
except dateparser._parser.ParserError:
103+
pass
99104

100105
# The description
101106
new_doc.body = ", ".join(record.get_values("dc:subject"))

bnsp/services/gallica_search_api.py

Lines changed: 88 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import logging
22
import time
3+
from django.core.exceptions import ValidationError
4+
from django.core.validators import URLValidator
35
import requests
46

57
import xmltodict
@@ -9,14 +11,18 @@
911

1012
class GallicaSearch:
1113
# API defaults
12-
API_ENDPOINT = "https://gallica.bnf.fr/SRU"
14+
DEFAULT_API_ENDPOINT = "https://gallica.bnf.fr/SRU"
1315
DEFAULT_MAX_RECORDS = 15 # Max: 50
1416
DEFAULT_START_RECORD = 1
1517
DEFAULT_MAX_RETRIES = 3
1618

17-
def __init__(self, max_records: int = DEFAULT_MAX_RECORDS) -> None:
19+
def __init__(
20+
self,
21+
max_records: int = DEFAULT_MAX_RECORDS,
22+
endpoint: str = DEFAULT_API_ENDPOINT,
23+
) -> None:
1824
self.set_max_records(max_records)
19-
25+
self.set_api_endpoint(endpoint)
2026
self.raw_records = []
2127
self.total_records = 0
2228
self.records = {}
@@ -34,6 +40,16 @@ def set_max_records(self, max_records: int) -> None:
3440
else:
3541
self.max_records = self.DEFAULT_MAX_RECORDS
3642

43+
def set_api_endpoint(self, endpoint: str) -> None:
44+
"""
45+
Changes the SRU API endpoint.
46+
47+
param:
48+
- endpoint: the base URL of the SRU on a white-label version of Gallica
49+
(ex: "https://nutrisco-patrimoine.lehavre.fr/SRU")
50+
"""
51+
self.api_endpoint = endpoint
52+
3753
def set_slow_mode(self, slow_mode: float = 0) -> None:
3854
"""
3955
Set the value of the slow_mode parameter (defaults to zero)
@@ -60,14 +76,14 @@ def gallica_search_retrieve(
6076
"query": query,
6177
}
6278

63-
response = requests.get(self.API_ENDPOINT, params=payload)
79+
response = requests.get(self.api_endpoint, params=payload)
6480

6581
retries = self.DEFAULT_MAX_RETRIES
6682
if response.status_code == 500:
6783
while retries:
6884
logging.warning(f"Error 500, retrying (retries: {retries})")
6985
time.sleep(5)
70-
response = requests.get(self.API_ENDPOINT, params=payload)
86+
response = requests.get(self.DEFAULT_API_ENDPOINT, params=payload)
7187
if response.status_code == 200:
7288
break
7389
else:
@@ -90,10 +106,7 @@ def count_records(self, query: str) -> dict:
90106

91107
return {"total_records": self.total_records}
92108

93-
def get_records(
94-
self,
95-
query: str,
96-
) -> dict:
109+
def fetch_records(self, query: str) -> None:
97110
"""
98111
Retrieve the full lists of records for a query.
99112
Calls the gallica_search_retrieve recursively until all results are retrieved.
@@ -134,41 +147,97 @@ def get_records(
134147
else:
135148
logging.info("The research returned no (new) results.")
136149

137-
return {"records": self.raw_records, "total_records": self.total_records}
138-
139150
def parse_records(self) -> None:
140151
for raw in self.raw_records:
152+
raw = dict(raw)
141153
record = Record(raw)
142154
self.records[record.ark_id] = record
143155

156+
def get_records(self) -> dict:
157+
"""
158+
Returns the dict with the records
159+
"""
160+
return self.records
161+
144162

145163
class Record:
146164
def __init__(self, raw: dict) -> None:
147165
self.raw = raw
148-
self.get_ark()
166+
self.set_ark()
167+
self.set_date()
149168
self.ark_id = self.ark_url.split("/")[-1]
150-
self.title = raw["dc:title"]
151-
self.date = raw["dc:date"]
169+
self.title = self.get_first_value("dc:title")
152170

153171
def get_values(self, key: str) -> list:
154172
"""
155173
A datapoint can either be a string or a list of strings.
156174
157175
This function casts everything into a list to streamline the parsing.
176+
177+
It also returns an empty list if the key is missing in the record.
158178
"""
159-
values = self.raw[key]
160-
if isinstance(values, str):
161-
values = [values]
179+
if key in self.raw:
180+
values = self.raw[key]
181+
182+
if isinstance(values, str):
183+
values = [values]
184+
else:
185+
values = []
162186

163187
return values
164188

165-
def get_ark(self) -> str:
189+
def get_first_value(self, key: str) -> str:
190+
"""
191+
Returns the first value, or an empty string if no value is present
192+
"""
193+
values = self.get_values(key)
194+
if len(values):
195+
return values[0]
196+
else:
197+
return ""
198+
199+
def set_date(self) -> None:
200+
"""
201+
Stores the 'dc:date' value as a string
202+
"""
203+
if "dc:date" in self.raw:
204+
date = self.get_first_value("dc:date")
205+
206+
# Manage some common bad values
207+
if date == "[S.d.]":
208+
date = ""
209+
210+
if "à nos jours" in date:
211+
date = ""
212+
213+
# If a date range is provided, only keep the latest year
214+
if "-" in date:
215+
date_range = date.split("-")
216+
date = date_range[-1]
217+
self.date = date
218+
219+
else:
220+
self.date = ""
221+
222+
def set_ark(self, ark_root="https://gallica.bnf.fr/ark") -> str:
166223
ids = self.get_values("dc:identifier")
167224

225+
self.ark_url = ""
226+
# First, check for the proper Gallica ark
168227
for id in ids:
169-
if "https://gallica.bnf.fr/ark" in id:
228+
if ark_root in id:
170229
self.ark_url = id
171230

231+
# Else, check for any URL
232+
if not self.ark_url:
233+
url_validate = URLValidator()
234+
for id in ids:
235+
try:
236+
url_validate(id)
237+
self.ark_url = id
238+
except ValidationError:
239+
pass
240+
172241
return self.ark_url
173242

174243
def get_thumbnail(self) -> str:
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{% extends 'admin/change_form.html' %}
2+
3+
{% block submit_buttons_bottom %}
4+
<div class="submit-row">
5+
<input type="submit" value="Lancer la requête maintenant" name="_force_run_query">
6+
</div>
7+
{{ block.super }}
8+
{% endblock %}

bnsp/tests.py

Lines changed: 0 additions & 20 deletions
This file was deleted.

bnsp/tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)