Skip to content

Commit 0982c60

Browse files
authored
XML support (#2505)
* Add support for XML content-type in API * Add management command to export to s3 the ERPs for datatourism in XML * use freezegun to freeze time in tests * use activities id * Use chunks and multipart upload to avoid RAM pb * Use distinct delay for XML files cleaning
1 parent 3a9419e commit 0982c60

File tree

10 files changed

+295
-17
lines changed

10 files changed

+295
-17
lines changed

.isort.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
[settings]
2-
known_third_party = Levenshtein,admin_auto_filters,autoslug,boto3,celery,colorama,corsheaders,dateutil,deepl,dj_static,django,django_better_admin_arrayfield,django_otp,django_registration,django_summernote,environ,factory,faker,frictionless,fuzz,fuzzywuzzy,geopy,ijson,import_export,magic_profanity,modeltranslation,outscraper,pandas,phonenumbers,pytest,pytest_factoryboy,requests,rest_framework,rest_framework_api_key,rest_framework_gis,reversion,schedule,scrapfly,sentry_sdk,sib_api_v3_sdk,six,splinter,stdnum,two_factor,waffle
2+
known_third_party = Levenshtein,admin_auto_filters,autoslug,boto3,celery,colorama,corsheaders,dateutil,deepl,dj_static,django,django_better_admin_arrayfield,django_otp,django_registration,django_summernote,environ,factory,faker,freezegun,frictionless,fuzz,fuzzywuzzy,geopy,ijson,import_export,magic_profanity,modeltranslation,outscraper,pandas,phonenumbers,pytest,pytest_factoryboy,requests,rest_framework,rest_framework_api_key,rest_framework_gis,rest_framework_xml,reversion,schedule,scrapfly,sentry_sdk,sib_api_v3_sdk,six,splinter,stdnum,two_factor,waffle

.talismanrc

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@ fileignoreconfig:
33
ignore_detectors: [filecontent]
44
- filename: pnpm-lock.yaml
55
ignore_detectors: [filecontent]
6+
- filename: .isort.cfg
7+
ignore_detectors: [filecontent]
68
- filename: compte/forms.py
79
checksum: 028d3dedb8a51279d03b9c3c4ca6729a7e1cb3c5b96ce653fb6171517f5f238e
810
- filename: templates/contrib/7-commentaire.html
911
checksum: 46dbc17a97dfe7ffb619c0281c4989d02ac40a17603b696701d00132384a9dbf
1012
- filename: erp/export/static/schema.json
11-
checksum: 2e3f20cf7ed1d425786c05f1deb3521af30d80c3e2c86f55861bab785a4fc111
12-
- filename: .isort.cfg
1313
checksum: 888e09b1fe8ead28a4561b05842cd852be1ca8a3b86264aa48469514f93cfdd2
1414
- filename: templates/contrib/6-accueil.html
1515
checksum: 6a2b50478a202eb35bf49d50ff156110eeed68287d545626753b7343bd3e3074
@@ -40,8 +40,12 @@ fileignoreconfig:
4040
- filename: .env.sample
4141
checksum: 87369459b8e37ac7fbef66697f423141bb68771f33e3f33f0a178b083aa5102b
4242
- filename: core/settings.py
43-
checksum: 5e60668dd84288a34fc1193692c4312812318bfe5bbdb55ba5e5f42f6dc2b6d2
43+
checksum: 268d050e134c6268531c274bf08ab0dbe11da7901dd2a3f9e9345112125fbf21
4444
- filename: pnpm-lock.yaml
4545
checksum: bb30cc46e656283f12141506f19deceef6f4ead92d12d23260fa94b79584ece4
4646
- filename: templates/erp/includes/widget_block.html
4747
checksum: 5fb87dbb1de0ac79bd8cd9ad167c01b67ad3bfd6fad61679188fa8e996e0576e
48+
- filename: erp/management/commands/export_XML_to_s3.py
49+
checksum: 683b24613d882c234e8b04a674cb5ae6ee478bb877dc6bdfb223475d8cb2b6aa
50+
- filename: erp/management/commands/clean_S3_export_bucket.py
51+
checksum: a6049e2ff31893e3ce4a9f4f661f0af25045492c91bf600c4d2485a845770b97

api/views.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,14 @@
3232
{settings.SITE_NAME.title()} is exposing a public [API](https://en.wikipedia.org/wiki/API)
3333
allowing to programmatically query its database. This API embraces the
3434
[REST paradigm](https://en.wikipedia.org/wiki/Representational_state_transfer) as much as possible and
35-
exposes results in [JSON](https://en.wikipedia.org/wiki/JavaScript_Object_Notation) or [geoJSON](https://en.wikipedia.org/wiki/GeoJSON) format.
35+
exposes results in [JSON](https://en.wikipedia.org/wiki/JavaScript_Object_Notation), [geoJSON](https://en.wikipedia.org/wiki/GeoJSON), or [XML](https://en.wikipedia.org/wiki/XML) format.
3636
3737
The API root entry point can be accessed at
3838
[`{settings.SITE_ROOT_URL}/api/`]({settings.SITE_ROOT_URL}/api/):
3939
- An HTML view is presented when requested through a web browser,
4040
- A response of type `application/json` is returned by default.
4141
- A response of type `application/geo+json` is returned if explicitly requested by the client and if available.
42+
- A response of type `application/xml` is returned if explicitly requested by the client and if available.
4243
## Identification
4344
4445
If you want to use our API, we can provide you with a key, to attach to each request to the API via the following header:

core/settings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@
188188
"DEFAULT_RENDERER_CLASSES": [
189189
"rest_framework.renderers.JSONRenderer",
190190
"api.renderers.GeoJSONRenderer",
191+
"rest_framework_xml.renderers.XMLRenderer",
191192
"rest_framework.renderers.BrowsableAPIRenderer",
192193
],
193194
}

erp/management/commands/clean_S3_export_bucket.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
class Command(BaseCommand):
14-
help = "Clean the S3 bucket storing search results exports."
14+
help = "Clean the S3 bucket storing search results exports, XML exports for datatourisme."
1515

1616
@monitor(monitor_slug="clean_S3_export_bucket")
1717
def handle(self, *args, **kwargs):
@@ -22,9 +22,18 @@ def handle(self, *args, **kwargs):
2222
files_to_delete = []
2323

2424
for obj in response.get("Contents", []):
25-
# NOTE: The presigned URLs are generated with a 24-hour validity duration; set it to 25 to avoid overlap.
26-
if obj["LastModified"] < now - timedelta(hours=25):
27-
files_to_delete.append({"Key": obj["Key"]})
25+
key = obj["Key"]
26+
last_modified = obj["LastModified"]
27+
self.stdout.write(f"Checking {key} last modified {last_modified}")
28+
29+
if key.endswith(".xml"):
30+
# NOTE: the presigned URLs are generated with a 7-day validity duration; set it to 8 to avoid overlap.
31+
if last_modified < now - timedelta(days=8):
32+
files_to_delete.append({"Key": key})
33+
else:
34+
# NOTE: The presigned URLs are generated with a 24-hour validity duration; set it to 25 to avoid overlap.
35+
if last_modified < now - timedelta(hours=25):
36+
files_to_delete.append({"Key": key})
2837

2938
if files_to_delete:
3039
s3.delete_objects(Bucket=bucket_name, Delete={"Objects": files_to_delete})
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
import re
2+
from datetime import datetime, timezone
3+
4+
import boto3
5+
from django.conf import settings
6+
from django.core.management.base import BaseCommand
7+
from rest_framework.request import Request
8+
from rest_framework.test import APIRequestFactory
9+
from rest_framework_xml.renderers import XMLRenderer
10+
11+
from api.serializers import ErpSerializer
12+
from erp.models import Erp
13+
14+
CHUNK_SIZE = 1000
15+
16+
ACTIVITIES = [
17+
255,
18+
184,
19+
4,
20+
6,
21+
1,
22+
293,
23+
279,
24+
14,
25+
257,
26+
406,
27+
223,
28+
22,
29+
243,
30+
247,
31+
27,
32+
29,
33+
253,
34+
33,
35+
34,
36+
305,
37+
38,
38+
203,
39+
40,
40+
339,
41+
41,
42+
42,
43+
43,
44+
141,
45+
298,
46+
50,
47+
51,
48+
285,
49+
450,
50+
297,
51+
63,
52+
64,
53+
375,
54+
307,
55+
206,
56+
79,
57+
62,
58+
407,
59+
405,
60+
224,
61+
85,
62+
86,
63+
84,
64+
415,
65+
95,
66+
252,
67+
100,
68+
103,
69+
109,
70+
236,
71+
112,
72+
211,
73+
411,
74+
113,
75+
385,
76+
117,
77+
209,
78+
380,
79+
292,
80+
125,
81+
432,
82+
417,
83+
231,
84+
294,
85+
310,
86+
135,
87+
362,
88+
137,
89+
145,
90+
147,
91+
269,
92+
148,
93+
152,
94+
153,
95+
258,
96+
445,
97+
413,
98+
163,
99+
165,
100+
295,
101+
171,
102+
172,
103+
177,
104+
396,
105+
220,
106+
173,
107+
174,
108+
175,
109+
176,
110+
178,
111+
168,
112+
182,
113+
183,
114+
185,
115+
381,
116+
194,
117+
371,
118+
196,
119+
276,
120+
392,
121+
414,
122+
429,
123+
]
124+
125+
126+
class Command(BaseCommand):
127+
help = "Export ERP in XML format to S3"
128+
129+
def handle(self, *args, **options):
130+
now = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
131+
132+
server_name = settings.SITE_HOST
133+
factory = APIRequestFactory(SERVER_NAME=server_name)
134+
fake_request = Request(factory.get("/"))
135+
136+
qs = (
137+
Erp.objects.published()
138+
.select_related("user", "accessibilite", "activite")
139+
.filter(activite__id__in=ACTIVITIES)
140+
.order_by("id")
141+
)
142+
total = qs.count()
143+
self.stdout.write(f"{total} ERPs to export...")
144+
145+
bucket_name = settings.S3_EXPORT_BUCKET_NAME
146+
file_name = f"export_{now}_full.xml"
147+
148+
s3 = boto3.client(
149+
"s3",
150+
endpoint_url=settings.S3_EXPORT_BUCKET_ENDPOINT_URL,
151+
)
152+
153+
# Initialize the multipart upload
154+
mpu = s3.create_multipart_upload(Bucket=bucket_name, Key=file_name, ContentType="application/xml")
155+
upload_id = mpu["UploadId"]
156+
parts = []
157+
part_number = 1
158+
buffer = b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
159+
renderer = XMLRenderer()
160+
161+
try:
162+
for offset in range(0, total, CHUNK_SIZE):
163+
batch = qs[offset : offset + CHUNK_SIZE]
164+
self.stdout.write(f"Serialize {offset}-{offset + CHUNK_SIZE}/{total}...")
165+
166+
data = ErpSerializer(batch, many=True, context={"request": fake_request}).data
167+
xml_str = renderer.render(data, accepted_media_type="application/xml", renderer_context={})
168+
if isinstance(xml_str, str):
169+
xml_str = xml_str.encode("utf-8")
170+
171+
# using ErpSerializer is serializing into <root> tag, so we need to remove it, to append it to our buffer
172+
inner = re.search(rb"<root>(.*)</root>", xml_str, re.DOTALL)
173+
if inner:
174+
buffer += inner.group(1)
175+
176+
# S3 multipart requires parts >= 5MB (except the last one)
177+
if len(buffer) >= 5 * 1024 * 1024:
178+
self.stdout.write(f"Upload part {part_number} ({offset}-{offset + CHUNK_SIZE}/{total})...")
179+
response = s3.upload_part(
180+
Bucket=bucket_name,
181+
Key=file_name,
182+
PartNumber=part_number,
183+
UploadId=upload_id,
184+
Body=buffer,
185+
)
186+
parts.append({"PartNumber": part_number, "ETag": response["ETag"]})
187+
part_number += 1
188+
buffer = b""
189+
190+
# Remaining part
191+
buffer += "</root>".encode("utf-8")
192+
response = s3.upload_part(
193+
Bucket=bucket_name,
194+
Key=file_name,
195+
PartNumber=part_number,
196+
UploadId=upload_id,
197+
Body=buffer,
198+
)
199+
parts.append({"PartNumber": part_number, "ETag": response["ETag"]})
200+
201+
s3.complete_multipart_upload(
202+
Bucket=bucket_name,
203+
Key=file_name,
204+
UploadId=upload_id,
205+
MultipartUpload={"Parts": parts},
206+
)
207+
208+
except Exception as e:
209+
s3.abort_multipart_upload(Bucket=bucket_name, Key=file_name, UploadId=upload_id)
210+
raise e
211+
212+
file_url = s3.generate_presigned_url(
213+
"get_object",
214+
Params={"Bucket": bucket_name, "Key": file_name},
215+
ExpiresIn=604800,
216+
)
217+
218+
self.stdout.write(self.style.SUCCESS(f"Export terminated, link available during 7 days: {file_url}"))

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ dependencies = [
6262
"magic-profanity",
6363
"django-summernote",
6464
"pre-commit>=4.5.1",
65+
"djangorestframework-xml>=2.0.0",
6566
]
6667

6768
[dependency-groups]
@@ -82,6 +83,7 @@ dev = [
8283
"legacy-cgi",
8384
"django-deep-translator",
8485
"pytest-socket",
86+
"freezegun>=1.5.5",
8587
]
8688

8789
[build-system]

tests/api/tests.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,13 @@ def test_list_geojson(self, api_client, initial_erp):
176176
)
177177
assert response.json() == geojson_expected_for_no_results
178178

179+
def test_list_xml(self, api_client, initial_erp):
180+
response = api_client.get(reverse("erp-list") + "?format=xml")
181+
assert response.status_code == 200
182+
assert response["Content-Type"] == "application/xml; charset=utf-8"
183+
content = response.content.decode("utf-8")
184+
assert "<erp>" in content
185+
179186
def test_list_can_show_drafts(self, api_client, initial_erp):
180187
ErpFactory(published=False)
181188

tests/erp/test_export.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from django.contrib.gis.geos import Point
1515
from django.core import management
1616
from django.core.management import call_command
17+
from freezegun import freeze_time
1718

1819
from erp.export.export import export_schema_to_csv
1920
from erp.export.generate_schema import generate_schema
@@ -218,7 +219,11 @@ def test_generate_schema(db, activite):
218219
os.remove(test_schema.name)
219220

220221

222+
CURRENT_TIME = datetime(2024, 10, 1, tzinfo=timezone.utc)
223+
224+
221225
@pytest.mark.django_db
226+
@freeze_time(CURRENT_TIME)
222227
@patch("core.mailer.BrevoMailer.send_email")
223228
@patch("boto3.client")
224229
def test_generate_csv_export(mock_boto_client, mock_send_email):
@@ -270,15 +275,9 @@ def test_generate_csv_export(mock_boto_client, mock_send_email):
270275
)
271276

272277

273-
CURRENT_TIME = datetime(2024, 10, 1, tzinfo=timezone.utc)
274-
275-
276278
@patch("boto3.client")
277-
@patch("datetime.datetime")
278-
def test_clean_s3_export_bucket(mock_datetime, mock_boto_client):
279-
mock_datetime.now.return_value = CURRENT_TIME
280-
mock_datetime.side_effect = lambda *args, **kw: datetime(*args, **kw)
281-
279+
@freeze_time(CURRENT_TIME)
280+
def test_clean_s3_export_bucket(mock_boto_client):
282281
mock_s3 = MagicMock()
283282
mock_boto_client.return_value = mock_s3
284283

0 commit comments

Comments
 (0)