Skip to content

Commit 506c849

Browse files
form: HTML sanitization and remove Source from CKEditor
1 parent dd6e0b6 commit 506c849

File tree

6 files changed

+52
-15
lines changed

6 files changed

+52
-15
lines changed

cds/modules/deposit/static/json/cds_deposit/forms/project.json

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,6 @@
4545
"Replace",
4646
"-",
4747
"RemoveFormat"
48-
],
49-
[
50-
"Source"
5148
]
5249
],
5350
"disableNativeSpellChecker": false,

cds/modules/deposit/static/json/cds_deposit/forms/video.json

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,6 @@
4444
"Replace",
4545
"-",
4646
"RemoveFormat"
47-
],
48-
[
49-
"Source"
5047
]
5148
],
5249
"disableNativeSpellChecker": false,

cds/modules/records/serializers/json.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
has_read_record_permission,
3333
)
3434
from ..utils import HTMLTagRemover, remove_html_tags
35+
from marshmallow_utils.html import sanitize_html
3536

3637

3738
class CDSJSONSerializer(JSONSerializer):
@@ -69,9 +70,10 @@ def preprocess_record(self, pid, record, links_factory=None):
6970
)
7071

7172
# decode html entities
72-
metadata["description"] = self.html_tag_remover.unescape(
73-
metadata["description"]
74-
)
73+
description = metadata["description"]
74+
description = self.html_tag_remover.unescape(description)
75+
metadata["description"] = sanitize_html(
76+
description)
7577
if has_request_context():
7678
metadata["videos"] = [
7779
video
@@ -101,9 +103,10 @@ def preprocess_search_hit(self, pid, record_hit, links_factory=None):
101103
self.html_tag_remover, title
102104
)
103105

104-
metadata["description"] = self.html_tag_remover.unescape(
105-
metadata["description"]
106-
)
106+
description = metadata["description"]
107+
description = self.html_tag_remover.unescape(description)
108+
metadata["description"] = sanitize_html(
109+
description)
107110
except KeyError:
108111
# ignore error if keys are missing in the metadata
109112
pass

cds/modules/records/serializers/schemas/project.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
from invenio_jsonschemas import current_jsonschemas
2222
from marshmallow import Schema, fields, pre_load, post_load
23+
from marshmallow_utils.fields import SanitizedHTML
2324

2425
from ....deposit.api import Project, deposit_video_resolver
2526
from .common import (
@@ -76,7 +77,7 @@ class ProjectSchema(StrictKeysSchema):
7677
_deposit = fields.Nested(ProjectDepositSchema, required=True)
7778
_cds = fields.Nested(_CDSSSchema, required=True)
7879
title = fields.Nested(TitleSchema, required=True)
79-
description = fields.Str()
80+
description = SanitizedHTML()
8081
category = fields.Str(required=True)
8182
type = fields.Str(required=True)
8283
note = fields.Str()

cds/modules/records/serializers/schemas/video.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
from invenio_jsonschemas import current_jsonschemas
2222
from marshmallow import Schema, fields, pre_load, post_load
23-
23+
from marshmallow_utils.fields import SanitizedHTML
2424
from ....deposit.api import Video
2525
from ..fields.datetime import DateString
2626
from .common import (
@@ -126,7 +126,7 @@ class VideoSchema(StrictKeysSchema):
126126
contributors = fields.Nested(ContributorSchema, many=True, required=True)
127127
copyright = fields.Nested(CopyrightSchema)
128128
date = DateString(required=True)
129-
description = fields.Str(required=True)
129+
description = SanitizedHTML(required=True)
130130
doi = DOI()
131131
duration = fields.Str()
132132
external_system_identifiers = fields.Nested(

tests/unit/test_serializer.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929

3030
from cds.modules.deposit.api import Video
3131
from cds.modules.records.serializers.drupal import VideoDrupal
32+
from cds.modules.records.serializers.json import CDSJSONSerializer
33+
from cds.modules.records.api import CDSRecord
34+
from unittest.mock import Mock
3235
from cds.modules.records.serializers.smil import Smil
3336
from cds.modules.records.serializers.vtt import VTT
3437

@@ -149,3 +152,39 @@ def test_drupal_serializer(video_record_metadata, deposit_metadata):
149152
data = serializer.format()["entries"][0]["entry"]
150153
data = {k: data[k] for k in data if k in expected}
151154
assert data == expected
155+
156+
157+
def test_cds_json_serializer_sanitization(video_record_metadata):
158+
"""Test HTML sanitization in CDSJSONSerializer."""
159+
record = CDSRecord.create(video_record_metadata)
160+
161+
# Add malicious HTML
162+
record['description'] = '<script>alert("xss")</script>Safe content <b>bold</b>'
163+
record['title']['title'] = 'Test <script>alert("title")</script> Title <b>bold</b>'
164+
165+
# Test the serializer
166+
serializer = CDSJSONSerializer()
167+
168+
# Create a mock PID (required by the serializer)
169+
mock_pid = Mock()
170+
mock_pid.pid_value = '1'
171+
172+
# Test preprocess_record method
173+
result = serializer.preprocess_record(mock_pid, record)
174+
175+
# Check sanitization
176+
description = result['metadata']['description']
177+
assert '<script>' not in description
178+
assert '</script>' not in description
179+
assert 'Safe content' in description
180+
# Keep safe HTML tags like <b>
181+
assert '<b>bold</b>' in description
182+
183+
# Remove everything in title
184+
title = result['metadata']['title']['title']
185+
assert '<script>' not in title
186+
assert '</script>' not in title
187+
assert 'Test' in title and 'Title' in title
188+
assert '<b>' not in title
189+
190+

0 commit comments

Comments
 (0)