Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@
"Replace",
"-",
"RemoveFormat"
],
[
"Source"
]
],
"disableNativeSpellChecker": false,
Expand Down
3 changes: 0 additions & 3 deletions cds/modules/deposit/static/json/cds_deposit/forms/video.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@
"Replace",
"-",
"RemoveFormat"
],
[
"Source"
]
],
"disableNativeSpellChecker": false,
Expand Down
62 changes: 38 additions & 24 deletions cds/modules/records/serializers/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
has_read_record_permission,
)
from ..utils import HTMLTagRemover, remove_html_tags
from marshmallow_utils.html import sanitize_html


class CDSJSONSerializer(JSONSerializer):
Expand All @@ -46,6 +47,41 @@ def dump(self, obj, context=None):
"""Serialize object with schema."""
return self.schema_class(context=context).dump(obj)

def _sanitize_metadata(self, metadata):
"""Sanitize title, description and translations in metadata."""
try:
if "title" in metadata and "title" in metadata["title"]:
title = metadata["title"]["title"]
title = self.html_tag_remover.unescape(title)
metadata["title"]["title"] = remove_html_tags(
self.html_tag_remover, title
)

if "description" in metadata:
description = metadata["description"]
description = self.html_tag_remover.unescape(description)
metadata["description"] = sanitize_html(description)

if "translations" in metadata:
for t in metadata["translations"]:
if "title" in t and "title" in t["title"]:
t_title = t["title"]["title"]
t_title = self.html_tag_remover.unescape(t_title)
t["title"]["title"] = remove_html_tags(
self.html_tag_remover, t_title
)

if "description" in t:
t_desc = t["description"]
t_desc = self.html_tag_remover.unescape(t_desc)
t["description"] = sanitize_html(t_desc)

except KeyError:
# ignore error if keys are missing
pass

return metadata

def preprocess_record(self, pid, record, links_factory=None):
"""Include ``_eos_library_path`` for single record retrievals."""
result = super(CDSJSONSerializer, self).preprocess_record(
Expand All @@ -62,16 +98,7 @@ def preprocess_record(self, pid, record, links_factory=None):

# sanitize title by unescaping and stripping html tags
try:
title = metadata["title"]["title"]
title = self.html_tag_remover.unescape(title)
metadata["title"]["title"] = remove_html_tags(
self.html_tag_remover, title
)

# decode html entities
metadata["description"] = self.html_tag_remover.unescape(
metadata["description"]
)
metadata = self._sanitize_metadata(metadata)
if has_request_context():
metadata["videos"] = [
video
Expand All @@ -93,19 +120,6 @@ def preprocess_search_hit(self, pid, record_hit, links_factory=None):

if "metadata" in result:
metadata = result["metadata"]

try:
title = metadata["title"]["title"]
title = self.html_tag_remover.unescape(title)
metadata["title"]["title"] = remove_html_tags(
self.html_tag_remover, title
)

metadata["description"] = self.html_tag_remover.unescape(
metadata["description"]
)
except KeyError:
# ignore error if keys are missing in the metadata
pass
result["metadata"] = self._sanitize_metadata(result["metadata"])

return result
3 changes: 2 additions & 1 deletion cds/modules/records/serializers/schemas/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from marshmallow import RAISE, Schema, ValidationError, fields, validates_schema
from marshmallow.validate import Length
from marshmallow_utils.fields import SanitizedHTML

from ...api import Keyword
from ...resolver import keyword_resolver
Expand Down Expand Up @@ -140,7 +141,7 @@ class TranslationsSchema(StrictKeysSchema):
"""Translations schema."""

title = fields.Nested(TitleSchema)
description = fields.Str()
description = SanitizedHTML()
language = fields.Str()


Expand Down
3 changes: 2 additions & 1 deletion cds/modules/records/serializers/schemas/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from invenio_jsonschemas import current_jsonschemas
from marshmallow import Schema, fields, pre_load, post_load
from marshmallow_utils.fields import SanitizedHTML

from ....deposit.api import Project, deposit_video_resolver
from .common import (
Expand Down Expand Up @@ -76,7 +77,7 @@ class ProjectSchema(StrictKeysSchema):
_deposit = fields.Nested(ProjectDepositSchema, required=True)
_cds = fields.Nested(_CDSSSchema, required=True)
title = fields.Nested(TitleSchema, required=True)
description = fields.Str()
description = SanitizedHTML()
category = fields.Str(required=True)
type = fields.Str(required=True)
note = fields.Str()
Expand Down
4 changes: 2 additions & 2 deletions cds/modules/records/serializers/schemas/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from invenio_jsonschemas import current_jsonschemas
from marshmallow import Schema, fields, pre_load, post_load

from marshmallow_utils.fields import SanitizedHTML
from ....deposit.api import Video
from ..fields.datetime import DateString
from .common import (
Expand Down Expand Up @@ -126,7 +126,7 @@ class VideoSchema(StrictKeysSchema):
contributors = fields.Nested(ContributorSchema, many=True, required=True)
copyright = fields.Nested(CopyrightSchema)
date = DateString(required=True)
description = fields.Str(required=True)
description = SanitizedHTML(required=True)
doi = DOI()
duration = fields.Str()
external_system_identifiers = fields.Nested(
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ lxml_html_clean==0.4.1
Mako==1.3.8
MarkupSafe==3.0.2
marshmallow==3.23.1
marshmallow-utils==0.13.0
matplotlib-inline==0.1.7
maxminddb==2.6.2
maxminddb-geolite2==2018.703
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ install_requires =
invenio-sequencegenerator==1.0.0a3
requests-toolbelt>=1.0.0,<2.0.0
python-ldap>=3.4.0,<3.5.0
marshmallow-utils>=0.13.0,<1.0.0

[options.extras_require]
tests =
Expand Down
59 changes: 59 additions & 0 deletions tests/unit/test_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@

from cds.modules.deposit.api import Video
from cds.modules.records.serializers.drupal import VideoDrupal
from cds.modules.records.serializers.json import CDSJSONSerializer
from cds.modules.records.api import CDSRecord
from unittest.mock import Mock
from cds.modules.records.serializers.smil import Smil
from cds.modules.records.serializers.vtt import VTT

Expand Down Expand Up @@ -149,3 +152,59 @@ def test_drupal_serializer(video_record_metadata, deposit_metadata):
data = serializer.format()["entries"][0]["entry"]
data = {k: data[k] for k in data if k in expected}
assert data == expected


def test_cds_json_serializer_sanitization(video_record_metadata):
"""Test HTML sanitization in CDSJSONSerializer."""
record = CDSRecord.create(video_record_metadata)

# Add malicious HTML
record['description'] = '<script>alert("xss")</script>Safe content <b>bold</b>'
record['title']['title'] = 'Test <script>alert("title")</script> Title <b>bold</b>'
record['translations'] = [
{
'language': 'en',
'description': '<script>alert("desc")</script>Translated <i>italic</i>',
'title': {'title': '<b>Translated</b> <script>alert("title")</script> Title'}
},
{
'language': 'fr',
'description': 'Bonjour <script>alert("desc")</script> <u>underline</u>',
'title': {'title': '<script>alert("bad")</script> Titre'}
}
]

# Test the serializer
serializer = CDSJSONSerializer()

# Create a mock PID (required by the serializer)
mock_pid = Mock()
mock_pid.pid_value = '1'

# Test preprocess_record method
result = serializer.preprocess_record(mock_pid, record)

# Check sanitization
description = result['metadata']['description']
assert '<script>' not in description
assert '</script>' not in description
assert 'Safe content' in description
# Keep safe HTML tags like <b>
assert '<b>bold</b>' in description

# Remove everything in title
title = result['metadata']['title']['title']
assert '<script>' not in title
assert '</script>' not in title
assert 'Test' in title and 'Title' in title
assert '<b>' not in title

# --- Translations checks ---
translations = result['metadata']['translations']
for tr in translations:
# description
assert '<script>' not in tr['description']
# title
assert '<script>' not in tr['title']['title']
assert '<b>' not in tr['title']['title']