Skip to content

Commit d0f2dcf

Browse files
committed
Improve stripping HTML
Previous method was a bit scarce on whitespace, so words on different lines / paragraph borders could be mashed together.
1 parent dbaf33e commit d0f2dcf

7 files changed

+49
-7
lines changed

Gemfile

+6
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ gem "mission_control-jobs"
3636
# Reduces boot times through caching; required in config/boot.rb
3737
gem "bootsnap", require: false
3838

39+
# Decode HTML entities
40+
gem "htmlentities"
41+
42+
# Scrub HTML (this is a dependency of rails already, putting it here for compeleteness sake
43+
gem "loofah"
44+
3945
gem "fasp_base", github: "mastodon/fasp_ruby", glob: "fasp_base/*.gemspec"
4046
gem "fasp_data_sharing", github: "mastodon/fasp_ruby", glob: "fasp_data_sharing/*.gemspec"
4147

Gemfile.lock

+3
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ GEM
137137
globalid (1.2.1)
138138
activesupport (>= 6.1)
139139
hashdiff (1.1.2)
140+
htmlentities (4.3.4)
140141
http-2 (1.0.2)
141142
httpx (1.4.1)
142143
http-2 (>= 1.0.0)
@@ -394,8 +395,10 @@ DEPENDENCIES
394395
faker
395396
fasp_base!
396397
fasp_data_sharing!
398+
htmlentities
397399
importmap-rails
398400
jbuilder
401+
loofah
399402
mission_control-jobs
400403
pg (~> 1.1)
401404
propshaft

app/models/actor.rb

+2-5
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class Actor < ApplicationRecord
1919
presence: { if: :discoverable? },
2020
absence: { unless: :discoverable? }
2121

22+
before_validation :set_full_text
2223
after_save :remove_unindexable_content
2324

2425
scope :discoverable, -> { where(discoverable: true) }
@@ -49,7 +50,7 @@ def update_from_json(json_object)
4950

5051
def set_full_text
5152
self.full_text = if discoverable?
52-
[ name, username, stripped_summary ].compact.join(" ")
53+
[ name, username, summary ].compact.join(" ")
5354
else
5455
nil
5556
end
@@ -58,8 +59,4 @@ def set_full_text
5859
def remove_unindexable_content
5960
content_objects.destroy_all unless indexable?
6061
end
61-
62-
def stripped_summary
63-
Rails::Html::FullSanitizer.new.sanitize(summary) if summary.present?
64-
end
6562
end

app/models/concerns/full_text_searchable_concern.rb

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ module FullTextSearchableConcern
22
extend ActiveSupport::Concern
33

44
included do
5-
before_validation :set_full_text
5+
normalizes :full_text, with: StrippedHtmlNormalizer.new
66

77
scope :search, ->(term) {
88
where("to_tsvector(#{table_name}.pg_text_search_configuration, #{table_name}.full_text) @@ to_tsquery(?)", term)

app/models/content_object.rb

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
class ContentObject < ApplicationRecord
2+
include FullTextSearchableConcern
23
include LanguageTaggableConcern
34
include RankableConcern
45

@@ -39,7 +40,7 @@ def json_to_attributes(json_object)
3940
last_edited_at: json_object["updated"] || json_object["published"],
4041
sensitive: json_object["sensitive"],
4142
language: json_object["contentMap"]&.keys&.first || "en",
42-
full_text: Rails::HTML::FullSanitizer.new.sanitize(json_object["content"]),
43+
full_text: json_object["content"],
4344
shares: json_object.dig("shares", "totalItems") || 0,
4445
likes: json_object.dig("likes", "totalItems") || 0,
4546
hashtags: hashtags.map { |name| Hashtag.find_or_initialize_by(name:) },
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
class StrippedHtmlNormalizer
2+
def call(html)
3+
return "" unless html
4+
5+
coder = HTMLEntities.new
6+
html = coder.decode(html)
7+
8+
Loofah.fragment(html).to_text(encode_special_chars: false)
9+
end
10+
end
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
require "test_helper"
2+
3+
class StrippedHtmlNormalizerTest < ActiveSupport::TestCase
4+
setup do
5+
@normalizer = StrippedHtmlNormalizer.new
6+
end
7+
8+
test "#call returns input without HTML tags" do
9+
result = @normalizer.call("<span>test</span>")
10+
11+
assert_equal "test", result
12+
end
13+
14+
test "#call returns whitespace in place of stripped tags" do
15+
result = @normalizer.call("<p>Firstname</p><p>Lastname</p>")
16+
17+
assert_match /Firstname\s+Lastname/, result
18+
end
19+
20+
test "#call decodes html entities" do
21+
result = @normalizer.call("R&amp;D")
22+
23+
assert_equal "R&D", result
24+
end
25+
end

0 commit comments

Comments
 (0)