Skip to content

Commit 7026706

Browse files
committed
Update Godot docs to include v4.2 and fix older version scraping
Godot 4.2 was released on Nov 2023. Since v3.5, godot upstream docs have used some new HTML layouts and file structures. Since older versions still need to be scrapable, this introduces another set of versioned filters for godot. This change also makes small changes to the filters to handle the current website markup for the previous godot doc versions. The @GDscript and @globalscope entries can't currently be browsed, because of an encoding mismatch between the frontend and backend; I've identified a possible fix for that but will PR that separately.
1 parent 62e2723 commit 7026706

File tree

6 files changed

+126
-28
lines changed

6 files changed

+126
-28
lines changed

lib/docs/filters/godot/clean_html.rb

+20-3
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@ def call
66
at_css('h1').content = 'Godot Engine'
77
at_css('.admonition.note').remove
88
end
9+
css('.admonition-grid').remove
910

10-
css('ul[id].simple li:first-child:last-child').each do |node|
11+
css('p[id]').each do |node|
1112
heading = Nokogiri::XML::Node.new 'h3', doc.document
12-
heading['id'] = node.parent['id']
13+
heading['id'] = node['id']
1314
heading.children = node.children
14-
node.parent.before(heading).remove
15+
node.before(heading).remove
1516
end
1617

1718
css('h3 strong').each do |node|
@@ -20,6 +21,22 @@ def call
2021

2122
css('a.reference').remove_attr('class')
2223

24+
# flatten gdscript+C# example blocks and add language name.
25+
css('div[role="tabpanel"]').each do |node|
26+
language_label = Nokogiri::XML::Node.new 'strong', doc.document
27+
language_name = 'GDScript' if node.at_css('div.highlight-gdscript')
28+
language_name = 'C#' if node.at_css('div.highlight-csharp')
29+
language_label.content = language_name.to_s
30+
31+
node.before(language_label)
32+
node.before(node.children).remove
33+
end
34+
35+
css('div.sphinx-tabs [role="tablist"]').remove
36+
37+
# remove the remotely hosted "percent-translated" badge
38+
css('a[href^="https://hosted.weblate"]').remove if root_page?
39+
2340
doc
2441
end
2542
end

lib/docs/filters/godot/clean_html_v2.rb

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ class CleanHtmlV2Filter < Filter
44
def call
55
if root_page?
66
at_css('h1').content = 'Godot Engine'
7-
at_css('.admonition.tip').remove
7+
at_css('.admonition.caution').remove
88
end
99

1010
css('ul[id].simple li:first-child:last-child').each do |node|
+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
module Docs
2+
class Godot
3+
class CleanHtmlV3Filter < Filter
4+
def call
5+
if root_page?
6+
at_css('h1').content = 'Godot Engine'
7+
at_css('.admonition.caution').remove
8+
end
9+
10+
css('ul[id].simple li:first-child:last-child').each do |node|
11+
heading = Nokogiri::XML::Node.new 'h3', doc.document
12+
heading['id'] = node.parent['id']
13+
heading.children = node.children
14+
node.parent.before(heading).remove
15+
end
16+
17+
css('h3 strong').each do |node|
18+
node.before(node.children).remove
19+
end
20+
21+
css('a.reference').remove_attr('class')
22+
23+
doc
24+
end
25+
end
26+
end
27+
end

lib/docs/filters/godot/entries.rb

+4-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def get_type
1111
if slug.start_with?('getting_started')
1212
# Getting started sections are different even between different minor
1313
# versions from v3 so we're programmatically generating them instead.
14-
"Getting started: " + slug.split('/')[1].tr_s('_', ' ').capitalize
14+
'Getting started: ' + slug.split('/')[1].tr_s('_', ' ').capitalize
1515
else
1616
name
1717
end
@@ -20,9 +20,10 @@ def get_type
2020
def additional_entries
2121
return [] unless slug.start_with?('classes')
2222

23-
css('.simple[id]').each_with_object [] do |node, entries|
23+
css('p[id]').each_with_object [] do |node, entries|
2424
name = node.at_css('strong').content
2525
next if name == self.name
26+
2627
name.prepend "#{self.name}."
2728
name << '()'
2829
entries << [name, node['id']] unless entries.any? { |entry| entry[0] == name }
@@ -32,6 +33,7 @@ def additional_entries
3233
def include_default_entry?
3334
return false if subpath.start_with?('getting_started') && subpath.end_with?('index.html')
3435
return false if subpath == 'classes/index.html'
36+
3537
true
3638
end
3739
end

lib/docs/filters/godot/entries_v3.rb

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
module Docs
2+
class Godot
3+
class EntriesV3Filter < Docs::EntriesFilter
4+
def get_name
5+
name = at_css('h1').content
6+
name.remove! "\u{00B6}" # Remove the pilcrow
7+
name
8+
end
9+
10+
def get_type
11+
if slug.start_with?('getting_started')
12+
# Getting started sections are different even between different minor
13+
# versions from v3 so we're programmatically generating them instead.
14+
"Getting started: " + slug.split('/')[1].tr_s('_', ' ').capitalize
15+
else
16+
name
17+
end
18+
end
19+
20+
def additional_entries
21+
return [] unless slug.start_with?('classes')
22+
23+
css('.simple[id]').each_with_object [] do |node, entries|
24+
name = node.at_css('strong').content
25+
next if name == self.name
26+
name.prepend "#{self.name}."
27+
name << '()'
28+
entries << [name, node['id']] unless entries.any? { |entry| entry[0] == name }
29+
end
30+
end
31+
32+
def include_default_entry?
33+
return false if subpath.start_with?('getting_started') && subpath.end_with?('index.html')
34+
return false if subpath == 'classes/index.html'
35+
true
36+
end
37+
end
38+
end
39+
end

lib/docs/scrapers/godot.rb

+35-22
Original file line numberDiff line numberDiff line change
@@ -5,59 +5,72 @@ class Godot < UrlScraper
55
home: 'https://godotengine.org/',
66
code: 'https://github.com/godotengine/godot'
77
}
8+
# godot docs since 3.5 don't link everything from the index.
9+
self.initial_paths = %w[
10+
getting_started/introduction/index.html
11+
getting_started/step_by_step/index.html
12+
classes/index.html
13+
]
814

9-
options[:container] = '.document .section'
10-
15+
options[:container] = '.document > [itemprop="articleBody"]'
1116
options[:download_images] = false
12-
options[:only_patterns] = [/\Agetting_started\//, /\Aclasses\//]
17+
options[:only_patterns] = [%r{\Agetting_started/}, %r{\Aclasses/}]
18+
19+
options[:attribution] = <<-HTML
20+
&copy; 2014&ndash;present Juan Linietsky, Ariel Manzur and the Godot community<br>
21+
Licensed under the Creative Commons Attribution Unported License v3.0.
22+
HTML
1323

14-
options[:attribution] = ->(filter) do
15-
if filter.subpath.start_with?('classes')
16-
<<-HTML
17-
&copy; 2014&ndash;2022 Juan Linietsky, Ariel Manzur, Godot Engine contributors<br>
18-
Licensed under the MIT License.
19-
HTML
20-
else
21-
<<-HTML
22-
&copy; 2014&ndash;2022 Juan Linietsky, Ariel Manzur and the Godot community<br>
23-
Licensed under the Creative Commons Attribution Unported License v3.0.
24-
HTML
25-
end
24+
version '4.2' do
25+
self.release = '4.2.2'
26+
self.base_url = "https://docs.godotengine.org/en/#{self.version}/"
27+
html_filters.push 'godot/entries', 'godot/clean_html', 'sphinx/clean_html'
2628
end
2729

2830
version '3.5' do
29-
self.release = '3.5.1'
31+
self.release = '3.5.3'
3032
self.base_url = "https://docs.godotengine.org/en/#{self.version}/"
31-
options[:container] = '.document > [itemprop="articleBody"] > section[id]'
33+
34+
# godot 3.5 upstream docs are formatted like godot4
3235
html_filters.push 'godot/entries', 'godot/clean_html', 'sphinx/clean_html'
3336
end
3437

3538
version '3.4' do
3639
self.release = '3.4.5'
3740
self.base_url = "https://docs.godotengine.org/en/#{self.version}/"
41+
3842
options[:container] = '.document > [itemprop="articleBody"] > section[id]'
39-
html_filters.push 'godot/entries', 'godot/clean_html', 'sphinx/clean_html'
43+
html_filters.push 'godot/entries_v3', 'godot/clean_html_v3', 'sphinx/clean_html'
4044
end
4145

4246
version '3.3' do
4347
self.release = '3.3.0'
4448
self.base_url = "https://docs.godotengine.org/en/#{self.version}/"
45-
html_filters.push 'godot/entries', 'godot/clean_html', 'sphinx/clean_html'
49+
self.initial_paths = %w[/index.html]
50+
51+
options[:only_patterns] = [%r{\Aclasses/}]
52+
options[:container] = '.document .section'
53+
html_filters.push 'godot/entries_v3', 'godot/clean_html_v3', 'sphinx/clean_html'
4654
end
4755

4856
version '3.2' do
4957
self.release = '3.2.3'
5058
self.base_url = "https://docs.godotengine.org/en/#{self.version}/"
51-
html_filters.push 'godot/entries', 'godot/clean_html', 'sphinx/clean_html'
59+
self.initial_paths = %w[/index.html]
60+
61+
options[:only_patterns] = [%r{\Aclasses/}]
62+
options[:container] = '.document .section'
63+
html_filters.push 'godot/entries_v3', 'godot/clean_html_v3', 'sphinx/clean_html'
5264
end
5365

5466
version '2.1' do
5567
self.release = '2.1.6'
5668
self.base_url = "https://docs.godotengine.org/en/#{self.version}/"
69+
self.initial_paths = %w[/index.html]
5770

5871
options[:skip] = %w(classes/class_@global\ scope.html)
59-
options[:only_patterns] = [/\Alearning\//, /\Aclasses\//]
60-
72+
options[:only_patterns] = [%r{\Alearning/}, %r{\Aclasses/}]
73+
options[:container] = '.document .section'
6174
html_filters.push 'godot/entries_v2', 'godot/clean_html_v2', 'sphinx/clean_html'
6275
end
6376

0 commit comments

Comments
 (0)