-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathexternal.go
242 lines (207 loc) · 7.78 KB
/
external.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
// This file is part of go-trafilatura, Go package for extracting readable
// content, comments and metadata from a web page. Source available in
// <https://github.com/markusmobius/go-trafilatura>.
//
// Copyright (C) 2021 Markus Mobius
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Code in this file is ported from <https://github.com/adbar/trafilatura>
// which available under Apache 2.0 license.
package trafilatura
import (
"fmt"
"unicode/utf8"
"github.com/go-shiori/dom"
"github.com/go-shiori/go-readability"
distiller "github.com/markusmobius/go-domdistiller"
"github.com/markusmobius/go-trafilatura/internal/etree"
"github.com/markusmobius/go-trafilatura/internal/selector"
"golang.org/x/net/html"
)
type _FallbackGenerator func() (string, *html.Node)
var tagsToSanitize = sliceToMap(
"aside", "audio", "button", "fieldset", "figure", "footer", "iframe",
"input", "label", "link", "nav", "noindex", "noscript",
"object", "option", "select", "source", "svg", "time",
)
// compareExternalExtraction decide whether to choose own or external extraction based on
// a series of heuristics. In original Trafilatura, they use python-readability and justext,
// while here we use go-readability and go-domdistiller. Since there are difference in
// implementation between them, here we do it a bit differently compared to the original code.
//
// In original Trafilatura, this function is named `compare_extraction`.
func compareExternalExtraction(originalDoc, extractedDoc *html.Node, opts Options) (*html.Node, string) {
// Bypass for favor recall
extractedText := trim(etree.IterText(extractedDoc, " "))
lenExtracted := utf8.RuneCountInString(extractedText)
if opts.Focus == FavorRecall && lenExtracted > opts.Config.MinExtractedSize*10 {
return extractedDoc, extractedText
}
// Convert url to string for logging
var originalUrl string
if opts.OriginalURL != nil {
originalUrl = opts.OriginalURL.String()
}
logInfo(opts, "trying external extractor for url %q", originalUrl)
// Prior cleaning
cleanedDoc := dom.Clone(originalDoc, true)
if opts.Focus == FavorPrecision {
cleanedDoc = pruneUnwantedNodes(cleanedDoc, selector.OverallDiscardedContent)
}
// Process each candidate
for _, generator := range createFallbackGenerators(cleanedDoc, opts) {
// Generate candidate, skip if empty
candidateTitle, candidateDoc := generator()
if candidateDoc == nil {
continue
}
// Extract text from candidate
candidateText := trim(etree.IterText(candidateDoc, " "))
lenCandidate := utf8.RuneCountInString(candidateText)
logInfo(opts, "comparison for %q: candidate %d vs extracted %d",
candidateTitle, lenCandidate, lenExtracted)
// Check if candidate is usable
if candidateIsUsable(candidateDoc, extractedDoc, lenCandidate, lenExtracted, opts) {
extractedDoc, lenExtracted = candidateDoc, lenCandidate
logDebug(opts, "candidate %s is usable", candidateTitle)
}
if lenExtracted >= opts.Config.MinExtractedSize {
logDebug(opts, "candidate %s is used", candidateTitle)
break
}
}
// Final cleaning
sanitizeTree(extractedDoc, opts)
extractedText = trim(etree.IterText(extractedDoc, " "))
return extractedDoc, extractedText
}
func createFallbackGenerators(doc *html.Node, opts Options) []_FallbackGenerator {
// Initial variables
var generators []_FallbackGenerator
var customCandidates []*html.Node
var readabilityCandidate, distillerCandidate *html.Node
if opts.FallbackCandidates != nil {
customCandidates = opts.FallbackCandidates.Others
distillerCandidate = opts.FallbackCandidates.Distiller
readabilityCandidate = opts.FallbackCandidates.Readability
}
// First is the user specified custom candidates.
for i, candidate := range customCandidates {
if candidate == nil {
continue
}
generators = append(generators, func() (string, *html.Node) {
return fmt.Sprintf("Candidate-%d", i), candidate
})
}
// Next is Readability
readabilityTitle := "Readability"
if readabilityCandidate != nil {
generators = append(generators, func() (string, *html.Node) {
return readabilityTitle, readabilityCandidate
})
} else {
generators = append(generators, func() (string, *html.Node) {
result, _ := readability.FromDocument(doc, opts.OriginalURL)
return readabilityTitle, result.Node
})
}
// Last is Dom Distiller
distillerTitle := "Dom Distiller"
if distillerCandidate != nil {
generators = append(generators, func() (string, *html.Node) {
return distillerTitle, distillerCandidate
})
} else {
generators = append(generators, func() (string, *html.Node) {
clone := dom.Clone(doc, true)
result, _ := distiller.Apply(clone, &distiller.Options{
OriginalURL: opts.OriginalURL,
SkipPagination: true})
if result == nil {
return "", nil
}
return distillerTitle, result.Node
})
}
return generators
}
// candidateIsUsable check if the fallback candidate is good enough to use as extraction result.
func candidateIsUsable(candidateDoc, extractedDoc *html.Node, lenCandidate, lenExtracted int, opts Options) bool {
var candidateUsable bool
if lenCandidate == 0 || lenCandidate == lenExtracted {
candidateUsable = false
} else if lenExtracted == 0 && lenCandidate > 0 {
candidateUsable = true
} else if lenExtracted > 2*lenCandidate {
candidateUsable = false
} else if lenCandidate > 2*lenExtracted {
candidateUsable = true
} else {
// Borderline case
extractedHeads := dom.GetElementsByTagName(extractedDoc, "head")
extractedTables := dom.GetElementsByTagName(extractedDoc, "table")
extractedParagraphs := dom.GetElementsByTagName(extractedDoc, "p")
candidateHeadings := dom.QuerySelectorAll(candidateDoc, "h2,h3,h4")
var pTextLength int
for _, p := range extractedParagraphs {
pText := trim(etree.IterText(p, " "))
pTextLength += utf8.RuneCountInString(pText)
}
if pTextLength == 0 && lenCandidate > opts.Config.MinExtractedSize*2 {
candidateUsable = true
} else if len(extractedTables) > len(extractedParagraphs) && lenCandidate > opts.Config.MinExtractedSize*2 {
candidateUsable = true
} else if opts.Focus == FavorRecall && len(extractedHeads) == 0 &&
len(candidateHeadings) > 0 && lenCandidate > lenExtracted {
candidateUsable = true
} else {
candidateUsable = false
}
}
mustFavorRecall := lenExtracted < opts.Config.MinExtractedSize && opts.Focus == FavorRecall
return candidateUsable || mustFavorRecall
}
// sanitizeTree converts and sanitize the output from the generic
// fallback algorithm (post-processing).
func sanitizeTree(tree *html.Node, opts Options) {
// 1. Clean
docCleaning(tree, opts)
subElements := dom.GetElementsByTagName(tree, "*")
for i := len(subElements) - 1; i >= 0; i-- {
elemTag := dom.TagName(subElements[i])
if _, exist := tagsToSanitize[elemTag]; exist {
subElements[i].Parent.RemoveChild(subElements[i])
}
}
if !opts.IncludeLinks {
etree.StripTags(tree, "a")
}
etree.StripTags(tree, "span")
// 2. Sanitize
var sanitizationList []string
uniqueTags := make(map[string]struct{})
for _, node := range dom.GetElementsByTagName(tree, "*") {
tagName := dom.TagName(node)
if _, exist := uniqueTags[tagName]; exist {
continue
}
uniqueTags[tagName] = struct{}{}
if _, exist := validTagCatalog[tagName]; !exist {
sanitizationList = append(sanitizationList, tagName)
}
}
if len(sanitizationList) > 0 {
etree.StripTags(tree, sanitizationList...)
}
}