-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcore-options.go
199 lines (160 loc) · 7.18 KB
/
core-options.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
// This file is part of go-trafilatura, Go package for extracting readable
// content, comments and metadata from a web page. Source available in
// <https://github.com/markusmobius/go-trafilatura>.
//
// Copyright (C) 2021 Markus Mobius
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Code in this file is ported from <https://github.com/adbar/trafilatura>
// which available under Apache 2.0 license.
package trafilatura
import (
nurl "net/url"
"github.com/markusmobius/go-htmldate"
"golang.org/x/net/html"
)
// ExtractionFocus specify the focus of extraction.
type ExtractionFocus uint8
const (
// Balanced is the middle ground.
Balanced ExtractionFocus = iota
// FavorRecall makes the extractor extracts more text, even when unsure.
FavorRecall
// FavorPrecision makes the extractor extracts less text, but usually more precise.
FavorPrecision
)
// HtmlDateMode specify the mode of publish date extractor using HtmlDate package.
type HtmlDateMode uint8
const (
// In Default mode, HtmlDate will be run based on whether fallback is enabled or not.
// If fallback is enabled, HtmlDate will be run on `Extensive` mode. If fallback is
// disabled, HtmlDate will be run on `Fast` mode.
Default HtmlDateMode = iota
// In Fast mode, publish date will be extracted from entire document by using HtmlDate,
// but without using external DateParser package. Thank to this the date extraction is
// quite fast, but it can't detect string in non English language.
Fast
// In Extensive mode, publish date will be extracted from entire document by using
// HtmlDate, utilizing the external DateParser package. Thank to this the date
// extraction is pretty accurate and can detect foreign language, but it use a lot
// of RegEx which is slow in Go.
Extensive
// If Disabled, publish date will only extracted from metadata and not scanned from
// the entire document. Thanks to this content extraction will be fast, but the
// publish date might be missing or inaccurate. Use it if you only care about the
// content and not the publish date.
Disabled
)
// Options is configuration for the extractor.
type Options struct {
// Config is the advanced configuration to fine tune the
// extraction result. Keep it as nil to use default config.
Config *Config
// OriginalURL is the original URL of the page. Might be overwritten by URL in metadata.
OriginalURL *nurl.URL
// TargetLanguage is ISO 639-1 language code to make the extractor only process web page that
// uses the specified language.
TargetLanguage string
// If EnableFallback is true, then whenever Trafilatura failed to extract a document,
// it will use algorithm from another package, i.e. Readability and Dom Distiller.
// This will make the extraction result more precise, but also a bit slower.
EnableFallback bool
// FallbackCandidates is user specified candidates that will be checked by Trafilatura
// when EnableFallback set to True. This is useful if user already use Readability
// and Dom Distiller before, or if user want to provide his own candidates. As mentioned
// before, it will only used if `EnableFallback = true`.
FallbackCandidates *FallbackCandidates
// Focus specify the extraction behavior of Trafilatura.
Focus ExtractionFocus
// ExcludeComments specify whether to exclude comments from the extraction result.
ExcludeComments bool
// ExcludeTables specify whether to exclude information within the HTML <table> element.
ExcludeTables bool
// IncludeImages specify whether the extraction result will include images (experimental).
IncludeImages bool
// IncludeLinks specify whether the extraction result will include links along with their
// targets (experimental).
IncludeLinks bool
// BlacklistedAuthors is list of author names to be excluded from extraction result.
BlacklistedAuthors []string
// Deduplicate specify whether to remove duplicate segments and sections.
Deduplicate bool
// HasEssentialMetadata make the extractor only keep documents featuring all essential
// metadata (date, title, url).
HasEssentialMetadata bool
// MaxTreeSize specify max number of elements inside a document.
// Document that surpass this value will be discarded.
MaxTreeSize int
// EnableLog specify whether log should be enabled or not.
EnableLog bool
// HtmlDateMode specify the behaviour of the external HtmlDate package that used
// to extract publish date from a web page.
HtmlDateMode HtmlDateMode
// HtmlDateOptions is user provided configuration for the external `go-htmldate`
// package that used to look for publish date of a web page. If this property is
// specified, `HtmlDateMode` will be ignored.
HtmlDateOptions *htmldate.Options
// HtmlDateOverride is user provided extracted date from `go-htmldate` package.
// If this property specified, HtmlDate won't be run and instead will use
// this property as its result. In other words, `HtmlDateMode` and `HtmlDateOptions`
// will be ignored.
HtmlDateOverride *htmldate.Result
// PruneSelector is the CSS selector to select nodes to be pruned before extraction.
PruneSelector string
}
// Config is advanced setting to fine tune the extraction result.
// You can use it to specify the minimal size of the extracted content
// and how many duplicate text allowed. However, for most of the time
// the default config should be good enough.
type Config struct {
// Deduplication config
CacheSize int
MaxDuplicateCount int
MinDuplicateCheckSize int
// Extraction size setting
MinExtractedSize int
MinExtractedCommentSize int
MinOutputSize int
MinOutputCommentSize int
}
// DefaultConfig returns the default configuration value.
func DefaultConfig() *Config {
return &Config{
CacheSize: 4096,
MinDuplicateCheckSize: 100,
MaxDuplicateCount: 2,
MinExtractedSize: 250,
MinExtractedCommentSize: 1,
MinOutputSize: 1,
MinOutputCommentSize: 1,
}
}
// FallbackCandidates allows to specify a list of fallback candidates
// in particular: Readability and Dom Distiller.
type FallbackCandidates struct {
// Readability is the user specified extraction result from Go-Readability
// that will be used as fallback candidate.
Readability *html.Node
// Distiller is the user specified extraction result from Go-DomDistiller
// that will be used as fallback candidate.
Distiller *html.Node
// Others is list of the user specified extraction results taht will be used as
// candidates, that generated manually by user using another methods beside
// Go-Readability and Go-DomDistiller.
//
// This list will be prioritized before Readability and Distiller.
//
// Make sure to not put output of Go-Readability and Go-DomDistiller here, to
// prevent those two extractors running twice.
Others []*html.Node
}