Skip to content

Commit 99eaa8a

Browse files
Modify default encoding detector
Replace HtmlEncodingDetector to StandardHtmlEncodingDetector Adjust some test case
1 parent 0e7b475 commit 99eaa8a

File tree

3 files changed

+5
-3
lines changed

3 files changed

+5
-3
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector

tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import org.apache.tika.parser.Parser;
4444
import org.apache.tika.parser.ParserDecorator;
4545
import org.apache.tika.parser.html.HtmlEncodingDetector;
46+
import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
4647
import org.apache.tika.parser.txt.Icu4jEncodingDetector;
4748
import org.apache.tika.parser.txt.TXTParser;
4849
import org.apache.tika.parser.txt.UniversalEncodingDetector;
@@ -56,7 +57,7 @@ public void testDefault() {
5657
assertTrue(detector instanceof CompositeEncodingDetector);
5758
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
5859
assertEquals(3, detectors.size());
59-
assertTrue(detectors.get(0) instanceof HtmlEncodingDetector);
60+
assertTrue(detectors.get(0) instanceof StandardHtmlEncodingDetector);
6061
assertTrue(detectors.get(1) instanceof UniversalEncodingDetector);
6162
assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector);
6263
}

tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
<!-- Explicitly request default parsers -->
2020
<parsers/>
2121
<encodingDetectors>
22-
<!-- All detectors except HtmlEncodingDetector -->
22+
<!-- All detectors except StandardHtmlEncodingDetector -->
2323
<encodingDetector class="org.apache.tika.detect.DefaultEncodingDetector">
24-
<encodingDetector-exclude class="org.apache.tika.parser.html.HtmlEncodingDetector"/>
24+
<encodingDetector-exclude class="org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector"/>
2525
</encodingDetector>
2626
<!-- One other detector, to check ordering -->
2727
<encodingDetector class="org.apache.tika.detect.NonDetectingEncodingDetector"/>

0 commit comments

Comments
 (0)