Skip to content

Commit b14cc7c

Browse files
committed
Make Windows-1252 detection stricter, consistent with other encodings
This should help with #841.
1 parent 43eb109 commit b14cc7c

File tree

2 files changed

+14
-2
lines changed

2 files changed

+14
-2
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
## 0.65 (unreleased)
22

3+
### Parsing
4+
5+
File detection is now stricter with Windows-1252 (Latin 1) encoded
6+
text. Windows-1252 was added in 0.63 and some binary files
7+
(e.g. Brotli compressed files) were incorrectly treated as this
8+
encoding.
9+
310
## 0.64 (released 16th June 2025)
411

512
### Parsing

src/files.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,13 +188,14 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
188188
// to be valid UTF-16. Decoding these as UTF-16 leads to garbage
189189
// ("mojibake").
190190
//
191-
// To avoid this, we only try UTF-16 after we'vedone MIME type
191+
// To avoid this, we only try UTF-16 after we've done MIME type
192192
// checks for binary, and we conservatively require an explicit
193193
// byte order mark.
194194
let u16_values = u16_from_bytes(bytes);
195195
let utf16_str_result = String::from_utf16(&u16_values);
196196
match utf16_str_result {
197197
Ok(valid_utf16_string) if has_utf16_byte_order_mark(bytes) => {
198+
info!("Input file is valid UTF-16 with a byte order mark");
198199
return ProbableFileKind::Text(valid_utf16_string);
199200
}
200201
_ => {}
@@ -238,9 +239,13 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
238239
let num_null = utf16_string
239240
.chars()
240241
.take(5000)
241-
.filter(|c| *c == '\0')
242+
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
242243
.count();
243244
if num_null <= 1 {
245+
info!(
246+
"Input file is mostly valid Latin 1 (invalid characters: {})",
247+
num_null
248+
);
244249
return ProbableFileKind::Text(latin1_str.to_string());
245250
}
246251
}

0 commit comments

Comments
 (0)