diff options
author | Wilfred Hughes <me@wilfred.me.uk> | 2023-07-21 08:34:41 -0700 |
---|---|---|
committer | Wilfred Hughes <me@wilfred.me.uk> | 2023-07-21 08:34:41 -0700 |
commit | 4e9637c861673b15f0ed941963bf29d152509f8a (patch) | |
tree | 5ed64cd92d44f0f6a02185462ecf6e6e2d9a211e | |
parent | 1c0b3153dfdbff653e5edd18fb3ac72deb3f39ee (diff) |
Check more bytes when detecting encoding
I've observed PDF files that have sufficiently large headers that they
were detected as text, which wasn't helpful.
Also improve logging to report how many invalid bytes were found.
-rw-r--r-- | CHANGELOG.md | 4 | ||||
-rw-r--r-- | src/files.rs | 14 |
2 files changed, 14 insertions, 4 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 347bd6560..83aedcd67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ Updated grammar for Java. Improved parsing of qualified constructors in Haskell. +Difftastic is now stricter about valid UTF-8 and UTF-16, considering +more of the file's bytes during filetype detection. This fixes cases +where e.g. PDF was sometimes incorrectly considered as UTF-8. + ### Diffing Improved handling of delimiters ("nested sliders") in languages that diff --git a/src/files.rs b/src/files.rs index 71180b430..1cad43540 100644 --- a/src/files.rs +++ b/src/files.rs @@ -193,11 +193,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind { let utf8_string = String::from_utf8_lossy(bytes).to_string(); let num_utf8_invalid = utf8_string .chars() - .take(1000) + .take(5000) .filter(|c| *c == std::char::REPLACEMENT_CHARACTER) .count(); if num_utf8_invalid <= 10 { - info!("Input file is mostly valid UTF-8"); + info!( + "Input file is mostly valid UTF-8 (invalid characters: {})", + num_utf8_invalid + ); return ProbableFileKind::Text(utf8_string); } @@ -206,11 +209,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind { let utf16_string = String::from_utf16_lossy(&u16_values); let num_utf16_invalid = utf16_string .chars() - .take(1000) + .take(5000) .filter(|c| *c == std::char::REPLACEMENT_CHARACTER) .count(); if num_utf16_invalid <= 5 { - info!("Input file is mostly valid UTF-16"); + info!( + "Input file is mostly valid UTF-16 (invalid characters: {})", + num_utf16_invalid + ); return ProbableFileKind::Text(utf16_string); } |