summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWilfred Hughes <me@wilfred.me.uk>2023-07-21 08:34:41 -0700
committerWilfred Hughes <me@wilfred.me.uk>2023-07-21 08:34:41 -0700
commit4e9637c861673b15f0ed941963bf29d152509f8a (patch)
tree5ed64cd92d44f0f6a02185462ecf6e6e2d9a211e
parent1c0b3153dfdbff653e5edd18fb3ac72deb3f39ee (diff)
Check more bytes when detecting encoding
I've observed PDF files that have sufficiently large headers that they were detected as text, which wasn't helpful. Also improve logging to report how many invalid bytes were found.
-rw-r--r--CHANGELOG.md4
-rw-r--r--src/files.rs14
2 files changed, 14 insertions, 4 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 347bd6560..83aedcd67 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,10 @@ Updated grammar for Java.
Improved parsing of qualified constructors in Haskell.
+Difftastic is now stricter about valid UTF-8 and UTF-16, considering
+more of the file's bytes during filetype detection. This fixes cases
+where e.g. PDF was sometimes incorrectly considered as UTF-8.
+
### Diffing
Improved handling of delimiters ("nested sliders") in languages that
diff --git a/src/files.rs b/src/files.rs
index 71180b430..1cad43540 100644
--- a/src/files.rs
+++ b/src/files.rs
@@ -193,11 +193,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
let utf8_string = String::from_utf8_lossy(bytes).to_string();
let num_utf8_invalid = utf8_string
.chars()
- .take(1000)
+ .take(5000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
.count();
if num_utf8_invalid <= 10 {
- info!("Input file is mostly valid UTF-8");
+ info!(
+ "Input file is mostly valid UTF-8 (invalid characters: {})",
+ num_utf8_invalid
+ );
return ProbableFileKind::Text(utf8_string);
}
@@ -206,11 +209,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
let utf16_string = String::from_utf16_lossy(&u16_values);
let num_utf16_invalid = utf16_string
.chars()
- .take(1000)
+ .take(5000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
.count();
if num_utf16_invalid <= 5 {
- info!("Input file is mostly valid UTF-16");
+ info!(
+ "Input file is mostly valid UTF-16 (invalid characters: {})",
+ num_utf16_invalid
+ );
return ProbableFileKind::Text(utf16_string);
}