Check more bytes when detecting encoding

I've observed PDF files that have sufficiently large headers that they were detected as text, which wasn't helpful. Also improve logging to report how many invalid bytes were found.
author: Wilfred Hughes <me@wilfred.me.uk> 2023-07-21 08:34:41 -0700
committer: Wilfred Hughes <me@wilfred.me.uk> 2023-07-21 08:34:41 -0700
commit: 4e9637c861673b15f0ed941963bf29d152509f8a (patch)
tree: 5ed64cd92d44f0f6a02185462ecf6e6e2d9a211e
parent: 1c0b3153dfdbff653e5edd18fb3ac72deb3f39ee (diff)
2 files changed, 14 insertions, 4 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 347bd6560..83aedcd67 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,10 @@ Updated grammar for Java.
 
 Improved parsing of qualified constructors in Haskell.
 
+Difftastic is now stricter about valid UTF-8 and UTF-16, considering
+more of the file's bytes during filetype detection. This fixes cases
+where e.g. PDF was sometimes incorrectly considered as UTF-8.
+
 ### Diffing
 
 Improved handling of delimiters ("nested sliders") in languages that
diff --git a/src/files.rs b/src/files.rs
index 71180b430..1cad43540 100644
--- a/src/files.rs
+++ b/src/files.rs
@@ -193,11 +193,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
     let utf8_string = String::from_utf8_lossy(bytes).to_string();
     let num_utf8_invalid = utf8_string
         .chars()
-        .take(1000)
+        .take(5000)
         .filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
         .count();
     if num_utf8_invalid <= 10 {
-        info!("Input file is mostly valid UTF-8");
+        info!(
+            "Input file is mostly valid UTF-8 (invalid characters: {})",
+            num_utf8_invalid
+        );
         return ProbableFileKind::Text(utf8_string);
     }
 
@@ -206,11 +209,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
     let utf16_string = String::from_utf16_lossy(&u16_values);
     let num_utf16_invalid = utf16_string
         .chars()
-        .take(1000)
+        .take(5000)
         .filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
         .count();
     if num_utf16_invalid <= 5 {
-        info!("Input file is mostly valid UTF-16");
+        info!(
+            "Input file is mostly valid UTF-16 (invalid characters: {})",
+            num_utf16_invalid
+        );
         return ProbableFileKind::Text(utf16_string);
     }
author	Wilfred Hughes <me@wilfred.me.uk>	2023-07-21 08:34:41 -0700
committer	Wilfred Hughes <me@wilfred.me.uk>	2023-07-21 08:34:41 -0700
commit	4e9637c861673b15f0ed941963bf29d152509f8a (patch)
tree	5ed64cd92d44f0f6a02185462ecf6e6e2d9a211e
parent	1c0b3153dfdbff653e5edd18fb3ac72deb3f39ee (diff)