summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Davison <dandavison7@gmail.com>2022-07-02 16:30:01 -0400
committerDan Davison <dandavison7@gmail.com>2022-07-02 17:07:29 -0400
commitab8015e4f7168c30256503aa9f26548bf0cbfe93 (patch)
treebefc383f6da8f619a620649f43fac2bfc694d268
parent5f8ac5f71720171c2237f0f61df70006cc87ac08 (diff)
Don't attempt to process ANSI sequences in non-UTF8 input
Fixes #677
-rw-r--r--src/delta.rs20
-rw-r--r--src/utils/mod.rs1
-rw-r--r--src/utils/round_char_boundary.rs24
3 files changed, 43 insertions, 2 deletions
diff --git a/src/delta.rs b/src/delta.rs
index 27b30512..5cda5266 100644
--- a/src/delta.rs
+++ b/src/delta.rs
@@ -13,6 +13,7 @@ use crate::handlers::hunk_header::ParsedHunkHeader;
use crate::handlers::{self, merge_conflict};
use crate::paint::Painter;
use crate::style::DecorationStyle;
+use crate::utils;
#[derive(Clone, Debug, PartialEq)]
pub enum State {
@@ -181,10 +182,25 @@ impl<'a> StateMachine<'a> {
}
fn ingest_line(&mut self, raw_line_bytes: &[u8]) {
- // TODO: retain raw_line as Cow
- self.raw_line = String::from_utf8_lossy(raw_line_bytes).to_string();
+ match String::from_utf8(raw_line_bytes.to_vec()) {
+ Ok(utf8) => self.ingest_line_utf8(utf8),
+ Err(_) => {
+ let raw_line = String::from_utf8_lossy(raw_line_bytes);
+ let truncated_len = utils::round_char_boundary::floor_char_boundary(
+ &raw_line,
+ self.config.max_line_length,
+ );
+ self.raw_line = raw_line[..truncated_len].to_string();
+ self.line = self.raw_line.clone();
+ }
+ }
+ }
+
+ fn ingest_line_utf8(&mut self, raw_line: String) {
+ self.raw_line = raw_line;
// When a file has \r\n line endings, git sometimes adds ANSI escape sequences between the
// \r and \n, in which case byte_lines does not remove the \r. Remove it now.
+ // TODO: Limit the number of characters we examine when looking for the \r?
if let Some(cr_index) = self.raw_line.rfind('\r') {
if ansi::strip_ansi_codes(&self.raw_line[cr_index + 1..]).is_empty() {
self.raw_line = format!(
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index 20adef24..9cd8cfaa 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -3,4 +3,5 @@ pub mod bat;
pub mod path;
pub mod process;
pub mod regex_replacement;
+pub mod round_char_boundary;
pub mod syntect;
diff --git a/src/utils/round_char_boundary.rs b/src/utils/round_char_boundary.rs
new file mode 100644
index 00000000..c000134d
--- /dev/null
+++ b/src/utils/round_char_boundary.rs
@@ -0,0 +1,24 @@
+// Taken from https://github.com/rust-lang/rust/pull/86497
+// TODO: Remove when this is in the version of the Rust standard library that delta is building
+// against.
+
+#[inline]
+const fn is_utf8_char_boundary(b: u8) -> bool {
+ // This is bit magic equivalent to: b < 128 || b >= 192
+ (b as i8) >= -0x40
+}
+
+#[inline]
+pub fn floor_char_boundary(s: &str, index: usize) -> usize {
+ if index >= s.len() {
+ s.len()
+ } else {
+ let lower_bound = index.saturating_sub(3);
+ let new_index = s.as_bytes()[lower_bound..=index]
+ .iter()
+ .rposition(|b| is_utf8_char_boundary(*b));
+
+ // SAFETY: we know that the character boundary will be within four bytes
+ unsafe { lower_bound + new_index.unwrap_unchecked() }
+ }
+}