Improve word diffing heuristic and add another sample file

author: Wilfred Hughes <me@wilfred.me.uk> 2023-07-12 12:12:12 -0700
committer: Wilfred Hughes <me@wilfred.me.uk> 2023-07-12 12:12:32 -0700
commit: a814e01d229db717fbcd5c2b98a4daa247b07c02 (patch)
tree: 73beb2cc0d83ad2f29a00935c6eac6bd28a7f803
parent: 1d3b6836ef1ab4fd2200871c0f4a8c92583997f3 (diff)
4 files changed, 49 insertions, 16 deletions
diff --git a/sample_files/compare.expected b/sample_files/compare.expected
index bba2e0fc7..056ee8a80 100644
--- a/sample_files/compare.expected
+++ b/sample_files/compare.expected
@@ -158,7 +158,7 @@ sample_files/pascal_before.pascal sample_files/pascal_after.pascal
 dfea5599b7f5e180d0fafab326f612cc  -
 
 sample_files/perl_before.pl sample_files/perl_after.pl
-09034cdf9cc4853ba7527de6d633e9be  -
+62ed7685bdfad901d1087e8bad399d86  -
 
 sample_files/prefer_outer_before.el sample_files/prefer_outer_after.el
 de31a80dc8a06987aeff4aaa04ce3b87  -
@@ -199,6 +199,9 @@ sample_files/slow_before.rs sample_files/slow_after.rs
 sample_files/small_before.js sample_files/small_after.js
 b4300bfc0203acd8f2603b504b859dc8  -
 
+sample_files/string_subwords_before.el sample_files/string_subwords_after.el
+1154702ee8bc90407728871b94d12878  -
+
 sample_files/strings_before.el sample_files/strings_after.el
 adc1c8734906b83deff25b1567e46b56  -
 
diff --git a/sample_files/string_subwords_after.el b/sample_files/string_subwords_after.el
new file mode 100644
index 000000000..870f65428
--- /dev/null
+++ b/sample_files/string_subwords_after.el
@@ -0,0 +1,16 @@
+(format "%s: %s" (site-name) name)
+
+(defcustom deadgrep-max-buffers
+  4
+  "The maximum number of deadgrep results buffers.
+
+If the number of results buffers exceeds this value, deadgrep
+will kill results buffers. The least recently used buffers are
+killed first.
+
+To disable cleanup entirely, set this variable to nil."
+  :type '(choice
+          (number :tag "Maximum of buffers allowed")
+          (const :tag "Disable cleanup" nil))
+  :group 'deadgrep)
+
diff --git a/sample_files/string_subwords_before.el b/sample_files/string_subwords_before.el
new file mode 100644
index 000000000..940aa96ad
--- /dev/null
+++ b/sample_files/string_subwords_before.el
@@ -0,0 +1,13 @@
+(format "SoloWiki Viewing: %s" name)
+
+(defcustom deadgrep-max-buffers
+  4
+  "Deadgrep will kill the least recently used results buffer
+if there are more than this many.
+
+To disable cleanup entirely, set this variable to nil."
+  :type '(choice
+          (number :tag "Maximum of buffers allowed")
+          (const :tag "Disable cleanup" nil))
+  :group 'deadgrep)
+
diff --git a/src/parse/syntax.rs b/src/parse/syntax.rs
index a64d18764..f93992ad7 100644
--- a/src/parse/syntax.rs
+++ b/src/parse/syntax.rs
@@ -790,29 +790,30 @@ fn split_atom_words(
 /// Are there sufficient common words that we should only highlight
 /// individual changed words?
 fn has_common_words(word_diffs: &Vec<myers_diff::DiffResult<&&str>>) -> bool {
-    let mut word_count = 0;
+    let mut novel_count = 0;
+    let mut unchanged_count = 0;
+
     for word_diff in word_diffs {
         match word_diff {
             myers_diff::DiffResult::Both(word, _) => {
-                // If we have at least one long word (i.e. not just
-                // punctuation), that's sufficient.
-                if word.len() > 2 {
-                    return true;
-                }
-
-                // If we have lots of common short words, not just the
-                // beginning/end comment delimiter, that qualifies
-                // too.
-                word_count += 1;
-                if word_count > 4 {
-                    return true;
+                if **word != " " {
+                    unchanged_count += 1;
                 }
             }
-            _ => {}
+            _ => {
+                novel_count += 1;
+            }
         }
     }
 
-    false
+    // We want more than two unchanged words, because the text content
+    // includes the comment or string delimiters.
+    //
+    // A sufficiently similar set of words is when more than 50% of
+    // the words are common between the two sides. We multiply by two
+    // because non-matching words gives us two novel words, whereas
+    // matched words only gives us one unchanged word.
+    unchanged_count > 2 && unchanged_count * 2 >= novel_count
 }
 
 impl MatchedPos {
author	Wilfred Hughes <me@wilfred.me.uk>	2023-07-12 12:12:12 -0700
committer	Wilfred Hughes <me@wilfred.me.uk>	2023-07-12 12:12:32 -0700
commit	a814e01d229db717fbcd5c2b98a4daa247b07c02 (patch)
tree	73beb2cc0d83ad2f29a00935c6eac6bd28a7f803
parent	1d3b6836ef1ab4fd2200871c0f4a8c92583997f3 (diff)