summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDenys Séguret <cano.petrole@gmail.com>2022-10-04 20:04:59 +0200
committerGitHub <noreply@github.com>2022-10-04 20:04:59 +0200
commit3e2890739526baadb11139c4297a1bc02951fe06 (patch)
treef6588eeb965bda2401e2612ad7027506c57ed6a3
parent818d069c51efe3a1736743a06c714ae7c351818a (diff)
New escaping rules (#609)
As was noticed by @FedericoStra, escaping for regular expressions was painful. For example you had to do `/\\d` to search for a digit. This PR brings more complex escaping rules so that less escaping is necessary: - when after '/': only ' ', ':', '/' and '\' need escaping - otherwise, '&,' '|', '(', ')', '\' need escaping Fix #592
-rw-r--r--CHANGELOG.md1
-rw-r--r--Cargo.lock12
-rw-r--r--Cargo.toml4
-rw-r--r--src/command/parts.rs303
-rw-r--r--src/pattern/pattern_parts.rs18
-rw-r--r--website/docs/img/regex-antislash-d.pngbin0 -> 21063 bytes
-rw-r--r--website/docs/input.md67
7 files changed, 339 insertions, 66 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f7a6ea1..6d71ff4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
- status messages now displayed on toggling (for example showing hidden files)
- upgrade terminal-light to 1.0.1 for better recognition of background color on high precision color terminals
- in default configuration, ctrl-left never opens a panel to the left, as I think this was most often unwanted (one too many hit on cltr-left). It's possible to get the old behavior by binding ctrl-left to `:panel_left` instead of the new `:panel_left_no_open` internal.
+- New escaping rules basically let you skip many `\`, especially when building regexes - Fix #592
### v1.15.0 - 2022-09-24
<a name="v1.15.0"></a>
diff --git a/Cargo.lock b/Cargo.lock
index 3e68ec4..5557878 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -113,9 +113,9 @@ checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
[[package]]
name = "bet"
-version = "1.0.1"
+version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05187b4047565a2bb9aeab0c3e8740175871fd616984d816b0c8f1f6cb71125e"
+checksum = "1673d13ad9c8d4b5e3d17a38730714157d428d1a249c18dd96e77e969623ac98"
[[package]]
name = "bincode"
@@ -1090,9 +1090,9 @@ dependencies = [
[[package]]
name = "minimad"
-version = "0.9.0"
+version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd37b2e65fbd459544194d8f52ed84027e031684335a062c708774c09d172b0b"
+checksum = "277639f0198568f70f8fe4ab88a52a67c96bca12f27ba5c17a76acdcb8b45834"
dependencies = [
"once_cell",
]
@@ -1816,9 +1816,9 @@ dependencies = [
[[package]]
name = "termimad"
-version = "0.20.2"
+version = "0.20.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8a16d7de8d4c97a4149cc3b9d3681c5dba36011c303745bb1af19636e89ba39"
+checksum = "3977554523f42b473e5211e5fbb39e78e21e793ff6ffd6428ae3bd02dbb7e09d"
dependencies = [
"coolor",
"crossbeam",
diff --git a/Cargo.toml b/Cargo.toml
index 29f8300..7f9ff28 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,7 +24,7 @@ kitty-csi-check = ["xterm-query"]
ahash = { version = "0.7", features = ["serde"] }
ansi_colours = "1.0"
base64 = "0.13"
-bet = "1.0"
+bet = "1.0.2"
char_reader = "0.1"
chrono = "0.4"
clap = { version = "3.2.1", features = ["derive"] }
@@ -57,7 +57,7 @@ splitty = "1.0"
strict = "0.1.4"
syntect = { package = "syntect-no-panic", version = "4.6.1" } # see issue #485
tempfile = "3.2"
-termimad = "0.20.2"
+termimad = "0.20.3"
terminal-clipboard = { version = "0.3.1", optional = true }
terminal-light = "1.0.1"
toml = "0.5"
diff --git a/src/command/parts.rs b/src/command/parts.rs
index b355de5..6dc2559 100644
--- a/src/command/parts.rs
+++ b/src/command/parts.rs
@@ -8,7 +8,7 @@ use {
};
/// An intermediate parsed representation of the raw string
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq)]
pub struct CommandParts {
pub raw_pattern: String, // may be empty
pub pattern: BeTree<PatternOperator, PatternParts>,
@@ -31,60 +31,77 @@ impl CommandParts {
.as_ref()
.map_or(false, |vi| !vi.is_empty())
}
- pub fn from(mut raw: String) -> Self {
+ pub fn from<S: Into<String>>(raw: S) -> Self {
+ let mut raw = raw.into();
let mut invocation_start_pos: Option<usize> = None;
- let mut escaping = false;
let mut pt = BeTree::new();
- for (pos, c) in raw.char_indices() {
- if c == '\\' {
- if escaping {
- escaping = false;
- } else {
- escaping = true;
- continue;
+ let mut chars = raw.char_indices().peekable();
+ let mut escape_cur_char = false;
+ let mut escape_next_char = false;
+ // we loop on chars and build the pattern tree until we reach an unescaped ' ' or ':'
+ while let Some((pos, cur_char)) = chars.next() {
+ let between_slashes = pt.current_atom()
+ .map_or(
+ false,
+ |pp: &PatternParts| pp.is_between_slashes(),
+ );
+ match cur_char {
+ c if escape_cur_char => {
+ // Escaping is used to prevent characters from being consumed at the
+ // composite pattern level (or, and, parens) or as the separator between
+ // the pattern and the verb. An escaped char is usable in a pattern atom.
+ pt.mutate_or_create_atom(PatternParts::default).push(c);
}
- }
- if !escaping {
- if c == ' ' || c == ':' {
+ '\\' => {
+ // Pattern escaping rules:
+ // - when after '/': only ' ', ':', '/' and '\' need escaping
+ // - otherwise, '&,' '|', '(', ')' need escaping too ('(' is only here for
+ // symmetry)
+ let between_slashes = match pt.current_atom() {
+ Some(pattern_parts) => pattern_parts.is_between_slashes(),
+ None => false,
+ };
+ escape_next_char = match chars.peek() {
+ None => false, // End of the string, we can't be escaping
+ Some((_, next_char)) => match (next_char, between_slashes) {
+ (' ' | ':' | '/' | '\\', _) => true,
+ ('&' | '|' | '!' | '(' | ')', false) => true,
+ _ => false,
+ }
+ };
+ if !escape_next_char {
+ // if the '\' isn't used for escaping, it's used as its char value
+ pt.mutate_or_create_atom(PatternParts::default).push('\\');
+ }
+ }
+ ' ' | ':' => { // ending the pattern part
invocation_start_pos = Some(pos);
break;
}
- if c == '/' {
+ '/' => { // starting an atom part
pt.mutate_or_create_atom(PatternParts::default).add_part();
- continue;
}
- let allow_inter_pattern_token = match pt.current_atom() {
- Some(pattern_parts) => pattern_parts.allow_inter_pattern_token(),
- None => true,
- };
- if allow_inter_pattern_token {
- match c {
- '|' if pt.accept_binary_operator() => {
- pt.push_operator(PatternOperator::Or);
- continue;
- }
- '&' if pt.accept_binary_operator() => {
- pt.push_operator(PatternOperator::And);
- continue;
- }
- '!' if pt.accept_unary_operator() => {
- pt.push_operator(PatternOperator::Not);
- continue;
- }
- '(' if pt.accept_opening_par() => {
- pt.open_par();
- continue;
- }
- ')' if pt.accept_closing_par() => {
- pt.close_par();
- continue;
- }
- _ => {}
- }
+ '|' if !between_slashes && pt.accept_binary_operator() => {
+ pt.push_operator(PatternOperator::Or);
+ }
+ '&' if !between_slashes && pt.accept_binary_operator() => {
+ pt.push_operator(PatternOperator::And);
+ }
+ '!' if !between_slashes && pt.accept_unary_operator() => {
+ pt.push_operator(PatternOperator::Not);
+ }
+ '(' if !between_slashes && pt.accept_opening_par() => {
+ pt.open_par();
+ }
+ ')' if !between_slashes && pt.accept_closing_par() => {
+ pt.close_par();
+ }
+ _ => {
+ pt.mutate_or_create_atom(PatternParts::default).push(cur_char);
}
}
- pt.mutate_or_create_atom(PatternParts::default).push(c);
- escaping = false;
+ escape_cur_char = escape_next_char;
+ escape_next_char = false;
}
let mut verb_invocation = None;
if let Some(pos) = invocation_start_pos {
@@ -123,3 +140,199 @@ impl CommandParts {
}
+#[cfg(test)]
+mod test_command_parts {
+
+ use {
+ crate::{
+ command::CommandParts,
+ pattern::*,
+ verb::VerbInvocation,
+ },
+ bet::{BeTree, Token},
+ };
+
+ fn pp(a: &[&str]) -> PatternParts {
+ a.try_into().unwrap()
+ }
+
+ fn check(
+ input: &str,
+ raw_pattern: &str,
+ mut pattern_tokens: Vec<Token<PatternOperator, PatternParts>>,
+ verb_invocation: Option<&str>,
+ ) {
+ let left = CommandParts::from(input);
+ dbg!(&left);
+ let mut pattern = BeTree::new();
+ for token in pattern_tokens.drain(..) {
+ pattern.push(token);
+ }
+ let right = CommandParts {
+ raw_pattern: raw_pattern.to_string(),
+ pattern,
+ verb_invocation: verb_invocation.map(|s| VerbInvocation::from(s)),
+ };
+ dbg!(&right);
+ assert_eq!(left, right);
+ }
+
+ #[test]
+ fn parse_empty() {
+ check(
+ "",
+ "",
+ vec![],
+ None,
+ );
+ }
+ #[test]
+ fn parse_just_semicolon() {
+ check(
+ ":",
+ "",
+ vec![],
+ Some(""),
+ );
+ }
+ #[test]
+ fn parse_no_pattern() {
+ check(
+ " cd /",
+ "",
+ vec![],
+ Some("cd /"),
+ );
+ }
+ #[test]
+ fn parse_pattern_and_invocation() {
+ check(
+ "/r cd /",
+ "/r",
+ vec![
+ Token::Atom(pp(&["", "r"])),
+ ],
+ Some("cd /"),
+ );
+ }
+ #[test]
+ fn parse_pattern_between_slashes() {
+ check(
+ r#"/&"#,
+ r#"/&"#,
+ vec![
+ Token::Atom(pp(&["", "&"])),
+ ],
+ None,
+ );
+ check(
+ r#"/&/&r/a(\w-)+/ rm"#,
+ r#"/&/&r/a(\w-)+/"#,
+ vec![
+ Token::Atom(pp(&["", "&", ""])),
+ Token::Operator(PatternOperator::And),
+ Token::Atom(pp(&["r", r#"a(\w-)+"#, ""])),
+ ],
+ Some("rm"),
+ );
+ }
+ #[test]
+ fn parse_pattern_with_space() {
+ check(
+ r#"a\ b"#,
+ r#"a\ b"#,
+ vec![
+ Token::Atom(pp(&["a b"])),
+ ],
+ None,
+ );
+ }
+ #[test]
+ fn parse_pattern_with_slash() {
+ check(
+ r#"r/a\ b\//i cd /"#,
+ r#"r/a\ b\//i"#,
+ vec![
+ Token::Atom(pp(&["r", "a b/", "i"])),
+ ],
+ Some("cd /"),
+ );
+ }
+ #[test]
+ fn parse_fuzzy_pattern_searching_parenthesis() {
+ check(
+ r#"\("#,
+ r#"\("#,
+ vec![
+ Token::Atom(pp(&["("])),
+ ],
+ None,
+ );
+ }
+ #[test]
+ fn parse_regex_pattern_searching_parenthesis() {
+ check(
+ r#"/\("#,
+ r#"/\("#,
+ vec![
+ Token::Atom(pp(&["", r#"\("#])),
+ ],
+ None,
+ );
+ }
+ #[test]
+ fn parse_composite_pattern() {
+ check(
+ "(/txt$/&!truc)&c/rex",
+ "(/txt$/&!truc)&c/rex",
+ vec![
+ Token::OpeningParenthesis,
+ Token::Atom(pp(&["", "txt$", ""])),
+ Token::Operator(PatternOperator::And),
+ Token::Operator(PatternOperator::Not),
+ Token::Atom(pp(&["truc"])),
+ Token::ClosingParenthesis,
+ Token::Operator(PatternOperator::And),
+ Token::Atom(pp(&["c", "rex"])),
+ ],
+ None
+ );
+ }
+ #[test]
+ fn parse_unclosed_composite_pattern() {
+ check(
+ r#"!/\.json$/&(c/isize/|c/i32:rm"#,
+ r#"!/\.json$/&(c/isize/|c/i32"#,
+ vec![
+ Token::Operator(PatternOperator::Not),
+ Token::Atom(pp(&["", r#"\.json$"#, ""])),
+ Token::Operator(PatternOperator::And),
+ Token::OpeningParenthesis,
+ Token::Atom(pp(&["c", "isize", ""])),
+ Token::Operator(PatternOperator::Or),
+ Token::Atom(pp(&["c", "i32"])),
+ ],
+ Some("rm"),
+ );
+ }
+ #[test]
+ fn issue_592() { // https://github.com/Canop/broot/issues/592
+ check(
+ r#"\t"#,
+ r#"\t"#,
+ vec![
+ Token::Atom(pp(&[r#"\t"#])),
+ ],
+ None,
+ );
+ check(
+ r#"r/@(\.[^.]+)+/ cp .."#,
+ r#"r/@(\.[^.]+)+/"#,
+ vec![
+ Token::Atom(pp(&["r", r#"@(\.[^.]+)+"#, ""])),
+ ],
+ Some("cp .."),
+ );
+ }
+}
+
diff --git a/src/pattern/pattern_parts.rs b/src/pattern/pattern_parts.rs
index 99680d9..19466fb 100644
--- a/src/pattern/pattern_parts.rs
+++ b/src/pattern/pattern_parts.rs
@@ -27,17 +27,29 @@ impl Default for PatternParts {
}
}
+#[cfg(test)]
+impl TryFrom<&[&str]> for PatternParts {
+ type Error = &'static str;
+ fn try_from(a: &[&str]) -> Result<Self, Self::Error> {
+ if a.is_empty() {
+ return Err("invalid empty parts array");
+ }
+ let parts = a.iter().map(|s| (*s).into()).collect();
+ Ok(Self { parts })
+ }
+}
+
impl PatternParts {
pub fn push(&mut self, c: char) {
// self.parts can't be empty, by construct
self.parts.last_mut().unwrap().push(c);
}
+ pub fn is_between_slashes(&self) -> bool {
+ self.parts.len() == 2
+ }
pub fn add_part(&mut self) {
self.parts.push(String::new());
}
- pub fn allow_inter_pattern_token(&self) -> bool {
- self.parts.len() != 2
- }
pub fn is_empty(&self) -> bool {
self.core().is_empty()
}
diff --git a/website/docs/img/regex-antislash-d.png b/website/docs/img/regex-antislash-d.png
new file mode 100644
index 0000000..4ea9aad
--- /dev/null
+++ b/website/docs/img/regex-antislash-d.png
Binary files differ
diff --git a/website/docs/input.md b/website/docs/input.md
index 8d0ac50..b609e48 100644
--- a/website/docs/input.md
+++ b/website/docs/input.md
@@ -35,32 +35,79 @@ regex name | `/[yz]{3}` or `/[yz]{3}/` | `fuzzy.rs` | search for the regular exp
regex name | `/(json|xml)$/i` | `thing.XML` | find files whose name ends in `json` or `xml`, case insensitive
regex name | `/abc/i` | `aBc.txt` | search for the regular expression `abc` with flag `i` in filenames
exact path | `ep/te\/d` or `pe/te\/d/` | `website/docs` | search for "te/d" in sub-paths from current tree root
-regex path | `rp/\\d{3}.*txt` | `dir/a256/abc.txt` | search for the `\d{3}.*txt` regex in sub-paths from current tree root
+regex path | `rp/\d{3}.*txt` | `dir/a256/abc.txt` | search for the `\d{3}.*txt` regex in sub-paths from current tree root
tokens path | `t/ab,cd` | `DCD/a256/abc.txt` | search for the "ab" and "cd" tokens in sub-paths from current tree root
exact content | `c/mask` or `c/mask/` | `umask = "1.0"` | search for the "mask" string in file contents
regex content | `rc/[abc]{5}/i` | `bAAAc` | search with a regular expression in file contents - `i` making it case insensitive
-regex content | `cr/\\bzh\\b` | `"zh":{` | search a word with a regular expression in file contents
+regex content | `cr/\bzh\b` | `"zh":{` | search a word with a regular expression in file contents
It's also possible to [redefine those mode mappings](../conf_file/#search-modes).
-To escape characters (for example the space, colon or slash) in the pattern, use a `\` (an antislash is `\\`).
-
# Combining filtering patterns
Patterns can be combined with the `!` (not), `&` (and) and `|` (or) operators, and parentheses if necessary.
-You can for example display non `json` files containing either `isize` or `i32` with
+You can for example list files whose name contains a `z` and whose content contains one too with
+
+ z&c/z
+
+To display non `json` files containing either `isize` or `i32`, type
+
+ !/\.json$/&(c/isize/|c/i32/)
+
+The last closing characters are often unecessary when no ambiguity is possible, so you could have typed this:
+
+ !/\.json$/&(c/isize/|c/i32
- !/json$/&(c/isize/|c/i32/)
+# Escaping
-## Subtleties
+## Why escaping ?
+
+Look at this input: `a|b rm`.
+
+It's for searching files whose name contains either a `a` or a `b`, then removing the selected one.
+The pattern here is `a|b`, it's a composite pattern.
+
+A space or a colon starts the verb invocation.
+So if you needs one of them in your pattern, you need to escape it with `\`.
+
+For example
+
+* to search for a file whose name contains a x and a colon, you type `x\:`
+* to search for a file whose name contains a space just before a digit, you can use a regular expression: `/\ \d`
The characters you use as operators and the parenthesis can be useful in patterns too, either because you want to search for them in fuzzy patterns or in file contents, or because you write non trivial regular expressions.
-Most often you'll just type what feels natural and broot will select the interpretation which makes sense but you might be interested in a few rules:
+If you want to search for the `|` character (or a `&`, or `(`, or `)`), you can't just type it because it's used to combine elementary patterns. I needs escaping. So if you need to search for the `|` character in file names, you type `\|`.
+
+An elementary pattern which starts with a `/` can only be ended with a `/`, a space, or a colon.
+That's why you don't have to escape other characters you want to include in your elementary pattern.
+
+This lets you type this regular expression with no unecessary escaping:
+
+ /(\d-){2}\w
+
+![regex](img/regex-antislash-d.png)
+
+Regular expression escaping rules still apply, so if you want to search with a regex for a file containing a `(`, you'll type `/\(`.
+
+## Escaping Rules
+
+The escaping character is the antislash `\`.
+
+Most often, you don't need to know more: when broot tells you it doesn't understand your pattern, it should click that your special character needs escaping and you prefix it with a `\ `.
+
+More precisely:
+
+1. After the first `/` of a pattern, only ` `, `:`, `/` and `\` need escaping.
+2. Otherwise, `&,` `|`, `(`, `)`, `\` need escaping too.
+3. When there's no ambiguity, ending characters are often unecessary
+
+# Performances
+
+broot interprets the left operand before the right one and doesn't interpret the second one if it's not necessary.
-* parenthesis and operators in the second pattern part (parts being separated by `/`) are part of the pattern, which explains why `/(json|xml)` is interpreted as a regular expression. If you want to do a fuzzy search for a `|` in the name of your files, you'll need to either escape it as `\|` or to have an explicit pattern mode : `nf/a|b` because `a|b` would search for files whose name contains either `a` or `b`. And to ensure an operator or closing parenthesis isn't interpreted as part of your pattern, close it with a `/`.
-* broot interprets the left operand before the right one and doesn't interpret the second one if it's not necessary. So if you want to search your whole disk for json files containing `abcd`, it will be faster to use `/json$/&c/abcd` rather than `c/abcd/&/json$/` which would look at the file name only after having scanned the content.
+So if you want to search your whole disk for json files containing `abcd`, it will be faster to use `/\.json$/&c/abcd` rather than `c/abcd/&/\.json$/` which would look at the file name only after having scanned the content.
# The verb invocation