diff options
author | Denys Séguret <cano.petrole@gmail.com> | 2022-10-04 20:04:59 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-04 20:04:59 +0200 |
commit | 3e2890739526baadb11139c4297a1bc02951fe06 (patch) | |
tree | f6588eeb965bda2401e2612ad7027506c57ed6a3 | |
parent | 818d069c51efe3a1736743a06c714ae7c351818a (diff) |
New escaping rules (#609)
As was noticed by @FedericoStra, escaping for regular expressions was painful. For example you had to do `/\\d` to search for a digit.
This PR brings more complex escaping rules so that less escaping is necessary:
- when after '/': only ' ', ':', '/' and '\' need escaping
- otherwise, '&,' '|', '(', ')', '\' need escaping
Fix #592
-rw-r--r-- | CHANGELOG.md | 1 | ||||
-rw-r--r-- | Cargo.lock | 12 | ||||
-rw-r--r-- | Cargo.toml | 4 | ||||
-rw-r--r-- | src/command/parts.rs | 303 | ||||
-rw-r--r-- | src/pattern/pattern_parts.rs | 18 | ||||
-rw-r--r-- | website/docs/img/regex-antislash-d.png | bin | 0 -> 21063 bytes | |||
-rw-r--r-- | website/docs/input.md | 67 |
7 files changed, 339 insertions, 66 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index f7a6ea1..6d71ff4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ - status messages now displayed on toggling (for example showing hidden files) - upgrade terminal-light to 1.0.1 for better recognition of background color on high precision color terminals - in default configuration, ctrl-left never opens a panel to the left, as I think this was most often unwanted (one too many hit on cltr-left). It's possible to get the old behavior by binding ctrl-left to `:panel_left` instead of the new `:panel_left_no_open` internal. +- New escaping rules basically let you skip many `\`, especially when building regexes - Fix #592 ### v1.15.0 - 2022-09-24 <a name="v1.15.0"></a> @@ -113,9 +113,9 @@ checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" [[package]] name = "bet" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05187b4047565a2bb9aeab0c3e8740175871fd616984d816b0c8f1f6cb71125e" +checksum = "1673d13ad9c8d4b5e3d17a38730714157d428d1a249c18dd96e77e969623ac98" [[package]] name = "bincode" @@ -1090,9 +1090,9 @@ dependencies = [ [[package]] name = "minimad" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd37b2e65fbd459544194d8f52ed84027e031684335a062c708774c09d172b0b" +checksum = "277639f0198568f70f8fe4ab88a52a67c96bca12f27ba5c17a76acdcb8b45834" dependencies = [ "once_cell", ] @@ -1816,9 +1816,9 @@ dependencies = [ [[package]] name = "termimad" -version = "0.20.2" +version = "0.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8a16d7de8d4c97a4149cc3b9d3681c5dba36011c303745bb1af19636e89ba39" +checksum = "3977554523f42b473e5211e5fbb39e78e21e793ff6ffd6428ae3bd02dbb7e09d" dependencies = [ "coolor", "crossbeam", @@ -24,7 +24,7 @@ kitty-csi-check = ["xterm-query"] ahash = { version = "0.7", features = ["serde"] } ansi_colours = "1.0" base64 = "0.13" -bet = "1.0" +bet = "1.0.2" char_reader = "0.1" chrono = "0.4" clap = { version = "3.2.1", features = ["derive"] } @@ -57,7 +57,7 @@ splitty = "1.0" strict = "0.1.4" syntect = { package = "syntect-no-panic", version = "4.6.1" } # see issue #485 tempfile = "3.2" -termimad = "0.20.2" +termimad = "0.20.3" terminal-clipboard = { version = "0.3.1", optional = true } terminal-light = "1.0.1" toml = "0.5" diff --git a/src/command/parts.rs b/src/command/parts.rs index b355de5..6dc2559 100644 --- a/src/command/parts.rs +++ b/src/command/parts.rs @@ -8,7 +8,7 @@ use { }; /// An intermediate parsed representation of the raw string -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub struct CommandParts { pub raw_pattern: String, // may be empty pub pattern: BeTree<PatternOperator, PatternParts>, @@ -31,60 +31,77 @@ impl CommandParts { .as_ref() .map_or(false, |vi| !vi.is_empty()) } - pub fn from(mut raw: String) -> Self { + pub fn from<S: Into<String>>(raw: S) -> Self { + let mut raw = raw.into(); let mut invocation_start_pos: Option<usize> = None; - let mut escaping = false; let mut pt = BeTree::new(); - for (pos, c) in raw.char_indices() { - if c == '\\' { - if escaping { - escaping = false; - } else { - escaping = true; - continue; + let mut chars = raw.char_indices().peekable(); + let mut escape_cur_char = false; + let mut escape_next_char = false; + // we loop on chars and build the pattern tree until we reach an unescaped ' ' or ':' + while let Some((pos, cur_char)) = chars.next() { + let between_slashes = pt.current_atom() + .map_or( + false, + |pp: &PatternParts| pp.is_between_slashes(), + ); + match cur_char { + c if escape_cur_char => { + // Escaping is used to prevent characters from being consumed at the + // composite pattern level (or, and, parens) or as the separator between + // the pattern and the verb. An escaped char is usable in a pattern atom. + pt.mutate_or_create_atom(PatternParts::default).push(c); } - } - if !escaping { - if c == ' ' || c == ':' { + '\\' => { + // Pattern escaping rules: + // - when after '/': only ' ', ':', '/' and '\' need escaping + // - otherwise, '&,' '|', '(', ')' need escaping too ('(' is only here for + // symmetry) + let between_slashes = match pt.current_atom() { + Some(pattern_parts) => pattern_parts.is_between_slashes(), + None => false, + }; + escape_next_char = match chars.peek() { + None => false, // End of the string, we can't be escaping + Some((_, next_char)) => match (next_char, between_slashes) { + (' ' | ':' | '/' | '\\', _) => true, + ('&' | '|' | '!' | '(' | ')', false) => true, + _ => false, + } + }; + if !escape_next_char { + // if the '\' isn't used for escaping, it's used as its char value + pt.mutate_or_create_atom(PatternParts::default).push('\\'); + } + } + ' ' | ':' => { // ending the pattern part invocation_start_pos = Some(pos); break; } - if c == '/' { + '/' => { // starting an atom part pt.mutate_or_create_atom(PatternParts::default).add_part(); - continue; } - let allow_inter_pattern_token = match pt.current_atom() { - Some(pattern_parts) => pattern_parts.allow_inter_pattern_token(), - None => true, - }; - if allow_inter_pattern_token { - match c { - '|' if pt.accept_binary_operator() => { - pt.push_operator(PatternOperator::Or); - continue; - } - '&' if pt.accept_binary_operator() => { - pt.push_operator(PatternOperator::And); - continue; - } - '!' if pt.accept_unary_operator() => { - pt.push_operator(PatternOperator::Not); - continue; - } - '(' if pt.accept_opening_par() => { - pt.open_par(); - continue; - } - ')' if pt.accept_closing_par() => { - pt.close_par(); - continue; - } - _ => {} - } + '|' if !between_slashes && pt.accept_binary_operator() => { + pt.push_operator(PatternOperator::Or); + } + '&' if !between_slashes && pt.accept_binary_operator() => { + pt.push_operator(PatternOperator::And); + } + '!' if !between_slashes && pt.accept_unary_operator() => { + pt.push_operator(PatternOperator::Not); + } + '(' if !between_slashes && pt.accept_opening_par() => { + pt.open_par(); + } + ')' if !between_slashes && pt.accept_closing_par() => { + pt.close_par(); + } + _ => { + pt.mutate_or_create_atom(PatternParts::default).push(cur_char); } } - pt.mutate_or_create_atom(PatternParts::default).push(c); - escaping = false; + escape_cur_char = escape_next_char; + escape_next_char = false; } let mut verb_invocation = None; if let Some(pos) = invocation_start_pos { @@ -123,3 +140,199 @@ impl CommandParts { } +#[cfg(test)] +mod test_command_parts { + + use { + crate::{ + command::CommandParts, + pattern::*, + verb::VerbInvocation, + }, + bet::{BeTree, Token}, + }; + + fn pp(a: &[&str]) -> PatternParts { + a.try_into().unwrap() + } + + fn check( + input: &str, + raw_pattern: &str, + mut pattern_tokens: Vec<Token<PatternOperator, PatternParts>>, + verb_invocation: Option<&str>, + ) { + let left = CommandParts::from(input); + dbg!(&left); + let mut pattern = BeTree::new(); + for token in pattern_tokens.drain(..) { + pattern.push(token); + } + let right = CommandParts { + raw_pattern: raw_pattern.to_string(), + pattern, + verb_invocation: verb_invocation.map(|s| VerbInvocation::from(s)), + }; + dbg!(&right); + assert_eq!(left, right); + } + + #[test] + fn parse_empty() { + check( + "", + "", + vec![], + None, + ); + } + #[test] + fn parse_just_semicolon() { + check( + ":", + "", + vec![], + Some(""), + ); + } + #[test] + fn parse_no_pattern() { + check( + " cd /", + "", + vec![], + Some("cd /"), + ); + } + #[test] + fn parse_pattern_and_invocation() { + check( + "/r cd /", + "/r", + vec![ + Token::Atom(pp(&["", "r"])), + ], + Some("cd /"), + ); + } + #[test] + fn parse_pattern_between_slashes() { + check( + r#"/&"#, + r#"/&"#, + vec![ + Token::Atom(pp(&["", "&"])), + ], + None, + ); + check( + r#"/&/&r/a(\w-)+/ rm"#, + r#"/&/&r/a(\w-)+/"#, + vec![ + Token::Atom(pp(&["", "&", ""])), + Token::Operator(PatternOperator::And), + Token::Atom(pp(&["r", r#"a(\w-)+"#, ""])), + ], + Some("rm"), + ); + } + #[test] + fn parse_pattern_with_space() { + check( + r#"a\ b"#, + r#"a\ b"#, + vec![ + Token::Atom(pp(&["a b"])), + ], + None, + ); + } + #[test] + fn parse_pattern_with_slash() { + check( + r#"r/a\ b\//i cd /"#, + r#"r/a\ b\//i"#, + vec![ + Token::Atom(pp(&["r", "a b/", "i"])), + ], + Some("cd /"), + ); + } + #[test] + fn parse_fuzzy_pattern_searching_parenthesis() { + check( + r#"\("#, + r#"\("#, + vec![ + Token::Atom(pp(&["("])), + ], + None, + ); + } + #[test] + fn parse_regex_pattern_searching_parenthesis() { + check( + r#"/\("#, + r#"/\("#, + vec![ + Token::Atom(pp(&["", r#"\("#])), + ], + None, + ); + } + #[test] + fn parse_composite_pattern() { + check( + "(/txt$/&!truc)&c/rex", + "(/txt$/&!truc)&c/rex", + vec![ + Token::OpeningParenthesis, + Token::Atom(pp(&["", "txt$", ""])), + Token::Operator(PatternOperator::And), + Token::Operator(PatternOperator::Not), + Token::Atom(pp(&["truc"])), + Token::ClosingParenthesis, + Token::Operator(PatternOperator::And), + Token::Atom(pp(&["c", "rex"])), + ], + None + ); + } + #[test] + fn parse_unclosed_composite_pattern() { + check( + r#"!/\.json$/&(c/isize/|c/i32:rm"#, + r#"!/\.json$/&(c/isize/|c/i32"#, + vec![ + Token::Operator(PatternOperator::Not), + Token::Atom(pp(&["", r#"\.json$"#, ""])), + Token::Operator(PatternOperator::And), + Token::OpeningParenthesis, + Token::Atom(pp(&["c", "isize", ""])), + Token::Operator(PatternOperator::Or), + Token::Atom(pp(&["c", "i32"])), + ], + Some("rm"), + ); + } + #[test] + fn issue_592() { // https://github.com/Canop/broot/issues/592 + check( + r#"\t"#, + r#"\t"#, + vec![ + Token::Atom(pp(&[r#"\t"#])), + ], + None, + ); + check( + r#"r/@(\.[^.]+)+/ cp .."#, + r#"r/@(\.[^.]+)+/"#, + vec![ + Token::Atom(pp(&["r", r#"@(\.[^.]+)+"#, ""])), + ], + Some("cp .."), + ); + } +} + diff --git a/src/pattern/pattern_parts.rs b/src/pattern/pattern_parts.rs index 99680d9..19466fb 100644 --- a/src/pattern/pattern_parts.rs +++ b/src/pattern/pattern_parts.rs @@ -27,17 +27,29 @@ impl Default for PatternParts { } } +#[cfg(test)] +impl TryFrom<&[&str]> for PatternParts { + type Error = &'static str; + fn try_from(a: &[&str]) -> Result<Self, Self::Error> { + if a.is_empty() { + return Err("invalid empty parts array"); + } + let parts = a.iter().map(|s| (*s).into()).collect(); + Ok(Self { parts }) + } +} + impl PatternParts { pub fn push(&mut self, c: char) { // self.parts can't be empty, by construct self.parts.last_mut().unwrap().push(c); } + pub fn is_between_slashes(&self) -> bool { + self.parts.len() == 2 + } pub fn add_part(&mut self) { self.parts.push(String::new()); } - pub fn allow_inter_pattern_token(&self) -> bool { - self.parts.len() != 2 - } pub fn is_empty(&self) -> bool { self.core().is_empty() } diff --git a/website/docs/img/regex-antislash-d.png b/website/docs/img/regex-antislash-d.png Binary files differnew file mode 100644 index 0000000..4ea9aad --- /dev/null +++ b/website/docs/img/regex-antislash-d.png diff --git a/website/docs/input.md b/website/docs/input.md index 8d0ac50..b609e48 100644 --- a/website/docs/input.md +++ b/website/docs/input.md @@ -35,32 +35,79 @@ regex name | `/[yz]{3}` or `/[yz]{3}/` | `fuzzy.rs` | search for the regular exp regex name | `/(json|xml)$/i` | `thing.XML` | find files whose name ends in `json` or `xml`, case insensitive regex name | `/abc/i` | `aBc.txt` | search for the regular expression `abc` with flag `i` in filenames exact path | `ep/te\/d` or `pe/te\/d/` | `website/docs` | search for "te/d" in sub-paths from current tree root -regex path | `rp/\\d{3}.*txt` | `dir/a256/abc.txt` | search for the `\d{3}.*txt` regex in sub-paths from current tree root +regex path | `rp/\d{3}.*txt` | `dir/a256/abc.txt` | search for the `\d{3}.*txt` regex in sub-paths from current tree root tokens path | `t/ab,cd` | `DCD/a256/abc.txt` | search for the "ab" and "cd" tokens in sub-paths from current tree root exact content | `c/mask` or `c/mask/` | `umask = "1.0"` | search for the "mask" string in file contents regex content | `rc/[abc]{5}/i` | `bAAAc` | search with a regular expression in file contents - `i` making it case insensitive -regex content | `cr/\\bzh\\b` | `"zh":{` | search a word with a regular expression in file contents +regex content | `cr/\bzh\b` | `"zh":{` | search a word with a regular expression in file contents It's also possible to [redefine those mode mappings](../conf_file/#search-modes). -To escape characters (for example the space, colon or slash) in the pattern, use a `\` (an antislash is `\\`). - # Combining filtering patterns Patterns can be combined with the `!` (not), `&` (and) and `|` (or) operators, and parentheses if necessary. -You can for example display non `json` files containing either `isize` or `i32` with +You can for example list files whose name contains a `z` and whose content contains one too with + + z&c/z + +To display non `json` files containing either `isize` or `i32`, type + + !/\.json$/&(c/isize/|c/i32/) + +The last closing characters are often unecessary when no ambiguity is possible, so you could have typed this: + + !/\.json$/&(c/isize/|c/i32 - !/json$/&(c/isize/|c/i32/) +# Escaping -## Subtleties +## Why escaping ? + +Look at this input: `a|b rm`. + +It's for searching files whose name contains either a `a` or a `b`, then removing the selected one. +The pattern here is `a|b`, it's a composite pattern. + +A space or a colon starts the verb invocation. +So if you needs one of them in your pattern, you need to escape it with `\`. + +For example + +* to search for a file whose name contains a x and a colon, you type `x\:` +* to search for a file whose name contains a space just before a digit, you can use a regular expression: `/\ \d` The characters you use as operators and the parenthesis can be useful in patterns too, either because you want to search for them in fuzzy patterns or in file contents, or because you write non trivial regular expressions. -Most often you'll just type what feels natural and broot will select the interpretation which makes sense but you might be interested in a few rules: +If you want to search for the `|` character (or a `&`, or `(`, or `)`), you can't just type it because it's used to combine elementary patterns. I needs escaping. So if you need to search for the `|` character in file names, you type `\|`. + +An elementary pattern which starts with a `/` can only be ended with a `/`, a space, or a colon. +That's why you don't have to escape other characters you want to include in your elementary pattern. + +This lets you type this regular expression with no unecessary escaping: + + /(\d-){2}\w + +![regex](img/regex-antislash-d.png) + +Regular expression escaping rules still apply, so if you want to search with a regex for a file containing a `(`, you'll type `/\(`. + +## Escaping Rules + +The escaping character is the antislash `\`. + +Most often, you don't need to know more: when broot tells you it doesn't understand your pattern, it should click that your special character needs escaping and you prefix it with a `\ `. + +More precisely: + +1. After the first `/` of a pattern, only ` `, `:`, `/` and `\` need escaping. +2. Otherwise, `&,` `|`, `(`, `)`, `\` need escaping too. +3. When there's no ambiguity, ending characters are often unecessary + +# Performances + +broot interprets the left operand before the right one and doesn't interpret the second one if it's not necessary. -* parenthesis and operators in the second pattern part (parts being separated by `/`) are part of the pattern, which explains why `/(json|xml)` is interpreted as a regular expression. If you want to do a fuzzy search for a `|` in the name of your files, you'll need to either escape it as `\|` or to have an explicit pattern mode : `nf/a|b` because `a|b` would search for files whose name contains either `a` or `b`. And to ensure an operator or closing parenthesis isn't interpreted as part of your pattern, close it with a `/`. -* broot interprets the left operand before the right one and doesn't interpret the second one if it's not necessary. So if you want to search your whole disk for json files containing `abcd`, it will be faster to use `/json$/&c/abcd` rather than `c/abcd/&/json$/` which would look at the file name only after having scanned the content. +So if you want to search your whole disk for json files containing `abcd`, it will be faster to use `/\.json$/&c/abcd` rather than `c/abcd/&/\.json$/` which would look at the file name only after having scanned the content. # The verb invocation |