Merge pull request #658 from newsboat/bugfix/652-quoted-pound-sign-treated-as-comment

Ignore # characters inside double quotes and backticks Fixes #652.
author: Alexander Batischev <eual.jp@gmail.com> 2019-10-02 16:30:18 +0300
committer: GitHub <noreply@github.com> 2019-10-02 16:30:18 +0300
commit: b3d0b966b09d1ca6cdc70e8fa7ad614041ef5bff (patch)
tree: a156bbfea2697c76b0d851f8f2957151cb4409fc
parent: c63da29f70de21f045a1e40d45bcbf332ef71a4c (diff)
parent: 6b080e68d6d255684321a8dca669fd850152c75b (diff)
7 files changed, 271 insertions, 13 deletions
diff --git a/doc/chapter-firststeps.txt b/doc/chapter-firststeps.txt
index 725ad49c..01e71a50 100644
--- a/doc/chapter-firststeps.txt
+++ b/doc/chapter-firststeps.txt
@@ -92,12 +92,12 @@ also contain comments, which start with the `#` character and go as far as the
 end of line. If you need to enter a configuration argument that contains
 spaces, use quotes (`"`) around the whole argument. It's even possible to
 integrate the output of external commands into the configuration. The text
-between two backticks (+\`+) is evaluated as shell command, and its output is
-put on its place instead. This works like backtick evaluation in
+between two backticks (+&#96;+) is evaluated as shell command, and its output
+is put on its place instead. This works like backtick evaluation in
 Bourne-compatible shells and allows users to use external information from the
-system within the configuration. Backticks can be escaped with a backslash
-(+\`+); in that case, they'll be replaced with a literal backtick in the
-configuration.
+system within the configuration. Backticks and `#` characters can be escaped
+with a backslash (e.g. `\`` and `\#`); in that case, they'll be replaced with
+literal +&#96;+ or `#` in the configuration.
 
 Searching for articles is possible in newsboat, too. Just press the "/" key,
 enter your search phrase, and the title and content of all articles are
diff --git a/include/rs_utils.h b/include/rs_utils.h
index b35468ff..cbef8e0a 100644
--- a/include/rs_utils.h
+++ b/include/rs_utils.h
@@ -82,6 +82,8 @@ unsigned int rs_newsboat_version_major();
 
 int rs_mkdir_parents(const char* path, const std::uint32_t mode);
 
+char* rs_strip_comments(const char* line);
+
 class RustString {
 private:
 	char* str;
diff --git a/rust/libnewsboat-ffi/src/utils.rs b/rust/libnewsboat-ffi/src/utils.rs
index 1da94c53..b75c94dc 100644
--- a/rust/libnewsboat-ffi/src/utils.rs
+++ b/rust/libnewsboat-ffi/src/utils.rs
@@ -489,3 +489,18 @@ pub extern "C" fn rs_program_version() -> *mut c_char {
 pub extern "C" fn rs_newsboat_version_major() -> u32 {
     abort_on_panic(|| utils::newsboat_major_version())
 }
+
+#[no_mangle]
+pub unsafe extern "C" fn rs_strip_comments(line: *const c_char) -> *mut c_char {
+    abort_on_panic(|| {
+        let line = CStr::from_ptr(line);
+        let line = line.to_str().expect("line contained invalid UTF-8");
+
+        let result = utils::strip_comments(line);
+
+        // `result` contains a subset of `line`, which is a C string. Thus, we conclude that
+        // `result` doesn't contain null bytes. Therefore, `CString::new` always returns `Some`.
+        let result = CString::new(result).unwrap();
+        result.into_raw()
+    })
+}
diff --git a/rust/libnewsboat/src/utils.rs b/rust/libnewsboat/src/utils.rs
index 91ee9cfe..d1ccca80 100644
--- a/rust/libnewsboat/src/utils.rs
+++ b/rust/libnewsboat/src/utils.rs
@@ -545,6 +545,44 @@ pub fn newsboat_major_version() -> u32 {
     env!("CARGO_PKG_VERSION_MAJOR").parse::<u32>().unwrap()
 }
 
+/// Returns the part of the string before first # character (or the whole input string if there are
+/// no # character in it). Pound characters inside double quotes and backticks are ignored.
+pub fn strip_comments(line: &str) -> &str {
+    let mut prev_was_backslash = false;
+    let mut inside_quotes = false;
+    let mut inside_backticks = false;
+
+    let mut first_pound_chr_idx = line.len();
+
+    for (idx, chr) in line.char_indices() {
+        if chr == '\\' {
+            prev_was_backslash = true;
+            continue;
+        } else if chr == '"' {
+            // If the quote is escaped or we're inside backticks, do nothing
+            if !prev_was_backslash && !inside_backticks {
+                inside_quotes = !inside_quotes;
+            }
+        } else if chr == '`' {
+            // If the backtick is escaped, do nothing
+            if !prev_was_backslash {
+                inside_backticks = !inside_backticks;
+            }
+        } else if chr == '#' {
+            if !prev_was_backslash && !inside_quotes && !inside_backticks {
+                first_pound_chr_idx = idx;
+                break;
+            }
+        }
+
+        // We call `continue` when we run into a backslash; here, we handle all the other
+        // characters, which clearly *aren't* a backslash
+        prev_was_backslash = false;
+    }
+
+    &line[0..first_pound_chr_idx]
+}
+
 #[cfg(test)]
 mod tests {
     extern crate tempfile;
@@ -827,6 +865,7 @@ mod tests {
             get_command_output("a-program-that-is-guaranteed-to-not-exists"),
             "".to_string()
         );
+        assert_eq!(get_command_output("echo c\" d e"), "".to_string());
     }
 
     #[test]
@@ -1042,4 +1081,55 @@ mod tests {
 
         assert_eq!(strnaturalcmp("aa10", "aa2"), Ordering::Greater);
     }
+
+    #[test]
+    fn t_strip_comments() {
+        // no comments in line
+        assert_eq!(strip_comments(""), "");
+        assert_eq!(strip_comments("\t\n"), "\t\n");
+        assert_eq!(strip_comments("some directive "), "some directive ");
+
+        // fully commented line
+        assert_eq!(strip_comments("#"), "");
+        assert_eq!(strip_comments("# #"), "");
+        assert_eq!(strip_comments("# comment"), "");
+
+        // partially commented line
+        assert_eq!(strip_comments("directive # comment"), "directive ");
+        assert_eq!(
+            strip_comments("directive # comment # another"),
+            "directive "
+        );
+        assert_eq!(strip_comments("directive#comment"), "directive");
+
+        // ignores # characters inside double quotes (#652)
+        let expected = r#"highlight article "[-=+#_*~]{3,}.*" green default"#;
+        let input = expected.to_owned() + "# this is a comment";
+        assert_eq!(strip_comments(&input), expected);
+
+        let expected =
+            r#"highlight all "(https?|ftp)://[\-\.,/%~_:?&=\#a-zA-Z0-9]+" blue default bold"#;
+        let input = expected.to_owned() + "#heresacomment";
+        assert_eq!(strip_comments(&input), expected);
+
+        // Escaped double quote inside double quotes is not treated as closing quote
+        let expected = r#"test "here \"goes # nothing\" etc" hehe"#;
+        let input = expected.to_owned() + "# and here is a comment";
+        assert_eq!(strip_comments(&input), expected);
+
+        // Ignores # characters inside backticks
+        let expected = r#"one `two # three` four"#;
+        let input = expected.to_owned() + "# and a comment, of course";
+        assert_eq!(strip_comments(&input), expected);
+
+        // Escaped backtick inside backticks is not treated as closing
+        let expected = r#"some `other \` tricky # test` hehe"#;
+        let input = expected.to_owned() + "#here goescomment";
+        assert_eq!(strip_comments(&input), expected);
+
+        // Ignores escaped # characters (\\#)
+        let expected = r#"one two \# three four"#;
+        let input = expected.to_owned() + "# and a comment";
+        assert_eq!(strip_comments(&input), expected);
+    }
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index d7351425..9b50a492 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -78,14 +78,7 @@ void append_escapes(std::string& str, char c)
 
 std::string utils::strip_comments(const std::string& line)
 {
-	/*
-	 * This functions returns only the non-comment part of the line,
-	 * which can then be safely evaluated and tokenized.
-	 *
-	 * If no comments are present on the line, the whole line is returned.
-	 */
-	auto hash_pos = line.find_first_of("#");
-	return line.substr(0, hash_pos);
+	return RustString(rs_strip_comments(line.c_str()));
 }
 
 std::vector<std::string> utils::tokenize_quoted(const std::string& str,
diff --git a/test/configparser.cpp b/test/configparser.cpp
index 1133090d..f50625d8 100644
--- a/test/configparser.cpp
+++ b/test/configparser.cpp
@@ -59,6 +59,34 @@ TEST_CASE("evaluate_backticks replaces command in backticks with its output",
 		REQUIRE_NOTHROW(cfgparser.parse("data/config-space-backticks"));
 		REQUIRE_FALSE(keys.get_operation("s", "all") == OP_NIL);
 	}
+
+	SECTION("Unbalanced backtick does *not* start a command")
+	{
+		const auto input1 = std::string("one `echo two three");
+		REQUIRE(ConfigParser::evaluate_backticks(input1) == input1);
+
+		const auto input2 = std::string("this `echo is a` test `here");
+		const auto expected2 = std::string("this is a test `here");
+		REQUIRE(ConfigParser::evaluate_backticks(input2) == expected2);
+	}
+
+	// One might think that putting one or both backticks inside a string will
+	// "escape" them, the same way as backslash does. But it doesn't, and
+	// shouldn't: when parsing a config, we need to evaluate *all* commands
+	// there are, no matter where they're placed.
+	SECTION("Backticks inside double quotes are not ignored")
+	{
+		const auto input1 = std::string(R"#("`echo hello`")#");
+		REQUIRE(ConfigParser::evaluate_backticks(input1) == R"#("hello")#");
+
+		const auto input2 = std::string(R"#(a "b `echo c" d e` f)#");
+		// The line above asks the shell to run 'echo c" d e', which is an
+		// invalid command--the double quotes are not closed. The standard
+		// output of that command would be empty, so nothing will be inserted
+		// in place of backticks.
+		const auto expected2 = std::string(R"#(a "b  f)#");
+		REQUIRE(ConfigParser::evaluate_backticks(input2) == expected2);
+	}
 }
 
 TEST_CASE("\"unbind-key -a\" removes all key bindings", "[ConfigParser]")
diff --git a/test/utils.cpp b/test/utils.cpp
index e9c8c587..f516f65c 100644
--- a/test/utils.cpp
+++ b/test/utils.cpp
@@ -150,6 +150,62 @@ TEST_CASE("tokenize_quoted() doesn't un-escape escaped backticks", "[utils]")
 	REQUIRE(tokens[1] == "\\`foobar `bla`\\`");
 }
 
+TEST_CASE("tokenize_quoted stops tokenizing once it found a # character "
+		"(outside of double quotes)",
+		"[utils]")
+{
+	std::vector<std::string> tokens;
+
+	SECTION("A string consisting of just a comment")
+	{
+		tokens = utils::tokenize_quoted("# just a comment");
+		REQUIRE(tokens.empty());
+	}
+
+	SECTION("A string with one quoted substring")
+	{
+		tokens = utils::tokenize_quoted(R"#("a test substring" # !!!)#");
+		REQUIRE(tokens.size() == 1);
+		REQUIRE(tokens[0] == "a test substring");
+	}
+
+	SECTION("A string with two quoted substrings")
+	{
+		tokens = utils::tokenize_quoted(R"#("first sub" "snd" # comment)#");
+		REQUIRE(tokens.size() == 2);
+		REQUIRE(tokens[0] == "first sub");
+		REQUIRE(tokens[1] == "snd");
+	}
+
+	SECTION("A comment containing # character")
+	{
+		tokens = utils::tokenize_quoted(R"#(one # a comment with # char)#");
+		REQUIRE(tokens.size() == 1);
+		REQUIRE(tokens[0] == "one");
+	}
+
+	SECTION("A # character inside quoted substring is ignored")
+	{
+		tokens = utils::tokenize_quoted(R"#(this "will # be" ignored)#");
+		REQUIRE(tokens.size() == 3);
+		REQUIRE(tokens[0] == "this");
+		REQUIRE(tokens[1] == "will # be");
+		REQUIRE(tokens[2] == "ignored");
+	}
+}
+
+TEST_CASE("tokenize_quoted does not consider escaped pound sign (\\#) "
+		"a beginning of a comment",
+		"[utils]")
+{
+	const auto tokens = utils::tokenize_quoted(R"#(one \# two three # ???)#");
+	REQUIRE(tokens.size() == 4);
+	REQUIRE(tokens[0] == "one");
+	REQUIRE(tokens[1] == "\\#");
+	REQUIRE(tokens[2] == "two");
+	REQUIRE(tokens[3] == "three");
+}
+
 TEST_CASE("tokenize_nl() split a string into delimiters and fields", "[utils]")
 {
 	std::vector<std::string> tokens;
@@ -219,6 +275,79 @@ TEST_CASE(
 	}
 }
 
+TEST_CASE("strip_comments ignores escaped # characters (\\#)")
+{
+	const auto expected =
+		std::string(R"#(one two \# three four)#");
+	const auto input = expected + "# and a comment";
+	REQUIRE(utils::strip_comments(input) == expected);
+}
+
+TEST_CASE("strip_comments ignores # characters inside double quotes",
+		"[utils][issue652]")
+{
+	SECTION("Real-world cases from issue 652") {
+		const auto expected1 =
+			std::string(R"#(highlight article "[-=+#_*~]{3,}.*" green default)#");
+		const auto input1 = expected1 + "# this is a comment";
+		REQUIRE(utils::strip_comments(input1) == expected1);
+
+		const auto expected2 =
+			std::string(R"#(highlight all "(https?|ftp)://[\-\.,/%~_:?&=\#a-zA-Z0-9]+" blue default bold)#");
+		const auto input2 = expected2 + "#heresacomment";
+		REQUIRE(utils::strip_comments(input2) == expected2);
+	}
+
+	SECTION("Escaped double quote inside double quotes is not treated "
+			"as closing quote")
+	{
+		const auto expected =
+			std::string(R"#(test "here \"goes # nothing\" etc" hehe)#");
+		const auto input = expected + "# and here is a comment";
+		REQUIRE(utils::strip_comments(input) == expected);
+	}
+}
+
+TEST_CASE("strip_comments ignores # characters inside backticks", "[utils]")
+{
+	SECTION("Simple case") {
+		const auto expected = std::string(R"#(one `two # three` four)#");
+		const auto input = expected + "# and a comment, of course";
+		REQUIRE(utils::strip_comments(input) == expected);
+	}
+
+	SECTION("Escaped backtick inside backticks is not treated as closing")
+	{
+		const auto expected =
+			std::string(R"#(some `other \` tricky # test` hehe)#");
+		const auto input = expected + "#here goescomment";
+		REQUIRE(utils::strip_comments(input) == expected);
+	}
+}
+
+TEST_CASE("strip_comments is not confused by nested double quotes and backticks",
+		"[utils]")
+{
+	{
+		const auto expected = std::string(R"#("`" ... ` `"` ")#");
+		const auto input = expected + "#comment";
+		REQUIRE(utils::strip_comments(input) == expected);
+	}
+
+	{
+		const auto expected = std::string(R"#(aaa ` bbb "ccc ddd" e` dd)#");
+		const auto input = expected + "# a comment string";
+		REQUIRE(utils::strip_comments(input) == expected);
+	}
+
+	{
+        const auto expected =
+			std::string(R"#(option "this `weird " command` for value")#");
+        const auto input = expected + "#and a comment";
+        REQUIRE(utils::strip_comments(input) == expected);
+	}
+}
+
 TEST_CASE(
 	"consolidate_whitespace replaces multiple consecutive"
 	"whitespace with a single space",
@@ -249,6 +378,7 @@ TEST_CASE("get_command_output()", "[utils]")
 		"a-program-that-is-guaranteed-to-not-exists"));
 	REQUIRE(utils::get_command_output(
 			"a-program-that-is-guaranteed-to-not-exists") == "");
+	REQUIRE(utils::get_command_output("echo c\" d e") == "");
 }
 
 TEST_CASE("extract_filter()", "[utils]")
author	Alexander Batischev <eual.jp@gmail.com>	2019-10-02 16:30:18 +0300
committer	GitHub <noreply@github.com>	2019-10-02 16:30:18 +0300
commit	b3d0b966b09d1ca6cdc70e8fa7ad614041ef5bff (patch)
tree	a156bbfea2697c76b0d851f8f2957151cb4409fc
parent	c63da29f70de21f045a1e40d45bcbf332ef71a4c (diff)
parent	6b080e68d6d255684321a8dca669fd850152c75b (diff)