Do not convert feeds based on Content-Type

This reverts PRs #2214 and #2243, containing the following commits: 203bbf80206180740f24137ce0f35fcb64f2e56e Make utils::convert_text() available to C++ 5dfdc5d64d79a2c080956acf599d202b4aa2988b Convert feed data to utf-8 if charset specifies non-"utf-8" encoding c7fd68105d35b4c3c7e03c4013768272b6c3d015 retrieve_url(): Convert data to utf-8 if non-utf8 charset is specified 7eb721fdfab39d64a455ec8c6d90681864e057c7 Avoid ODR violation by defining HeaderValues in anonymous namespaces 7ba7470af1635e736f2e688fb46303e3569656f0 Get rid of raw new/delete useage in parser.cpp's handle_headers 1c3e1618f780655375b465e6a1d7ef030cb4b460 Reset HTTP headers when detecting a new response 64ad6de02d7f3fd8239772c70e964895360c7524 Update rss/parser.cpp 4b0108a1a501ac69ca969c4790990c35148fd7b0 Update rss/parser.cpp These commits led to problems with feeds that specify the same encoding both in the Content-Type and the XML tag; these were converted to UTF-8, but then the XML parser tried to convert them to UTF-8 again. We don't have a good fix for this yet. The problem is tracked in #1436.
author: Alexander Batischev <eual.jp@gmail.com> 2022-12-25 13:28:25 +0300
committer: Alexander Batischev <eual.jp@gmail.com> 2022-12-25 13:34:50 +0300
commit: fa363edd2d61e598aff0de5848ae955a5b35d3c0 (patch)
tree: 7ed0db759be7b24f650c47ef4e83b6977b36685c
parent: 521ed1fa9028797cd927e21e7e4dcc617f11a9eb (diff)
6 files changed, 20 insertions, 323 deletions
diff --git a/include/utils.h b/include/utils.h
index aeb1d821..c25afc47 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -58,9 +58,6 @@ std::string utf8_to_locale(const std::string& text);
 /// nl_langinfo(CODESET)) to UTF-8.
 std::string locale_to_utf8(const std::string& text);
 
-std::string convert_text(const std::string& text, const std::string& tocode,
-	const std::string& fromcode);
-
 std::string get_command_output(const std::string& cmd);
 std::string http_method_str(const HTTPMethod method);
 std::string link_type_str(LinkType type);
diff --git a/rss/parser.cpp b/rss/parser.cpp
index d370befc..8bae6e75 100644
--- a/rss/parser.cpp
+++ b/rss/parser.cpp
@@ -5,7 +5,6 @@
 #include <curl/curl.h>
 #include <libxml/parser.h>
 #include <libxml/tree.h>
-#include <vector>
 
 #include "config.h"
 #include "curlhandle.h"
@@ -54,74 +53,51 @@ Parser::~Parser()
 	}
 }
 
-namespace {
-
 struct HeaderValues {
 	time_t lastmodified;
 	std::string etag;
-	std::string charset;
 
 	HeaderValues()
+		: lastmodified(0)
 	{
-		reset();
-	}
-
-	void reset()
-	{
-		lastmodified = 0;
-		charset = "utf-8";
-		etag = "";
 	}
 };
 
-}
-
 static size_t handle_headers(void* ptr, size_t size, size_t nmemb, void* data)
 {
-	const auto header = std::string(reinterpret_cast<const char*>(ptr), size * nmemb);
+	char* header = new char[size * nmemb + 1];
 	HeaderValues* values = static_cast<HeaderValues*>(data);
 
-	if (header.find("HTTP/") == 0) {
-		// Reset headers if a new response is detected (there might be multiple responses per request in case of a redirect)
-		values->reset();
-	} else if (header.find("Last-Modified:") == 0) {
-		const std::string header_value = header.substr(14);
-		time_t r = curl_getdate(header_value.c_str(), nullptr);
+	memcpy(header, ptr, size * nmemb);
+	header[size * nmemb] = '\0';
+
+	if (!strncasecmp("Last-Modified:", header, 14)) {
+		time_t r = curl_getdate(header + 14, nullptr);
 		if (r == -1) {
 			LOG(Level::DEBUG,
 				"handle_headers: last-modified %s "
 				"(curl_getdate "
 				"FAILED)",
-				header_value.c_str());
+				header + 14);
 		} else {
-			values->lastmodified = r;
+			values->lastmodified =
+				curl_getdate(header + 14, nullptr);
 			LOG(Level::DEBUG,
 				"handle_headers: got last-modified %s (%" PRId64 ")",
-				header_value,
+				header + 14,
 				// On GCC, `time_t` is `long int`, which is at least 32 bits.
 				// On x86_64, it's 64 bits. Thus, this cast is either a no-op,
 				// or an up-cast which is always safe.
 				static_cast<int64_t>(values->lastmodified));
 		}
-	} else if (header.find("ETag:") == 0) {
-		values->etag = header.substr(5);
+	} else if (!strncasecmp("ETag:", header, 5)) {
+		values->etag = std::string(header + 5);
 		utils::trim(values->etag);
 		LOG(Level::DEBUG, "handle_headers: got etag %s", values->etag);
-	} else if (header.find("Content-Type:") == 0) {
-		const std::string key = "charset=";
-		const auto charset_index = header.find(key);
-		if (charset_index != std::string::npos) {
-			auto charset = header.substr(charset_index + key.size());
-			utils::trim(charset);
-			if (charset.size() >= 2 && charset[0] == '"' && charset[charset.size() - 1] == '"') {
-				charset = charset.substr(1, charset.size() - 2);
-			}
-			if (charset.size() > 0) {
-				values->charset = charset;
-			}
-		}
 	}
 
+	delete[] header;
+
 	return size * nmemb;
 }
 
@@ -270,11 +246,10 @@ Feed Parser::parse_url(const std::string& url,
 		buf);
 
 	if (buf.length() > 0) {
-		LOG(Level::DEBUG, "Parser::parse_url: converting data from %s to utf-8", hdrs.charset);
-		const auto utf8_buf = utils::convert_text(buf, "utf-8", hdrs.charset);
-
-		LOG(Level::DEBUG, "Parser::parse_url: handing over data to parse_buffer()");
-		return parse_buffer(utf8_buf, url);
+		LOG(Level::DEBUG,
+			"Parser::parse_url: handing over data to "
+			"parse_buffer()");
+		return parse_buffer(buf, url);
 	}
 
 	return Feed();
diff --git a/rust/libnewsboat-ffi/src/utils.rs b/rust/libnewsboat-ffi/src/utils.rs
index 4d0740b9..ae397842 100644
--- a/rust/libnewsboat-ffi/src/utils.rs
+++ b/rust/libnewsboat-ffi/src/utils.rs
@@ -89,7 +89,6 @@ mod bridged {
         fn translit(tocode: &str, fromcode: &str) -> String;
         fn utf8_to_locale(text: &str) -> Vec<u8>;
         fn locale_to_utf8(text: &[u8]) -> String;
-        fn convert_text(text: &[u8], tocode: &str, fromcode: &str) -> Vec<u8>;
     }
 
     extern "C++" {
diff --git a/src/cache.cpp b/src/cache.cpp
index e59c8f41..2075868c 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -54,8 +54,6 @@ void Cache::run_sql_nothrow(const std::string& query,
 	run_sql_impl(query, callback, callback_argument, false);
 }
 
-namespace {
-
 struct CbHandler {
 	CbHandler()
 		: c(-1)
@@ -79,8 +77,6 @@ struct HeaderValues {
 	std::string etag;
 };
 
-}
-
 static int count_callback(void* handler, int argc, char** argv,
 	char** /* azColName */)
 {
diff --git a/src/utils.cpp b/src/utils.cpp
index 683cc396..8e499376 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -21,7 +21,6 @@
 #include <regex>
 #include <sstream>
 #include <stfl.h>
-#include <string>
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/utsname.h>
@@ -31,7 +30,6 @@
 #include "config.h"
 #include "curlhandle.h"
 #include "htmlrenderer.h"
-#include "libnewsboat-ffi/src/utils.rs.h"
 #include "logger.h"
 #include "strprintf.h"
 
@@ -188,19 +186,6 @@ std::string utils::locale_to_utf8(const std::string& text)
 	return std::string(utils::bridged::locale_to_utf8(text_slice));
 }
 
-std::string utils::convert_text(const std::string& text, const std::string& tocode,
-	const std::string& fromcode)
-{
-	const auto text_slice =
-		rust::Slice<const unsigned char>(
-			reinterpret_cast<const unsigned char*>(text.c_str()),
-			text.length());
-
-	const auto result = utils::bridged::convert_text(text_slice, tocode, fromcode);
-
-	return std::string(reinterpret_cast<const char*>(result.data()), result.size());
-}
-
 std::string utils::get_command_output(const std::string& cmd)
 {
 	return std::string(utils::bridged::get_command_output(cmd));
@@ -268,50 +253,6 @@ std::string utils::retrieve_url(const std::string& url,
 	return retrieve_url(url, handle, cfgcont, authinfo, body, method);
 }
 
-namespace {
-
-struct HeaderValues {
-	std::string charset;
-
-	HeaderValues()
-	{
-		reset();
-	}
-
-	void reset()
-	{
-		charset = "utf-8";
-	}
-};
-
-}
-
-static size_t handle_headers(void* ptr, size_t size, size_t nmemb, void* data)
-{
-	const auto header = std::string(reinterpret_cast<const char*>(ptr), size * nmemb);
-	HeaderValues* values = static_cast<HeaderValues*>(data);
-
-	if (header.find("HTTP/") == 0) {
-		// Reset headers if a new response is detected (there might be multiple responses per request in case of a redirect)
-		values->reset();
-	} else if (header.find("Content-Type:") == 0) {
-		const std::string key = "charset=";
-		const auto charset_index = header.find(key);
-		if (charset_index != std::string::npos) {
-			auto charset = header.substr(charset_index + key.size());
-			utils::trim(charset);
-			if (charset.size() >= 2 && charset[0] == '"' && charset[charset.size() - 1] == '"') {
-				charset = charset.substr(1, charset.size() - 2);
-			}
-			if (charset.size() > 0) {
-				values->charset = charset;
-			}
-		}
-	}
-
-	return size * nmemb;
-}
-
 std::string utils::retrieve_url(const std::string& url,
 	CurlHandle& easyhandle,
 	ConfigContainer* cfgcont,
@@ -326,10 +267,6 @@ std::string utils::retrieve_url(const std::string& url,
 	curl_easy_setopt(easyhandle.ptr(), CURLOPT_WRITEFUNCTION, my_write_data);
 	curl_easy_setopt(easyhandle.ptr(), CURLOPT_WRITEDATA, &buf);
 
-	HeaderValues hdrs;
-	curl_easy_setopt(easyhandle.ptr(), CURLOPT_HEADERDATA, &hdrs);
-	curl_easy_setopt(easyhandle.ptr(), CURLOPT_HEADERFUNCTION, handle_headers);
-
 	switch (method) {
 	case HTTPMethod::GET:
 		break;
@@ -389,8 +326,7 @@ std::string utils::retrieve_url(const std::string& url,
 	// See the clobbering note above.
 	curl_easy_setopt(easyhandle.ptr(), CURLOPT_ERRORBUFFER, NULL);
 
-	LOG(Level::DEBUG, "Parser::parse_url: converting data from %s to utf-8", hdrs.charset);
-	return utils::convert_text(buf, "utf-8", hdrs.charset);
+	return buf;
 }
 
 std::string utils::run_program(const char* argv[], const std::string& input)
diff --git a/test/utils.cpp b/test/utils.cpp
index 434b8bb4..99ba2c04 100644
--- a/test/utils.cpp
+++ b/test/utils.cpp
@@ -1,7 +1,6 @@
 #include "utils.h"
 
 #include <algorithm>
-#include <cctype>
 #include <chrono>
 #include <ctype.h>
 #include <fstream>
@@ -2013,208 +2012,3 @@ TEST_CASE("translit() always returns the same value for the same inputs",
 		}
 	}
 }
-
-void verify_convert_text(const std::vector<unsigned char>& input,
-	const std::string& tocode, const std::string& fromcode,
-	const std::vector<unsigned char>& expected_output)
-{
-	const std::string input_str = std::string(reinterpret_cast<const char*>(input.data()),
-			input.size());
-	const std::string expected_str = std::string(reinterpret_cast<const char*>
-			(expected_output.data()), expected_output.size());
-
-	REQUIRE(utils::convert_text(input_str, tocode, fromcode) == expected_str);
-}
-
-TEST_CASE("convert_text() returns input string if fromcode and tocode are the same",
-	"[utils]")
-{
-	std::vector<unsigned char> input = {
-		// \x81 is not valid UTF-8
-		0x81, 0x13, 0x41,
-	};
-
-	std::vector<unsigned char> expected = {
-		0x3f, 0x13, 0x41,
-	};
-
-	verify_convert_text(input, "UTF-8", "UTF-8", expected);
-}
-
-TEST_CASE("convert_text() replaces incomplete multibyte sequences with a question mark: utf8 to utf16le",
-	"[utils]")
-{
-	std::vector<unsigned char> input = {
-		// "ой", "oops" in Russian, but the last byte is missing
-		0xd0, 0xbe, 0xd0,
-	};
-
-	std::vector<unsigned char> expected = {
-		0x3e, 0x04, 0x3f, 0x00,
-	};
-
-	verify_convert_text(input, "UTF-16LE", "UTF-8", expected);
-}
-
-TEST_CASE("convert_text() replaces incomplete multibyte sequences with a question mark: utf16le to utf8",
-	"[utils]")
-{
-	SECTION("input includes zero byte") {
-		std::vector<unsigned char> input = {
-			// "hi", but the last byte is missing
-			0x68, 0x00, 0x69,
-		};
-
-		std::vector<unsigned char> expected = {
-			0x68, 0x3f,
-		};
-
-		verify_convert_text(input, "UTF-8", "UTF-16LE", expected);
-	}
-
-	SECTION("input does not include zero byte") {
-		std::vector<unsigned char> input = {
-			// "эй", "hey" in Russian, but the last byte is missing
-			0x4d, 0x04, 0x39,
-		};
-
-		std::vector<unsigned char> expected = {
-			0xd1, 0x8d, 0x3f,
-		};
-
-		verify_convert_text(input, "UTF-8", "UTF-16LE", expected);
-	}
-}
-
-TEST_CASE("convert_text() replaces invalid multibyte sequences with a question mark: utf8 to utf16le",
-	"[utils]")
-{
-	std::vector<unsigned char> input = {
-		// "日本", "Japan", but the third byte of the first character (0xa5) is
-		// missing, making the whole first character an illegal sequence.
-		0xe6, 0x97, 0xe6, 0x9c, 0xac,
-	};
-
-	std::vector<unsigned char> expected = {
-		0x3f, 0x00, 0x3f, 0x00, 0x2c, 0x67,
-	};
-
-	verify_convert_text(input, "UTF-16LE", "UTF-8", expected);
-}
-
-TEST_CASE("convert_text() replaces invalid multibyte sequences with a question mark: utf16le to utf8",
-	"[utils]")
-{
-	std::vector<unsigned char> input = {
-		// The first two bytes here are part of a surrogate pair, i.e. they
-		// imply that the next two bytes encode additional info. However, the
-		// next two bytes are an ordinary character. This breaks the decoding
-		// process, so some things get turned into a question mark while others
-		// are decoded incorrectly.
-		0x01, 0xd8, 0xd7, 0x03,
-	};
-
-	std::vector<unsigned char> expected = {
-		0x3f, 0xed, 0x9f, 0x98, 0x3f,
-	};
-
-	verify_convert_text(input, "UTF-8", "UTF-16LE", expected);
-}
-
-TEST_CASE("convert_text() converts text between encodings: utf8 to utf16le", "[utils]")
-{
-	std::vector<unsigned char> input = {
-		// "Тестирую", "Testing" in Russian.
-		0xd0, 0xa2, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82, 0xd0, 0xb8, 0xd1, 0x80, 0xd1, 0x83,
-		0xd1, 0x8e,
-	};
-
-	std::vector<unsigned char> expected = {
-		0x22, 0x04, 0x35, 0x04, 0x41, 0x04, 0x42, 0x04, 0x38, 0x04, 0x40, 0x04, 0x43, 0x04,
-		0x4e, 0x04,
-	};
-
-	verify_convert_text(input, "UTF-16LE", "UTF-8", expected);
-}
-
-TEST_CASE("convert_text() converts text between encodings: utf8 to koi8r", "[utils]")
-{
-	std::vector<unsigned char> input = {
-		// "Проверка", "Check" in Russian.
-		0xd0, 0x9f, 0xd1, 0x80, 0xd0, 0xbe, 0xd0, 0xb2, 0xd0, 0xb5, 0xd1, 0x80, 0xd0, 0xba,
-		0xd0, 0xb0,
-	};
-
-	std::vector<unsigned char> expected = {
-		0xf0, 0xd2, 0xcf, 0xd7, 0xc5, 0xd2, 0xcb, 0xc1,
-	};
-
-	verify_convert_text(input, "KOI8-R", "UTF-8", expected);
-}
-
-TEST_CASE("convert_text() converts text between encodings: utf8 to ISO-8859-1", "[utils]")
-{
-	// Some symbols in the result will be transliterated.
-
-	std::vector<unsigned char> input = {
-		// "вау °±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃ": a mix of Cyrillic (unsupported by
-		// ISO-8859-1) and ISO-8859-1 characters.
-		0xd0, 0xb2, 0xd0, 0xb0, 0xd1, 0x83, 0x20, 0xc2, 0xb0, 0xc2, 0xb1, 0xc2, 0xb2, 0xc2,
-		0xb3, 0xc2, 0xb4, 0xc2, 0xb5, 0xc2, 0xb6, 0xc2, 0xb7, 0xc2, 0xb8, 0xc2, 0xb9, 0xc2,
-		0xba, 0xc2, 0xbb, 0xc2, 0xbc, 0xc2, 0xbd, 0xc2, 0xbe, 0xc2, 0xbf, 0xc3, 0x80, 0xc3,
-		0x81, 0xc3, 0x82, 0xc3, 0x83,
-	};
-
-	const std::string input_str = std::string(reinterpret_cast<const char*>(input.data()),
-			input.size());
-
-	const auto result = utils::convert_text(input_str, "ISO-8859-1", "UTF-8");
-
-	// We can't spell out an expected result because different platforms
-	// might follow different transliteration rules.
-	REQUIRE(result != "");
-	REQUIRE(result != input_str);
-}
-
-TEST_CASE("convert_text() converts text between encodings: utf16le to utf8", "[utils]")
-{
-	std::vector<unsigned char> input = {
-		// "Успех", "Success" in Russian.
-		0xff, 0xfe, 0x23, 0x04, 0x41, 0x04, 0x3f, 0x04, 0x35, 0x04, 0x45, 0x04,
-	};
-
-	std::vector<unsigned char> expected = {
-		0xef, 0xbb, 0xbf, 0xd0, 0xa3, 0xd1, 0x81, 0xd0, 0xbf, 0xd0, 0xb5, 0xd1, 0x85,
-	};
-
-	verify_convert_text(input, "UTF-8", "UTF-16LE", expected);
-}
-
-TEST_CASE("convert_text() converts text between encodings: koi8r to utf8", "[utils]")
-{
-	std::vector<unsigned char> input = {
-		// "История", "History" in Russian.
-		0xe9, 0xd3, 0xd4, 0xcf, 0xd2, 0xc9, 0xd1,
-	};
-
-	std::vector<unsigned char> expected = {
-		0xd0, 0x98, 0xd1, 0x81, 0xd1, 0x82, 0xd0, 0xbe, 0xd1, 0x80, 0xd0, 0xb8, 0xd1, 0x8f,
-	};
-
-	verify_convert_text(input, "UTF-8", "KOI8-R", expected);
-}
-
-TEST_CASE("convert_text() converts text between encodings: ISO-8859-1 to utf8", "[utils]")
-{
-	std::vector<unsigned char> input = {
-		// "ÄÅÆÇÈÉÊËÌÍÎÏ": some umlauts and Latin letters.
-		0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
-	};
-
-	std::vector<unsigned char> expected = {
-		0xc3, 0x84, 0xc3, 0x85, 0xc3, 0x86, 0xc3, 0x87, 0xc3, 0x88, 0xc3, 0x89, 0xc3, 0x8a,
-		0xc3, 0x8b, 0xc3, 0x8c, 0xc3, 0x8d, 0xc3, 0x8e, 0xc3, 0x8f,
-	};
-
-	verify_convert_text(input, "UTF-8", "ISO-8859-1", expected);
-}
author	Alexander Batischev <eual.jp@gmail.com>	2022-12-25 13:28:25 +0300
committer	Alexander Batischev <eual.jp@gmail.com>	2022-12-25 13:34:50 +0300
commit	fa363edd2d61e598aff0de5848ae955a5b35d3c0 (patch)
tree	7ed0db759be7b24f650c47ef4e83b6977b36685c
parent	521ed1fa9028797cd927e21e7e4dcc617f11a9eb (diff)