diff options
author | Alexander Batischev <eual.jp@gmail.com> | 2022-12-25 13:28:25 +0300 |
---|---|---|
committer | Alexander Batischev <eual.jp@gmail.com> | 2022-12-25 13:34:50 +0300 |
commit | fa363edd2d61e598aff0de5848ae955a5b35d3c0 (patch) | |
tree | 7ed0db759be7b24f650c47ef4e83b6977b36685c | |
parent | 521ed1fa9028797cd927e21e7e4dcc617f11a9eb (diff) |
Do not convert feeds based on Content-Type
This reverts PRs #2214 and #2243, containing the following commits:
203bbf80206180740f24137ce0f35fcb64f2e56e
Make utils::convert_text() available to C++
5dfdc5d64d79a2c080956acf599d202b4aa2988b
Convert feed data to utf-8 if charset specifies non-"utf-8" encoding
c7fd68105d35b4c3c7e03c4013768272b6c3d015
retrieve_url(): Convert data to utf-8 if non-utf8 charset is
specified
7eb721fdfab39d64a455ec8c6d90681864e057c7
Avoid ODR violation by defining HeaderValues in anonymous namespaces
7ba7470af1635e736f2e688fb46303e3569656f0
Get rid of raw new/delete useage in parser.cpp's handle_headers
1c3e1618f780655375b465e6a1d7ef030cb4b460
Reset HTTP headers when detecting a new response
64ad6de02d7f3fd8239772c70e964895360c7524
Update rss/parser.cpp
4b0108a1a501ac69ca969c4790990c35148fd7b0
Update rss/parser.cpp
These commits led to problems with feeds that specify the same encoding
both in the Content-Type and the XML tag; these were converted to UTF-8,
but then the XML parser tried to convert them to UTF-8 again. We don't
have a good fix for this yet. The problem is tracked in #1436.
-rw-r--r-- | include/utils.h | 3 | ||||
-rw-r--r-- | rss/parser.cpp | 63 | ||||
-rw-r--r-- | rust/libnewsboat-ffi/src/utils.rs | 1 | ||||
-rw-r--r-- | src/cache.cpp | 4 | ||||
-rw-r--r-- | src/utils.cpp | 66 | ||||
-rw-r--r-- | test/utils.cpp | 206 |
6 files changed, 20 insertions, 323 deletions
diff --git a/include/utils.h b/include/utils.h index aeb1d821..c25afc47 100644 --- a/include/utils.h +++ b/include/utils.h @@ -58,9 +58,6 @@ std::string utf8_to_locale(const std::string& text); /// nl_langinfo(CODESET)) to UTF-8. std::string locale_to_utf8(const std::string& text); -std::string convert_text(const std::string& text, const std::string& tocode, - const std::string& fromcode); - std::string get_command_output(const std::string& cmd); std::string http_method_str(const HTTPMethod method); std::string link_type_str(LinkType type); diff --git a/rss/parser.cpp b/rss/parser.cpp index d370befc..8bae6e75 100644 --- a/rss/parser.cpp +++ b/rss/parser.cpp @@ -5,7 +5,6 @@ #include <curl/curl.h> #include <libxml/parser.h> #include <libxml/tree.h> -#include <vector> #include "config.h" #include "curlhandle.h" @@ -54,74 +53,51 @@ Parser::~Parser() } } -namespace { - struct HeaderValues { time_t lastmodified; std::string etag; - std::string charset; HeaderValues() + : lastmodified(0) { - reset(); - } - - void reset() - { - lastmodified = 0; - charset = "utf-8"; - etag = ""; } }; -} - static size_t handle_headers(void* ptr, size_t size, size_t nmemb, void* data) { - const auto header = std::string(reinterpret_cast<const char*>(ptr), size * nmemb); + char* header = new char[size * nmemb + 1]; HeaderValues* values = static_cast<HeaderValues*>(data); - if (header.find("HTTP/") == 0) { - // Reset headers if a new response is detected (there might be multiple responses per request in case of a redirect) - values->reset(); - } else if (header.find("Last-Modified:") == 0) { - const std::string header_value = header.substr(14); - time_t r = curl_getdate(header_value.c_str(), nullptr); + memcpy(header, ptr, size * nmemb); + header[size * nmemb] = '\0'; + + if (!strncasecmp("Last-Modified:", header, 14)) { + time_t r = curl_getdate(header + 14, nullptr); if (r == -1) { LOG(Level::DEBUG, "handle_headers: last-modified %s " "(curl_getdate " "FAILED)", - header_value.c_str()); + header + 14); } else { - values->lastmodified = r; + values->lastmodified = + curl_getdate(header + 14, nullptr); LOG(Level::DEBUG, "handle_headers: got last-modified %s (%" PRId64 ")", - header_value, + header + 14, // On GCC, `time_t` is `long int`, which is at least 32 bits. // On x86_64, it's 64 bits. Thus, this cast is either a no-op, // or an up-cast which is always safe. static_cast<int64_t>(values->lastmodified)); } - } else if (header.find("ETag:") == 0) { - values->etag = header.substr(5); + } else if (!strncasecmp("ETag:", header, 5)) { + values->etag = std::string(header + 5); utils::trim(values->etag); LOG(Level::DEBUG, "handle_headers: got etag %s", values->etag); - } else if (header.find("Content-Type:") == 0) { - const std::string key = "charset="; - const auto charset_index = header.find(key); - if (charset_index != std::string::npos) { - auto charset = header.substr(charset_index + key.size()); - utils::trim(charset); - if (charset.size() >= 2 && charset[0] == '"' && charset[charset.size() - 1] == '"') { - charset = charset.substr(1, charset.size() - 2); - } - if (charset.size() > 0) { - values->charset = charset; - } - } } + delete[] header; + return size * nmemb; } @@ -270,11 +246,10 @@ Feed Parser::parse_url(const std::string& url, buf); if (buf.length() > 0) { - LOG(Level::DEBUG, "Parser::parse_url: converting data from %s to utf-8", hdrs.charset); - const auto utf8_buf = utils::convert_text(buf, "utf-8", hdrs.charset); - - LOG(Level::DEBUG, "Parser::parse_url: handing over data to parse_buffer()"); - return parse_buffer(utf8_buf, url); + LOG(Level::DEBUG, + "Parser::parse_url: handing over data to " + "parse_buffer()"); + return parse_buffer(buf, url); } return Feed(); diff --git a/rust/libnewsboat-ffi/src/utils.rs b/rust/libnewsboat-ffi/src/utils.rs index 4d0740b9..ae397842 100644 --- a/rust/libnewsboat-ffi/src/utils.rs +++ b/rust/libnewsboat-ffi/src/utils.rs @@ -89,7 +89,6 @@ mod bridged { fn translit(tocode: &str, fromcode: &str) -> String; fn utf8_to_locale(text: &str) -> Vec<u8>; fn locale_to_utf8(text: &[u8]) -> String; - fn convert_text(text: &[u8], tocode: &str, fromcode: &str) -> Vec<u8>; } extern "C++" { diff --git a/src/cache.cpp b/src/cache.cpp index e59c8f41..2075868c 100644 --- a/src/cache.cpp +++ b/src/cache.cpp @@ -54,8 +54,6 @@ void Cache::run_sql_nothrow(const std::string& query, run_sql_impl(query, callback, callback_argument, false); } -namespace { - struct CbHandler { CbHandler() : c(-1) @@ -79,8 +77,6 @@ struct HeaderValues { std::string etag; }; -} - static int count_callback(void* handler, int argc, char** argv, char** /* azColName */) { diff --git a/src/utils.cpp b/src/utils.cpp index 683cc396..8e499376 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -21,7 +21,6 @@ #include <regex> #include <sstream> #include <stfl.h> -#include <string> #include <sys/param.h> #include <sys/types.h> #include <sys/utsname.h> @@ -31,7 +30,6 @@ #include "config.h" #include "curlhandle.h" #include "htmlrenderer.h" -#include "libnewsboat-ffi/src/utils.rs.h" #include "logger.h" #include "strprintf.h" @@ -188,19 +186,6 @@ std::string utils::locale_to_utf8(const std::string& text) return std::string(utils::bridged::locale_to_utf8(text_slice)); } -std::string utils::convert_text(const std::string& text, const std::string& tocode, - const std::string& fromcode) -{ - const auto text_slice = - rust::Slice<const unsigned char>( - reinterpret_cast<const unsigned char*>(text.c_str()), - text.length()); - - const auto result = utils::bridged::convert_text(text_slice, tocode, fromcode); - - return std::string(reinterpret_cast<const char*>(result.data()), result.size()); -} - std::string utils::get_command_output(const std::string& cmd) { return std::string(utils::bridged::get_command_output(cmd)); @@ -268,50 +253,6 @@ std::string utils::retrieve_url(const std::string& url, return retrieve_url(url, handle, cfgcont, authinfo, body, method); } -namespace { - -struct HeaderValues { - std::string charset; - - HeaderValues() - { - reset(); - } - - void reset() - { - charset = "utf-8"; - } -}; - -} - -static size_t handle_headers(void* ptr, size_t size, size_t nmemb, void* data) -{ - const auto header = std::string(reinterpret_cast<const char*>(ptr), size * nmemb); - HeaderValues* values = static_cast<HeaderValues*>(data); - - if (header.find("HTTP/") == 0) { - // Reset headers if a new response is detected (there might be multiple responses per request in case of a redirect) - values->reset(); - } else if (header.find("Content-Type:") == 0) { - const std::string key = "charset="; - const auto charset_index = header.find(key); - if (charset_index != std::string::npos) { - auto charset = header.substr(charset_index + key.size()); - utils::trim(charset); - if (charset.size() >= 2 && charset[0] == '"' && charset[charset.size() - 1] == '"') { - charset = charset.substr(1, charset.size() - 2); - } - if (charset.size() > 0) { - values->charset = charset; - } - } - } - - return size * nmemb; -} - std::string utils::retrieve_url(const std::string& url, CurlHandle& easyhandle, ConfigContainer* cfgcont, @@ -326,10 +267,6 @@ std::string utils::retrieve_url(const std::string& url, curl_easy_setopt(easyhandle.ptr(), CURLOPT_WRITEFUNCTION, my_write_data); curl_easy_setopt(easyhandle.ptr(), CURLOPT_WRITEDATA, &buf); - HeaderValues hdrs; - curl_easy_setopt(easyhandle.ptr(), CURLOPT_HEADERDATA, &hdrs); - curl_easy_setopt(easyhandle.ptr(), CURLOPT_HEADERFUNCTION, handle_headers); - switch (method) { case HTTPMethod::GET: break; @@ -389,8 +326,7 @@ std::string utils::retrieve_url(const std::string& url, // See the clobbering note above. curl_easy_setopt(easyhandle.ptr(), CURLOPT_ERRORBUFFER, NULL); - LOG(Level::DEBUG, "Parser::parse_url: converting data from %s to utf-8", hdrs.charset); - return utils::convert_text(buf, "utf-8", hdrs.charset); + return buf; } std::string utils::run_program(const char* argv[], const std::string& input) diff --git a/test/utils.cpp b/test/utils.cpp index 434b8bb4..99ba2c04 100644 --- a/test/utils.cpp +++ b/test/utils.cpp @@ -1,7 +1,6 @@ #include "utils.h" #include <algorithm> -#include <cctype> #include <chrono> #include <ctype.h> #include <fstream> @@ -2013,208 +2012,3 @@ TEST_CASE("translit() always returns the same value for the same inputs", } } } - -void verify_convert_text(const std::vector<unsigned char>& input, - const std::string& tocode, const std::string& fromcode, - const std::vector<unsigned char>& expected_output) -{ - const std::string input_str = std::string(reinterpret_cast<const char*>(input.data()), - input.size()); - const std::string expected_str = std::string(reinterpret_cast<const char*> - (expected_output.data()), expected_output.size()); - - REQUIRE(utils::convert_text(input_str, tocode, fromcode) == expected_str); -} - -TEST_CASE("convert_text() returns input string if fromcode and tocode are the same", - "[utils]") -{ - std::vector<unsigned char> input = { - // \x81 is not valid UTF-8 - 0x81, 0x13, 0x41, - }; - - std::vector<unsigned char> expected = { - 0x3f, 0x13, 0x41, - }; - - verify_convert_text(input, "UTF-8", "UTF-8", expected); -} - -TEST_CASE("convert_text() replaces incomplete multibyte sequences with a question mark: utf8 to utf16le", - "[utils]") -{ - std::vector<unsigned char> input = { - // "ой", "oops" in Russian, but the last byte is missing - 0xd0, 0xbe, 0xd0, - }; - - std::vector<unsigned char> expected = { - 0x3e, 0x04, 0x3f, 0x00, - }; - - verify_convert_text(input, "UTF-16LE", "UTF-8", expected); -} - -TEST_CASE("convert_text() replaces incomplete multibyte sequences with a question mark: utf16le to utf8", - "[utils]") -{ - SECTION("input includes zero byte") { - std::vector<unsigned char> input = { - // "hi", but the last byte is missing - 0x68, 0x00, 0x69, - }; - - std::vector<unsigned char> expected = { - 0x68, 0x3f, - }; - - verify_convert_text(input, "UTF-8", "UTF-16LE", expected); - } - - SECTION("input does not include zero byte") { - std::vector<unsigned char> input = { - // "эй", "hey" in Russian, but the last byte is missing - 0x4d, 0x04, 0x39, - }; - - std::vector<unsigned char> expected = { - 0xd1, 0x8d, 0x3f, - }; - - verify_convert_text(input, "UTF-8", "UTF-16LE", expected); - } -} - -TEST_CASE("convert_text() replaces invalid multibyte sequences with a question mark: utf8 to utf16le", - "[utils]") -{ - std::vector<unsigned char> input = { - // "日本", "Japan", but the third byte of the first character (0xa5) is - // missing, making the whole first character an illegal sequence. - 0xe6, 0x97, 0xe6, 0x9c, 0xac, - }; - - std::vector<unsigned char> expected = { - 0x3f, 0x00, 0x3f, 0x00, 0x2c, 0x67, - }; - - verify_convert_text(input, "UTF-16LE", "UTF-8", expected); -} - -TEST_CASE("convert_text() replaces invalid multibyte sequences with a question mark: utf16le to utf8", - "[utils]") -{ - std::vector<unsigned char> input = { - // The first two bytes here are part of a surrogate pair, i.e. they - // imply that the next two bytes encode additional info. However, the - // next two bytes are an ordinary character. This breaks the decoding - // process, so some things get turned into a question mark while others - // are decoded incorrectly. - 0x01, 0xd8, 0xd7, 0x03, - }; - - std::vector<unsigned char> expected = { - 0x3f, 0xed, 0x9f, 0x98, 0x3f, - }; - - verify_convert_text(input, "UTF-8", "UTF-16LE", expected); -} - -TEST_CASE("convert_text() converts text between encodings: utf8 to utf16le", "[utils]") -{ - std::vector<unsigned char> input = { - // "Тестирую", "Testing" in Russian. - 0xd0, 0xa2, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82, 0xd0, 0xb8, 0xd1, 0x80, 0xd1, 0x83, - 0xd1, 0x8e, - }; - - std::vector<unsigned char> expected = { - 0x22, 0x04, 0x35, 0x04, 0x41, 0x04, 0x42, 0x04, 0x38, 0x04, 0x40, 0x04, 0x43, 0x04, - 0x4e, 0x04, - }; - - verify_convert_text(input, "UTF-16LE", "UTF-8", expected); -} - -TEST_CASE("convert_text() converts text between encodings: utf8 to koi8r", "[utils]") -{ - std::vector<unsigned char> input = { - // "Проверка", "Check" in Russian. - 0xd0, 0x9f, 0xd1, 0x80, 0xd0, 0xbe, 0xd0, 0xb2, 0xd0, 0xb5, 0xd1, 0x80, 0xd0, 0xba, - 0xd0, 0xb0, - }; - - std::vector<unsigned char> expected = { - 0xf0, 0xd2, 0xcf, 0xd7, 0xc5, 0xd2, 0xcb, 0xc1, - }; - - verify_convert_text(input, "KOI8-R", "UTF-8", expected); -} - -TEST_CASE("convert_text() converts text between encodings: utf8 to ISO-8859-1", "[utils]") -{ - // Some symbols in the result will be transliterated. - - std::vector<unsigned char> input = { - // "вау °±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃ": a mix of Cyrillic (unsupported by - // ISO-8859-1) and ISO-8859-1 characters. - 0xd0, 0xb2, 0xd0, 0xb0, 0xd1, 0x83, 0x20, 0xc2, 0xb0, 0xc2, 0xb1, 0xc2, 0xb2, 0xc2, - 0xb3, 0xc2, 0xb4, 0xc2, 0xb5, 0xc2, 0xb6, 0xc2, 0xb7, 0xc2, 0xb8, 0xc2, 0xb9, 0xc2, - 0xba, 0xc2, 0xbb, 0xc2, 0xbc, 0xc2, 0xbd, 0xc2, 0xbe, 0xc2, 0xbf, 0xc3, 0x80, 0xc3, - 0x81, 0xc3, 0x82, 0xc3, 0x83, - }; - - const std::string input_str = std::string(reinterpret_cast<const char*>(input.data()), - input.size()); - - const auto result = utils::convert_text(input_str, "ISO-8859-1", "UTF-8"); - - // We can't spell out an expected result because different platforms - // might follow different transliteration rules. - REQUIRE(result != ""); - REQUIRE(result != input_str); -} - -TEST_CASE("convert_text() converts text between encodings: utf16le to utf8", "[utils]") -{ - std::vector<unsigned char> input = { - // "Успех", "Success" in Russian. - 0xff, 0xfe, 0x23, 0x04, 0x41, 0x04, 0x3f, 0x04, 0x35, 0x04, 0x45, 0x04, - }; - - std::vector<unsigned char> expected = { - 0xef, 0xbb, 0xbf, 0xd0, 0xa3, 0xd1, 0x81, 0xd0, 0xbf, 0xd0, 0xb5, 0xd1, 0x85, - }; - - verify_convert_text(input, "UTF-8", "UTF-16LE", expected); -} - -TEST_CASE("convert_text() converts text between encodings: koi8r to utf8", "[utils]") -{ - std::vector<unsigned char> input = { - // "История", "History" in Russian. - 0xe9, 0xd3, 0xd4, 0xcf, 0xd2, 0xc9, 0xd1, - }; - - std::vector<unsigned char> expected = { - 0xd0, 0x98, 0xd1, 0x81, 0xd1, 0x82, 0xd0, 0xbe, 0xd1, 0x80, 0xd0, 0xb8, 0xd1, 0x8f, - }; - - verify_convert_text(input, "UTF-8", "KOI8-R", expected); -} - -TEST_CASE("convert_text() converts text between encodings: ISO-8859-1 to utf8", "[utils]") -{ - std::vector<unsigned char> input = { - // "ÄÅÆÇÈÉÊËÌÍÎÏ": some umlauts and Latin letters. - 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, - }; - - std::vector<unsigned char> expected = { - 0xc3, 0x84, 0xc3, 0x85, 0xc3, 0x86, 0xc3, 0x87, 0xc3, 0x88, 0xc3, 0x89, 0xc3, 0x8a, - 0xc3, 0x8b, 0xc3, 0x8c, 0xc3, 0x8d, 0xc3, 0x8e, 0xc3, 0x8f, - }; - - verify_convert_text(input, "UTF-8", "ISO-8859-1", expected); -} |