summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexander Batischev <eual.jp@gmail.com>2022-12-25 13:28:25 +0300
committerAlexander Batischev <eual.jp@gmail.com>2022-12-25 13:34:50 +0300
commitfa363edd2d61e598aff0de5848ae955a5b35d3c0 (patch)
tree7ed0db759be7b24f650c47ef4e83b6977b36685c
parent521ed1fa9028797cd927e21e7e4dcc617f11a9eb (diff)
Do not convert feeds based on Content-Type
This reverts PRs #2214 and #2243, containing the following commits: 203bbf80206180740f24137ce0f35fcb64f2e56e Make utils::convert_text() available to C++ 5dfdc5d64d79a2c080956acf599d202b4aa2988b Convert feed data to utf-8 if charset specifies non-"utf-8" encoding c7fd68105d35b4c3c7e03c4013768272b6c3d015 retrieve_url(): Convert data to utf-8 if non-utf8 charset is specified 7eb721fdfab39d64a455ec8c6d90681864e057c7 Avoid ODR violation by defining HeaderValues in anonymous namespaces 7ba7470af1635e736f2e688fb46303e3569656f0 Get rid of raw new/delete useage in parser.cpp's handle_headers 1c3e1618f780655375b465e6a1d7ef030cb4b460 Reset HTTP headers when detecting a new response 64ad6de02d7f3fd8239772c70e964895360c7524 Update rss/parser.cpp 4b0108a1a501ac69ca969c4790990c35148fd7b0 Update rss/parser.cpp These commits led to problems with feeds that specify the same encoding both in the Content-Type and the XML tag; these were converted to UTF-8, but then the XML parser tried to convert them to UTF-8 again. We don't have a good fix for this yet. The problem is tracked in #1436.
-rw-r--r--include/utils.h3
-rw-r--r--rss/parser.cpp63
-rw-r--r--rust/libnewsboat-ffi/src/utils.rs1
-rw-r--r--src/cache.cpp4
-rw-r--r--src/utils.cpp66
-rw-r--r--test/utils.cpp206
6 files changed, 20 insertions, 323 deletions
diff --git a/include/utils.h b/include/utils.h
index aeb1d821..c25afc47 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -58,9 +58,6 @@ std::string utf8_to_locale(const std::string& text);
/// nl_langinfo(CODESET)) to UTF-8.
std::string locale_to_utf8(const std::string& text);
-std::string convert_text(const std::string& text, const std::string& tocode,
- const std::string& fromcode);
-
std::string get_command_output(const std::string& cmd);
std::string http_method_str(const HTTPMethod method);
std::string link_type_str(LinkType type);
diff --git a/rss/parser.cpp b/rss/parser.cpp
index d370befc..8bae6e75 100644
--- a/rss/parser.cpp
+++ b/rss/parser.cpp
@@ -5,7 +5,6 @@
#include <curl/curl.h>
#include <libxml/parser.h>
#include <libxml/tree.h>
-#include <vector>
#include "config.h"
#include "curlhandle.h"
@@ -54,74 +53,51 @@ Parser::~Parser()
}
}
-namespace {
-
struct HeaderValues {
time_t lastmodified;
std::string etag;
- std::string charset;
HeaderValues()
+ : lastmodified(0)
{
- reset();
- }
-
- void reset()
- {
- lastmodified = 0;
- charset = "utf-8";
- etag = "";
}
};
-}
-
static size_t handle_headers(void* ptr, size_t size, size_t nmemb, void* data)
{
- const auto header = std::string(reinterpret_cast<const char*>(ptr), size * nmemb);
+ char* header = new char[size * nmemb + 1];
HeaderValues* values = static_cast<HeaderValues*>(data);
- if (header.find("HTTP/") == 0) {
- // Reset headers if a new response is detected (there might be multiple responses per request in case of a redirect)
- values->reset();
- } else if (header.find("Last-Modified:") == 0) {
- const std::string header_value = header.substr(14);
- time_t r = curl_getdate(header_value.c_str(), nullptr);
+ memcpy(header, ptr, size * nmemb);
+ header[size * nmemb] = '\0';
+
+ if (!strncasecmp("Last-Modified:", header, 14)) {
+ time_t r = curl_getdate(header + 14, nullptr);
if (r == -1) {
LOG(Level::DEBUG,
"handle_headers: last-modified %s "
"(curl_getdate "
"FAILED)",
- header_value.c_str());
+ header + 14);
} else {
- values->lastmodified = r;
+ values->lastmodified =
+ curl_getdate(header + 14, nullptr);
LOG(Level::DEBUG,
"handle_headers: got last-modified %s (%" PRId64 ")",
- header_value,
+ header + 14,
// On GCC, `time_t` is `long int`, which is at least 32 bits.
// On x86_64, it's 64 bits. Thus, this cast is either a no-op,
// or an up-cast which is always safe.
static_cast<int64_t>(values->lastmodified));
}
- } else if (header.find("ETag:") == 0) {
- values->etag = header.substr(5);
+ } else if (!strncasecmp("ETag:", header, 5)) {
+ values->etag = std::string(header + 5);
utils::trim(values->etag);
LOG(Level::DEBUG, "handle_headers: got etag %s", values->etag);
- } else if (header.find("Content-Type:") == 0) {
- const std::string key = "charset=";
- const auto charset_index = header.find(key);
- if (charset_index != std::string::npos) {
- auto charset = header.substr(charset_index + key.size());
- utils::trim(charset);
- if (charset.size() >= 2 && charset[0] == '"' && charset[charset.size() - 1] == '"') {
- charset = charset.substr(1, charset.size() - 2);
- }
- if (charset.size() > 0) {
- values->charset = charset;
- }
- }
}
+ delete[] header;
+
return size * nmemb;
}
@@ -270,11 +246,10 @@ Feed Parser::parse_url(const std::string& url,
buf);
if (buf.length() > 0) {
- LOG(Level::DEBUG, "Parser::parse_url: converting data from %s to utf-8", hdrs.charset);
- const auto utf8_buf = utils::convert_text(buf, "utf-8", hdrs.charset);
-
- LOG(Level::DEBUG, "Parser::parse_url: handing over data to parse_buffer()");
- return parse_buffer(utf8_buf, url);
+ LOG(Level::DEBUG,
+ "Parser::parse_url: handing over data to "
+ "parse_buffer()");
+ return parse_buffer(buf, url);
}
return Feed();
diff --git a/rust/libnewsboat-ffi/src/utils.rs b/rust/libnewsboat-ffi/src/utils.rs
index 4d0740b9..ae397842 100644
--- a/rust/libnewsboat-ffi/src/utils.rs
+++ b/rust/libnewsboat-ffi/src/utils.rs
@@ -89,7 +89,6 @@ mod bridged {
fn translit(tocode: &str, fromcode: &str) -> String;
fn utf8_to_locale(text: &str) -> Vec<u8>;
fn locale_to_utf8(text: &[u8]) -> String;
- fn convert_text(text: &[u8], tocode: &str, fromcode: &str) -> Vec<u8>;
}
extern "C++" {
diff --git a/src/cache.cpp b/src/cache.cpp
index e59c8f41..2075868c 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -54,8 +54,6 @@ void Cache::run_sql_nothrow(const std::string& query,
run_sql_impl(query, callback, callback_argument, false);
}
-namespace {
-
struct CbHandler {
CbHandler()
: c(-1)
@@ -79,8 +77,6 @@ struct HeaderValues {
std::string etag;
};
-}
-
static int count_callback(void* handler, int argc, char** argv,
char** /* azColName */)
{
diff --git a/src/utils.cpp b/src/utils.cpp
index 683cc396..8e499376 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -21,7 +21,6 @@
#include <regex>
#include <sstream>
#include <stfl.h>
-#include <string>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/utsname.h>
@@ -31,7 +30,6 @@
#include "config.h"
#include "curlhandle.h"
#include "htmlrenderer.h"
-#include "libnewsboat-ffi/src/utils.rs.h"
#include "logger.h"
#include "strprintf.h"
@@ -188,19 +186,6 @@ std::string utils::locale_to_utf8(const std::string& text)
return std::string(utils::bridged::locale_to_utf8(text_slice));
}
-std::string utils::convert_text(const std::string& text, const std::string& tocode,
- const std::string& fromcode)
-{
- const auto text_slice =
- rust::Slice<const unsigned char>(
- reinterpret_cast<const unsigned char*>(text.c_str()),
- text.length());
-
- const auto result = utils::bridged::convert_text(text_slice, tocode, fromcode);
-
- return std::string(reinterpret_cast<const char*>(result.data()), result.size());
-}
-
std::string utils::get_command_output(const std::string& cmd)
{
return std::string(utils::bridged::get_command_output(cmd));
@@ -268,50 +253,6 @@ std::string utils::retrieve_url(const std::string& url,
return retrieve_url(url, handle, cfgcont, authinfo, body, method);
}
-namespace {
-
-struct HeaderValues {
- std::string charset;
-
- HeaderValues()
- {
- reset();
- }
-
- void reset()
- {
- charset = "utf-8";
- }
-};
-
-}
-
-static size_t handle_headers(void* ptr, size_t size, size_t nmemb, void* data)
-{
- const auto header = std::string(reinterpret_cast<const char*>(ptr), size * nmemb);
- HeaderValues* values = static_cast<HeaderValues*>(data);
-
- if (header.find("HTTP/") == 0) {
- // Reset headers if a new response is detected (there might be multiple responses per request in case of a redirect)
- values->reset();
- } else if (header.find("Content-Type:") == 0) {
- const std::string key = "charset=";
- const auto charset_index = header.find(key);
- if (charset_index != std::string::npos) {
- auto charset = header.substr(charset_index + key.size());
- utils::trim(charset);
- if (charset.size() >= 2 && charset[0] == '"' && charset[charset.size() - 1] == '"') {
- charset = charset.substr(1, charset.size() - 2);
- }
- if (charset.size() > 0) {
- values->charset = charset;
- }
- }
- }
-
- return size * nmemb;
-}
-
std::string utils::retrieve_url(const std::string& url,
CurlHandle& easyhandle,
ConfigContainer* cfgcont,
@@ -326,10 +267,6 @@ std::string utils::retrieve_url(const std::string& url,
curl_easy_setopt(easyhandle.ptr(), CURLOPT_WRITEFUNCTION, my_write_data);
curl_easy_setopt(easyhandle.ptr(), CURLOPT_WRITEDATA, &buf);
- HeaderValues hdrs;
- curl_easy_setopt(easyhandle.ptr(), CURLOPT_HEADERDATA, &hdrs);
- curl_easy_setopt(easyhandle.ptr(), CURLOPT_HEADERFUNCTION, handle_headers);
-
switch (method) {
case HTTPMethod::GET:
break;
@@ -389,8 +326,7 @@ std::string utils::retrieve_url(const std::string& url,
// See the clobbering note above.
curl_easy_setopt(easyhandle.ptr(), CURLOPT_ERRORBUFFER, NULL);
- LOG(Level::DEBUG, "Parser::parse_url: converting data from %s to utf-8", hdrs.charset);
- return utils::convert_text(buf, "utf-8", hdrs.charset);
+ return buf;
}
std::string utils::run_program(const char* argv[], const std::string& input)
diff --git a/test/utils.cpp b/test/utils.cpp
index 434b8bb4..99ba2c04 100644
--- a/test/utils.cpp
+++ b/test/utils.cpp
@@ -1,7 +1,6 @@
#include "utils.h"
#include <algorithm>
-#include <cctype>
#include <chrono>
#include <ctype.h>
#include <fstream>
@@ -2013,208 +2012,3 @@ TEST_CASE("translit() always returns the same value for the same inputs",
}
}
}
-
-void verify_convert_text(const std::vector<unsigned char>& input,
- const std::string& tocode, const std::string& fromcode,
- const std::vector<unsigned char>& expected_output)
-{
- const std::string input_str = std::string(reinterpret_cast<const char*>(input.data()),
- input.size());
- const std::string expected_str = std::string(reinterpret_cast<const char*>
- (expected_output.data()), expected_output.size());
-
- REQUIRE(utils::convert_text(input_str, tocode, fromcode) == expected_str);
-}
-
-TEST_CASE("convert_text() returns input string if fromcode and tocode are the same",
- "[utils]")
-{
- std::vector<unsigned char> input = {
- // \x81 is not valid UTF-8
- 0x81, 0x13, 0x41,
- };
-
- std::vector<unsigned char> expected = {
- 0x3f, 0x13, 0x41,
- };
-
- verify_convert_text(input, "UTF-8", "UTF-8", expected);
-}
-
-TEST_CASE("convert_text() replaces incomplete multibyte sequences with a question mark: utf8 to utf16le",
- "[utils]")
-{
- std::vector<unsigned char> input = {
- // "ой", "oops" in Russian, but the last byte is missing
- 0xd0, 0xbe, 0xd0,
- };
-
- std::vector<unsigned char> expected = {
- 0x3e, 0x04, 0x3f, 0x00,
- };
-
- verify_convert_text(input, "UTF-16LE", "UTF-8", expected);
-}
-
-TEST_CASE("convert_text() replaces incomplete multibyte sequences with a question mark: utf16le to utf8",
- "[utils]")
-{
- SECTION("input includes zero byte") {
- std::vector<unsigned char> input = {
- // "hi", but the last byte is missing
- 0x68, 0x00, 0x69,
- };
-
- std::vector<unsigned char> expected = {
- 0x68, 0x3f,
- };
-
- verify_convert_text(input, "UTF-8", "UTF-16LE", expected);
- }
-
- SECTION("input does not include zero byte") {
- std::vector<unsigned char> input = {
- // "эй", "hey" in Russian, but the last byte is missing
- 0x4d, 0x04, 0x39,
- };
-
- std::vector<unsigned char> expected = {
- 0xd1, 0x8d, 0x3f,
- };
-
- verify_convert_text(input, "UTF-8", "UTF-16LE", expected);
- }
-}
-
-TEST_CASE("convert_text() replaces invalid multibyte sequences with a question mark: utf8 to utf16le",
- "[utils]")
-{
- std::vector<unsigned char> input = {
- // "日本", "Japan", but the third byte of the first character (0xa5) is
- // missing, making the whole first character an illegal sequence.
- 0xe6, 0x97, 0xe6, 0x9c, 0xac,
- };
-
- std::vector<unsigned char> expected = {
- 0x3f, 0x00, 0x3f, 0x00, 0x2c, 0x67,
- };
-
- verify_convert_text(input, "UTF-16LE", "UTF-8", expected);
-}
-
-TEST_CASE("convert_text() replaces invalid multibyte sequences with a question mark: utf16le to utf8",
- "[utils]")
-{
- std::vector<unsigned char> input = {
- // The first two bytes here are part of a surrogate pair, i.e. they
- // imply that the next two bytes encode additional info. However, the
- // next two bytes are an ordinary character. This breaks the decoding
- // process, so some things get turned into a question mark while others
- // are decoded incorrectly.
- 0x01, 0xd8, 0xd7, 0x03,
- };
-
- std::vector<unsigned char> expected = {
- 0x3f, 0xed, 0x9f, 0x98, 0x3f,
- };
-
- verify_convert_text(input, "UTF-8", "UTF-16LE", expected);
-}
-
-TEST_CASE("convert_text() converts text between encodings: utf8 to utf16le", "[utils]")
-{
- std::vector<unsigned char> input = {
- // "Тестирую", "Testing" in Russian.
- 0xd0, 0xa2, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82, 0xd0, 0xb8, 0xd1, 0x80, 0xd1, 0x83,
- 0xd1, 0x8e,
- };
-
- std::vector<unsigned char> expected = {
- 0x22, 0x04, 0x35, 0x04, 0x41, 0x04, 0x42, 0x04, 0x38, 0x04, 0x40, 0x04, 0x43, 0x04,
- 0x4e, 0x04,
- };
-
- verify_convert_text(input, "UTF-16LE", "UTF-8", expected);
-}
-
-TEST_CASE("convert_text() converts text between encodings: utf8 to koi8r", "[utils]")
-{
- std::vector<unsigned char> input = {
- // "Проверка", "Check" in Russian.
- 0xd0, 0x9f, 0xd1, 0x80, 0xd0, 0xbe, 0xd0, 0xb2, 0xd0, 0xb5, 0xd1, 0x80, 0xd0, 0xba,
- 0xd0, 0xb0,
- };
-
- std::vector<unsigned char> expected = {
- 0xf0, 0xd2, 0xcf, 0xd7, 0xc5, 0xd2, 0xcb, 0xc1,
- };
-
- verify_convert_text(input, "KOI8-R", "UTF-8", expected);
-}
-
-TEST_CASE("convert_text() converts text between encodings: utf8 to ISO-8859-1", "[utils]")
-{
- // Some symbols in the result will be transliterated.
-
- std::vector<unsigned char> input = {
- // "вау °±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃ": a mix of Cyrillic (unsupported by
- // ISO-8859-1) and ISO-8859-1 characters.
- 0xd0, 0xb2, 0xd0, 0xb0, 0xd1, 0x83, 0x20, 0xc2, 0xb0, 0xc2, 0xb1, 0xc2, 0xb2, 0xc2,
- 0xb3, 0xc2, 0xb4, 0xc2, 0xb5, 0xc2, 0xb6, 0xc2, 0xb7, 0xc2, 0xb8, 0xc2, 0xb9, 0xc2,
- 0xba, 0xc2, 0xbb, 0xc2, 0xbc, 0xc2, 0xbd, 0xc2, 0xbe, 0xc2, 0xbf, 0xc3, 0x80, 0xc3,
- 0x81, 0xc3, 0x82, 0xc3, 0x83,
- };
-
- const std::string input_str = std::string(reinterpret_cast<const char*>(input.data()),
- input.size());
-
- const auto result = utils::convert_text(input_str, "ISO-8859-1", "UTF-8");
-
- // We can't spell out an expected result because different platforms
- // might follow different transliteration rules.
- REQUIRE(result != "");
- REQUIRE(result != input_str);
-}
-
-TEST_CASE("convert_text() converts text between encodings: utf16le to utf8", "[utils]")
-{
- std::vector<unsigned char> input = {
- // "Успех", "Success" in Russian.
- 0xff, 0xfe, 0x23, 0x04, 0x41, 0x04, 0x3f, 0x04, 0x35, 0x04, 0x45, 0x04,
- };
-
- std::vector<unsigned char> expected = {
- 0xef, 0xbb, 0xbf, 0xd0, 0xa3, 0xd1, 0x81, 0xd0, 0xbf, 0xd0, 0xb5, 0xd1, 0x85,
- };
-
- verify_convert_text(input, "UTF-8", "UTF-16LE", expected);
-}
-
-TEST_CASE("convert_text() converts text between encodings: koi8r to utf8", "[utils]")
-{
- std::vector<unsigned char> input = {
- // "История", "History" in Russian.
- 0xe9, 0xd3, 0xd4, 0xcf, 0xd2, 0xc9, 0xd1,
- };
-
- std::vector<unsigned char> expected = {
- 0xd0, 0x98, 0xd1, 0x81, 0xd1, 0x82, 0xd0, 0xbe, 0xd1, 0x80, 0xd0, 0xb8, 0xd1, 0x8f,
- };
-
- verify_convert_text(input, "UTF-8", "KOI8-R", expected);
-}
-
-TEST_CASE("convert_text() converts text between encodings: ISO-8859-1 to utf8", "[utils]")
-{
- std::vector<unsigned char> input = {
- // "ÄÅÆÇÈÉÊËÌÍÎÏ": some umlauts and Latin letters.
- 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
- };
-
- std::vector<unsigned char> expected = {
- 0xc3, 0x84, 0xc3, 0x85, 0xc3, 0x86, 0xc3, 0x87, 0xc3, 0x88, 0xc3, 0x89, 0xc3, 0x8a,
- 0xc3, 0x8b, 0xc3, 0x8c, 0xc3, 0x8d, 0xc3, 0x8e, 0xc3, 0x8f,
- };
-
- verify_convert_text(input, "UTF-8", "ISO-8859-1", expected);
-}