summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--app/lib/link_details_extractor.rb15
-rw-r--r--spec/fixtures/requests/low_confidence_latin1.txt17
-rw-r--r--spec/services/fetch_link_card_service_spec.rb9
3 files changed, 36 insertions, 5 deletions
diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb
index 2e49d3fb4f5..dbfdd33fcca 100644
--- a/app/lib/link_details_extractor.rb
+++ b/app/lib/link_details_extractor.rb
@@ -269,16 +269,21 @@ class LinkDetailsExtractor
end
def document
- @document ||= Nokogiri::HTML(@html, nil, encoding)
+ @document ||= detect_encoding_and_parse_document
end
- def encoding
- @encoding ||= begin
- guess = detector.detect(@html, @html_charset)
- guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil
+ def detect_encoding_and_parse_document
+ [detect_encoding, nil, @html_charset, 'UTF-8'].uniq.each do |encoding|
+ document = Nokogiri::HTML(@html, nil, encoding)
+ return document if document.to_s.valid_encoding?
end
end
+ def detect_encoding
+ guess = detector.detect(@html, @html_charset)
+ guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil
+ end
+
def detector
@detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector|
detector.strip_tags = true
diff --git a/spec/fixtures/requests/low_confidence_latin1.txt b/spec/fixtures/requests/low_confidence_latin1.txt
new file mode 100644
index 00000000000..39c3e23d649
--- /dev/null
+++ b/spec/fixtures/requests/low_confidence_latin1.txt
@@ -0,0 +1,17 @@
+HTTP/1.1 200 OK
+server: nginx
+date: Thu, 13 Jun 2024 14:33:13 GMT
+content-type: text/html; charset=ISO-8859-1
+content-length: 158
+accept-ranges: bytes
+
+<!doctype html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8">
+ <title>Tofu á l'orange</title>
+</head>
+<body>
+ <h2>Tofu á l'orange</h2>
+</body>
+</html>
diff --git a/spec/services/fetch_link_card_service_spec.rb b/spec/services/fetch_link_card_service_spec.rb
index 63ebc3b978d..239f84fde94 100644
--- a/spec/services/fetch_link_card_service_spec.rb
+++ b/spec/services/fetch_link_card_service_spec.rb
@@ -26,6 +26,7 @@ RSpec.describe FetchLinkCardService do
stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt'))
stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt'))
stub_request(:get, 'http://example.com/windows-1251').to_return(request_fixture('windows-1251.txt'))
+ stub_request(:get, 'http://example.com/low_confidence_latin1').to_return(request_fixture('low_confidence_latin1.txt'))
Rails.cache.write('oembed_endpoint:example.com', oembed_cache) if oembed_cache
@@ -148,6 +149,14 @@ RSpec.describe FetchLinkCardService do
end
end
+ context 'with a URL of a page in ISO-8859-1 encoding, that charlock_holmes cannot detect' do
+ let(:status) { Fabricate(:status, text: 'Check out http://example.com/low_confidence_latin1') }
+
+ it 'decodes the HTML' do
+ expect(status.preview_card.title).to eq("Tofu á l'orange")
+ end
+ end
+
context 'with a Japanese path URL' do
let(:status) { Fabricate(:status, text: 'テストhttp://example.com/日本語') }