summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArun <engineerarun@gmail.com>2024-04-30 22:23:37 +0530
committerGitHub <noreply@github.com>2024-04-30 22:23:37 +0530
commit83576b45849222613756c4e8995ec788037dd5db (patch)
tree8b6afa99c3fb163c8f9fb963fc11dea167b23bc0
parent4c2bb265d0aff43ceaad469d94242eda68eea34d (diff)
parentcfb43e365c1d8326b7b2dda1b0dddfa4aaa999e0 (diff)
Merge pull request #737 from LeXofLeviafan/fix-keywordsHEADmaster
[jarun#734] fixed parsing of webpage keywords
-rwxr-xr-xbuku2
-rw-r--r--tests/test_buku.py7
2 files changed, 5 insertions, 4 deletions
diff --git a/buku b/buku
index b54ae88..df84ace 100755
--- a/buku
+++ b/buku
@@ -4061,7 +4061,7 @@ def parse_decoded_page(page):
try:
if keywords:
keys = keywords.get('content').strip().replace('\n', ' ')
- keys = re.sub(r'\s{2,}', ' ', keys)
+ keys = re.sub(r'\s{2,}', ' ', re.sub(r'\s*,\s*', ',', keys))
if is_unusual_tag(keys):
if keys not in (title, desc):
LOGDBG('keywords to description: %s', keys)
diff --git a/tests/test_buku.py b/tests/test_buku.py
index ecf1907..105006f 100644
--- a/tests/test_buku.py
+++ b/tests/test_buku.py
@@ -940,10 +940,11 @@ def test_get_data_from_page(charset, mode):
'charset': f'\n<meta charset="{charset}"/>',
'content': f'\n<meta http-equiv="content-type" content="text/html; charset={charset}"/>',
}.get(mode, '')
- body = f'<html>\n\n<head>{meta}\n<title>{title}</title>\n</head>\n<body></body>\n\n</html>\n'
+ keywords = '<meta name="keywords" content="foo, bar baz, quux"/>'
+ body = f'<html>\n\n<head>{meta}\n{keywords}\n<title>{title}</title>\n</head>\n<body></body>\n\n</html>\n'
resp = HTTPResponse(body.encode(charset), headers)
- parsed_title, desc, keywords = get_data_from_page(resp)
- assert parsed_title == title
+ parsed_title, desc, tags = get_data_from_page(resp)
+ assert (parsed_title, tags) == (title, "foo,bar baz,quux")
@pytest.mark.parametrize('tokens, valid, expected', [