diff options
author | Karl Lorey <git@karllorey.com> | 2022-07-07 18:34:12 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-07-07 18:34:12 +0200 |
commit | 6ba0a3a60e3ab85ec85e260fd70b3bc3770ec53c (patch) | |
tree | 5060de7c798692980409e5103960857883510600 | |
parent | ee10ef769591c442c91b0dcc20e2626544862aee (diff) |
Add test for nbsp issue #15
-rw-r--r-- | tests/test_html.py | 24 |
1 files changed, 15 insertions, 9 deletions
diff --git a/tests/test_html.py b/tests/test_html.py index 25e3844..27be90d 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,3 +1,4 @@ +import pytest from mlscraper.html import get_relative_depth from mlscraper.html import get_root_node from mlscraper.html import HTMLExactTextMatch @@ -47,6 +48,20 @@ class TestPage: nodes = page.find_all("/users/624900/jterrace") assert nodes + def test_find_all_with_text_with_noise(self): + html = b"<html><body><p>bla karl bla</p></body></html>" + page = Page(html) + assert all( + not isinstance(html_match, HTMLExactTextMatch) + for html_match in page.find_all("karl") + ) + + @pytest.mark.skip("no fuzzy matching yet") + def test_find_all_with_nbsp(self): + html = "<html><body><p>123 €</body></html>".encode() + page = Page(html) + assert len(page.find_all("123 €")) > 0 + def test_equality(): # we want to make sure that equal html does not result in equality @@ -88,15 +103,6 @@ def test_find_text_with_whitespace(): assert all(isinstance(hm, HTMLExactTextMatch) for hm in html_matches) -def test_find_text_with_noise(): - html = b"<html><body><p>bla karl bla</p></body></html>" - page = Page(html) - assert all( - not isinstance(html_match, HTMLExactTextMatch) - for html_match in page.find_all("karl") - ) - - def test_get_relative_depth(): html = b"<html><body><p>bla karl bla</p></body></html>" page = Page(html) |