summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-07-07 18:34:12 +0200
committerKarl Lorey <git@karllorey.com>2022-07-07 18:34:12 +0200
commit6ba0a3a60e3ab85ec85e260fd70b3bc3770ec53c (patch)
tree5060de7c798692980409e5103960857883510600
parentee10ef769591c442c91b0dcc20e2626544862aee (diff)
Add test for nbsp issue #15
-rw-r--r--tests/test_html.py24
1 files changed, 15 insertions, 9 deletions
diff --git a/tests/test_html.py b/tests/test_html.py
index 25e3844..27be90d 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -1,3 +1,4 @@
+import pytest
from mlscraper.html import get_relative_depth
from mlscraper.html import get_root_node
from mlscraper.html import HTMLExactTextMatch
@@ -47,6 +48,20 @@ class TestPage:
nodes = page.find_all("/users/624900/jterrace")
assert nodes
+ def test_find_all_with_text_with_noise(self):
+ html = b"<html><body><p>bla karl bla</p></body></html>"
+ page = Page(html)
+ assert all(
+ not isinstance(html_match, HTMLExactTextMatch)
+ for html_match in page.find_all("karl")
+ )
+
+ @pytest.mark.skip("no fuzzy matching yet")
+ def test_find_all_with_nbsp(self):
+ html = "<html><body><p>123&nbsp;€</body></html>".encode()
+ page = Page(html)
+ assert len(page.find_all("123 €")) > 0
+
def test_equality():
# we want to make sure that equal html does not result in equality
@@ -88,15 +103,6 @@ def test_find_text_with_whitespace():
assert all(isinstance(hm, HTMLExactTextMatch) for hm in html_matches)
-def test_find_text_with_noise():
- html = b"<html><body><p>bla karl bla</p></body></html>"
- page = Page(html)
- assert all(
- not isinstance(html_match, HTMLExactTextMatch)
- for html_match in page.find_all("karl")
- )
-
-
def test_get_relative_depth():
html = b"<html><body><p>bla karl bla</p></body></html>"
page = Page(html)