summaryrefslogtreecommitdiffstats
path: root/tests
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-23 17:27:13 +0200
committerKarl Lorey <git@karllorey.com>2022-06-23 17:27:13 +0200
commit13a11d65d477abd347a4b24d21216985631bd446 (patch)
treeebe80fc10dec93227d32216e8f37c374937e805b /tests
parentd08a69e478582ebd2be5ac3ff7c83591ef86bf8c (diff)
Fix selection of arbitrary text within nodes (for now)
Diffstat (limited to 'tests')
-rw-r--r--tests/test_html.py13
1 files changed, 11 insertions, 2 deletions
diff --git a/tests/test_html.py b/tests/test_html.py
index faeaef8..e976ed5 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -1,6 +1,6 @@
from bs4 import BeautifulSoup
from mlscraper.html import _get_root_of_nodes
-from mlscraper.html import HTMLTextMatch
+from mlscraper.html import HTMLExactTextMatch
from mlscraper.html import Page
from mlscraper.html import selector_matches_nodes
from mlscraper.matches import AttributeValueExtractor
@@ -98,4 +98,13 @@ def test_find_text_with_whitespace():
page = Page(html)
html_matches = page.find_all("whitespace")
assert len(html_matches) == 1
- assert isinstance(html_matches[0], HTMLTextMatch)
+ assert isinstance(html_matches[0], HTMLExactTextMatch)
+
+
+def test_find_text_with_noise():
+ html = b"<html><body><p>bla karl bla</p></body></html>"
+ page = Page(html)
+ assert all(
+ not isinstance(html_match, HTMLExactTextMatch)
+ for html_match in page.find_all("karl")
+ )