diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-23 17:27:13 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-23 17:27:13 +0200 |
commit | 13a11d65d477abd347a4b24d21216985631bd446 (patch) | |
tree | ebe80fc10dec93227d32216e8f37c374937e805b /tests | |
parent | d08a69e478582ebd2be5ac3ff7c83591ef86bf8c (diff) |
Fix selection of arbitrary text within nodes (for now)
Diffstat (limited to 'tests')
-rw-r--r-- | tests/test_html.py | 13 |
1 files changed, 11 insertions, 2 deletions
diff --git a/tests/test_html.py b/tests/test_html.py index faeaef8..e976ed5 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup from mlscraper.html import _get_root_of_nodes -from mlscraper.html import HTMLTextMatch +from mlscraper.html import HTMLExactTextMatch from mlscraper.html import Page from mlscraper.html import selector_matches_nodes from mlscraper.matches import AttributeValueExtractor @@ -98,4 +98,13 @@ def test_find_text_with_whitespace(): page = Page(html) html_matches = page.find_all("whitespace") assert len(html_matches) == 1 - assert isinstance(html_matches[0], HTMLTextMatch) + assert isinstance(html_matches[0], HTMLExactTextMatch) + + +def test_find_text_with_noise(): + html = b"<html><body><p>bla karl bla</p></body></html>" + page = Page(html) + assert all( + not isinstance(html_match, HTMLExactTextMatch) + for html_match in page.find_all("karl") + ) |