summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-24 01:06:37 +0200
committerKarl Lorey <git@karllorey.com>2022-06-24 01:06:37 +0200
commitd567ea3ecf5f1bf7e42e4d93def6052b7adc5fa2 (patch)
tree22b98378578c1589b07ba725c873fe59c39e455f
parentc54208e15c2b303cced2698ff331962d243b6329 (diff)
Also match all parents that contain the same text
-rw-r--r--mlscraper/html.py9
-rw-r--r--tests/test_html.py6
2 files changed, 10 insertions, 5 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index c0eede3..72f95f0 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -61,13 +61,16 @@ class Node:
# text
# - since text matches including whitespace, a regex is used
- for soup_node in self.soup.find_all(
- string=re.compile(r"^\s*%s\s*$" % html.escape(item))
- ):
+ target_regex = re.compile(r"^\s*%s\s*$" % html.escape(item))
+ for soup_node in self.soup.find_all(string=target_regex):
# use parent node as found text is NaviableString and not Tag
node = self._page._get_node_for_soup(soup_node.parent)
yield HTMLExactTextMatch(node)
+ for p in node.parents:
+ if p.text.strip() == node.text.strip():
+ yield HTMLExactTextMatch(p)
+
# attributes
for soup_node in self.soup.find_all():
for attr in soup_node.attrs:
diff --git a/tests/test_html.py b/tests/test_html.py
index 7780504..ab7a6b3 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -102,8 +102,10 @@ def test_find_text_with_whitespace():
html = b"<html><body><p> whitespace \n\t </p></body></html>"
page = Page(html)
html_matches = page.find_all("whitespace")
- assert len(html_matches) == 1
- assert isinstance(html_matches[0], HTMLExactTextMatch)
+
+ # should match p, body, html (and document)
+ assert len(html_matches) == 4
+ assert all(isinstance(hm, HTMLExactTextMatch) for hm in html_matches)
def test_find_text_with_noise():