Also match all parents that contain the same text

author: Karl Lorey <git@karllorey.com> 2022-06-24 01:06:37 +0200
committer: Karl Lorey <git@karllorey.com> 2022-06-24 01:06:37 +0200
commit: d567ea3ecf5f1bf7e42e4d93def6052b7adc5fa2 (patch)
tree: 22b98378578c1589b07ba725c873fe59c39e455f
parent: c54208e15c2b303cced2698ff331962d243b6329 (diff)
2 files changed, 10 insertions, 5 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index c0eede3..72f95f0 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -61,13 +61,16 @@ class Node:
 
         # text
         # - since text matches including whitespace, a regex is used
-        for soup_node in self.soup.find_all(
-            string=re.compile(r"^\s*%s\s*$" % html.escape(item))
-        ):
+        target_regex = re.compile(r"^\s*%s\s*$" % html.escape(item))
+        for soup_node in self.soup.find_all(string=target_regex):
             # use parent node as found text is NaviableString and not Tag
             node = self._page._get_node_for_soup(soup_node.parent)
             yield HTMLExactTextMatch(node)
 
+            for p in node.parents:
+                if p.text.strip() == node.text.strip():
+                    yield HTMLExactTextMatch(p)
+
         # attributes
         for soup_node in self.soup.find_all():
             for attr in soup_node.attrs:
diff --git a/tests/test_html.py b/tests/test_html.py
index 7780504..ab7a6b3 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -102,8 +102,10 @@ def test_find_text_with_whitespace():
     html = b"<html><body><p>    whitespace  \n\t </p></body></html>"
     page = Page(html)
     html_matches = page.find_all("whitespace")
-    assert len(html_matches) == 1
-    assert isinstance(html_matches[0], HTMLExactTextMatch)
+
+    # should match p, body, html (and document)
+    assert len(html_matches) == 4
+    assert all(isinstance(hm, HTMLExactTextMatch) for hm in html_matches)
 
 
 def test_find_text_with_noise():
author	Karl Lorey <git@karllorey.com>	2022-06-24 01:06:37 +0200
committer	Karl Lorey <git@karllorey.com>	2022-06-24 01:06:37 +0200
commit	d567ea3ecf5f1bf7e42e4d93def6052b7adc5fa2 (patch)
tree	22b98378578c1589b07ba725c873fe59c39e455f
parent	c54208e15c2b303cced2698ff331962d243b6329 (diff)