summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-23 17:27:13 +0200
committerKarl Lorey <git@karllorey.com>2022-06-23 17:27:13 +0200
commit13a11d65d477abd347a4b24d21216985631bd446 (patch)
treeebe80fc10dec93227d32216e8f37c374937e805b
parentd08a69e478582ebd2be5ac3ff7c83591ef86bf8c (diff)
Fix selection of arbitrary text within nodes (for now)
-rw-r--r--mlscraper/html.py6
-rw-r--r--mlscraper/matches.py11
-rw-r--r--tests/test_html.py13
3 files changed, 20 insertions, 10 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 5ee51be..807a35e 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -20,7 +20,7 @@ class HTMLMatch(ABC):
@dataclass
-class HTMLTextMatch(HTMLMatch):
+class HTMLExactTextMatch(HTMLMatch):
pass
@@ -54,11 +54,11 @@ class Node:
# text
# - since text matches including whitespace, a regex is used
for soup_node in self.soup.find_all(
- string=re.compile(r"\s*%s\s*" % html.escape(item))
+ string=re.compile(r"^\s*%s\s*$" % html.escape(item))
):
# use parent node as found text is NaviableString and not Tag
node = self._page._get_node_for_soup(soup_node.parent)
- yield HTMLTextMatch(node)
+ yield HTMLExactTextMatch(node)
# attributes
for soup_node in self.soup.find_all():
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index 1301375..c5145c1 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -9,7 +9,7 @@ from itertools import combinations
from mlscraper.html import get_relative_depth
from mlscraper.html import get_root_node
from mlscraper.html import HTMLAttributeMatch
-from mlscraper.html import HTMLTextMatch
+from mlscraper.html import HTMLExactTextMatch
from mlscraper.html import Node
@@ -151,15 +151,16 @@ def generate_all_value_matches(
logging.info(f"generating all value matches ({node=}, {item=})")
for html_match in node.find_all(item):
matched_node = html_match.node
- if isinstance(html_match, HTMLTextMatch):
+ if isinstance(html_match, HTMLExactTextMatch):
extractor = TextValueExtractor()
+ yield ValueMatch(matched_node, extractor)
elif isinstance(html_match, HTMLAttributeMatch):
extractor = AttributeValueExtractor(html_match.attr)
+ yield ValueMatch(matched_node, extractor)
else:
- raise RuntimeError(
- f"unknown match type ({html_match=}, {type(html_match)=})"
+ logging.warning(
+ f"Cannot deal with HTMLMatch type, ignoring ({html_match=}, {type(html_match)=}))"
)
- yield ValueMatch(matched_node, extractor)
def is_disjoint_match_combination(matches):
diff --git a/tests/test_html.py b/tests/test_html.py
index faeaef8..e976ed5 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -1,6 +1,6 @@
from bs4 import BeautifulSoup
from mlscraper.html import _get_root_of_nodes
-from mlscraper.html import HTMLTextMatch
+from mlscraper.html import HTMLExactTextMatch
from mlscraper.html import Page
from mlscraper.html import selector_matches_nodes
from mlscraper.matches import AttributeValueExtractor
@@ -98,4 +98,13 @@ def test_find_text_with_whitespace():
page = Page(html)
html_matches = page.find_all("whitespace")
assert len(html_matches) == 1
- assert isinstance(html_matches[0], HTMLTextMatch)
+ assert isinstance(html_matches[0], HTMLExactTextMatch)
+
+
+def test_find_text_with_noise():
+ html = b"<html><body><p>bla karl bla</p></body></html>"
+ page = Page(html)
+ assert all(
+ not isinstance(html_match, HTMLExactTextMatch)
+ for html_match in page.find_all("karl")
+ )