summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-21 16:59:05 +0200
committerKarl Lorey <git@karllorey.com>2022-06-21 16:59:05 +0200
commit9c22147fd8f7c451ed8627af8bcb8c7eab5ba8e0 (patch)
tree8b9d2bfe8d0d8013b0a5c5781ddacbfc4ff18288
parentf58ddd5464e170d2570286f5e4d9b3d6d53efb5d (diff)
Ignore whitespace around values when searching for matches in HTML
-rw-r--r--mlscraper/html.py19
-rw-r--r--mlscraper/matches.py10
-rw-r--r--tests/test_html.py17
3 files changed, 30 insertions, 16 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 98201a9..5ee51be 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -2,7 +2,9 @@
Encapsulation of html-related functionality.
BeautifulSoup should only get used here.
"""
+import html
import logging
+import re
from abc import ABC
from dataclasses import dataclass
from functools import cached_property
@@ -13,17 +15,17 @@ from bs4 import Tag
@dataclass
-class Match(ABC):
+class HTMLMatch(ABC):
node: "Node" = None
@dataclass
-class TextMatch(Match):
+class HTMLTextMatch(HTMLMatch):
pass
@dataclass
-class AttributeMatch(Match):
+class HTMLAttributeMatch(HTMLMatch):
attr: str = None
@@ -43,24 +45,27 @@ class Node:
def text(self):
return self.soup.text
- def find_all(self, item) -> list[Match]:
+ def find_all(self, item) -> list[HTMLMatch]:
return list(self._generate_find_all(item))
def _generate_find_all(self, item):
assert isinstance(item, str), "can only search for str at the moment"
# text
- for soup_node in self.soup.find_all(string=item):
+ # - since text matches including whitespace, a regex is used
+ for soup_node in self.soup.find_all(
+ string=re.compile(r"\s*%s\s*" % html.escape(item))
+ ):
# use parent node as found text is NaviableString and not Tag
node = self._page._get_node_for_soup(soup_node.parent)
- yield TextMatch(node)
+ yield HTMLTextMatch(node)
# attributes
for soup_node in self.soup.find_all():
for attr in soup_node.attrs:
if soup_node[attr] == item:
node = self._page._get_node_for_soup(soup_node)
- yield AttributeMatch(node, attr)
+ yield HTMLAttributeMatch(node, attr)
# todo implement other find methods
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index 36da4b4..1301375 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -6,11 +6,11 @@ import typing
from functools import cached_property
from itertools import combinations
-from mlscraper.html import AttributeMatch
from mlscraper.html import get_relative_depth
from mlscraper.html import get_root_node
+from mlscraper.html import HTMLAttributeMatch
+from mlscraper.html import HTMLTextMatch
from mlscraper.html import Node
-from mlscraper.html import TextMatch
class Match:
@@ -61,7 +61,7 @@ class TextValueExtractor(Extractor):
"""
def extract(self, node: Node):
- return node.soup.text
+ return node.soup.text.strip()
def __repr__(self):
return f"<{self.__class__.__name__}>"
@@ -151,9 +151,9 @@ def generate_all_value_matches(
logging.info(f"generating all value matches ({node=}, {item=})")
for html_match in node.find_all(item):
matched_node = html_match.node
- if isinstance(html_match, TextMatch):
+ if isinstance(html_match, HTMLTextMatch):
extractor = TextValueExtractor()
- elif isinstance(html_match, AttributeMatch):
+ elif isinstance(html_match, HTMLAttributeMatch):
extractor = AttributeValueExtractor(html_match.attr)
else:
raise RuntimeError(
diff --git a/tests/test_html.py b/tests/test_html.py
index 678a5f9..faeaef8 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -1,5 +1,6 @@
from bs4 import BeautifulSoup
from mlscraper.html import _get_root_of_nodes
+from mlscraper.html import HTMLTextMatch
from mlscraper.html import Page
from mlscraper.html import selector_matches_nodes
from mlscraper.matches import AttributeValueExtractor
@@ -7,7 +8,7 @@ from mlscraper.matches import AttributeValueExtractor
def test_get_root_of_nodes():
soup = BeautifulSoup(
- '<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>',
+ b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>',
"lxml",
)
node_1 = soup.select_one("#one")
@@ -49,13 +50,13 @@ def test_extractor_factory():
def test_equality():
# we want to make sure that equal html does not result in equality
- same_html = "<html><body><div><p></p></div></body></html>"
+ same_html = b"<html><body><div><p></p></div></body></html>"
assert Page(same_html) == Page(same_html)
assert Page(same_html) is not Page(same_html)
def test_select():
- html = "<html><body><p></p><p></p></body></html>"
+ html = b"<html><body><p></p><p></p></body></html>"
page = Page(html)
p_tag_nodes = page.select("p")
assert len(p_tag_nodes) == 2
@@ -78,7 +79,7 @@ def test_classes():
def test_selector_matches_nodes():
- html = "<html><body><p>1</p><p>2</p></body></html>"
+ html = b"<html><body><p>1</p><p>2</p></body></html>"
page = Page(html)
p_tags = page.select("p")
@@ -90,3 +91,11 @@ def test_selector_matches_nodes():
assert not selector_matches_nodes(
page, "p", p_tags_reversed
), "matches reversed order"
+
+
+def test_find_text_with_whitespace():
+ html = b"<html><body><p> whitespace \n\t </p></body></html>"
+ page = Page(html)
+ html_matches = page.find_all("whitespace")
+ assert len(html_matches) == 1
+ assert isinstance(html_matches[0], HTMLTextMatch)