summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-07-06 15:02:07 +0200
committerKarl Lorey <git@karllorey.com>2022-07-06 15:02:07 +0200
commit371574d0ef6b345996375741df0b688c8487747e (patch)
treeebe41d8675046babf2a1addd80795834875ac58d
parent9f17478192977f483d4b1ebaf5e8e0835a46800d (diff)
Minor performance improvements
-rw-r--r--mlscraper/html.py24
-rw-r--r--mlscraper/matches.py8
-rw-r--r--mlscraper/selectors.py17
-rw-r--r--tests/test_html.py10
4 files changed, 37 insertions, 22 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 4cca384..fc45c29 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -89,7 +89,7 @@ class Node:
yield HTMLExactTextMatch(node)
for p in node.ancestors:
- if p.text.strip() == node.text.strip():
+ if p.text.strip() == node.text.strip() and not isinstance(p, Page):
yield HTMLExactTextMatch(p)
# attributes
@@ -101,9 +101,14 @@ class Node:
# todo implement other find methods
- def has_parent(self, node: "Node") -> bool:
- for p in self.soup.parents:
- if p == node.soup:
+ def has_ancestor(self, node: "Node") -> bool:
+ # early return if different page
+ if self._page != node._page:
+ return False
+
+ # inline to avoid creating ancestors
+ for a in self.soup.parents:
+ if a == node.soup:
return True
return False
@@ -114,7 +119,7 @@ class Node:
"""
# don't return parent if it would be above <html> tag
if self.tag_name in ["html", "[document]"]:
- return None
+ return self._page
return self._page._get_node_for_soup(self.soup.parent)
@@ -144,9 +149,10 @@ class Node:
def html_attributes(self):
return self.soup.attrs
- def select(self, css_selector):
+ def select(self, css_selector, limit=None):
return [
- self._page._get_node_for_soup(n) for n in self.soup.select(css_selector)
+ self._page._get_node_for_soup(n)
+ for n in self.soup.select(css_selector, limit=limit)
]
def __repr__(self):
@@ -196,6 +202,10 @@ class Page(Node):
self._node_registry[soup] = Node(soup, self)
return self._node_registry[soup]
+ @property
+ def parent(self):
+ return None
+
def get_root_node(nodes: list[Node]) -> Node:
pages = [n._page for n in nodes]
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index f7eb447..ace47b3 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -29,16 +29,12 @@ class Match:
def has_overlap(self, other_match: "Match"):
assert isinstance(other_match, Match)
- # early return if different document
- if self.root.page != other_match.root.page:
- return False
-
return (
# overlap if same root node
self.root == other_match.root
# or if one is a parent of the other one
- or self.root.has_parent(other_match.root)
- or other_match.root.has_parent(self.root)
+ or self.root.has_ancestor(other_match.root)
+ or other_match.root.has_ancestor(self.root)
)
@property
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py
index fe1019a..693637d 100644
--- a/mlscraper/selectors.py
+++ b/mlscraper/selectors.py
@@ -48,20 +48,21 @@ class CssRuleSelector(Selector):
return node.select(self.css_rule)
def uniquely_selects(self, root: Node, nodes: typing.Collection[Node]):
+ # limit +1
+ # ensures mismatch if selection result starts with nodes
+ # e.g. select returns [1,2,3,...] and nodes is [1,2,3]
+ # but decreases load with many results significantly
+ limit = len(nodes) + 1
+
# directly using soups:
# - avoids creating nodes for all selects
# - increases caching effort
-
- # limit +1 ensures mismatch
- # if selection result starts with nodes
- # e.g. select returns [1,2,3,...] and nodes is [1,2,3]
- limit = len(nodes) + 1
return root.soup.select(self.css_rule, limit=limit) == [n.soup for n in nodes]
# using select
# - creates nodes for every soup object
# - leverages caching
- # return root.select(self.css_rule) == nodes
+ # return root.select(self.css_rule, limit=limit) == nodes
def __repr__(self):
return f"<{self.__class__.__name__} {self.css_rule=}>"
@@ -77,8 +78,9 @@ def generate_unique_selectors_for_nodes(
logging.info("roots is None, using pages as roots")
roots = [n.page for n in nodes]
- nodes_per_root = {r: [n for n in nodes if n.has_parent(r)] for r in set(roots)}
+ nodes_per_root = {r: [n for n in nodes if n.has_ancestor(r)] for r in set(roots)}
for selector in generate_selectors_for_nodes(nodes, roots, complexity):
+ logging.info(f"check if unique: {selector}")
if all(
selector.uniquely_selects(r, nodes_of_root)
for r, nodes_of_root in nodes_per_root.items()
@@ -180,6 +182,7 @@ def _generate_regular_node_selectors(node: Node):
yield f'{node.tag_name}[{attribute}="{value}"]'
+@functools.cache
def _get_path_selectors(node: Node, max_length: int) -> tuple[str]:
return tuple(set(_generate_path_selectors(node, max_length)))
diff --git a/tests/test_html.py b/tests/test_html.py
index a782a4c..9008cd7 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -16,10 +16,16 @@ def test_get_root_nodes():
def test_node_parents():
html = b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>'
page = Page(html)
+ assert page.select("html")[0].parent == page, "html's parent should be page"
+
+
+def test_node_ancestors():
+ html = b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>'
+ page = Page(html)
element = page.select("#one")[0]
ancestors = element.ancestors
assert ancestors[0] == element.parent, "first ancestor should be parent"
- assert ancestors[-1].tag_name == "html", "last ancestor should be html"
+ assert isinstance(ancestors[-1], Page), "last ancestor should be page"
def test_node_set():
@@ -77,7 +83,7 @@ def test_find_text_with_whitespace():
page = Page(html)
html_matches = page.find_all("whitespace")
- # should match p, body, html
+ # should match p, body, html, but not page
assert len(html_matches) == 3
assert all(isinstance(hm, HTMLExactTextMatch) for hm in html_matches)