Minor performance improvements

author: Karl Lorey <git@karllorey.com> 2022-07-06 15:02:07 +0200
committer: Karl Lorey <git@karllorey.com> 2022-07-06 15:02:07 +0200
commit: 371574d0ef6b345996375741df0b688c8487747e (patch)
tree: ebe41d8675046babf2a1addd80795834875ac58d
parent: 9f17478192977f483d4b1ebaf5e8e0835a46800d (diff)
4 files changed, 37 insertions, 22 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 4cca384..fc45c29 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -89,7 +89,7 @@ class Node:
             yield HTMLExactTextMatch(node)
 
             for p in node.ancestors:
-                if p.text.strip() == node.text.strip():
+                if p.text.strip() == node.text.strip() and not isinstance(p, Page):
                     yield HTMLExactTextMatch(p)
 
         # attributes
@@ -101,9 +101,14 @@ class Node:
 
         # todo implement other find methods
 
-    def has_parent(self, node: "Node") -> bool:
-        for p in self.soup.parents:
-            if p == node.soup:
+    def has_ancestor(self, node: "Node") -> bool:
+        # early return if different page
+        if self._page != node._page:
+            return False
+
+        # inline to avoid creating ancestors
+        for a in self.soup.parents:
+            if a == node.soup:
                 return True
         return False
 
@@ -114,7 +119,7 @@ class Node:
         """
         # don't return parent if it would be above <html> tag
         if self.tag_name in ["html", "[document]"]:
-            return None
+            return self._page
 
         return self._page._get_node_for_soup(self.soup.parent)
 
@@ -144,9 +149,10 @@ class Node:
     def html_attributes(self):
         return self.soup.attrs
 
-    def select(self, css_selector):
+    def select(self, css_selector, limit=None):
         return [
-            self._page._get_node_for_soup(n) for n in self.soup.select(css_selector)
+            self._page._get_node_for_soup(n)
+            for n in self.soup.select(css_selector, limit=limit)
         ]
 
     def __repr__(self):
@@ -196,6 +202,10 @@ class Page(Node):
             self._node_registry[soup] = Node(soup, self)
         return self._node_registry[soup]
 
+    @property
+    def parent(self):
+        return None
+
 
 def get_root_node(nodes: list[Node]) -> Node:
     pages = [n._page for n in nodes]
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index f7eb447..ace47b3 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -29,16 +29,12 @@ class Match:
     def has_overlap(self, other_match: "Match"):
         assert isinstance(other_match, Match)
 
-        # early return if different document
-        if self.root.page != other_match.root.page:
-            return False
-
         return (
             # overlap if same root node
             self.root == other_match.root
             # or if one is a parent of the other one
-            or self.root.has_parent(other_match.root)
-            or other_match.root.has_parent(self.root)
+            or self.root.has_ancestor(other_match.root)
+            or other_match.root.has_ancestor(self.root)
         )
 
     @property
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py
index fe1019a..693637d 100644
--- a/mlscraper/selectors.py
+++ b/mlscraper/selectors.py
@@ -48,20 +48,21 @@ class CssRuleSelector(Selector):
         return node.select(self.css_rule)
 
     def uniquely_selects(self, root: Node, nodes: typing.Collection[Node]):
+        # limit +1
+        # ensures mismatch if selection result starts with nodes
+        # e.g. select returns [1,2,3,...] and nodes is [1,2,3]
+        # but decreases load with many results significantly
+        limit = len(nodes) + 1
+
         # directly using soups:
         # - avoids creating nodes for all selects
         # - increases caching effort
-
-        # limit +1 ensures mismatch
-        # if selection result starts with nodes
-        # e.g. select returns [1,2,3,...] and nodes is [1,2,3]
-        limit = len(nodes) + 1
         return root.soup.select(self.css_rule, limit=limit) == [n.soup for n in nodes]
 
         # using select
         # - creates nodes for every soup object
         # - leverages caching
-        # return root.select(self.css_rule) == nodes
+        # return root.select(self.css_rule, limit=limit) == nodes
 
     def __repr__(self):
         return f"<{self.__class__.__name__} {self.css_rule=}>"
@@ -77,8 +78,9 @@ def generate_unique_selectors_for_nodes(
         logging.info("roots is None, using pages as roots")
         roots = [n.page for n in nodes]
 
-    nodes_per_root = {r: [n for n in nodes if n.has_parent(r)] for r in set(roots)}
+    nodes_per_root = {r: [n for n in nodes if n.has_ancestor(r)] for r in set(roots)}
     for selector in generate_selectors_for_nodes(nodes, roots, complexity):
+        logging.info(f"check if unique: {selector}")
         if all(
             selector.uniquely_selects(r, nodes_of_root)
             for r, nodes_of_root in nodes_per_root.items()
@@ -180,6 +182,7 @@ def _generate_regular_node_selectors(node: Node):
             yield f'{node.tag_name}[{attribute}="{value}"]'
 
 
+@functools.cache
 def _get_path_selectors(node: Node, max_length: int) -> tuple[str]:
     return tuple(set(_generate_path_selectors(node, max_length)))
 
diff --git a/tests/test_html.py b/tests/test_html.py
index a782a4c..9008cd7 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -16,10 +16,16 @@ def test_get_root_nodes():
 def test_node_parents():
     html = b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>'
     page = Page(html)
+    assert page.select("html")[0].parent == page, "html's parent should be page"
+
+
+def test_node_ancestors():
+    html = b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>'
+    page = Page(html)
     element = page.select("#one")[0]
     ancestors = element.ancestors
     assert ancestors[0] == element.parent, "first ancestor should be parent"
-    assert ancestors[-1].tag_name == "html", "last ancestor should be html"
+    assert isinstance(ancestors[-1], Page), "last ancestor should be page"
 
 
 def test_node_set():
@@ -77,7 +83,7 @@ def test_find_text_with_whitespace():
     page = Page(html)
     html_matches = page.find_all("whitespace")
 
-    # should match p, body, html
+    # should match p, body, html, but not page
     assert len(html_matches) == 3
     assert all(isinstance(hm, HTMLExactTextMatch) for hm in html_matches)
author	Karl Lorey <git@karllorey.com>	2022-07-06 15:02:07 +0200
committer	Karl Lorey <git@karllorey.com>	2022-07-06 15:02:07 +0200
commit	371574d0ef6b345996375741df0b688c8487747e (patch)
tree	ebe41d8675046babf2a1addd80795834875ac58d
parent	9f17478192977f483d4b1ebaf5e8e0835a46800d (diff)