diff options
author | Karl Lorey <git@karllorey.com> | 2022-07-06 15:02:07 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-07-06 15:02:07 +0200 |
commit | 371574d0ef6b345996375741df0b688c8487747e (patch) | |
tree | ebe41d8675046babf2a1addd80795834875ac58d | |
parent | 9f17478192977f483d4b1ebaf5e8e0835a46800d (diff) |
Minor performance improvements
-rw-r--r-- | mlscraper/html.py | 24 | ||||
-rw-r--r-- | mlscraper/matches.py | 8 | ||||
-rw-r--r-- | mlscraper/selectors.py | 17 | ||||
-rw-r--r-- | tests/test_html.py | 10 |
4 files changed, 37 insertions, 22 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py index 4cca384..fc45c29 100644 --- a/mlscraper/html.py +++ b/mlscraper/html.py @@ -89,7 +89,7 @@ class Node: yield HTMLExactTextMatch(node) for p in node.ancestors: - if p.text.strip() == node.text.strip(): + if p.text.strip() == node.text.strip() and not isinstance(p, Page): yield HTMLExactTextMatch(p) # attributes @@ -101,9 +101,14 @@ class Node: # todo implement other find methods - def has_parent(self, node: "Node") -> bool: - for p in self.soup.parents: - if p == node.soup: + def has_ancestor(self, node: "Node") -> bool: + # early return if different page + if self._page != node._page: + return False + + # inline to avoid creating ancestors + for a in self.soup.parents: + if a == node.soup: return True return False @@ -114,7 +119,7 @@ class Node: """ # don't return parent if it would be above <html> tag if self.tag_name in ["html", "[document]"]: - return None + return self._page return self._page._get_node_for_soup(self.soup.parent) @@ -144,9 +149,10 @@ class Node: def html_attributes(self): return self.soup.attrs - def select(self, css_selector): + def select(self, css_selector, limit=None): return [ - self._page._get_node_for_soup(n) for n in self.soup.select(css_selector) + self._page._get_node_for_soup(n) + for n in self.soup.select(css_selector, limit=limit) ] def __repr__(self): @@ -196,6 +202,10 @@ class Page(Node): self._node_registry[soup] = Node(soup, self) return self._node_registry[soup] + @property + def parent(self): + return None + def get_root_node(nodes: list[Node]) -> Node: pages = [n._page for n in nodes] diff --git a/mlscraper/matches.py b/mlscraper/matches.py index f7eb447..ace47b3 100644 --- a/mlscraper/matches.py +++ b/mlscraper/matches.py @@ -29,16 +29,12 @@ class Match: def has_overlap(self, other_match: "Match"): assert isinstance(other_match, Match) - # early return if different document - if self.root.page != other_match.root.page: - return False - return ( # overlap if same root node self.root == other_match.root # or if one is a parent of the other one - or self.root.has_parent(other_match.root) - or other_match.root.has_parent(self.root) + or self.root.has_ancestor(other_match.root) + or other_match.root.has_ancestor(self.root) ) @property diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py index fe1019a..693637d 100644 --- a/mlscraper/selectors.py +++ b/mlscraper/selectors.py @@ -48,20 +48,21 @@ class CssRuleSelector(Selector): return node.select(self.css_rule) def uniquely_selects(self, root: Node, nodes: typing.Collection[Node]): + # limit +1 + # ensures mismatch if selection result starts with nodes + # e.g. select returns [1,2,3,...] and nodes is [1,2,3] + # but decreases load with many results significantly + limit = len(nodes) + 1 + # directly using soups: # - avoids creating nodes for all selects # - increases caching effort - - # limit +1 ensures mismatch - # if selection result starts with nodes - # e.g. select returns [1,2,3,...] and nodes is [1,2,3] - limit = len(nodes) + 1 return root.soup.select(self.css_rule, limit=limit) == [n.soup for n in nodes] # using select # - creates nodes for every soup object # - leverages caching - # return root.select(self.css_rule) == nodes + # return root.select(self.css_rule, limit=limit) == nodes def __repr__(self): return f"<{self.__class__.__name__} {self.css_rule=}>" @@ -77,8 +78,9 @@ def generate_unique_selectors_for_nodes( logging.info("roots is None, using pages as roots") roots = [n.page for n in nodes] - nodes_per_root = {r: [n for n in nodes if n.has_parent(r)] for r in set(roots)} + nodes_per_root = {r: [n for n in nodes if n.has_ancestor(r)] for r in set(roots)} for selector in generate_selectors_for_nodes(nodes, roots, complexity): + logging.info(f"check if unique: {selector}") if all( selector.uniquely_selects(r, nodes_of_root) for r, nodes_of_root in nodes_per_root.items() @@ -180,6 +182,7 @@ def _generate_regular_node_selectors(node: Node): yield f'{node.tag_name}[{attribute}="{value}"]' +@functools.cache def _get_path_selectors(node: Node, max_length: int) -> tuple[str]: return tuple(set(_generate_path_selectors(node, max_length))) diff --git a/tests/test_html.py b/tests/test_html.py index a782a4c..9008cd7 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -16,10 +16,16 @@ def test_get_root_nodes(): def test_node_parents(): html = b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>' page = Page(html) + assert page.select("html")[0].parent == page, "html's parent should be page" + + +def test_node_ancestors(): + html = b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>' + page = Page(html) element = page.select("#one")[0] ancestors = element.ancestors assert ancestors[0] == element.parent, "first ancestor should be parent" - assert ancestors[-1].tag_name == "html", "last ancestor should be html" + assert isinstance(ancestors[-1], Page), "last ancestor should be page" def test_node_set(): @@ -77,7 +83,7 @@ def test_find_text_with_whitespace(): page = Page(html) html_matches = page.find_all("whitespace") - # should match p, body, html + # should match p, body, html, but not page assert len(html_matches) == 3 assert all(isinstance(hm, HTMLExactTextMatch) for hm in html_matches) |