Fix ListScraper and introduce maximum complexity parameter

author: Karl Lorey <git@karllorey.com> 2022-06-17 21:58:11 +0200
committer: Karl Lorey <git@karllorey.com> 2022-06-17 21:58:11 +0200
commit: edc327cbc52de5fc9d9cb8eb475d8007ea7337f1 (patch)
tree: fdf40ce9115c8ca5a4e014aa70b13d4d6caabadf
parent: 26e96a8e4e306bf350dc2f6d6b379d9509d18198 (diff)
8 files changed, 122 insertions, 150 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index c19f0db..0a6f4de 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -14,9 +14,6 @@ from bs4 import NavigableString
 from bs4 import Tag
 from mlscraper.util import powerset_max_length
 
-PARENT_NODE_COUNT_MAX = 2
-CSS_CLASS_COMBINATIONS_MAX = 2
-
 
 @dataclass
 class Match(ABC):
@@ -33,7 +30,7 @@ class AttributeMatch(Match):
     attr: str = None
 
 
-def _generate_css_selectors_for_node(soup: Tag):
+def _generate_css_selectors_for_node(soup: Tag, complexity: int):
     """
     Generate a selector for the given node.
     :param soup:
@@ -48,7 +45,7 @@ def _generate_css_selectors_for_node(soup: Tag):
 
     # use classes
     css_classes = soup.attrs.get("class", [])
-    for css_class_combo in powerset_max_length(css_classes, CSS_CLASS_COMBINATIONS_MAX):
+    for css_class_combo in powerset_max_length(css_classes, complexity):
         css_clases_str = "".join([f".{css_class}" for css_class in css_class_combo])
         css_selector = soup.name + css_clases_str
         yield css_selector
@@ -108,7 +105,7 @@ class Node:
                 return True
         return False
 
-    def generate_path_selectors(self):
+    def generate_path_selectors(self, complexity: int):
         """
         Generate a selector for the path to the given node.
         :return:
@@ -133,7 +130,7 @@ class Node:
         # print(parents)
 
         # loop from i=0 to i=len(parents) as we consider all parents
-        parent_node_count_max = min(len(parents), PARENT_NODE_COUNT_MAX)
+        parent_node_count_max = min(len(parents), complexity)
         for parent_node_count in range(parent_node_count_max + 1):
             logging.info(
                 "generating path selectors with %d parents" % parent_node_count
@@ -146,7 +143,8 @@ class Node:
                 # make a list of selector generators for each node in the path
                 # todo limit generated selectors -> huge product
                 selector_generators_for_each_path_node = [
-                    _generate_css_selectors_for_node(n) for n in path_sampled
+                    _generate_css_selectors_for_node(n, complexity)
+                    for n in path_sampled
                 ]
 
                 # generator that outputs selector paths
@@ -171,7 +169,7 @@ class Node:
     def __repr__(self):
         if isinstance(self.soup, NavigableString):
             return f"<{self.__class__.__name__} {self.soup[:100]=}>"
-        return f"<{self.__class__.__name__} {self.soup.name=} classes={self.soup.get('class', None)}>"
+        return f"<{self.__class__.__name__} {self.soup.name=} classes={self.soup.get('class', None)}, text={self.soup.text[:10]}...>"
 
     def __hash__(self):
         return self.soup.__hash__()
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index ca5d66a..d1a8498 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -3,12 +3,12 @@ Matches are specific elements found on a page that match a sample.
 """
 import logging
 import typing
+from functools import cached_property
 
 from mlscraper.html import AttributeMatch
 from mlscraper.html import get_root_node
 from mlscraper.html import Node
 from mlscraper.html import TextMatch
-from mlscraper.selectors import Selector
 
 
 class Match:
@@ -33,30 +33,6 @@ class Extractor:
         raise NotImplementedError()
 
 
-class Matcher:
-    """
-    Class that finds/selects nodes and extracts items from these nodes.
-    """
-
-    selector = None
-    extractor = None
-
-    def __init__(self, selector: Selector, extractor: Extractor):
-        self.selector = selector
-        self.extractor = extractor
-
-    def match_one(self, node: Node) -> Match:
-        selected_node = self.selector.select_one(node)
-        return Match(selected_node, self.extractor)
-
-    def match_all(self, node: Node) -> typing.List[Match]:
-        selected_nodes = self.selector.select_all(node)
-        return [Match(n, self.extractor) for n in selected_nodes]
-
-    def __repr__(self):
-        return f"<{self.__class__.__name__} {self.selector=} {self.extractor=}>"
-
-
 class TextValueExtractor(Extractor):
     """
     Class to extract text from a node.
@@ -102,23 +78,13 @@ class AttributeValueExtractor(Extractor):
         return isinstance(other, AttributeValueExtractor) and self.attr == other.attr
 
 
-class DictExtractor(Extractor):
-    def __init__(self, matcher_by_key: typing.Dict[str, Matcher]):
-        self.matcher_by_key = matcher_by_key
-
-    def extract(self, node: Node):
-        return {
-            key: matcher.match_one(node) for key, matcher in self.matcher_by_key.items()
-        }
-
-
 class DictMatch(Match):
     match_by_key = None
 
     def __init__(self, match_by_key: dict):
         self.match_by_key = match_by_key
 
-    @property
+    @cached_property
     def root(self) -> Node:
         match_roots = [m.root for m in self.match_by_key.values()]
         return get_root_node(match_roots)
@@ -136,7 +102,7 @@ class ListMatch(Match):
     def __repr__(self):
         return f"<{self.__class__.__name__} {self.matches=}>"
 
-    @property
+    @cached_property
     def root(self) -> Node:
         return get_root_node([m.root for m in self.matches])
 
diff --git a/mlscraper/samples.py b/mlscraper/samples.py
index f369de4..8f54be9 100644
--- a/mlscraper/samples.py
+++ b/mlscraper/samples.py
@@ -1,15 +1,10 @@
-import logging
 import typing
 from itertools import product
 
-from mlscraper.html import Node
 from mlscraper.html import Page
 from mlscraper.matches import DictMatch
 from mlscraper.matches import generate_all_value_matches
 from mlscraper.matches import ListMatch
-from mlscraper.matches import Matcher
-from mlscraper.selectors import CssRuleSelector
-from more_itertools import flatten
 
 
 class ItemStructureException(Exception):
@@ -35,6 +30,7 @@ class Sample:
 
             # generate list of combinations
             # todo filter combinations that use the same matches twice
+            # todo create combinations only in order
             match_combis = product(*matches_by_value)
 
             return [ListMatch(tuple(match_combi)) for match_combi in match_combis]
@@ -155,60 +151,3 @@ def make_training_set(pages, items):
         ts.add_sample(Sample(p, i))
 
     return ts
-
-
-def make_matcher_for_samples(
-    samples: typing.List[Sample], roots: typing.Optional[typing.List[Node]] = None
-) -> typing.Union[Matcher, None]:
-    for sample in samples:
-        # todo leverage generator or cache
-        assert sample.get_matches(), f"no matches found for {sample}"
-
-    for matcher in generate_matchers_for_samples(samples, roots):
-        return matcher
-    return None
-
-
-def generate_matchers_for_samples(
-    samples: typing.List[Sample], roots: typing.Optional[typing.List[Node]] = None
-) -> typing.Generator:
-    """
-    Generate CSS selectors that match the given samples.
-    :param samples:
-    :param roots: root nodes to search from
-    :return:
-    """
-    logging.info(f"generating matchers for samples {samples=} {roots=}")
-    if not roots:
-        roots = [s.page for s in samples]
-        logging.info("roots not set, will use samples' pages")
-
-    assert len(samples) == len(roots)
-
-    # make a list containing sets of nodes for each possible combination of matches
-    # -> enables fast searching and set ensures order
-    # todo add only matches below roots here
-    matches_per_sample = [s.get_matches() for s in samples]
-    match_combinations = list(map(set, product(*matches_per_sample)))
-    logging.info(f"match combinations: {match_combinations}")
-    node_combinations = [{m.node for m in matches} for matches in match_combinations]
-
-    for sample in samples:
-        for match in sample.get_matches():
-            for css_sel in match.root.generate_path_selectors():
-                logging.info(f"testing selector: {css_sel}")
-                matched_nodes = set(flatten(root.select(css_sel) for root in roots))
-                if matched_nodes in node_combinations:
-                    logging.info(f"{css_sel} matches one of the possible combinations")
-                    i = node_combinations.index(matched_nodes)
-                    matches = match_combinations[i]
-                    match_extractors = {m.extractor for m in matches}
-                    if len(match_extractors) == 1:
-                        logging.info(f"{css_sel} matches same extractors")
-                        selector = CssRuleSelector(css_sel)
-                        extractor = next(iter(match_extractors))
-                        yield Matcher(selector, extractor)
-                    else:
-                        logging.info(
-                            f"{css_sel} would need different extractors, ignoring: {match_extractors}"
-                        )
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py
index 8a15ad3..1ba4c55 100644
--- a/mlscraper/selectors.py
+++ b/mlscraper/selectors.py
@@ -2,9 +2,7 @@ import logging
 import typing
 
 from mlscraper.html import Node
-from mlscraper.html import Page
 from mlscraper.html import selector_matches_nodes
-from more_itertools import bucket
 
 
 class Selector:
@@ -47,8 +45,10 @@ class CssRuleSelector(Selector):
         return f"<{self.__class__.__name__} {self.css_rule=}>"
 
 
-def generate_selector_for_nodes(nodes: typing.List[Node], roots):
-    logging.info(f"trying to find selector for nodes ({nodes=}, {roots=})")
+def generate_selector_for_nodes(nodes: typing.List[Node], roots, complexity: int):
+    logging.info(
+        f"trying to find selector for nodes ({nodes=}, {roots=}, {complexity=})"
+    )
     assert nodes, "no nodes given"
 
     if roots is None:
@@ -63,7 +63,7 @@ def generate_selector_for_nodes(nodes: typing.List[Node], roots):
     selectors_seen = set()
 
     for node in nodes:
-        for sel in node.generate_path_selectors():
+        for sel in node.generate_path_selectors(complexity):
             logging.info(f"selector: {sel}")
             if sel not in selectors_seen:
                 logging.info(
diff --git a/mlscraper/training.py b/mlscraper/training.py
index 8512682..55e4aa7 100644
--- a/mlscraper/training.py
+++ b/mlscraper/training.py
@@ -32,17 +32,35 @@ def train_scraper(training_set: TrainingSet):
 
     sample_matches = [s.get_matches() for s in training_set.item.samples]
     roots = [s.page for s in training_set.item.samples]
-    for match_combination in product(*sample_matches):
-        logging.info(f"trying to train scraper for matches ({match_combination=})")
-        scraper = train_scraper_for_matches(match_combination, roots)
-        return scraper
+    match_combinations = [mc for mc in product(*sample_matches)]
+    logging.info(f"Trying {len(match_combinations)=}")
 
-
-def train_scraper_for_matches(matches, roots):
+    for complexity in range(3):
+        for match_combination in match_combinations:
+            logging.info(
+                f"progress {match_combinations.index(match_combination)/len(match_combinations)}"
+            )
+            try:
+                logging.info(
+                    f"trying to train scraper for matches ({match_combination=})"
+                )
+                scraper = train_scraper_for_matches(
+                    match_combination, roots, complexity
+                )
+                return scraper
+            except NoScraperFoundException:
+                logging.info(
+                    f"no scraper found for complexity and match_combination ({complexity=}, {match_combination=})"
+                )
+    raise NoScraperFoundException(f"did not find scraper")
+
+
+def train_scraper_for_matches(matches, roots, complexity: int):
     """
     Train a scraper that finds the given matches from the given roots.
     :param matches: the matches to scrape
     :param roots: the root elements containing the matches, e.g. pages or elements on pages
+    :param complexity: the complexity to try
     """
     found_types = set(map(type, matches))
     assert (
@@ -71,9 +89,15 @@ def train_scraper_for_matches(matches, roots):
         if all(m.node == r for m, r in zip(matches, roots)):
             # nodes are matched already, done
             return ValueScraper(PassThroughSelector(), extractor=extractor)
+        else:
+            logging.info(
+                "no early return: %s",
+                [(m.node, r, m.node == r) for m, r in zip(matches, roots)],
+            )
 
         selector = first(
-            generate_selector_for_nodes([m.node for m in matches], roots), None
+            generate_selector_for_nodes([m.node for m in matches], roots, complexity),
+            None,
         )
         if not selector:
             raise NoScraperFoundException(f"no selector found {matches=}")
@@ -90,35 +114,50 @@ def train_scraper_for_matches(matches, roots):
         # matches are the matches for the keys
         # roots are the original roots(?)
         scraper_per_key = {
-            k: train_scraper_for_matches([m.match_by_key[k] for m in matches], roots)
+            k: train_scraper_for_matches(
+                [m.match_by_key[k] for m in matches], roots, complexity
+            )
             for k in keys
         }
         return DictScraper(scraper_per_key)
     elif found_type == ListMatch:
         logging.info("training ListScraper")
         matches: typing.List[ListMatch]
+        logging.info(matches)
 
         # so we have a list of ListMatch objects
         # we have to find a selector that uniquely matches the list elements
         # todo can be one of the parents
-        match_roots = [m.root for m in matches]
-        logging.info(f"{match_roots=}")
+        # for each match, generate all the nodes of list items
+        list_item_match_and_roots = [
+            (im, r) for m, r in zip(matches, roots) for im in m.matches
+        ]
+        list_item_nodes_and_roots = [
+            (im.root, r) for im, r in list_item_match_and_roots
+        ]
+        item_nodes, item_roots = unzip(list_item_nodes_and_roots)
 
         # first selector is fine as it matches perfectly
         # no need to try other selectors
         # -> item_scraper would be the same
-        selector = first(generate_selector_for_nodes(match_roots, roots))
+        selector = first(
+            generate_selector_for_nodes(list(item_nodes), list(item_roots), complexity),
+            None,
+        )
         if selector:
-            # for all the item_matches, create a tuple
-            # that contains the item_match and the new root
-            matches_and_roots = [
-                (im, selector.select_one(r))
-                for m, r in zip(matches, roots)
-                for im in m.matches
+            logging.info(f"selector that matches list items found ({selector=})")
+            # so we have found a selector that matches the list items
+            # we now need a scraper, that scrapes each contained item
+            # todo im.root does not hold for all items, could be a parent
+            item_matches_and_item_roots = [
+                (im, im.root) for im, r in list_item_match_and_roots
             ]
-            item_matches, list_roots = unzip(matches_and_roots)
+            logging.info(
+                f"training to extract list items now ({item_matches_and_item_roots})"
+            )
+            item_matches, item_roots = unzip(item_matches_and_item_roots)
             item_scraper = train_scraper_for_matches(
-                list(item_matches), list(list_roots)
+                list(item_matches), list(item_roots), complexity
             )
             return ListScraper(selector, item_scraper)
         else:
diff --git a/tests/test_samples.py b/tests/test_samples.py
index df18d86..b6677f0 100644
--- a/tests/test_samples.py
+++ b/tests/test_samples.py
@@ -3,7 +3,6 @@ from mlscraper.html import Page
 from mlscraper.matches import DictMatch
 from mlscraper.matches import ListMatch
 from mlscraper.samples import ItemStructureException
-from mlscraper.samples import make_matcher_for_samples
 from mlscraper.samples import make_training_set
 from mlscraper.samples import Sample
 
@@ -64,17 +63,3 @@ class TestMatch:
         assert len(match.matches) == 2
         assert all(isinstance(m, DictMatch) for m in match.matches)
         print(match.root)
-
-
-def test_make_matcher_for_samples():
-    page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
-    page1 = Page(page1_html)
-    sample1 = Sample(page1, "test")
-
-    page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
-    page2 = Page(page2_html)
-    sample2 = Sample(page2, "hallo")
-
-    samples = [sample1, sample2]
-    matcher = make_matcher_for_samples(samples)
-    assert matcher.selector.css_rule in ["p.test", ".test"]
diff --git a/tests/test_selectors.py b/tests/test_selectors.py
index 57a5ded..c2d204a 100644
--- a/tests/test_selectors.py
+++ b/tests/test_selectors.py
@@ -16,6 +16,6 @@ def test_generate_selector_for_nodes():
 
     nodes = [s.get_matches()[0].root for s in samples]
     print(nodes)
-    gen = generate_selector_for_nodes(nodes, None)
+    gen = generate_selector_for_nodes(nodes, None, 1)
     # todo .test is also possible
     assert ["p.test"] == [sel.css_rule for sel in gen]
diff --git a/tests/test_training.py b/tests/test_training.py
index 3e48756..ddd86fb 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -1,7 +1,12 @@
 import pytest
 from mlscraper.html import Page
+from mlscraper.matches import TextValueExtractor
 from mlscraper.samples import Sample
 from mlscraper.samples import TrainingSet
+from mlscraper.scrapers import ListScraper
+from mlscraper.scrapers import ValueScraper
+from mlscraper.selectors import CssRuleSelector
+from mlscraper.selectors import PassThroughSelector
 from mlscraper.training import train_scraper
 
 
@@ -13,10 +18,50 @@ def test_train_scraper_simple_list():
         ["a", "b", "c"],
     )
     training_set.add_sample(sample)
-    train_scraper(training_set)
+    scraper = train_scraper(training_set)
+
+    # check list scraper
+    assert isinstance(scraper, ListScraper)
+    assert isinstance(scraper.selector, CssRuleSelector)
+    assert scraper.selector.css_rule == "p"
+
+    # check item scraper
+    item_scraper = scraper.scraper
+    assert isinstance(item_scraper, ValueScraper)
+    assert isinstance(item_scraper.selector, PassThroughSelector)
+    assert isinstance(item_scraper.extractor, TextValueExtractor)
+
+
+def test_train_scraper_list_of_dicts():
+    html = b"""
+    <html>
+    <body>
+    <div><p>a</p><p>b</p></div>
+    <div><p>c</p><p>d</p></div>
+    </body>
+    </html
+    """
+    page = Page(html)
+    sample = Sample(page, [["a", "b"], ["c", "d"]])
+    training_set = TrainingSet()
+    training_set.add_sample(sample)
+    scraper = train_scraper(training_set)
+    assert isinstance(scraper, ListScraper)
+    assert isinstance(scraper.selector, CssRuleSelector)
+    assert scraper.selector.css_rule == "div"
+
+    inner_scraper = scraper.scraper
+    assert isinstance(inner_scraper, ListScraper)
+    assert isinstance(inner_scraper.selector, CssRuleSelector)
+    assert inner_scraper.selector.css_rule == "p"
+
+    value_scraper = inner_scraper.scraper
+    assert isinstance(value_scraper, ValueScraper)
+    assert isinstance(value_scraper.selector, PassThroughSelector)
+    assert isinstance(value_scraper.extractor, TextValueExtractor)
 
 
-@pytest.mark.skip("does not work yet")
+@pytest.mark.skip("takes too long")
 def test_train_scraper(stackoverflow_samples):
     training_set = TrainingSet()
     for s in stackoverflow_samples:
author	Karl Lorey <git@karllorey.com>	2022-06-17 21:58:11 +0200
committer	Karl Lorey <git@karllorey.com>	2022-06-17 21:58:11 +0200
commit	edc327cbc52de5fc9d9cb8eb475d8007ea7337f1 (patch)
tree	fdf40ce9115c8ca5a4e014aa70b13d4d6caabadf
parent	26e96a8e4e306bf350dc2f6d6b379d9509d18198 (diff)