Improved basic scraper to be fully functional

author: Karl Lorey <git@karllorey.com> 2020-09-25 18:49:45 +0200
committer: Karl Lorey <git@karllorey.com> 2020-09-25 18:49:45 +0200
commit: f0b1dbb8494fdea26f217fce59817f386691fcc0 (patch)
tree: 7964d243b9938e4be69ad5ea18949a3439bda126
parent: 1e7dbc9346fa85edcb132acad937dd7c15efb22c (diff)
4 files changed, 36 insertions, 16 deletions
diff --git a/mlscraper/parser.py b/mlscraper/parser.py
index f6d1507..3fe658a 100644
--- a/mlscraper/parser.py
+++ b/mlscraper/parser.py
@@ -1,13 +1,18 @@
 # everything related to parsing html
+import logging
+import re
 from abc import ABC
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
 
 class Page(ABC):
     def select(self, css_selector):
         raise NotImplementedError()
 
+    def find(self, needle):
+        raise NotImplementedError()
+
 
 class Node(ABC):
     pass
@@ -21,7 +26,23 @@ class SoupPage(Page):
         self._soup = soup
 
     def select(self, css_selector):
-        return [SoupNode(res) for res in self._soup.select(css_selector)]
+        try:
+            return [SoupNode(res) for res in self._soup.select(css_selector)]
+        except NotImplementedError:
+            logging.warning("ignoring selector %s: not implemented by BS4" % css_selector)
+            return []
+
+    def find(self, needle):
+        assert type(needle) == str, "can only find strings ATM"
+        text_matches = self._soup.find_all(text=re.compile(needle))
+        logging.debug("Matches for %s: %s", needle, text_matches)
+        text_parents = (ns.parent for ns in text_matches)
+        tag_matches = [p for p in text_parents if extract_soup_text(p) == needle]
+        return [SoupNode(m) for m in tag_matches]
+
+
+def extract_soup_text(tag: Tag):
+    return tag.text
 
 
 class SoupNode(Node):
@@ -40,3 +61,13 @@ class SoupNode(Node):
 def make_soup_page(html):
     soup = BeautifulSoup(html, "lxml")
     return SoupPage(soup)
+
+
+class ExtractionResult:
+    """Specific result found on a page"""
+
+    node = None
+    # extraction_method = None
+
+    def __init__(self, node: Node):
+        self.node = node
diff --git a/mlscraper/training.py b/mlscraper/training.py
index afa3e73..875ba22 100644
--- a/mlscraper/training.py
+++ b/mlscraper/training.py
@@ -1,15 +1,5 @@
 # training objects
-from mlscraper.parser import Page, Node
-
-
-class ExtractionResult:
-    """Specific result found on a page"""
-
-    node = None
-    # extraction_method = None
-
-    def __init__(self, node: Node):
-        self.node = node
+from mlscraper.parser import Page
 
 
 class MultiItemPageSample:
diff --git a/mlscraper/util.py b/mlscraper/util.py
index 7c0b1c1..8cab5c0 100644
--- a/mlscraper/util.py
+++ b/mlscraper/util.py
@@ -1,6 +1,6 @@
 import logging
 from collections import namedtuple
-from itertools import combinations, product, permutations
+from itertools import combinations, product
 from statistics import mean
 
 from bs4 import Tag
diff --git a/test/test_new.py b/test/test_new.py
index 3a4c05e..9ba5f72 100644
--- a/test/test_new.py
+++ b/test/test_new.py
@@ -1,6 +1,5 @@
 from mlscraper import RuleBasedSingleItemScraper, SingleItemPageSample
-from mlscraper.parser import make_soup_page
-from mlscraper.training import ExtractionResult
+from mlscraper.parser import make_soup_page, ExtractionResult
 
 
 def test_basic():
author	Karl Lorey <git@karllorey.com>	2020-09-25 18:49:45 +0200
committer	Karl Lorey <git@karllorey.com>	2020-09-25 18:49:45 +0200
commit	f0b1dbb8494fdea26f217fce59817f386691fcc0 (patch)
tree	7964d243b9938e4be69ad5ea18949a3439bda126
parent	1e7dbc9346fa85edcb132acad937dd7c15efb22c (diff)