summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2020-09-25 18:49:45 +0200
committerKarl Lorey <git@karllorey.com>2020-09-25 18:49:45 +0200
commitf0b1dbb8494fdea26f217fce59817f386691fcc0 (patch)
tree7964d243b9938e4be69ad5ea18949a3439bda126
parent1e7dbc9346fa85edcb132acad937dd7c15efb22c (diff)
Improved basic scraper to be fully functional
-rw-r--r--mlscraper/parser.py35
-rw-r--r--mlscraper/training.py12
-rw-r--r--mlscraper/util.py2
-rw-r--r--test/test_new.py3
4 files changed, 36 insertions, 16 deletions
diff --git a/mlscraper/parser.py b/mlscraper/parser.py
index f6d1507..3fe658a 100644
--- a/mlscraper/parser.py
+++ b/mlscraper/parser.py
@@ -1,13 +1,18 @@
# everything related to parsing html
+import logging
+import re
from abc import ABC
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
class Page(ABC):
def select(self, css_selector):
raise NotImplementedError()
+ def find(self, needle):
+ raise NotImplementedError()
+
class Node(ABC):
pass
@@ -21,7 +26,23 @@ class SoupPage(Page):
self._soup = soup
def select(self, css_selector):
- return [SoupNode(res) for res in self._soup.select(css_selector)]
+ try:
+ return [SoupNode(res) for res in self._soup.select(css_selector)]
+ except NotImplementedError:
+ logging.warning("ignoring selector %s: not implemented by BS4" % css_selector)
+ return []
+
+ def find(self, needle):
+ assert type(needle) == str, "can only find strings ATM"
+ text_matches = self._soup.find_all(text=re.compile(needle))
+ logging.debug("Matches for %s: %s", needle, text_matches)
+ text_parents = (ns.parent for ns in text_matches)
+ tag_matches = [p for p in text_parents if extract_soup_text(p) == needle]
+ return [SoupNode(m) for m in tag_matches]
+
+
+def extract_soup_text(tag: Tag):
+ return tag.text
class SoupNode(Node):
@@ -40,3 +61,13 @@ class SoupNode(Node):
def make_soup_page(html):
soup = BeautifulSoup(html, "lxml")
return SoupPage(soup)
+
+
+class ExtractionResult:
+ """Specific result found on a page"""
+
+ node = None
+ # extraction_method = None
+
+ def __init__(self, node: Node):
+ self.node = node
diff --git a/mlscraper/training.py b/mlscraper/training.py
index afa3e73..875ba22 100644
--- a/mlscraper/training.py
+++ b/mlscraper/training.py
@@ -1,15 +1,5 @@
# training objects
-from mlscraper.parser import Page, Node
-
-
-class ExtractionResult:
- """Specific result found on a page"""
-
- node = None
- # extraction_method = None
-
- def __init__(self, node: Node):
- self.node = node
+from mlscraper.parser import Page
class MultiItemPageSample:
diff --git a/mlscraper/util.py b/mlscraper/util.py
index 7c0b1c1..8cab5c0 100644
--- a/mlscraper/util.py
+++ b/mlscraper/util.py
@@ -1,6 +1,6 @@
import logging
from collections import namedtuple
-from itertools import combinations, product, permutations
+from itertools import combinations, product
from statistics import mean
from bs4 import Tag
diff --git a/test/test_new.py b/test/test_new.py
index 3a4c05e..9ba5f72 100644
--- a/test/test_new.py
+++ b/test/test_new.py
@@ -1,6 +1,5 @@
from mlscraper import RuleBasedSingleItemScraper, SingleItemPageSample
-from mlscraper.parser import make_soup_page
-from mlscraper.training import ExtractionResult
+from mlscraper.parser import make_soup_page, ExtractionResult
def test_basic():