diff options
author | Karl Lorey <git@karllorey.com> | 2020-09-25 18:49:45 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2020-09-25 18:49:45 +0200 |
commit | f0b1dbb8494fdea26f217fce59817f386691fcc0 (patch) | |
tree | 7964d243b9938e4be69ad5ea18949a3439bda126 | |
parent | 1e7dbc9346fa85edcb132acad937dd7c15efb22c (diff) |
Improved basic scraper to be fully functional
-rw-r--r-- | mlscraper/parser.py | 35 | ||||
-rw-r--r-- | mlscraper/training.py | 12 | ||||
-rw-r--r-- | mlscraper/util.py | 2 | ||||
-rw-r--r-- | test/test_new.py | 3 |
4 files changed, 36 insertions, 16 deletions
diff --git a/mlscraper/parser.py b/mlscraper/parser.py index f6d1507..3fe658a 100644 --- a/mlscraper/parser.py +++ b/mlscraper/parser.py @@ -1,13 +1,18 @@ # everything related to parsing html +import logging +import re from abc import ABC -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag class Page(ABC): def select(self, css_selector): raise NotImplementedError() + def find(self, needle): + raise NotImplementedError() + class Node(ABC): pass @@ -21,7 +26,23 @@ class SoupPage(Page): self._soup = soup def select(self, css_selector): - return [SoupNode(res) for res in self._soup.select(css_selector)] + try: + return [SoupNode(res) for res in self._soup.select(css_selector)] + except NotImplementedError: + logging.warning("ignoring selector %s: not implemented by BS4" % css_selector) + return [] + + def find(self, needle): + assert type(needle) == str, "can only find strings ATM" + text_matches = self._soup.find_all(text=re.compile(needle)) + logging.debug("Matches for %s: %s", needle, text_matches) + text_parents = (ns.parent for ns in text_matches) + tag_matches = [p for p in text_parents if extract_soup_text(p) == needle] + return [SoupNode(m) for m in tag_matches] + + +def extract_soup_text(tag: Tag): + return tag.text class SoupNode(Node): @@ -40,3 +61,13 @@ class SoupNode(Node): def make_soup_page(html): soup = BeautifulSoup(html, "lxml") return SoupPage(soup) + + +class ExtractionResult: + """Specific result found on a page""" + + node = None + # extraction_method = None + + def __init__(self, node: Node): + self.node = node diff --git a/mlscraper/training.py b/mlscraper/training.py index afa3e73..875ba22 100644 --- a/mlscraper/training.py +++ b/mlscraper/training.py @@ -1,15 +1,5 @@ # training objects -from mlscraper.parser import Page, Node - - -class ExtractionResult: - """Specific result found on a page""" - - node = None - # extraction_method = None - - def __init__(self, node: Node): - self.node = node +from mlscraper.parser import Page class MultiItemPageSample: diff --git a/mlscraper/util.py b/mlscraper/util.py index 7c0b1c1..8cab5c0 100644 --- a/mlscraper/util.py +++ b/mlscraper/util.py @@ -1,6 +1,6 @@ import logging from collections import namedtuple -from itertools import combinations, product, permutations +from itertools import combinations, product from statistics import mean from bs4 import Tag diff --git a/test/test_new.py b/test/test_new.py index 3a4c05e..9ba5f72 100644 --- a/test/test_new.py +++ b/test/test_new.py @@ -1,6 +1,5 @@ from mlscraper import RuleBasedSingleItemScraper, SingleItemPageSample -from mlscraper.parser import make_soup_page -from mlscraper.training import ExtractionResult +from mlscraper.parser import make_soup_page, ExtractionResult def test_basic(): |