diff options
author | Karl Lorey <git@karllorey.com> | 2020-09-26 17:02:42 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2020-09-26 17:02:42 +0200 |
commit | 7c9c218f84a06665921350d0993c2820bec50fea (patch) | |
tree | 2e3c9cff9c1ef97cba5546db1ff5ed71eaafda65 | |
parent | 2c745d684f2b15c8c91638244240d56b7308926b (diff) |
Simplify interfaces even more (plain html only)
-rw-r--r-- | README.md | 3 | ||||
-rw-r--r-- | mlscraper/training.py | 10 | ||||
-rw-r--r-- | test/test_basic.py | 7 | ||||
-rw-r--r-- | test/test_new.py | 4 |
4 files changed, 10 insertions, 14 deletions
@@ -26,7 +26,6 @@ After you've defined the data you want to scrape, mlscraper will: ```python from mlscraper import MultiItemScraper -from mlscraper.parser import make_soup_page from mlscraper.training import MultiItemPageSample # the items found on the training page @@ -37,7 +36,7 @@ items = [ ] # training the scraper with the items -sample = MultiItemPageSample(make_soup_page(html), items) +sample = MultiItemPageSample(html, items) scraper = MultiItemScraper.build([sample]) scraper.scrape(html) # will produce the items above scraper.scrape(new_html) # will apply the learned rules and extract new items diff --git a/mlscraper/training.py b/mlscraper/training.py index 28517a4..3a8e8b4 100644 --- a/mlscraper/training.py +++ b/mlscraper/training.py @@ -1,5 +1,5 @@ # training objects -from mlscraper.parser import Page +from mlscraper.parser import Page, make_soup_page class MultiItemPageSample: @@ -8,8 +8,8 @@ class MultiItemPageSample: page = None items = None - def __init__(self, page: Page, items: list): - self.page = page + def __init__(self, html: bytes, items: list): + self.page = make_soup_page(html) self.items = items @@ -19,8 +19,8 @@ class SingleItemPageSample: page = None item = None - def __init__(self, page: Page, item: dict): - self.page = page + def __init__(self, html: bytes, item: dict): + self.page = make_soup_page(html) self.item = item def find_nodes(self, attr): diff --git a/test/test_basic.py b/test/test_basic.py index 0876c91..c9a4e50 100644 --- a/test/test_basic.py +++ b/test/test_basic.py @@ -6,7 +6,6 @@ from mlscraper import ( MultiItemScraper, SingleItemScraper, MultiItemPageSample, - make_soup_page, SingleItemPageSample, ) @@ -50,7 +49,7 @@ def test_multi(multi_single_result_page_html): ] html = multi_single_result_page_html - scraper = MultiItemScraper.build([MultiItemPageSample(make_soup_page(html), items)]) + scraper = MultiItemScraper.build([MultiItemPageSample(html, items)]) assert scraper.scrape(html) == items # optional since they're only human guesses @@ -60,7 +59,7 @@ def test_multi(multi_single_result_page_html): def test_single(single_basic_train_html): data = {"name": "Peter", "description": "Cool-looking guy"} - samples = [SingleItemPageSample(make_soup_page(single_basic_train_html), data)] + samples = [SingleItemPageSample(single_basic_train_html, data)] scraper = SingleItemScraper.build(samples) result = scraper.scrape(single_basic_train_html) assert result == data @@ -68,7 +67,7 @@ def test_single(single_basic_train_html): def test_single_with_whitespace(whitespace_html): data = {"name": "Peter", "description": "Cool-looking guy"} - samples = [SingleItemPageSample(make_soup_page(whitespace_html), data)] + samples = [SingleItemPageSample(whitespace_html, data)] scraper = SingleItemScraper.build(samples) result = scraper.scrape( b'<html><body><div><div class="person-name">Klaus</div></div></body></html>' diff --git a/test/test_new.py b/test/test_new.py index dbfea13..6efc81a 100644 --- a/test/test_new.py +++ b/test/test_new.py @@ -1,12 +1,10 @@ from mlscraper import RuleBasedSingleItemScraper, SingleItemPageSample -from mlscraper.parser import make_soup_page def test_basic(): html = '<html><body><div class="parent"><p class="item">result</p></div><p class="item">not a result</p></body></html>' - page = make_soup_page(html) item = {"res": "result"} - samples = [SingleItemPageSample(page, item)] + samples = [SingleItemPageSample(html, item)] scraper = RuleBasedSingleItemScraper.build(samples) assert scraper.scrape(html) == item |