summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2020-09-26 17:02:42 +0200
committerKarl Lorey <git@karllorey.com>2020-09-26 17:02:42 +0200
commit7c9c218f84a06665921350d0993c2820bec50fea (patch)
tree2e3c9cff9c1ef97cba5546db1ff5ed71eaafda65
parent2c745d684f2b15c8c91638244240d56b7308926b (diff)
Simplify interfaces even more (plain html only)
-rw-r--r--README.md3
-rw-r--r--mlscraper/training.py10
-rw-r--r--test/test_basic.py7
-rw-r--r--test/test_new.py4
4 files changed, 10 insertions, 14 deletions
diff --git a/README.md b/README.md
index 7f22ab8..f23ded4 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,6 @@ After you've defined the data you want to scrape, mlscraper will:
```python
from mlscraper import MultiItemScraper
-from mlscraper.parser import make_soup_page
from mlscraper.training import MultiItemPageSample
# the items found on the training page
@@ -37,7 +36,7 @@ items = [
]
# training the scraper with the items
-sample = MultiItemPageSample(make_soup_page(html), items)
+sample = MultiItemPageSample(html, items)
scraper = MultiItemScraper.build([sample])
scraper.scrape(html) # will produce the items above
scraper.scrape(new_html) # will apply the learned rules and extract new items
diff --git a/mlscraper/training.py b/mlscraper/training.py
index 28517a4..3a8e8b4 100644
--- a/mlscraper/training.py
+++ b/mlscraper/training.py
@@ -1,5 +1,5 @@
# training objects
-from mlscraper.parser import Page
+from mlscraper.parser import Page, make_soup_page
class MultiItemPageSample:
@@ -8,8 +8,8 @@ class MultiItemPageSample:
page = None
items = None
- def __init__(self, page: Page, items: list):
- self.page = page
+ def __init__(self, html: bytes, items: list):
+ self.page = make_soup_page(html)
self.items = items
@@ -19,8 +19,8 @@ class SingleItemPageSample:
page = None
item = None
- def __init__(self, page: Page, item: dict):
- self.page = page
+ def __init__(self, html: bytes, item: dict):
+ self.page = make_soup_page(html)
self.item = item
def find_nodes(self, attr):
diff --git a/test/test_basic.py b/test/test_basic.py
index 0876c91..c9a4e50 100644
--- a/test/test_basic.py
+++ b/test/test_basic.py
@@ -6,7 +6,6 @@ from mlscraper import (
MultiItemScraper,
SingleItemScraper,
MultiItemPageSample,
- make_soup_page,
SingleItemPageSample,
)
@@ -50,7 +49,7 @@ def test_multi(multi_single_result_page_html):
]
html = multi_single_result_page_html
- scraper = MultiItemScraper.build([MultiItemPageSample(make_soup_page(html), items)])
+ scraper = MultiItemScraper.build([MultiItemPageSample(html, items)])
assert scraper.scrape(html) == items
# optional since they're only human guesses
@@ -60,7 +59,7 @@ def test_multi(multi_single_result_page_html):
def test_single(single_basic_train_html):
data = {"name": "Peter", "description": "Cool-looking guy"}
- samples = [SingleItemPageSample(make_soup_page(single_basic_train_html), data)]
+ samples = [SingleItemPageSample(single_basic_train_html, data)]
scraper = SingleItemScraper.build(samples)
result = scraper.scrape(single_basic_train_html)
assert result == data
@@ -68,7 +67,7 @@ def test_single(single_basic_train_html):
def test_single_with_whitespace(whitespace_html):
data = {"name": "Peter", "description": "Cool-looking guy"}
- samples = [SingleItemPageSample(make_soup_page(whitespace_html), data)]
+ samples = [SingleItemPageSample(whitespace_html, data)]
scraper = SingleItemScraper.build(samples)
result = scraper.scrape(
b'<html><body><div><div class="person-name">Klaus</div></div></body></html>'
diff --git a/test/test_new.py b/test/test_new.py
index dbfea13..6efc81a 100644
--- a/test/test_new.py
+++ b/test/test_new.py
@@ -1,12 +1,10 @@
from mlscraper import RuleBasedSingleItemScraper, SingleItemPageSample
-from mlscraper.parser import make_soup_page
def test_basic():
html = '<html><body><div class="parent"><p class="item">result</p></div><p class="item">not a result</p></body></html>'
- page = make_soup_page(html)
item = {"res": "result"}
- samples = [SingleItemPageSample(page, item)]
+ samples = [SingleItemPageSample(html, item)]
scraper = RuleBasedSingleItemScraper.build(samples)
assert scraper.scrape(html) == item