Simplify interfaces even more (plain html only)

author: Karl Lorey <git@karllorey.com> 2020-09-26 17:02:42 +0200
committer: Karl Lorey <git@karllorey.com> 2020-09-26 17:02:42 +0200
commit: 7c9c218f84a06665921350d0993c2820bec50fea (patch)
tree: 2e3c9cff9c1ef97cba5546db1ff5ed71eaafda65
parent: 2c745d684f2b15c8c91638244240d56b7308926b (diff)
4 files changed, 10 insertions, 14 deletions
diff --git a/README.md b/README.md
index 7f22ab8..f23ded4 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,6 @@ After you've defined the data you want to scrape, mlscraper will:
 
 ```python
 from mlscraper import MultiItemScraper
-from mlscraper.parser import make_soup_page
 from mlscraper.training import MultiItemPageSample
 
 # the items found on the training page
@@ -37,7 +36,7 @@ items = [
 ]
 
 # training the scraper with the items
-sample = MultiItemPageSample(make_soup_page(html), items)
+sample = MultiItemPageSample(html, items)
 scraper = MultiItemScraper.build([sample])
 scraper.scrape(html)  # will produce the items above
 scraper.scrape(new_html)  # will apply the learned rules and extract new items
diff --git a/mlscraper/training.py b/mlscraper/training.py
index 28517a4..3a8e8b4 100644
--- a/mlscraper/training.py
+++ b/mlscraper/training.py
@@ -1,5 +1,5 @@
 # training objects
-from mlscraper.parser import Page
+from mlscraper.parser import Page, make_soup_page
 
 
 class MultiItemPageSample:
@@ -8,8 +8,8 @@ class MultiItemPageSample:
     page = None
     items = None
 
-    def __init__(self, page: Page, items: list):
-        self.page = page
+    def __init__(self, html: bytes, items: list):
+        self.page = make_soup_page(html)
         self.items = items
 
 
@@ -19,8 +19,8 @@ class SingleItemPageSample:
     page = None
     item = None
 
-    def __init__(self, page: Page, item: dict):
-        self.page = page
+    def __init__(self, html: bytes, item: dict):
+        self.page = make_soup_page(html)
         self.item = item
 
     def find_nodes(self, attr):
diff --git a/test/test_basic.py b/test/test_basic.py
index 0876c91..c9a4e50 100644
--- a/test/test_basic.py
+++ b/test/test_basic.py
@@ -6,7 +6,6 @@ from mlscraper import (
     MultiItemScraper,
     SingleItemScraper,
     MultiItemPageSample,
-    make_soup_page,
     SingleItemPageSample,
 )
 
@@ -50,7 +49,7 @@ def test_multi(multi_single_result_page_html):
     ]
 
     html = multi_single_result_page_html
-    scraper = MultiItemScraper.build([MultiItemPageSample(make_soup_page(html), items)])
+    scraper = MultiItemScraper.build([MultiItemPageSample(html, items)])
     assert scraper.scrape(html) == items
 
     # optional since they're only human guesses
@@ -60,7 +59,7 @@ def test_multi(multi_single_result_page_html):
 
 def test_single(single_basic_train_html):
     data = {"name": "Peter", "description": "Cool-looking guy"}
-    samples = [SingleItemPageSample(make_soup_page(single_basic_train_html), data)]
+    samples = [SingleItemPageSample(single_basic_train_html, data)]
     scraper = SingleItemScraper.build(samples)
     result = scraper.scrape(single_basic_train_html)
     assert result == data
@@ -68,7 +67,7 @@ def test_single(single_basic_train_html):
 
 def test_single_with_whitespace(whitespace_html):
     data = {"name": "Peter", "description": "Cool-looking guy"}
-    samples = [SingleItemPageSample(make_soup_page(whitespace_html), data)]
+    samples = [SingleItemPageSample(whitespace_html, data)]
     scraper = SingleItemScraper.build(samples)
     result = scraper.scrape(
         b'<html><body><div><div class="person-name">Klaus</div></div></body></html>'
diff --git a/test/test_new.py b/test/test_new.py
index dbfea13..6efc81a 100644
--- a/test/test_new.py
+++ b/test/test_new.py
@@ -1,12 +1,10 @@
 from mlscraper import RuleBasedSingleItemScraper, SingleItemPageSample
-from mlscraper.parser import make_soup_page
 
 
 def test_basic():
     html = '<html><body><div class="parent"><p class="item">result</p></div><p class="item">not a result</p></body></html>'
-    page = make_soup_page(html)
     item = {"res": "result"}
 
-    samples = [SingleItemPageSample(page, item)]
+    samples = [SingleItemPageSample(html, item)]
     scraper = RuleBasedSingleItemScraper.build(samples)
     assert scraper.scrape(html) == item
author	Karl Lorey <git@karllorey.com>	2020-09-26 17:02:42 +0200
committer	Karl Lorey <git@karllorey.com>	2020-09-26 17:02:42 +0200
commit	7c9c218f84a06665921350d0993c2820bec50fea (patch)
tree	2e3c9cff9c1ef97cba5546db1ff5ed71eaafda65
parent	2c745d684f2b15c8c91638244240d56b7308926b (diff)