summaryrefslogtreecommitdiffstats
path: root/tests/test_training.py
diff options
context:
space:
mode:
Diffstat (limited to 'tests/test_training.py')
-rw-r--r--tests/test_training.py22
1 files changed, 21 insertions, 1 deletions
diff --git a/tests/test_training.py b/tests/test_training.py
index ccd0441..f0f1703 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -1,4 +1,3 @@
-import pytest
from mlscraper.html import Page
from mlscraper.matches import TextValueExtractor
from mlscraper.samples import Sample
@@ -61,6 +60,27 @@ def test_train_scraper_list_of_dicts():
assert isinstance(value_scraper.extractor, TextValueExtractor)
+def test_train_scraper_multipage():
+ training_set = TrainingSet()
+ for items in ["ab", "cd"]:
+ html = b"""
+ <html><body>
+ <div class="target">
+ <ul><li>%s</li><li>%s</li></ul>
+ </div>
+ </body></html>
+ """ % (
+ items[0].encode(),
+ items[1].encode(),
+ )
+ training_set.add_sample(Sample(Page(html), [items[0], items[1]]))
+ scraper = train_scraper(training_set)
+ assert scraper.selector.css_rule == "li"
+ assert scraper.get(
+ Page(b"""<html><body><ul><li>first</li><li>second</li></body></html>""")
+ ) == ["first", "second"]
+
+
def test_train_scraper_stackoverflow(stackoverflow_samples):
training_set = TrainingSet()
for s in stackoverflow_samples: