diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-20 11:12:00 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-20 11:12:00 +0200 |
commit | c6c371223d56f23ad4a588231b5e6f51bee4259c (patch) | |
tree | 45d68736dfce2c8bee9d061e19224161f3916332 /tests/test_training.py | |
parent | 3a4c3234653984768a992747ba45da5e34d3af9c (diff) |
Re-implement selector generation with a speedup >10xdevelop
Diffstat (limited to 'tests/test_training.py')
-rw-r--r-- | tests/test_training.py | 22 |
1 files changed, 21 insertions, 1 deletions
diff --git a/tests/test_training.py b/tests/test_training.py index ccd0441..f0f1703 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -1,4 +1,3 @@ -import pytest from mlscraper.html import Page from mlscraper.matches import TextValueExtractor from mlscraper.samples import Sample @@ -61,6 +60,27 @@ def test_train_scraper_list_of_dicts(): assert isinstance(value_scraper.extractor, TextValueExtractor) +def test_train_scraper_multipage(): + training_set = TrainingSet() + for items in ["ab", "cd"]: + html = b""" + <html><body> + <div class="target"> + <ul><li>%s</li><li>%s</li></ul> + </div> + </body></html> + """ % ( + items[0].encode(), + items[1].encode(), + ) + training_set.add_sample(Sample(Page(html), [items[0], items[1]])) + scraper = train_scraper(training_set) + assert scraper.selector.css_rule == "li" + assert scraper.get( + Page(b"""<html><body><ul><li>first</li><li>second</li></body></html>""") + ) == ["first", "second"] + + def test_train_scraper_stackoverflow(stackoverflow_samples): training_set = TrainingSet() for s in stackoverflow_samples: |