diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-14 00:08:31 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-14 00:08:31 +0200 |
commit | c71f49d94919e34c2a60d6d005559708923c75c9 (patch) | |
tree | e8bbb239954d8c413115a508b7161aea1cc3a573 /tests | |
parent | 6eeea5d82c96fc3c0c12e3ef8f5376944f6cfda4 (diff) |
Pull stackoverflow test sample to module level
Diffstat (limited to 'tests')
-rw-r--r-- | tests/test_scrapers.py | 28 | ||||
-rw-r--r-- | tests/test_selectors.py | 11 | ||||
-rw-r--r-- | tests/test_training.py | 39 |
3 files changed, 12 insertions, 66 deletions
diff --git a/tests/test_scrapers.py b/tests/test_scrapers.py index e341041..bc7f54c 100644 --- a/tests/test_scrapers.py +++ b/tests/test_scrapers.py @@ -1,5 +1,3 @@ -import pytest -from mlscraper.samples import Sample from mlscraper.scrapers import DictScraper from mlscraper.scrapers import ListScraper from mlscraper.scrapers import ValueScraper @@ -9,32 +7,6 @@ from mlscraper.util import Page from mlscraper.util import TextValueExtractor -@pytest.fixture -def stackoverflow_samples(): - with open("tests/static/so.html") as file: - page = Page(file.read()) - - item = [ - { - "user": "/users/624900/jterrace", - "upvotes": "20", - "when": "2011-06-16 19:45:11Z", - }, - { - "user": "/users/4044167/nico-knoll", - "upvotes": "16", - "when": "2017-09-06 15:27:16Z", - }, - { - "user": "/users/1275778/lorey", - "upvotes": "0", - "when": "2021-01-06 10:50:04Z", - }, - ] - samples = [Sample(page, item)] - return samples - - class TestListOfDictScraper: def test_scrape(self, stackoverflow_samples): user_scraper = ValueScraper( diff --git a/tests/test_selectors.py b/tests/test_selectors.py index ac6510a..05f9bb6 100644 --- a/tests/test_selectors.py +++ b/tests/test_selectors.py @@ -1,6 +1,4 @@ -import pytest from mlscraper.samples import Sample -from mlscraper.selectors import generate_matchers_for_samples from mlscraper.selectors import generate_selector_for_nodes from mlscraper.selectors import make_matcher_for_samples from mlscraper.util import Page @@ -19,15 +17,6 @@ def test_make_matcher_for_samples(): assert make_matcher_for_samples(samples).selector.css_rule in ["p.test", ".test"] -@pytest.mark.skip("takes too long") -def test_generate_css_selectors_for_samples(): - with open("tests/static/so.html") as file: - page = Page(file.read()) - samples = [Sample(page, ["20", "16", "0"])] - selector_first = next(generate_matchers_for_samples(samples=samples)) - assert selector_first.endswith(".js-vote-count") - - def test_generate_selector_for_nodes(): page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>' page1 = Page(page1_html) diff --git a/tests/test_training.py b/tests/test_training.py index 61fa62a..5a38c98 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -1,34 +1,19 @@ import pytest -from mlscraper.samples import make_training_set +from mlscraper.samples import TrainingSet from mlscraper.training import train_scraper -from mlscraper.util import Page -@pytest.fixture -def stackoverflow_training_set(): - with open("tests/static/so.html") as file: - page = Page(file.read()) +@pytest.mark.skip("listscraper just returns one result instead of three") +def test_train_scraper(stackoverflow_samples): + training_set = TrainingSet() + for s in stackoverflow_samples: + training_set.add_sample(s) - item = [ - { - "user": "/users/624900/jterrace", - "upvotes": "20", - "when": "2011-06-16 19:45:11Z", - }, - { - "user": "/users/4044167/nico-knoll", - "upvotes": "16", - "when": "2017-09-06 15:27:16Z", - }, - { - "user": "/users/1275778/lorey", - "upvotes": "0", - "when": "2021-01-06 10:50:04Z", - }, - ] - return make_training_set([page], [item]) + scraper = train_scraper(training_set.item) + print(f"result scraper: {scraper}") + scraping_result = scraper.get(stackoverflow_samples[0].page) + print(f"scraping result: {scraping_result}") -@pytest.mark.skip("takes too long") -def test_train_scraper(stackoverflow_training_set): - train_scraper(stackoverflow_training_set.item) + scraping_sample = stackoverflow_samples[0].value + assert scraping_result == scraping_sample |