summaryrefslogtreecommitdiffstats
path: root/tests
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-14 00:08:31 +0200
committerKarl Lorey <git@karllorey.com>2022-06-14 00:08:31 +0200
commitc71f49d94919e34c2a60d6d005559708923c75c9 (patch)
treee8bbb239954d8c413115a508b7161aea1cc3a573 /tests
parent6eeea5d82c96fc3c0c12e3ef8f5376944f6cfda4 (diff)
Pull stackoverflow test sample to module level
Diffstat (limited to 'tests')
-rw-r--r--tests/test_scrapers.py28
-rw-r--r--tests/test_selectors.py11
-rw-r--r--tests/test_training.py39
3 files changed, 12 insertions, 66 deletions
diff --git a/tests/test_scrapers.py b/tests/test_scrapers.py
index e341041..bc7f54c 100644
--- a/tests/test_scrapers.py
+++ b/tests/test_scrapers.py
@@ -1,5 +1,3 @@
-import pytest
-from mlscraper.samples import Sample
from mlscraper.scrapers import DictScraper
from mlscraper.scrapers import ListScraper
from mlscraper.scrapers import ValueScraper
@@ -9,32 +7,6 @@ from mlscraper.util import Page
from mlscraper.util import TextValueExtractor
-@pytest.fixture
-def stackoverflow_samples():
- with open("tests/static/so.html") as file:
- page = Page(file.read())
-
- item = [
- {
- "user": "/users/624900/jterrace",
- "upvotes": "20",
- "when": "2011-06-16 19:45:11Z",
- },
- {
- "user": "/users/4044167/nico-knoll",
- "upvotes": "16",
- "when": "2017-09-06 15:27:16Z",
- },
- {
- "user": "/users/1275778/lorey",
- "upvotes": "0",
- "when": "2021-01-06 10:50:04Z",
- },
- ]
- samples = [Sample(page, item)]
- return samples
-
-
class TestListOfDictScraper:
def test_scrape(self, stackoverflow_samples):
user_scraper = ValueScraper(
diff --git a/tests/test_selectors.py b/tests/test_selectors.py
index ac6510a..05f9bb6 100644
--- a/tests/test_selectors.py
+++ b/tests/test_selectors.py
@@ -1,6 +1,4 @@
-import pytest
from mlscraper.samples import Sample
-from mlscraper.selectors import generate_matchers_for_samples
from mlscraper.selectors import generate_selector_for_nodes
from mlscraper.selectors import make_matcher_for_samples
from mlscraper.util import Page
@@ -19,15 +17,6 @@ def test_make_matcher_for_samples():
assert make_matcher_for_samples(samples).selector.css_rule in ["p.test", ".test"]
-@pytest.mark.skip("takes too long")
-def test_generate_css_selectors_for_samples():
- with open("tests/static/so.html") as file:
- page = Page(file.read())
- samples = [Sample(page, ["20", "16", "0"])]
- selector_first = next(generate_matchers_for_samples(samples=samples))
- assert selector_first.endswith(".js-vote-count")
-
-
def test_generate_selector_for_nodes():
page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
page1 = Page(page1_html)
diff --git a/tests/test_training.py b/tests/test_training.py
index 61fa62a..5a38c98 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -1,34 +1,19 @@
import pytest
-from mlscraper.samples import make_training_set
+from mlscraper.samples import TrainingSet
from mlscraper.training import train_scraper
-from mlscraper.util import Page
-@pytest.fixture
-def stackoverflow_training_set():
- with open("tests/static/so.html") as file:
- page = Page(file.read())
+@pytest.mark.skip("listscraper just returns one result instead of three")
+def test_train_scraper(stackoverflow_samples):
+ training_set = TrainingSet()
+ for s in stackoverflow_samples:
+ training_set.add_sample(s)
- item = [
- {
- "user": "/users/624900/jterrace",
- "upvotes": "20",
- "when": "2011-06-16 19:45:11Z",
- },
- {
- "user": "/users/4044167/nico-knoll",
- "upvotes": "16",
- "when": "2017-09-06 15:27:16Z",
- },
- {
- "user": "/users/1275778/lorey",
- "upvotes": "0",
- "when": "2021-01-06 10:50:04Z",
- },
- ]
- return make_training_set([page], [item])
+ scraper = train_scraper(training_set.item)
+ print(f"result scraper: {scraper}")
+ scraping_result = scraper.get(stackoverflow_samples[0].page)
+ print(f"scraping result: {scraping_result}")
-@pytest.mark.skip("takes too long")
-def test_train_scraper(stackoverflow_training_set):
- train_scraper(stackoverflow_training_set.item)
+ scraping_sample = stackoverflow_samples[0].value
+ assert scraping_result == scraping_sample