summaryrefslogtreecommitdiffstats
path: root/tests
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-14 21:39:13 +0200
committerKarl Lorey <git@karllorey.com>2022-06-14 21:39:13 +0200
commitdb82b63b1f280a0b59af624aa310d3b163528570 (patch)
tree9e836dad8713623c26429da4149939f3050ec60f /tests
parentfaebafbac0b01a062a93ee143fc82d052b7419c5 (diff)
I might go insane with this one
Diffstat (limited to 'tests')
-rw-r--r--tests/conftest.py2
-rw-r--r--tests/test_html.py66
-rw-r--r--tests/test_matches.py0
-rw-r--r--tests/test_samples.py24
-rw-r--r--tests/test_scrapers.py6
-rw-r--r--tests/test_selectors.py19
-rw-r--r--tests/test_training.py16
-rw-r--r--tests/test_util.py50
8 files changed, 107 insertions, 76 deletions
diff --git a/tests/conftest.py b/tests/conftest.py
index 0adcdbc..a1755a4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,6 @@
import pytest
+from mlscraper.html import Page
from mlscraper.samples import Sample
-from mlscraper.util import Page
@pytest.fixture(scope="module")
diff --git a/tests/test_html.py b/tests/test_html.py
new file mode 100644
index 0000000..aa8842a
--- /dev/null
+++ b/tests/test_html.py
@@ -0,0 +1,66 @@
+from bs4 import BeautifulSoup
+
+from mlscraper.html import _get_root_of_nodes, Node, Page, selector_matches_nodes
+from mlscraper.matches import AttributeValueExtractor
+
+
+def test_get_root_of_nodes():
+ soup = BeautifulSoup(
+ '<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>',
+ "lxml",
+ )
+ node_1 = soup.select_one("#one")
+ node_2 = soup.select_one("#two")
+ root = _get_root_of_nodes([node_1, node_2])
+ assert root == soup.select_one("div")
+
+
+class TestPage:
+ def test_select(self, stackoverflow_samples):
+ page = stackoverflow_samples[0].page
+ nodes = page.select(".answer .js-vote-count")
+ assert [n.text for n in nodes] == ["20", "16", "0"]
+
+ def test_find_all(self, stackoverflow_samples):
+ page = stackoverflow_samples[0].page
+ nodes = page.find_all("/users/624900/jterrace")
+ assert nodes
+
+
+def test_attribute_extractor():
+ html_ = b'<html><body><a href="https://karllorey.com"></a><a>no link</a></body></html>'
+ page = Page(html_)
+ extractor = AttributeValueExtractor("href")
+ a_tags = page.select('a')
+ assert extractor.extract(a_tags[0]) == "https://karllorey.com"
+ assert extractor.extract(a_tags[1]) is None
+
+
+def test_extractor_factory():
+ # we want to make sure that each extractor exists only once
+ # as we need this to ensure extractor selection
+ e1 = AttributeValueExtractor("href")
+ e2 = AttributeValueExtractor("href")
+ assert len({e1, e2}) == 1
+
+
+def test_equality():
+ # we want to make sure that equal html does not result in equality
+ same_html = '<html><body><div><p></p></div></body></html>'
+ assert Page(same_html) == Page(same_html)
+ assert Page(same_html) is not Page(same_html)
+
+
+def test_select():
+ html = '<html><body><p></p><p></p></body></html>'
+ page = Page(html)
+ p_tag_nodes = page.select('p')
+ assert len(p_tag_nodes) == 2
+ # not used in practice
+ # assert len(set(p_tag_nodes)) == 2
+
+
+def test_selector_matches_nodes():
+ html = '<html><body><p></p><p></p></body></html>'
+ page = Page(html)
+ assert selector_matches_nodes(page, 'p', list(reversed(page.select('p'))))
diff --git a/tests/test_matches.py b/tests/test_matches.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/test_matches.py
diff --git a/tests/test_samples.py b/tests/test_samples.py
index 7023082..df18d86 100644
--- a/tests/test_samples.py
+++ b/tests/test_samples.py
@@ -1,10 +1,11 @@
import pytest
+from mlscraper.html import Page
+from mlscraper.matches import DictMatch
+from mlscraper.matches import ListMatch
from mlscraper.samples import ItemStructureException
+from mlscraper.samples import make_matcher_for_samples
from mlscraper.samples import make_training_set
from mlscraper.samples import Sample
-from mlscraper.util import DictMatch
-from mlscraper.util import ListMatch
-from mlscraper.util import Page
class TestTrainingSet:
@@ -62,5 +63,18 @@ class TestMatch:
assert isinstance(match, ListMatch)
assert len(match.matches) == 2
assert all(isinstance(m, DictMatch) for m in match.matches)
- print(match.get_root())
- print(match.get_span())
+ print(match.root)
+
+
+def test_make_matcher_for_samples():
+ page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
+ page1 = Page(page1_html)
+ sample1 = Sample(page1, "test")
+
+ page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
+ page2 = Page(page2_html)
+ sample2 = Sample(page2, "hallo")
+
+ samples = [sample1, sample2]
+ matcher = make_matcher_for_samples(samples)
+ assert matcher.selector.css_rule in ["p.test", ".test"]
diff --git a/tests/test_scrapers.py b/tests/test_scrapers.py
index bc7f54c..d74adfe 100644
--- a/tests/test_scrapers.py
+++ b/tests/test_scrapers.py
@@ -1,10 +1,10 @@
+from mlscraper.html import Page
+from mlscraper.matches import AttributeValueExtractor
+from mlscraper.matches import TextValueExtractor
from mlscraper.scrapers import DictScraper
from mlscraper.scrapers import ListScraper
from mlscraper.scrapers import ValueScraper
from mlscraper.selectors import CssRuleSelector
-from mlscraper.util import AttributeValueExtractor
-from mlscraper.util import Page
-from mlscraper.util import TextValueExtractor
class TestListOfDictScraper:
diff --git a/tests/test_selectors.py b/tests/test_selectors.py
index 05f9bb6..57a5ded 100644
--- a/tests/test_selectors.py
+++ b/tests/test_selectors.py
@@ -1,20 +1,6 @@
+from mlscraper.html import Page
from mlscraper.samples import Sample
from mlscraper.selectors import generate_selector_for_nodes
-from mlscraper.selectors import make_matcher_for_samples
-from mlscraper.util import Page
-
-
-def test_make_matcher_for_samples():
- page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
- page1 = Page(page1_html)
- sample1 = Sample(page1, "test")
-
- page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
- page2 = Page(page2_html)
- sample2 = Sample(page2, "hallo")
-
- samples = [sample1, sample2]
- assert make_matcher_for_samples(samples).selector.css_rule in ["p.test", ".test"]
def test_generate_selector_for_nodes():
@@ -28,7 +14,8 @@ def test_generate_selector_for_nodes():
samples = [sample1, sample2]
- nodes = [s.get_matches()[0].get_root() for s in samples]
+ nodes = [s.get_matches()[0].root for s in samples]
+ print(nodes)
gen = generate_selector_for_nodes(nodes, None)
# todo .test is also possible
assert ["p.test"] == [sel.css_rule for sel in gen]
diff --git a/tests/test_training.py b/tests/test_training.py
index 5a38c98..73a8771 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -1,9 +1,22 @@
import pytest
+from mlscraper.html import Page
+from mlscraper.samples import Sample
from mlscraper.samples import TrainingSet
from mlscraper.training import train_scraper
-@pytest.mark.skip("listscraper just returns one result instead of three")
+def test_train_scraper_simple_list():
+ training_set = TrainingSet()
+ training_set.add_sample(
+ Sample(
+ Page(b"<html><body><p>a<p><i>noise</i><p>b</p><p>c</p></body></html>"),
+ ["a", "b", "c"],
+ )
+ )
+ train_scraper(training_set.item)
+
+
+@pytest.mark.skip("fucking fails")
def test_train_scraper(stackoverflow_samples):
training_set = TrainingSet()
for s in stackoverflow_samples:
@@ -11,6 +24,7 @@ def test_train_scraper(stackoverflow_samples):
scraper = train_scraper(training_set.item)
print(f"result scraper: {scraper}")
+ print(f"selector for list items: {scraper.selector}")
scraping_result = scraper.get(stackoverflow_samples[0].page)
print(f"scraping result: {scraping_result}")
diff --git a/tests/test_util.py b/tests/test_util.py
index bef4758..e69de29 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,50 +0,0 @@
-from bs4 import BeautifulSoup
-from mlscraper.util import _get_root_of_nodes
-from mlscraper.util import AttributeValueExtractor
-from mlscraper.util import get_attribute_extractor
-from mlscraper.util import Node
-from mlscraper.util import Page
-
-
-class TestPage:
- def test_something(self, stackoverflow_samples):
- page = stackoverflow_samples[0].page
- nodes = page.select(".answer .js-vote-count")
- assert [n.text for n in nodes] == ["20", "16", "0"]
-
- def test_find_all(self, stackoverflow_samples):
- page = stackoverflow_samples[0].page
- nodes = page.find_all("/users/624900/jterrace")
- assert nodes
-
-
-def test_attribute_extractor():
- soup = BeautifulSoup(
- '<html><body><a href="http://karllorey.com"></a><a>no link</a></body></html>',
- "lxml",
- )
- ue = AttributeValueExtractor("href")
- a_tags = soup.find_all("a")
- assert ue.extract(Node(a_tags[0])) == "http://karllorey.com"
- assert ue.extract(Node(a_tags[1])) is None
-
-
-def test_extractor_factory():
- # we want to make sure that each extractor exists only once
- # as we need this to ensure extractor selection
- e1 = get_attribute_extractor("href")
- e2 = get_attribute_extractor("href")
- assert (
- e1 is e2
- ), "extractor factory returns different instances for the same extractor"
-
-
-def test_get_root_of_nodes():
- soup = BeautifulSoup(
- '<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>',
- "lxml",
- )
- node_1 = soup.select_one("#one")
- node_2 = soup.select_one("#two")
- root = _get_root_of_nodes([node_1, node_2])
- assert root == soup.select_one("div")