diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-14 21:39:13 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-14 21:39:13 +0200 |
commit | db82b63b1f280a0b59af624aa310d3b163528570 (patch) | |
tree | 9e836dad8713623c26429da4149939f3050ec60f /tests | |
parent | faebafbac0b01a062a93ee143fc82d052b7419c5 (diff) |
I might go insane with this one
Diffstat (limited to 'tests')
-rw-r--r-- | tests/conftest.py | 2 | ||||
-rw-r--r-- | tests/test_html.py | 66 | ||||
-rw-r--r-- | tests/test_matches.py | 0 | ||||
-rw-r--r-- | tests/test_samples.py | 24 | ||||
-rw-r--r-- | tests/test_scrapers.py | 6 | ||||
-rw-r--r-- | tests/test_selectors.py | 19 | ||||
-rw-r--r-- | tests/test_training.py | 16 | ||||
-rw-r--r-- | tests/test_util.py | 50 |
8 files changed, 107 insertions, 76 deletions
diff --git a/tests/conftest.py b/tests/conftest.py index 0adcdbc..a1755a4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ import pytest +from mlscraper.html import Page from mlscraper.samples import Sample -from mlscraper.util import Page @pytest.fixture(scope="module") diff --git a/tests/test_html.py b/tests/test_html.py new file mode 100644 index 0000000..aa8842a --- /dev/null +++ b/tests/test_html.py @@ -0,0 +1,66 @@ +from bs4 import BeautifulSoup + +from mlscraper.html import _get_root_of_nodes, Node, Page, selector_matches_nodes +from mlscraper.matches import AttributeValueExtractor + + +def test_get_root_of_nodes(): + soup = BeautifulSoup( + '<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>', + "lxml", + ) + node_1 = soup.select_one("#one") + node_2 = soup.select_one("#two") + root = _get_root_of_nodes([node_1, node_2]) + assert root == soup.select_one("div") + + +class TestPage: + def test_select(self, stackoverflow_samples): + page = stackoverflow_samples[0].page + nodes = page.select(".answer .js-vote-count") + assert [n.text for n in nodes] == ["20", "16", "0"] + + def test_find_all(self, stackoverflow_samples): + page = stackoverflow_samples[0].page + nodes = page.find_all("/users/624900/jterrace") + assert nodes + + +def test_attribute_extractor(): + html_ = b'<html><body><a href="https://karllorey.com"></a><a>no link</a></body></html>' + page = Page(html_) + extractor = AttributeValueExtractor("href") + a_tags = page.select('a') + assert extractor.extract(a_tags[0]) == "https://karllorey.com" + assert extractor.extract(a_tags[1]) is None + + +def test_extractor_factory(): + # we want to make sure that each extractor exists only once + # as we need this to ensure extractor selection + e1 = AttributeValueExtractor("href") + e2 = AttributeValueExtractor("href") + assert len({e1, e2}) == 1 + + +def test_equality(): + # we want to make sure that equal html does not result in equality + same_html = '<html><body><div><p></p></div></body></html>' + assert Page(same_html) == Page(same_html) + assert Page(same_html) is not Page(same_html) + + +def test_select(): + html = '<html><body><p></p><p></p></body></html>' + page = Page(html) + p_tag_nodes = page.select('p') + assert len(p_tag_nodes) == 2 + # not used in practice + # assert len(set(p_tag_nodes)) == 2 + + +def test_selector_matches_nodes(): + html = '<html><body><p></p><p></p></body></html>' + page = Page(html) + assert selector_matches_nodes(page, 'p', list(reversed(page.select('p')))) diff --git a/tests/test_matches.py b/tests/test_matches.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tests/test_matches.py diff --git a/tests/test_samples.py b/tests/test_samples.py index 7023082..df18d86 100644 --- a/tests/test_samples.py +++ b/tests/test_samples.py @@ -1,10 +1,11 @@ import pytest +from mlscraper.html import Page +from mlscraper.matches import DictMatch +from mlscraper.matches import ListMatch from mlscraper.samples import ItemStructureException +from mlscraper.samples import make_matcher_for_samples from mlscraper.samples import make_training_set from mlscraper.samples import Sample -from mlscraper.util import DictMatch -from mlscraper.util import ListMatch -from mlscraper.util import Page class TestTrainingSet: @@ -62,5 +63,18 @@ class TestMatch: assert isinstance(match, ListMatch) assert len(match.matches) == 2 assert all(isinstance(m, DictMatch) for m in match.matches) - print(match.get_root()) - print(match.get_span()) + print(match.root) + + +def test_make_matcher_for_samples(): + page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>' + page1 = Page(page1_html) + sample1 = Sample(page1, "test") + + page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>' + page2 = Page(page2_html) + sample2 = Sample(page2, "hallo") + + samples = [sample1, sample2] + matcher = make_matcher_for_samples(samples) + assert matcher.selector.css_rule in ["p.test", ".test"] diff --git a/tests/test_scrapers.py b/tests/test_scrapers.py index bc7f54c..d74adfe 100644 --- a/tests/test_scrapers.py +++ b/tests/test_scrapers.py @@ -1,10 +1,10 @@ +from mlscraper.html import Page +from mlscraper.matches import AttributeValueExtractor +from mlscraper.matches import TextValueExtractor from mlscraper.scrapers import DictScraper from mlscraper.scrapers import ListScraper from mlscraper.scrapers import ValueScraper from mlscraper.selectors import CssRuleSelector -from mlscraper.util import AttributeValueExtractor -from mlscraper.util import Page -from mlscraper.util import TextValueExtractor class TestListOfDictScraper: diff --git a/tests/test_selectors.py b/tests/test_selectors.py index 05f9bb6..57a5ded 100644 --- a/tests/test_selectors.py +++ b/tests/test_selectors.py @@ -1,20 +1,6 @@ +from mlscraper.html import Page from mlscraper.samples import Sample from mlscraper.selectors import generate_selector_for_nodes -from mlscraper.selectors import make_matcher_for_samples -from mlscraper.util import Page - - -def test_make_matcher_for_samples(): - page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>' - page1 = Page(page1_html) - sample1 = Sample(page1, "test") - - page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>' - page2 = Page(page2_html) - sample2 = Sample(page2, "hallo") - - samples = [sample1, sample2] - assert make_matcher_for_samples(samples).selector.css_rule in ["p.test", ".test"] def test_generate_selector_for_nodes(): @@ -28,7 +14,8 @@ def test_generate_selector_for_nodes(): samples = [sample1, sample2] - nodes = [s.get_matches()[0].get_root() for s in samples] + nodes = [s.get_matches()[0].root for s in samples] + print(nodes) gen = generate_selector_for_nodes(nodes, None) # todo .test is also possible assert ["p.test"] == [sel.css_rule for sel in gen] diff --git a/tests/test_training.py b/tests/test_training.py index 5a38c98..73a8771 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -1,9 +1,22 @@ import pytest +from mlscraper.html import Page +from mlscraper.samples import Sample from mlscraper.samples import TrainingSet from mlscraper.training import train_scraper -@pytest.mark.skip("listscraper just returns one result instead of three") +def test_train_scraper_simple_list(): + training_set = TrainingSet() + training_set.add_sample( + Sample( + Page(b"<html><body><p>a<p><i>noise</i><p>b</p><p>c</p></body></html>"), + ["a", "b", "c"], + ) + ) + train_scraper(training_set.item) + + +@pytest.mark.skip("fucking fails") def test_train_scraper(stackoverflow_samples): training_set = TrainingSet() for s in stackoverflow_samples: @@ -11,6 +24,7 @@ def test_train_scraper(stackoverflow_samples): scraper = train_scraper(training_set.item) print(f"result scraper: {scraper}") + print(f"selector for list items: {scraper.selector}") scraping_result = scraper.get(stackoverflow_samples[0].page) print(f"scraping result: {scraping_result}") diff --git a/tests/test_util.py b/tests/test_util.py index bef4758..e69de29 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,50 +0,0 @@ -from bs4 import BeautifulSoup -from mlscraper.util import _get_root_of_nodes -from mlscraper.util import AttributeValueExtractor -from mlscraper.util import get_attribute_extractor -from mlscraper.util import Node -from mlscraper.util import Page - - -class TestPage: - def test_something(self, stackoverflow_samples): - page = stackoverflow_samples[0].page - nodes = page.select(".answer .js-vote-count") - assert [n.text for n in nodes] == ["20", "16", "0"] - - def test_find_all(self, stackoverflow_samples): - page = stackoverflow_samples[0].page - nodes = page.find_all("/users/624900/jterrace") - assert nodes - - -def test_attribute_extractor(): - soup = BeautifulSoup( - '<html><body><a href="http://karllorey.com"></a><a>no link</a></body></html>', - "lxml", - ) - ue = AttributeValueExtractor("href") - a_tags = soup.find_all("a") - assert ue.extract(Node(a_tags[0])) == "http://karllorey.com" - assert ue.extract(Node(a_tags[1])) is None - - -def test_extractor_factory(): - # we want to make sure that each extractor exists only once - # as we need this to ensure extractor selection - e1 = get_attribute_extractor("href") - e2 = get_attribute_extractor("href") - assert ( - e1 is e2 - ), "extractor factory returns different instances for the same extractor" - - -def test_get_root_of_nodes(): - soup = BeautifulSoup( - '<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>', - "lxml", - ) - node_1 = soup.select_one("#one") - node_2 = soup.select_one("#two") - root = _get_root_of_nodes([node_1, node_2]) - assert root == soup.select_one("div") |