diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-21 16:59:47 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-21 16:59:47 +0200 |
commit | 823304fd13444f35920a660e72e1a3cbbf853039 (patch) | |
tree | 368ddd5c7a5ec8c9a542d6def96751c13626f1d2 | |
parent | 9c22147fd8f7c451ed8627af8bcb8c7eab5ba8e0 (diff) |
Update README and add examples folder
-rw-r--r-- | README.rst | 11 | ||||
-rw-r--r-- | examples/quotes_to_scrape.py | 36 |
2 files changed, 44 insertions, 3 deletions
@@ -50,10 +50,15 @@ After you've defined the data you want to scrape, mlscraper will: Getting started --------------- -Install the latest version of mlscraper via :code:`pip install git+https://github.com/lorey/mlscraper#egg=mlscraper`. +mlscraper is currently short before version 1.0. +If you want to check the new release, use :code:`pip install --pre mlscraper` to test the release candidate. +You can also install the latest (unstable) development version of mlscraper +via :code:`pip install git+https://github.com/lorey/mlscraper#egg=mlscraper`, +e.g. to check new features or to see if a bug has been fixed already. Please note that until the 1.0 release :code:`pip install mlscraper` will return an outdated 0.* version. -In both cases, you can then import it via `mlscraper`. -Check the tests for usage until detailed documentation arrives. +Check the examples_ directory for usage examples until further documentation arrives. + +.. _examples: examples/ ----------- Development diff --git a/examples/quotes_to_scrape.py b/examples/quotes_to_scrape.py new file mode 100644 index 0000000..b87acae --- /dev/null +++ b/examples/quotes_to_scrape.py @@ -0,0 +1,36 @@ +import logging + +import requests +from mlscraper.html import Page +from mlscraper.samples import Sample, TrainingSet +from mlscraper.training import train_scraper + + +def main(): + """ + This example shows you how to build a scraper for authors on quotes.toscrape.com + """ + + # fetch the page to train + einstein_url = 'http://quotes.toscrape.com/author/Albert-Einstein/' + resp = requests.get(einstein_url) + assert resp.status_code == 200 + + # create a sample for Albert Einstein + training_set = TrainingSet() + page = Page(resp.content) + sample = Sample(page, {'name': 'Albert Einstein', 'born': 'March 14, 1879'}) + training_set.add_sample(sample) + + # train the scraper with the created training set + scraper = train_scraper(training_set) + + # scrape another page + resp = requests.get('http://quotes.toscrape.com/author/J-K-Rowling') + result = scraper.get(Page(resp.content)) + print(result) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main() |