summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-21 16:59:47 +0200
committerKarl Lorey <git@karllorey.com>2022-06-21 16:59:47 +0200
commit823304fd13444f35920a660e72e1a3cbbf853039 (patch)
tree368ddd5c7a5ec8c9a542d6def96751c13626f1d2
parent9c22147fd8f7c451ed8627af8bcb8c7eab5ba8e0 (diff)
Update README and add examples folder
-rw-r--r--README.rst11
-rw-r--r--examples/quotes_to_scrape.py36
2 files changed, 44 insertions, 3 deletions
diff --git a/README.rst b/README.rst
index 1fc9cb1..387627f 100644
--- a/README.rst
+++ b/README.rst
@@ -50,10 +50,15 @@ After you've defined the data you want to scrape, mlscraper will:
Getting started
---------------
-Install the latest version of mlscraper via :code:`pip install git+https://github.com/lorey/mlscraper#egg=mlscraper`.
+mlscraper is currently short before version 1.0.
+If you want to check the new release, use :code:`pip install --pre mlscraper` to test the release candidate.
+You can also install the latest (unstable) development version of mlscraper
+via :code:`pip install git+https://github.com/lorey/mlscraper#egg=mlscraper`,
+e.g. to check new features or to see if a bug has been fixed already.
Please note that until the 1.0 release :code:`pip install mlscraper` will return an outdated 0.* version.
-In both cases, you can then import it via `mlscraper`.
-Check the tests for usage until detailed documentation arrives.
+Check the examples_ directory for usage examples until further documentation arrives.
+
+.. _examples: examples/
-----------
Development
diff --git a/examples/quotes_to_scrape.py b/examples/quotes_to_scrape.py
new file mode 100644
index 0000000..b87acae
--- /dev/null
+++ b/examples/quotes_to_scrape.py
@@ -0,0 +1,36 @@
+import logging
+
+import requests
+from mlscraper.html import Page
+from mlscraper.samples import Sample, TrainingSet
+from mlscraper.training import train_scraper
+
+
+def main():
+ """
+ This example shows you how to build a scraper for authors on quotes.toscrape.com
+ """
+
+ # fetch the page to train
+ einstein_url = 'http://quotes.toscrape.com/author/Albert-Einstein/'
+ resp = requests.get(einstein_url)
+ assert resp.status_code == 200
+
+ # create a sample for Albert Einstein
+ training_set = TrainingSet()
+ page = Page(resp.content)
+ sample = Sample(page, {'name': 'Albert Einstein', 'born': 'March 14, 1879'})
+ training_set.add_sample(sample)
+
+ # train the scraper with the created training set
+ scraper = train_scraper(training_set)
+
+ # scrape another page
+ resp = requests.get('http://quotes.toscrape.com/author/J-K-Rowling')
+ result = scraper.get(Page(resp.content))
+ print(result)
+
+
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+ main()