Update examples with stackoverflow scraper

author: Karl Lorey <git@karllorey.com> 2020-09-27 15:33:42 +0200
committer: Karl Lorey <git@karllorey.com> 2020-09-27 15:33:42 +0200
commit: fa9741426b99fe9444529b9d813d77474c5d8cbf (patch)
tree: 012d3f7b44daebfdfaa2c1346ac4539f47e5d93d
parent: 41624b9d5b99232112dc450791bc736740b54844 (diff)
3 files changed, 45 insertions, 113 deletions
diff --git a/README.rst b/README.rst
index d8b0057..b9ae144 100644
--- a/README.rst
+++ b/README.rst
@@ -28,7 +28,7 @@ I've been wondering for a long time why there's no Open Source solution that doe
 So here's my attempt at creating a python library to enable automatic scraping.
 
 All you have to do is define some examples of scraped data.
-`autoscraper` will figure out everything else and return clean data.
+`mlscraper` will figure out everything else and return clean data.
 
 Currently, this is a proof of concept with a simplistic solution.
 
@@ -60,6 +60,10 @@ After you've defined the data you want to scrape, mlscraper will:
     scraper.scrape(html)  # will produce the items above
     scraper.scrape(new_html)  # will apply the learned rules and extract new items
 
+You can find working scrapers in the `examples folder`_.
+
+.. _`examples folder`: examples/
+
 
 ---------------
 Getting started
diff --git a/examples/ml_prototype.py b/examples/ml_prototype.py
deleted file mode 100644
index f070943..0000000
--- a/examples/ml_prototype.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# this is just a prototype for classifier-based element detection
-# - using actual machine learning, not heuristics
-# - detecting elements defined by icons, e.g. a dollar icon follow by a price
-import codecs
-import logging
-import re
-from random import sample
-
-import pandas as pd
-import requests
-from bs4 import BeautifulSoup, NavigableString
-from sklearn import tree
-from sklearn.base import TransformerMixin
-from sklearn.pipeline import Pipeline
-from sklearn.tree import DecisionTreeClassifier
-
-
-class Preprocessing(TransformerMixin):
-    def __init__(self, element):
-        self.element = element
-
-    def fit(self, X, y):
-        # todo define classes to extract here, doing it in transform will cause bugs
-        return self
-
-    def transform(self, X, y=None, **fit_params):
-        df = pd.DataFrame({"node": X})
-        df["has_content"] = df["node"].apply(
-            lambda n: type(n) is NavigableString and len(str(n).strip()) > 0
-        )
-        df["is_same_type"] = df["node"].apply(lambda n: type(n) == type(self.element))
-        sibling_prev = self.element.previous_sibling
-        sibling_prev_classes = sibling_prev.attrs.get("class")
-
-        df["prev_sibling"] = df["node"].apply(lambda n: n.previous_sibling)
-
-        for css_class in sibling_prev_classes:
-            df["sibling_prev_has_class_{}".format(css_class)] = df[
-                "prev_sibling"
-            ].apply(
-                lambda n: css_class in n.attrs.get("class", [])
-                if hasattr(n, "attrs")
-                else False
-            )
-
-        features = [c for c in df.columns if c.startswith("sibling")] + ["is_same_type"]
-        return df[features]
-
-
-def main():
-    # obfuscated the url a little :)
-    url_rot = "uggcf://jjj.fgnegonfr.qr/betnavmngvba/jrygraznpure-tzou/"
-    url = codecs.decode(url_rot, "rot_13")
-
-    resp = requests.get(url)
-    assert resp.status_code == 200
-    soup = BeautifulSoup(resp.content, "lxml")
-
-    print("Found {} nodes".format(len(list(soup.descendants))))
-
-    # find the element to crawl on the site
-    elements_in_site = soup.find_all(text=re.compile(r"\s*2017\s*"))
-    element = elements_in_site[0]  # type: NavigableString
-
-    print("ELEMENT")
-    print(element)
-
-    pipeline = Pipeline(
-        [
-            ("preprocessing", Preprocessing(element)),
-            # ("clf", SVC(class_weight="balanced", probability=True)),
-            ("clf", DecisionTreeClassifier(class_weight="balanced")),
-        ]
-    )
-
-    # under-sample negative and over-sample positive samples
-    sample_count = 1000
-    oversampling_factor = 10
-
-    samples = sample(list(soup.descendants), sample_count)
-    X = samples + [element] * oversampling_factor
-    y = [n == element for n in samples] + [True] * oversampling_factor
-
-    # train the pipeline
-    pipeline = pipeline.fit(X, y)
-
-    # print the tree if it's a decision tree
-    # -> select last pipeline step
-    if type(pipeline.steps[-1][1]) is DecisionTreeClassifier:
-        print("TREE")
-        clf_tree = pipeline.steps[1][1]
-        print(tree.export_text(clf_tree))
-
-    # perform sanity check: element we're looking for should get matched
-    is_element_matched = pipeline.predict([element])[0]
-    print("Element is {}".format(is_element_matched))
-    if not is_element_matched:
-        logging.warning("Element is not matched, classifier is probably broken")
-
-    # print out samples for debugging
-    for x, y_ in zip(X, y):
-        is_match = pipeline.predict([x])[0]
-        if is_match:
-            print(x)
-            print("Class: " + str(y_))
-            print("Prediction: " + str(is_match))
-            print("Proba: " + str(pipeline.predict_proba([x])))
-            print()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/stackoverflow.py b/examples/stackoverflow.py
new file mode 100644
index 0000000..45bb78e
--- /dev/null
+++ b/examples/stackoverflow.py
@@ -0,0 +1,40 @@
+import logging
+
+import requests
+
+from mlscraper import SingleItemPageSample, RuleBasedSingleItemScraper
+
+
+def main():
+    items = {
+        "https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array": {
+            "title": "Why is processing a sorted array faster than processing an unsorted array?"
+        },
+        "https://stackoverflow.com/questions/927358/how-do-i-undo-the-most-recent-local-commits-in-git": {
+            "title": "How do I undo the most recent local commits in Git?"
+        },
+        "https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do": {
+            "title": "What does the “yield” keyword do?"
+        },
+    }
+
+    results = {url: requests.get(url) for url in items.keys()}
+
+    # train scraper
+    samples = [
+        SingleItemPageSample(results[url].content, items[url]) for url in items.keys()
+    ]
+    scraper = RuleBasedSingleItemScraper.build(samples)
+
+    print("Scraping new question")
+    html = requests.get(
+        "https://stackoverflow.com/questions/2003505/how-do-i-delete-a-git-branch-locally-and-remotely"
+    ).content
+    result = scraper.scrape(html)
+
+    print("Result: %s" % result)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    main()
author	Karl Lorey <git@karllorey.com>	2020-09-27 15:33:42 +0200
committer	Karl Lorey <git@karllorey.com>	2020-09-27 15:33:42 +0200
commit	fa9741426b99fe9444529b9d813d77474c5d8cbf (patch)
tree	012d3f7b44daebfdfaa2c1346ac4539f47e5d93d
parent	41624b9d5b99232112dc450791bc736740b54844 (diff)