diff options
author | Karl Lorey <git@karllorey.com> | 2020-09-27 15:33:42 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2020-09-27 15:33:42 +0200 |
commit | fa9741426b99fe9444529b9d813d77474c5d8cbf (patch) | |
tree | 012d3f7b44daebfdfaa2c1346ac4539f47e5d93d | |
parent | 41624b9d5b99232112dc450791bc736740b54844 (diff) |
Update examples with stackoverflow scraper
-rw-r--r-- | README.rst | 6 | ||||
-rw-r--r-- | examples/ml_prototype.py | 112 | ||||
-rw-r--r-- | examples/stackoverflow.py | 40 |
3 files changed, 45 insertions, 113 deletions
@@ -28,7 +28,7 @@ I've been wondering for a long time why there's no Open Source solution that doe So here's my attempt at creating a python library to enable automatic scraping. All you have to do is define some examples of scraped data. -`autoscraper` will figure out everything else and return clean data. +`mlscraper` will figure out everything else and return clean data. Currently, this is a proof of concept with a simplistic solution. @@ -60,6 +60,10 @@ After you've defined the data you want to scrape, mlscraper will: scraper.scrape(html) # will produce the items above scraper.scrape(new_html) # will apply the learned rules and extract new items +You can find working scrapers in the `examples folder`_. + +.. _`examples folder`: examples/ + --------------- Getting started diff --git a/examples/ml_prototype.py b/examples/ml_prototype.py deleted file mode 100644 index f070943..0000000 --- a/examples/ml_prototype.py +++ /dev/null @@ -1,112 +0,0 @@ -# this is just a prototype for classifier-based element detection -# - using actual machine learning, not heuristics -# - detecting elements defined by icons, e.g. a dollar icon follow by a price -import codecs -import logging -import re -from random import sample - -import pandas as pd -import requests -from bs4 import BeautifulSoup, NavigableString -from sklearn import tree -from sklearn.base import TransformerMixin -from sklearn.pipeline import Pipeline -from sklearn.tree import DecisionTreeClassifier - - -class Preprocessing(TransformerMixin): - def __init__(self, element): - self.element = element - - def fit(self, X, y): - # todo define classes to extract here, doing it in transform will cause bugs - return self - - def transform(self, X, y=None, **fit_params): - df = pd.DataFrame({"node": X}) - df["has_content"] = df["node"].apply( - lambda n: type(n) is NavigableString and len(str(n).strip()) > 0 - ) - df["is_same_type"] = df["node"].apply(lambda n: type(n) == type(self.element)) - sibling_prev = self.element.previous_sibling - sibling_prev_classes = sibling_prev.attrs.get("class") - - df["prev_sibling"] = df["node"].apply(lambda n: n.previous_sibling) - - for css_class in sibling_prev_classes: - df["sibling_prev_has_class_{}".format(css_class)] = df[ - "prev_sibling" - ].apply( - lambda n: css_class in n.attrs.get("class", []) - if hasattr(n, "attrs") - else False - ) - - features = [c for c in df.columns if c.startswith("sibling")] + ["is_same_type"] - return df[features] - - -def main(): - # obfuscated the url a little :) - url_rot = "uggcf://jjj.fgnegonfr.qr/betnavmngvba/jrygraznpure-tzou/" - url = codecs.decode(url_rot, "rot_13") - - resp = requests.get(url) - assert resp.status_code == 200 - soup = BeautifulSoup(resp.content, "lxml") - - print("Found {} nodes".format(len(list(soup.descendants)))) - - # find the element to crawl on the site - elements_in_site = soup.find_all(text=re.compile(r"\s*2017\s*")) - element = elements_in_site[0] # type: NavigableString - - print("ELEMENT") - print(element) - - pipeline = Pipeline( - [ - ("preprocessing", Preprocessing(element)), - # ("clf", SVC(class_weight="balanced", probability=True)), - ("clf", DecisionTreeClassifier(class_weight="balanced")), - ] - ) - - # under-sample negative and over-sample positive samples - sample_count = 1000 - oversampling_factor = 10 - - samples = sample(list(soup.descendants), sample_count) - X = samples + [element] * oversampling_factor - y = [n == element for n in samples] + [True] * oversampling_factor - - # train the pipeline - pipeline = pipeline.fit(X, y) - - # print the tree if it's a decision tree - # -> select last pipeline step - if type(pipeline.steps[-1][1]) is DecisionTreeClassifier: - print("TREE") - clf_tree = pipeline.steps[1][1] - print(tree.export_text(clf_tree)) - - # perform sanity check: element we're looking for should get matched - is_element_matched = pipeline.predict([element])[0] - print("Element is {}".format(is_element_matched)) - if not is_element_matched: - logging.warning("Element is not matched, classifier is probably broken") - - # print out samples for debugging - for x, y_ in zip(X, y): - is_match = pipeline.predict([x])[0] - if is_match: - print(x) - print("Class: " + str(y_)) - print("Prediction: " + str(is_match)) - print("Proba: " + str(pipeline.predict_proba([x]))) - print() - - -if __name__ == "__main__": - main() diff --git a/examples/stackoverflow.py b/examples/stackoverflow.py new file mode 100644 index 0000000..45bb78e --- /dev/null +++ b/examples/stackoverflow.py @@ -0,0 +1,40 @@ +import logging + +import requests + +from mlscraper import SingleItemPageSample, RuleBasedSingleItemScraper + + +def main(): + items = { + "https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array": { + "title": "Why is processing a sorted array faster than processing an unsorted array?" + }, + "https://stackoverflow.com/questions/927358/how-do-i-undo-the-most-recent-local-commits-in-git": { + "title": "How do I undo the most recent local commits in Git?" + }, + "https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do": { + "title": "What does the “yield” keyword do?" + }, + } + + results = {url: requests.get(url) for url in items.keys()} + + # train scraper + samples = [ + SingleItemPageSample(results[url].content, items[url]) for url in items.keys() + ] + scraper = RuleBasedSingleItemScraper.build(samples) + + print("Scraping new question") + html = requests.get( + "https://stackoverflow.com/questions/2003505/how-do-i-delete-a-git-branch-locally-and-remotely" + ).content + result = scraper.scrape(html) + + print("Result: %s" % result) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + main() |