Let requests and urllib3 do our retries for us

Use a session configured with various retry parameters and a special whitelisting retry class do our retries for us. This offloads all kinds of error handling we've not even thought of yet to urllib3 while still staying highly customizable. This also gives us a nice backoff algorithm when retrying which we only need to parametrize. While at it, unroll the __get() method into the one user of POST (submit()) to make it more readable.
author: Michael Weiser <michael.weiser@gmx.de> 2019-05-07 17:54:34 +0000
committer: Michael Weiser <michael.weiser@gmx.de> 2019-05-09 08:14:56 +0000
commit: 652a0515c9de74bd058b6942857e241237007ad8 (patch)
tree: c0d1abd986008e2bdc2bfdab39ce689ae81288c4
parent: 82b6931f20ef6d0ce0a975299ad8e4a439aa92f6 (diff)
1 files changed, 97 insertions, 39 deletions
diff --git a/peekaboo/toolbox/cuckoo.py b/peekaboo/toolbox/cuckoo.py
index b1df4b7..bb6ffb3 100644
--- a/peekaboo/toolbox/cuckoo.py
+++ b/peekaboo/toolbox/cuckoo.py
@@ -24,6 +24,8 @@
 ###############################################################################
 
 
+from future.builtins import super  # pylint: disable=wrong-import-order
+
 import re
 import os
 import locale
@@ -31,11 +33,12 @@ import logging
 import json
 import subprocess
 import random
+import requests
+import urllib3.util.retry
 
 from time import sleep
 from threading import RLock
 
-import requests
 from twisted.internet import protocol, reactor, process
 
 from peekaboo.exceptions import CuckooSubmitFailedException
@@ -243,48 +246,87 @@ class CuckooEmbed(Cuckoo):
         (currently) intentionally a no-op. """
         pass
 
+
+class WhitelistRetry(urllib3.util.retry.Retry):
+    """ A Retry class which has a status code whitelist, allowing to retry all
+    requests not whitelisted in a hard-core, catch-all manner. """
+    def __init__(self, status_whitelist=None, **kwargs):
+        super().__init__(**kwargs)
+        self.status_whitelist = status_whitelist or set()
+
+    def is_retry(self, method, status_code, has_retry_after=False):
+        """ Override Retry's is_retry to introduce our status whitelist logic.
+        """
+        # we retry all methods so no check if method is retryable here
+
+        if self.status_whitelist and status_code not in self.status_whitelist:
+            return True
+
+        return super().is_retry(method, status_code, has_retry_after)
+
+
 class CuckooApi(Cuckoo):
     """ Interfaces with a Cuckoo installation via its REST API. """
-    def __init__(self, job_queue, url="http://localhost:8090", poll_interval=5):
+    def __init__(self, job_queue, url="http://localhost:8090", poll_interval=5,
+                 retries=5, backoff=0.5):
         Cuckoo.__init__(self, job_queue)
         self.url = url
         self.poll_interval = poll_interval
+
+        # urrlib3 backoff formula:
+        # <backoff factor> * (2 ^ (<retry count so far> - 1))
+        # with second try intentionally having no sleep,
+        # e.g. with retry count==5 and backoff factor==0.5:
+        # try 1: fail, sleep(0.5*2^(1-1)==0.5*2^0==0.5*1==0.5->intentionally
+        #   overridden to 0)
+        # try 2: fail, sleep(0.5*2^(2-1)==0.5*2^1==1)
+        # try 3: fail, sleep(0.5*2^(3-1)==0.5*2^2==2)
+        # try 4: fail, sleep(0.5*2^(4-1)==0.5*2^3==4)
+        # try 5: fail, abort, sleep would've been 8 before try 6
+        #
+        # Also, use method_whitelist=False to enable POST and other methods for
+        # retry which aren't by default because they're not considered
+        # idempotent. We assume that with the REST API a request either
+        # succeeds or fails without residual effects, making them atomic and
+        # idempotent.
+        #
+        # And finally we retry everything but a 200 response, which admittedly
+        # is a bit hard-core but serves our purposes for now.
+        retry_config = WhitelistRetry(total=retries,
+                                      backoff_factor=backoff,
+                                      method_whitelist=False,
+                                      status_whitelist=set([200]))
+        retry_adapter = requests.adapters.HTTPAdapter(max_retries=retry_config)
+        self.session = requests.session()
+        self.session.mount('http://', retry_adapter)
+        self.session.mount('https://', retry_adapter)
+
         self.reported = self.__status()["tasks"]["reported"]
         logger.info("Connection to Cuckoo seems to work, %i reported tasks seen", self.reported)
-    
-    def __get(self, url, method="get", files=""):
-        r = ""
-        logger.debug("Requesting %s, method %s" % (url, method))
-        
-        # try 3 times to get a successfull response
-        for retry in range(0, 3):
-            try:
-                if method == "get":
-                    r = requests.get("%s/%s" % (self.url, url))
-                elif method == "post":
-                    r = requests.post("%s/%s" % (self.url, url), files=files)
-                else:
-                    break
-                if r.status_code != 200:
-                    continue
-                else:
-                    return r.json()
-            except requests.exceptions.Timeout as e:
-                # Maybe set up for a retry, or continue in a retry loop
-                print(e)
-                if e and retry >= 2:
-                    raise e
-            except requests.exceptions.TooManyRedirects as e:
-                # Tell the user their URL was bad and try a different one
-                print(e)
-                if e and retry >= 2:
-                    raise e
-            except requests.exceptions.RequestException as e:
-                # catastrophic error. bail.
-                print(e)
-                if e and retry >= 2:
-                    raise e
-        return None
+
+    def __get(self, path):
+        request_url = "%s/%s" % (self.url, path)
+        logger.debug("Getting %s", request_url)
+
+        try:
+            response = self.session.get(request_url)
+        # all requests exceptions are derived from RequestsException, including
+        # RetryError, TooManyRedirects and Timeout
+        except requests.exceptions.RequestException as error:
+            logger.error('Request to REST API failed: %s', error)
+            return None
+
+        # no check for status code here since we retry all but 200
+        # responses and raise an exception if retries fail
+        try:
+            json_resp = response.json()
+        except ValueError as error:
+            logger.error(
+                'Invalid JSON in response when getting %s: %s',
+                request_url, error)
+            return None
+
+        return json_resp
     
     def __status(self):
         return self.__get("cuckoo/status")
@@ -293,14 +335,30 @@ class CuckooApi(Cuckoo):
         path = sample.submit_path
         filename = os.path.basename(path)
         files = {"file": (filename, open(path, 'rb'))}
-        response = self.__get("tasks/create/file", method="post", files=files)
+        logger.debug("Creating Cuckoo task with content from %s and "
+                     "filename %s", path, filename)
+
+        try:
+            response = self.session.post(
+                "%s/tasks/create/file" % self.url, files=files)
+        except requests.exceptions.RequestException as error:
+            raise CuckooSubmitFailedException(
+                'Error creating Cuckoo task: %s' % error)
         
-        task_id = response["task_id"]
+        try:
+            json_resp = response.json()
+        except ValueError as error:
+            raise CuckooSubmitFailedException(
+                'Invalid JSON in response when creating Cuckoo task: %s'
+                % error)
+
+        task_id = json_resp["task_id"]
         if task_id > 0:
             self.register_running_job(task_id, sample)
             return task_id
+
         raise CuckooSubmitFailedException(
-            'Unable to extract job ID from given string %s' % response)
+            'Unable to extract job ID from response %s' % json_resp)
 
     def get_report(self, job_id):
         logger.debug("Report from Cuckoo API requested, job_id = %d" % job_id)
author	Michael Weiser <michael.weiser@gmx.de>	2019-05-07 17:54:34 +0000
committer	Michael Weiser <michael.weiser@gmx.de>	2019-05-09 08:14:56 +0000
commit	652a0515c9de74bd058b6942857e241237007ad8 (patch)
tree	c0d1abd986008e2bdc2bfdab39ce689ae81288c4
parent	82b6931f20ef6d0ce0a975299ad8e4a439aa92f6 (diff)