summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSebastian Deiss <sebastian.deiss@atos.net>2018-06-28 11:24:32 +0200
committerSebastian Deiss <sebastian.deiss@atos.net>2018-06-28 11:24:32 +0200
commit257d0acf559584f2bbeac173ecfd789dbd2e3a8f (patch)
treefa61ef698c1f94025edbffdc7b7fa80c1a15b799
parenteb8033e109ff61627e02309499c2478b43e96389 (diff)
Improve MIME type determination by using multiple sources
-rw-r--r--peekaboo/ruleset/rules.py4
-rw-r--r--peekaboo/sample.py52
-rw-r--r--peekaboo/toolbox/files.py12
-rw-r--r--test.py2
4 files changed, 43 insertions, 27 deletions
diff --git a/peekaboo/ruleset/rules.py b/peekaboo/ruleset/rules.py
index adf154e..607a48c 100644
--- a/peekaboo/ruleset/rules.py
+++ b/peekaboo/ruleset/rules.py
@@ -80,7 +80,7 @@ def file_type_on_whitelist(config, s):
whitelist = config['file_type_on_whitelist']['whitelist']
- if s.mimetype in whitelist:
+ if set(s.mimetypes).issubset(set(whitelist)):
return RuleResult(position,
result=Result.ignored,
reason="Dateityp ist auf Whitelist",
@@ -99,7 +99,7 @@ def file_type_on_greylist(config, s):
greylist = config['file_type_on_greylist']['greylist']
- if s.mimetype in greylist:
+ if set(s.mimetypes).issubset(set(greylist)):
return RuleResult(position,
result=Result.unknown,
reason="Dateityp ist auf der Liste der zu analysiserenden Typen",
diff --git a/peekaboo/sample.py b/peekaboo/sample.py
index 68209d0..1a8b623 100644
--- a/peekaboo/sample.py
+++ b/peekaboo/sample.py
@@ -34,7 +34,8 @@ from peekaboo.config import get_config
from peekaboo.exceptions import CuckooReportPendingException, \
CuckooAnalysisFailedException
from peekaboo.toolbox.sampletools import SampleMetaInfo, ConnectionMap, next_job_hash
-from peekaboo.toolbox.files import chown2me, guess_mime_type_from_file_contents
+from peekaboo.toolbox.files import chown2me, guess_mime_type_from_file_contents, \
+ guess_mime_type_from_filename
from peekaboo.toolbox.ms_office import has_office_macros
from peekaboo.toolbox.cuckoo import submit_to_cuckoo
import peekaboo.ruleset as ruleset
@@ -298,14 +299,14 @@ class Sample(object):
return self.get_attr('file_extension')
@property
- def mimetype(self):
+ def mimetypes(self):
"""
Can not be cached (hard to determine if known/complete).
determine mime on original p[0-9]* file
later result will be "inode/symlink"
"""
- mime_type = None
+ mime_types = []
smime = {
'p7s': [
@@ -321,32 +322,35 @@ class Sample(object):
declared_mt = self.__meta_info.get_mime_type()
if declared_mt is not None:
logger.debug('Sample declared as "%s"' % declared_mt)
- mime_type = declared_mt
+ mime_types.append(declared_mt)
except Exception as e:
logger.exception(e)
+ declared_mt = None
if self.meta_info_loaded:
- logger.error('Cannot get mime type from meta info although meta info is loaded.')
-
- detected_mime_type = guess_mime_type_from_file_contents(self.__path)
- if detected_mime_type != mime_type:
- logger.debug(
- 'Detected MIME type does not match declared MIME Type: declared: %s, detected: %s.'
- % (mime_type, detected_mime_type)
- )
- # check if the sample is an smime signature (smime.p7s)
- # If so, don't overwrite the MIME type since we do not want to analyse S/MIME signatures.
- try:
- declared_filename = self.get_attr('meta_info_name_declared')
- except KeyError:
- declared_filename = self.__filename
- if declared_filename == 'smime.p7s' and mime_type in smime['p7s']:
- logger.info('Using declared MIME type over detected one for S/MIME signatures.')
- else:
- logger.debug('Overwriting declared MIME Type with "%s"' % detected_mime_type)
- mime_type = detected_mime_type
+ logger.error('Cannot get MIME type from meta info although meta info is loaded.')
+
+ try:
+ declared_filename = self.get_attr('meta_info_name_declared')
+ except KeyError:
+ declared_filename = self.__filename
+
+ content_based_mime_type = guess_mime_type_from_file_contents(self.__path)
+ if content_based_mime_type is not None and content_based_mime_type not in mime_types:
+ mime_types.append(content_based_mime_type)
+
+ name_based_mime_type = guess_mime_type_from_filename(declared_filename)
+ if name_based_mime_type is not None and name_based_mime_type not in mime_types:
+ mime_types.append(name_based_mime_type)
+
+ logger.debug('Determined MIME Types: %s' % mime_types)
+ # check if the sample is an S/MIME signature (smime.p7s)
+ # If so, don't overwrite the MIME type since we do not want to analyse S/MIME signatures.
+ if declared_filename == 'smime.p7s' and declared_mt in smime['p7s']:
+ logger.info('S/MIME signature detected. Using declared MIME type over detected ones.')
+ mime_types = [declared_mt]
if not self.has_attr('mimetypes'):
- self.set_attr('mimetypes', mime_type)
+ self.set_attr('mimetypes', mime_types)
return self.get_attr('mimetypes')
diff --git a/peekaboo/toolbox/files.py b/peekaboo/toolbox/files.py
index 4896474..1bb6471 100644
--- a/peekaboo/toolbox/files.py
+++ b/peekaboo/toolbox/files.py
@@ -27,6 +27,7 @@
import logging
import subprocess
import magic
+import mimetypes
from peekaboo.config import get_config
@@ -51,3 +52,14 @@ def guess_mime_type_from_file_contents(file_path):
mt = magic.from_file(file_path, mime=True)
if mt:
return mt
+
+
+def guess_mime_type_from_filename(file_path):
+ """ Guess the type of a file based on its filename or URL. """
+ if not mimetypes.inited:
+ mimetypes.init()
+ mimetypes.add_type('application/javascript', '.jse')
+
+ mt = mimetypes.guess_type(file_path)[0]
+ if mt:
+ return mt
diff --git a/test.py b/test.py
index 55efe63..5eb0123 100644
--- a/test.py
+++ b/test.py
@@ -171,7 +171,7 @@ class TestSample(unittest.TestCase):
def test_sample_attributes(self):
self.assertEqual(self.sample.get_filename(), 'test.py')
self.assertEqual(self.sample.file_extension, 'py')
- self.assertTrue(self.sample.mimetype, 'text/x-python')
+ self.assertTrue(set(['text/x-python']).issubset(set(self.sample.mimetypes)))
self.assertIsNotNone(self.sample.sha256sum)
self.assertEqual(self.sample.job_id, -1)
self.assertEqual(self.sample.get_result(), Result.unchecked)