summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBinh Le <lebinh.it@gmail.com>2014-03-31 00:23:23 +0700
committerBinh Le <lebinh.it@gmail.com>2014-03-31 00:23:23 +0700
commite210ef67a35d17031527088244dc3954f9d97294 (patch)
tree0905783cd503597a6b37b0ff7a034352616c7539
parent5d15a6a3a19072e8f38213809f5d7694e317bb99 (diff)
Better parsing of nginx config file for access_log and log_format directives with pyparsing. Properly handle multiple access logs.
-rw-r--r--ngxtop/config_parser.py145
-rwxr-xr-xngxtop/ngxtop.py103
-rw-r--r--ngxtop/utils.py19
-rw-r--r--setup.py2
-rw-r--r--tests/test_config_parser.py60
5 files changed, 240 insertions, 89 deletions
diff --git a/ngxtop/config_parser.py b/ngxtop/config_parser.py
new file mode 100644
index 0000000..82a2d2a
--- /dev/null
+++ b/ngxtop/config_parser.py
@@ -0,0 +1,145 @@
+"""
+Nginx config parser and pattern builder.
+"""
+import os
+import re
+import subprocess
+
+from pyparsing import Literal, Word, ZeroOrMore, OneOrMore, Group, \
+ printables, quotedString, pythonStyleComment, removeQuotes
+
+from utils import choose_one, error_exit
+
+
+REGEX_SPECIAL_CHARS = r'([\.\*\+\?\|\(\)\{\}\[\]])'
+REGEX_LOG_FORMAT_VARIABLE = r'\$([a-z0-9\_]+)'
+LOG_FORMAT_COMBINED = '$remote_addr - $remote_user [$time_local] ' \
+ '"$request" $status $body_bytes_sent ' \
+ '"$http_referer" "$http_user_agent"'
+
+# common parser element
+semicolon = Literal(';').suppress()
+# nginx string parameter can contain any character except: { ; " '
+parameter = Word(''.join(c for c in printables if c not in set('{;"\'')))
+# which can also be quoted
+parameter = parameter | quotedString.setParseAction(removeQuotes)
+
+
+def detect_config_path():
+ """
+ Get nginx configuration file path based on `nginx -V` output
+ :return: detected nginx configuration file path
+ """
+ try:
+ proc = subprocess.Popen(['nginx', '-V'], stderr=subprocess.PIPE)
+ except OSError:
+ error_exit('Access log file or format was not set and nginx config file cannot be detected. ' +
+ 'Perhaps nginx is not in your PATH?')
+
+ stdout, stderr = proc.communicate()
+ version_output = stderr.decode('utf-8')
+ conf_path_match = re.search(r'--conf-path=(\S*)', version_output)
+ if conf_path_match is not None:
+ return conf_path_match.group(1)
+
+ prefix_match = re.search(r'--prefix=(\S*)', version_output)
+ if prefix_match is not None:
+ return prefix_match.group(1) + '/conf/nginx.conf'
+ return '/etc/nginx/nginx.conf'
+
+
+def get_access_logs(config):
+ """
+ Parse config for access_log directives
+ :return: iterator over ('path', 'format name') tuple of found directives
+ """
+ access_log = Literal("access_log") + ZeroOrMore(parameter) + semicolon
+ access_log.ignore(pythonStyleComment)
+
+ for directive in access_log.searchString(config).asList():
+ path = directive[1]
+ if path == 'off' or path.startswith('syslog:'):
+ # nothing to process here
+ continue
+
+ format_name = 'combined'
+ if len(directive) > 2 and '=' not in directive[2]:
+ format_name = directive[2]
+
+ yield path, format_name
+
+
+def get_log_formats(config):
+ """
+ Parse config for log_format directives
+ :return: iterator over ('format name', 'format string') tuple of found directives
+ """
+ # log_format name [params]
+ log_format = Literal('log_format') + parameter + Group(OneOrMore(parameter)) + semicolon
+ log_format.ignore(pythonStyleComment)
+
+ for directive in log_format.searchString(config).asList():
+ name = directive[1]
+ format_string = ''.join(directive[2])
+ yield name, format_string
+
+
+def detect_log_config(arguments):
+ """
+ Detect access log config (path and format) of nginx. Offer user to select if multiple access logs are detected.
+ :return: path and format of detected / selected access log
+ """
+ config = arguments['--config']
+ if config is None:
+ config = detect_config_path()
+ if not os.path.exists(config):
+ error_exit('Nginx config file not found: %s' % config)
+
+ with open(config) as f:
+ config_str = f.read()
+ access_logs = dict(get_access_logs(config_str))
+ if not access_logs:
+ error_exit('Access log file is not provided and ngxtop cannot detect it from your config file (%s).' % config)
+
+ log_formats = dict(get_log_formats(config_str))
+ if len(access_logs) == 1:
+ log_path, format_name = access_logs.items()[0]
+ if format_name == 'combined':
+ return log_path, LOG_FORMAT_COMBINED
+ if format_name not in log_formats:
+ error_exit('Incorrect format name set in config for access log file "%s"' % log_path)
+ return log_path, log_formats[format_name]
+
+ # multiple access logs configured, offer to select one
+ print('Multiple access logs detected in configuration:')
+ log_path = choose_one(access_logs.keys(), 'Select access log file to process: ')
+ format_name = access_logs[log_path]
+ if format_name not in log_formats:
+ error_exit('Incorrect format name set in config for access log file "%s"' % log_path)
+ return log_path, log_formats[format_name]
+
+
+def build_pattern(log_format):
+ """
+ Build regular expression to parse given format.
+ :param log_format: format string to parse
+ :return: regular expression to parse given format
+ """
+ if log_format == 'combined':
+ log_format = LOG_FORMAT_COMBINED
+ pattern = re.sub(REGEX_SPECIAL_CHARS, r'\\\1', log_format)
+ pattern = re.sub(REGEX_LOG_FORMAT_VARIABLE, '(?P<\\1>.*)', pattern)
+ return re.compile(pattern)
+
+
+def extract_variables(log_format):
+ """
+ Extract all variables from a log format string.
+ :param log_format: format string to extract
+ :return: iterator over all variables in given format string
+ """
+ if log_format == 'combined':
+ log_format = LOG_FORMAT_COMBINED
+ for match in re.findall(REGEX_LOG_FORMAT_VARIABLE, log_format):
+ yield match
+
diff --git a/ngxtop/ngxtop.py b/ngxtop/ngxtop.py
index 6b13679..f1487a5 100755
--- a/ngxtop/ngxtop.py
+++ b/ngxtop/ngxtop.py
@@ -8,7 +8,7 @@ Usage:
Options:
-l <file>, --access-log <file> access log file to parse.
- -f <format>, --log-format <format> log format as specify in log_format directive.
+ -f <format>, --log-format <format> log format as specify in log_format directive. [default: combined]
--no-follow ngxtop default behavior is to ignore current lines in log
and only watch for new lines as they are written to the access log.
Use this flag to tell ngxtop to process the current content of the access log instead.
@@ -57,9 +57,8 @@ import atexit
from contextlib import closing
import curses
import logging
-import re
+import os
import sqlite3
-import subprocess
import time
import sys
import signal
@@ -72,12 +71,9 @@ except ImportError:
from docopt import docopt
import tabulate
+from config_parser import detect_log_config, detect_config_path, extract_variables, build_pattern
+from utils import error_exit
-REGEX_SPECIAL_CHARS = r'([\.\*\+\?\|\(\)\{\}\[\]])'
-REGEX_LOG_FORMAT_VARIABLE = r'\$([a-z0-9\_]+)'
-LOG_FORMAT_COMBINED = '$remote_addr - $remote_user [$time_local] ' \
- '"$request" $status $body_bytes_sent ' \
- '"$http_referer" "$http_user_agent"'
DEFAULT_QUERIES = [
('Summary:',
@@ -111,76 +107,6 @@ DEFAULT_QUERIES = [
DEFAULT_FIELDS = set(['status_type', 'bytes_sent'])
-# ====================
-# Nginx utilities
-# ====================
-def get_nginx_conf_path():
- """
- Get nginx conf path based on `nginx -V` output
- """
- proc = subprocess.Popen(['nginx', '-V'], stderr=subprocess.PIPE)
- stdout, stderr = proc.communicate()
-
- version_output = stderr.decode('utf-8')
- conf_path_match = re.search(r'--conf-path=(\S*)', version_output)
- if conf_path_match is not None:
- return conf_path_match.group(1)
-
- prefix_match = re.search(r'--prefix=(\S*)', version_output)
- if prefix_match is not None:
- return prefix_match.group(1) + '/conf/nginx.conf'
- return '/etc/nginx/nginx.conf'
-
-
-def extract_nginx_conf(path, log_file=None, log_format=None):
- """
- *experimental* read nginx conf file to extract access log file location and format.
- TODO: rewrite this method to:
- - match all access_log directive to get all possible log files
- - for each log file search the correct log_format
- - if more than one log file, offer user to choose which one
- """
- with open(path) as conf_file:
- conf = conf_file.read()
-
- log_format_directive = re.search(r'log_format\s+(\S+)\s+(.*?);', conf, flags=re.DOTALL)
- log_format_name = log_format_directive.group(1) if log_format_directive else 'combined'
- log_format = log_format_directive.group(2) if log_format_directive else 'combined'
-
- # take care of log format in multiple line
- # only most common case, which encapsulate log format in single quote is handled
- if '\n' in log_format:
- log_format = ''.join(line.strip() for line in log_format.split('\n'))
- if log_format.startswith("'"):
- log_format = log_format.replace("'", "")
-
- access_log_directive = re.search(r'access_log\s+(\S+)\s+%s' % log_format_name, conf)
- # Use the log file from config only when not supplied with --access-log option,
- # else it is overwritten everytime.
- if not log_file:
- log_file = access_log_directive.group(1) if access_log_directive else '/var/log/nginx/access.log'
-
- return log_file, log_format
-
-
-def build_pattern(log_format):
- """
- Take an nginx's log format string and return the required regexp pattern to parse the access log
- """
- if log_format == 'combined':
- return build_pattern(LOG_FORMAT_COMBINED)
- pattern = re.sub(REGEX_SPECIAL_CHARS, r'\\\1', log_format)
- pattern = re.sub(REGEX_LOG_FORMAT_VARIABLE, '(?P<\\1>.*)', pattern)
- return re.compile(pattern)
-
-
-def extract_variables(log_format):
- if log_format == 'combined':
- log_format = LOG_FORMAT_COMBINED
- for match in re.findall(REGEX_LOG_FORMAT_VARIABLE, log_format):
- yield match
-
-
# ======================
# generator utilities
# ======================
@@ -204,8 +130,11 @@ def map_field(field, func, dict_sequence):
set the result as new value for that key.
"""
for item in dict_sequence:
- item[field] = func(item.get(field, None))
- yield item
+ try:
+ item[field] = func(item.get(field, None))
+ yield item
+ except ValueError:
+ pass
def add_field(field, func, dict_sequence):
@@ -405,18 +334,16 @@ def setup_reporter(processor, arguments):
def process(arguments):
access_log = arguments['--access-log']
log_format = arguments['--log-format']
- if access_log is None or log_format is None:
- config = arguments['--config']
- if config is None:
- config = get_nginx_conf_path()
- access_log, log_format = extract_nginx_conf(config, access_log)
- else:
- config = None
+ if access_log is None:
+ access_log, log_format = detect_log_config(arguments)
+
logging.info('access_log: %s', access_log)
logging.info('log_format: %s', log_format)
+ if not os.path.exists(access_log):
+ error_exit('access log file "%s" does not exist' % access_log)
if arguments['info']:
- print('configuration file:\n ', config)
+ print('nginx configuration file:\n ', detect_config_path())
print('access log file:\n ', access_log)
print('access log format:\n ', log_format)
print('available variables:\n ', ', '.join(sorted(extract_variables(log_format))))
diff --git a/ngxtop/utils.py b/ngxtop/utils.py
new file mode 100644
index 0000000..ef61072
--- /dev/null
+++ b/ngxtop/utils.py
@@ -0,0 +1,19 @@
+import sys
+
+
+def choose_one(choices, prompt):
+ for idx, choice in enumerate(choices):
+ print('%d. %s' % (idx + 1, choice))
+ selected = None
+ while not selected or selected <= 0 or selected > len(choices):
+ selected = raw_input(prompt)
+ try:
+ selected = int(selected)
+ except ValueError:
+ selected = None
+ return choices[selected - 1]
+
+
+def error_exit(msg, status=1):
+ sys.stderr.write('Error: %s\n' % msg)
+ sys.exit(status) \ No newline at end of file
diff --git a/setup.py b/setup.py
index d7e6c35..8f86be9 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ setup(
keywords='cli monitoring nginx system',
packages=['ngxtop'],
- install_requires=['docopt', 'tabulate'],
+ install_requires=['docopt', 'tabulate', 'pyparsing'],
entry_points={
'console_scripts': [
diff --git a/tests/test_config_parser.py b/tests/test_config_parser.py
new file mode 100644
index 0000000..3b6504b
--- /dev/null
+++ b/tests/test_config_parser.py
@@ -0,0 +1,60 @@
+from ngxtop import config_parser
+
+
+def test_get_log_formats():
+ config = '''
+ http {
+ # ubuntu default, log_format on multiple lines
+ log_format main '$remote_addr - $remote_user [$time_local] "$request" '
+ "$status $body_bytes_sent '$http_referer' "
+ '"$http_user_agent" "$http_x_forwarded_for"';
+
+ # name can also be quoted, and format don't always have to
+ log_format 'te st' $remote_addr;
+ }
+ '''
+ formats = dict(config_parser.get_log_formats(config))
+ assert 'main' in formats
+ assert "'$http_referer'" in formats['main']
+ assert 'te st' in formats
+
+
+def test_get_access_logs_no_format():
+ config = '''
+ http {
+ # ubuntu default
+ access_log /var/log/nginx/access.log;
+
+ # syslog is a valid access log, but we can't follow it
+ access_log syslog:server=address combined;
+
+ # commented
+ # access_log commented;
+
+ server {
+ location / {
+ # has parameter with default format
+ access_log /path/to/log gzip=1;
+ }
+ }
+ }
+ '''
+ logs = dict(config_parser.get_access_logs(config))
+ assert len(logs) == 2
+ assert logs['/var/log/nginx/access.log'] == 'combined'
+ assert logs['/path/to/log'] == 'combined'
+
+
+def test_access_logs_with_format_name():
+ config = '''
+ http {
+ access_log /path/to/main.log main gzip=5 buffer=32k flush=1m;
+ server {
+ access_log /path/to/test.log 'te st';
+ }
+ }
+ '''
+ logs = dict(config_parser.get_access_logs(config))
+ assert len(logs) == 2
+ assert logs['/path/to/main.log'] == 'main'
+ assert logs['/path/to/test.log'] == 'te st'