diff options
author | Darik Gamble <darik.gamble.spam@gmail.com> | 2015-11-01 17:25:59 -0500 |
---|---|---|
committer | Darik Gamble <darik.gamble.spam@gmail.com> | 2015-11-08 15:54:15 -0500 |
commit | 9c97d35606862737bb5bd65ecf696c5d6653ebef (patch) | |
tree | 76ea4353b837854111e7e4e0ac28302888090d5e | |
parent | f7aef6ecacd0dc4c1343d3aab534c12aa50cd51d (diff) |
New package prioritization and class PrevalenceCounter
-rw-r--r-- | pgcli/packages/prioritization.py | 53 | ||||
-rw-r--r-- | tests/test_prioritization.py | 20 |
2 files changed, 73 insertions, 0 deletions
diff --git a/pgcli/packages/prioritization.py b/pgcli/packages/prioritization.py new file mode 100644 index 00000000..eb01b310 --- /dev/null +++ b/pgcli/packages/prioritization.py @@ -0,0 +1,53 @@ +import re +import sqlparse +from sqlparse.tokens import Name +from collections import defaultdict +from .pgliterals.main import get_literals + + +white_space_regex = re.compile('\\s+', re.MULTILINE) + + +def _compile_regex(keyword): + # Surround the keyword with word boundaries and replace interior whitespace + # with whitespace wildcards + pattern = '\\b' + re.sub(white_space_regex, '\\s+', keyword) + '\\b' + return re.compile(pattern, re.MULTILINE | re.IGNORECASE) + +keywords = get_literals('keywords') +keyword_regexs = dict((kw, _compile_regex(kw)) for kw in keywords) + + +class PrevalenceCounter(object): + def __init__(self): + self.keyword_counts = defaultdict(int) + self.name_counts = defaultdict(int) + + def update(self, text): + self.update_keywords(text) + self.update_names(text) + + def update_names(self, text): + for parsed in sqlparse.parse(text): + for token in parsed.flatten(): + if token.ttype in Name: + self.name_counts[token.value] += 1 + + def clear_names(self): + self.name_counts = defaultdict(int) + + def update_keywords(self, text): + # Count keywords. Can't rely for sqlparse for this, because it's + # database agnostic + for keyword, regex in keyword_regexs.items(): + for _ in regex.finditer(text): + self.keyword_counts[keyword] += 1 + + def keyword_count(self, keyword): + return self.keyword_counts[keyword] + + def name_count(self, name): + return self.name_counts[name] + + + diff --git a/tests/test_prioritization.py b/tests/test_prioritization.py new file mode 100644 index 00000000..3046456e --- /dev/null +++ b/tests/test_prioritization.py @@ -0,0 +1,20 @@ +from pgcli.packages.prioritization import PrevalenceCounter + + +def test_prevalence_counter(): + counter = PrevalenceCounter() + sql = '''SELECT * FROM foo WHERE bar GROUP BY baz; + select * from foo; + SELECT * FROM foo WHERE bar GROUP + BY baz''' + counter.update(sql) + + keywords = ['SELECT', 'FROM', 'GROUP BY'] + expected = [3, 3, 2] + kw_counts = [counter.keyword_count(x) for x in keywords] + assert kw_counts == expected + assert counter.keyword_count('NOSUCHKEYWORD') == 0 + + names = ['foo', 'bar', 'baz'] + name_counts = [counter.name_count(x) for x in names] + assert name_counts == [3, 2, 2] |