New package prioritization and class PrevalenceCounter

author: Darik Gamble <darik.gamble.spam@gmail.com> 2015-11-01 17:25:59 -0500
committer: Darik Gamble <darik.gamble.spam@gmail.com> 2015-11-08 15:54:15 -0500
commit: 9c97d35606862737bb5bd65ecf696c5d6653ebef (patch)
tree: 76ea4353b837854111e7e4e0ac28302888090d5e
parent: f7aef6ecacd0dc4c1343d3aab534c12aa50cd51d (diff)
2 files changed, 73 insertions, 0 deletions
diff --git a/pgcli/packages/prioritization.py b/pgcli/packages/prioritization.py
new file mode 100644
index 00000000..eb01b310
--- /dev/null
+++ b/pgcli/packages/prioritization.py
@@ -0,0 +1,53 @@
+import re
+import sqlparse
+from sqlparse.tokens import Name
+from collections import defaultdict
+from .pgliterals.main import get_literals
+
+
+white_space_regex = re.compile('\\s+', re.MULTILINE)
+
+
+def _compile_regex(keyword):
+    # Surround the keyword with word boundaries and replace interior whitespace
+    # with whitespace wildcards
+    pattern = '\\b' + re.sub(white_space_regex, '\\s+', keyword) + '\\b'
+    return re.compile(pattern, re.MULTILINE | re.IGNORECASE)
+
+keywords = get_literals('keywords')
+keyword_regexs = dict((kw, _compile_regex(kw)) for kw in keywords)
+
+
+class PrevalenceCounter(object):
+    def __init__(self):
+        self.keyword_counts = defaultdict(int)
+        self.name_counts = defaultdict(int)
+
+    def update(self, text):
+        self.update_keywords(text)
+        self.update_names(text)
+
+    def update_names(self, text):
+        for parsed in sqlparse.parse(text):
+            for token in parsed.flatten():
+                if token.ttype in Name:
+                    self.name_counts[token.value] += 1
+
+    def clear_names(self):
+        self.name_counts = defaultdict(int)
+
+    def update_keywords(self, text):
+        # Count keywords. Can't rely for sqlparse for this, because it's
+        # database agnostic
+        for keyword, regex in keyword_regexs.items():
+            for _ in regex.finditer(text):
+                self.keyword_counts[keyword] += 1
+
+    def keyword_count(self, keyword):
+        return self.keyword_counts[keyword]
+
+    def name_count(self, name):
+        return self.name_counts[name]
+
+
+
diff --git a/tests/test_prioritization.py b/tests/test_prioritization.py
new file mode 100644
index 00000000..3046456e
--- /dev/null
+++ b/tests/test_prioritization.py
@@ -0,0 +1,20 @@
+from pgcli.packages.prioritization import PrevalenceCounter
+
+
+def test_prevalence_counter():
+    counter = PrevalenceCounter()
+    sql = '''SELECT * FROM foo WHERE bar GROUP BY baz;
+             select * from foo;
+             SELECT * FROM foo WHERE bar GROUP
+             BY baz'''
+    counter.update(sql)
+
+    keywords = ['SELECT', 'FROM', 'GROUP BY']
+    expected = [3, 3, 2]
+    kw_counts = [counter.keyword_count(x) for x in keywords]
+    assert kw_counts == expected
+    assert counter.keyword_count('NOSUCHKEYWORD') == 0
+
+    names = ['foo', 'bar', 'baz']
+    name_counts = [counter.name_count(x) for x in names]
+    assert name_counts == [3, 2, 2]
author	Darik Gamble <darik.gamble.spam@gmail.com>	2015-11-01 17:25:59 -0500
committer	Darik Gamble <darik.gamble.spam@gmail.com>	2015-11-08 15:54:15 -0500
commit	9c97d35606862737bb5bd65ecf696c5d6653ebef (patch)
tree	76ea4353b837854111e7e4e0ac28302888090d5e
parent	f7aef6ecacd0dc4c1343d3aab534c12aa50cd51d (diff)