summaryrefslogtreecommitdiffstats
path: root/bin
diff options
context:
space:
mode:
authorHarel Ben-Attia <harelba@gmail.com>2014-12-10 19:45:44 -0500
committerHarel Ben-Attia <harelba@gmail.com>2014-12-10 19:47:59 -0500
commit2147ccc0174a62ada6e0381f27e08ec08a3ba897 (patch)
treef3c0e6416621129d6b5c1eae3ebbaf537ff0038f /bin
parent9e31efe6bbf6ca19979482afc801d69caf343bc4 (diff)
Multiple queries on CLI + Reuse of loaded data + Full code refactoring (full python API) + RPM compiles q to binary
Included changes: pick 8b4342d Refactored q to expose objects, and added a python package wrapper pick b0d28b1 Internalized exception handling into the q main object pick faf3904 Added reuse of loaded data + stdin injection to module API + tests pick 08cf2bb Allow different input params for each loaded file + loading file manually + fixed modeling of query encoding + tests pick 3a7fffd Separation of populate phases pick 92d0bb6 Provide table structure as part of responses + merge _populate method versions pick d04a24c Multiple queries on CLI + load data from string API + finer details in table structure response + tests pick d90c392 modified non existent file error msg + fixed response bug in table_structure when there are errors pick 5908a75 Fixed bug in str of the response pick a9dcc74 New RPM creation which matches commit hash to version, and compiles q pick 90fd18a Fixed permissions for new RPM.
Diffstat (limited to 'bin')
-rwxr-xr-xbin/__init__.py2
-rwxr-xr-xbin/q1208
l---------bin/qtextasdata.py1
3 files changed, 815 insertions, 396 deletions
diff --git a/bin/__init__.py b/bin/__init__.py
new file mode 100755
index 0000000..cf529d7
--- /dev/null
+++ b/bin/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python
+
diff --git a/bin/q b/bin/q
index 75c52b9..a77f576 100755
--- a/bin/q
+++ b/bin/q
@@ -29,6 +29,8 @@
#
q_version = "1.5.0" # not released yet
+__all__ = [ 'QTextAsData' ]
+
import os
import sys
import sqlite3
@@ -44,6 +46,8 @@ from ConfigParser import ConfigParser
import traceback
import csv
import hashlib
+import uuid
+import cStringIO
DEBUG = False
@@ -58,121 +62,6 @@ def get_stdout_encoding(encoding_override=None):
SHOW_SQL = False
-p = ConfigParser()
-p.read([os.path.expanduser('~/.qrc'), '.qrc'])
-
-
-def get_option_with_default(p, option_type, option, default):
- if not p.has_option('options', option):
- return default
- if option_type == 'boolean':
- return p.getboolean('options', option)
- elif option_type == 'int':
- return p.getint('options', option)
- elif option_type == 'string':
- return p.get('options', option)
- elif option_type == 'escaped_string':
- return p.get('options', option).decode('string-escape')
- else:
- raise Exception("Unknown option type")
-
-default_beautify = get_option_with_default(p, 'boolean', 'beautify', False)
-default_gzipped = get_option_with_default(p, 'boolean', 'gzipped', False)
-default_delimiter = get_option_with_default(
- p, 'escaped_string', 'delimiter', None)
-default_output_delimiter = get_option_with_default(
- p, 'escaped_string', 'output_delimiter', None)
-default_skip_header = get_option_with_default(p, 'int', 'skip_header', 0)
-default_formatting = get_option_with_default(p, 'string', 'formatting', None)
-default_encoding = get_option_with_default(p, 'string', 'encoding', 'UTF-8')
-default_output_encoding = get_option_with_default(p, 'string', 'encoding', None)
-default_query_encoding = get_option_with_default(p, 'string', 'query_encoding', locale.getpreferredencoding())
-default_output_header = get_option_with_default(p, 'string', 'output_header', False)
-
-parser = OptionParser(usage="""
- q allows performing SQL-like statements on tabular text data.
-
- Its purpose is to bring SQL expressive power to manipulating text data using the Linux command line.
-
- Basic usage is q "<sql like query>" where table names are just regular file names (Use - to read from standard input)
- When the input contains a header row, use -H, and column names will be set according to the header row content. If there isn't a header row, then columns will automatically be named c1..cN.
-
- Column types are detected automatically. Use -A in order to see the column name/type analysis.
-
- Delimiter can be set using the -d (or -t) option. Output delimiter can be set using -D
-
- All sqlite3 SQL constructs are supported.
-
- Examples:
-
- Example 1: ls -ltrd * | q "select c1,count(1) from - group by c1"
- This example would print a count of each unique permission string in the current folder.
-
- Example 2: seq 1 1000 | q "select avg(c1),sum(c1) from -"
- This example would provide the average and the sum of the numbers in the range 1 to 1000
-
- Example 3: sudo find /tmp -ls | q "select c5,c6,sum(c7)/1024.0/1024 as total from - group by c5,c6 order by total desc"
- This example will output the total size in MB per user+group in the /tmp subtree
-
-
- See the help or https://github.com/harelba/q/ for more details.
-""")
-
-#-----------------------------------------------
-parser.add_option("-v", "--version", dest="version", default=False, action="store_true",
- help="Print version")
-#-----------------------------------------------
-input_data_option_group = OptionGroup(parser,"Input Data Options")
-input_data_option_group.add_option("-H", "--skip-header", dest="skip_header", default=default_skip_header, action="store_true",
- help="Skip header row. This has been changed from earlier version - Only one header row is supported, and the header row is used for column naming")
-input_data_option_group.add_option("-d", "--delimiter", dest="delimiter", default=default_delimiter,
- help="Field delimiter. If none specified, then space is used as the delimiter.")
-input_data_option_group.add_option("-t", "--tab-delimited", dest="tab_delimited", default=False, action="store_true",
- help="Same as -d <tab>. Just a shorthand for handling standard tab delimited file You can use $'\\t' if you want (this is how Linux expects to provide tabs in the command line")
-input_data_option_group.add_option("-e", "--encoding", dest="encoding", default=default_encoding,
- help="Input file encoding. Defaults to UTF-8. set to none for not setting any encoding - faster, but at your own risk...")
-input_data_option_group.add_option("-z", "--gzipped", dest="gzipped", default=default_gzipped, action="store_true",
- help="Data is gzipped. Useful for reading from stdin. For files, .gz means automatic gunzipping")
-input_data_option_group.add_option("-A", "--analyze-only", dest="analyze_only", action='store_true',
- help="Analyze sample input and provide information about data types")
-input_data_option_group.add_option("-m", "--mode", dest="mode", default="relaxed",
- help="Data parsing mode. fluffy, relaxed and strict. In strict mode, the -c column-count parameter must be supplied as well")
-input_data_option_group.add_option("-c", "--column-count", dest="column_count", default=None,
- help="Specific column count when using relaxed or strict mode")
-input_data_option_group.add_option("-k", "--keep-leading-whitespace", dest="keep_leading_whitespace_in_values", default=False, action="store_true",
- help="Keep leading whitespace in values. Default behavior strips leading whitespace off values, in order to provide out-of-the-box usability for simple use cases. If you need to preserve whitespace, use this flag.")
-input_data_option_group.add_option("--disable-double-double-quoting", dest="disable_double_double_quoting", default=True, action="store_false",
- help="Disable support for double double-quoting for escaping the double quote character. By default, you can use \"\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
-input_data_option_group.add_option("--disable-escaped-double-quoting", dest="disable_escaped_double_quoting", default=True, action="store_false",
- help="Disable support for escaped double-quoting for escaping the double quote character. By default, you can use \\\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
-input_data_option_group.add_option("-w","--input-quoting-mode",dest="input_quoting_mode",default="minimal",
- help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.")
-parser.add_option_group(input_data_option_group)
-#-----------------------------------------------
-output_data_option_group = OptionGroup(parser,"Output Options")
-output_data_option_group.add_option("-D", "--output-delimiter", dest="output_delimiter", default=default_output_delimiter,
- help="Field delimiter for output. If none specified, then the -d delimiter is used if present, or space if no delimiter is specified")
-output_data_option_group.add_option("-T", "--tab-delimited-output", dest="tab_delimited_output", default=False, action="store_true",
- help="Same as -D <tab>. Just a shorthand for outputing tab delimited output. You can use -D $'\\t' if you want.")
-output_data_option_group.add_option("-O", "--output-header", dest="output_header", default=default_output_header, action="store_true",help="Output header line. Output column-names are determined from the query itself. Use column aliases in order to set your column names in the query. For example, 'select name FirstName,value1/value2 MyCalculation from ...'. This can be used even if there was no header in the input.")
-output_data_option_group.add_option("-b", "--beautify", dest="beautify", default=default_beautify, action="store_true",
- help="Beautify output according to actual values. Might be slow...")
-output_data_option_group.add_option("-f", "--formatting", dest="formatting", default=default_formatting,
- help="Output-level formatting, in the format X=fmt,Y=fmt etc, where X,Y are output column numbers (e.g. 1 for first SELECT column etc.")
-output_data_option_group.add_option("-E", "--output-encoding", dest="output_encoding", default=default_output_encoding,
- help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding")
-output_data_option_group.add_option("-W","--output-quoting-mode",dest="output_quoting_mode",default="minimal",
- help="Output quoting mode. Possible values are all, minimal, nonnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.")
-parser.add_option_group(output_data_option_group)
-#-----------------------------------------------
-query_option_group = OptionGroup(parser,"Query Related Options")
-query_option_group.add_option("-q", "--query-filename", dest="query_filename", default=None,
- help="Read query from the provided filename instead of the command line, possibly using the provided query encoding (using -Q).")
-query_option_group.add_option("-Q", "--query-encoding", dest="query_encoding", default=default_query_encoding,
- help="query text encoding. Experimental. Please send your feedback on this")
-parser.add_option_group(query_option_group)
-#-----------------------------------------------
-
def sha1(data):
if not isinstance(data,str) and not isinstance(data,unicode):
return hashlib.sha1(str(data)).hexdigest()
@@ -307,12 +196,25 @@ class BadHeaderException(Exception):
def __str(self):
return repr(self.msg)
+class EncodedQueryException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __str(self):
+ return repr(self.msg)
+
class CannotUnzipStdInException(Exception):
def __init__(self):
pass
+class UnprovidedStdInException(Exception):
+
+ def __init__(self):
+ pass
+
class EmptyDataException(Exception):
def __init__(self):
@@ -336,6 +238,19 @@ class ColumnCountMismatchException(Exception):
def __str(self):
return repr(self.msg)
+class StrictModeColumnCountMismatchException(Exception):
+
+ def __init__(self,expected_col_count,actual_col_count):
+ self.expected_col_count = expected_col_count
+ self.actual_col_count = actual_col_count
+
+class FluffyModeColumnCountMismatchException(Exception):
+
+ def __init__(self,expected_col_count,actual_col_count):
+ self.expected_col_count = expected_col_count
+ self.actual_col_count = actual_col_count
+
+
# Simplistic Sql "parsing" class... We'll eventually require a real SQL parser which will provide us with a parse tree
#
# A "qtable" is a filename which behaves like an SQL table...
@@ -688,10 +603,34 @@ def normalized_filename(filename):
else:
return filename
+class TableCreatorState(object):
+ NEW = 'NEW'
+ INITIALIZED = 'INITIALIZED'
+ ANALYZED = 'ANALYZED'
+ FULLY_READ = 'FULLY_READ'
+
+class MaterializedFileState(object):
+ def __init__(self,filename,f,encoding,dialect,is_stdin):
+ self.filename = filename
+ self.lines_read = 0
+ self.f = f
+ self.encoding = encoding
+ self.dialect = dialect
+ self.is_stdin = is_stdin
+
+ def read_file_using_csv(self):
+ csv_reader = encoded_csv_reader(self.encoding, self.f, dialect=self.dialect)
+ for col_vals in csv_reader:
+ self.lines_read += 1
+ yield col_vals
+
+ def close(self):
+ if self.f != sys.stdin:
+ self.f.close()
class TableCreator(object):
- def __init__(self, db, filenames_str, line_splitter, skip_header=False, gzipped=False, encoding='UTF-8', mode='fluffy', expected_column_count=None, input_delimiter=None):
+ def __init__(self, db, filenames_str, line_splitter, skip_header=False, gzipped=False, encoding='UTF-8', mode='fluffy', expected_column_count=None, input_delimiter=None,stdin_file=None,stdin_filename='-'):
self.db = db
self.filenames_str = filenames_str
self.skip_header = skip_header
@@ -702,6 +641,9 @@ class TableCreator(object):
self.mode = mode
self.expected_column_count = expected_column_count
self.input_delimiter = input_delimiter
+ self.stdin_file = stdin_file
+ self.stdin_filename = stdin_filename
+
self.column_inferer = TableColumnInferer(
mode, expected_column_count, input_delimiter, skip_header)
@@ -716,45 +658,109 @@ class TableCreator(object):
# so column inferer can do its work before this information is needed
self.numeric_column_indices = None
- def get_table_name(self):
- return self.table_name
+ self.materialized_file_list = self.materialize_file_list()
+ self.materialized_file_dict = {}
+
+ self.state = TableCreatorState.NEW
+
+ def materialize_file_list(self):
+ materialized_file_list = []
- def populate(self, analyze_only=False):
# Get the list of filenames
filenames = self.filenames_str.split("+")
+
# for each filename (or pattern)
for fileglob in filenames:
# Allow either stdin or a glob match
- if fileglob == '-':
- files_to_go_over = ['-']
+ if fileglob == self.stdin_filename:
+ materialized_file_list.append(self.stdin_filename)
else:
- files_to_go_over = glob.glob(fileglob)
-
- # If there are no files to go over,
- if len(files_to_go_over) == 0:
- raise FileNotFoundException(
- "File %s has not been found" % fileglob)
-
- # For each match
- for filename in files_to_go_over:
- self.current_filename = filename
- self.lines_read = 0
-
- # Check if it's standard input or a file
- if filename == '-':
- f = sys.stdin
- if self.gzipped:
- raise CannotUnzipStdInException()
- else:
- if self.gzipped or filename.endswith('.gz'):
- f = gzip.GzipFile(fileobj=file(filename,'rb'))
- else:
- f = file(filename,'rb')
+ materialized_file_list += glob.glob(fileglob)
+
+ # If there are no files to go over,
+ if len(materialized_file_list) == 0:
+ raise FileNotFoundException(
+ "No files matching '%s' have been found" % self.filenames_str)
+
+ return materialized_file_list
+
+ def get_table_name(self):
+ return self.table_name
+
+ def open_file(self,filename):
+ # Check if it's standard input or a file
+ if filename == self.stdin_filename:
+ if self.stdin_file is None:
+ raise UnprovidedStdInException()
+ f = self.stdin_file
+ if self.gzipped:
+ raise CannotUnzipStdInException()
+ else:
+ if self.gzipped or filename.endswith('.gz'):
+ f = gzip.GzipFile(fileobj=file(filename,'rb'))
+ else:
+ f = file(filename,'rb')
+ return f
+
+ def _pre_populate(self,dialect):
+ # For each match
+ for filename in self.materialized_file_list:
+ if filename in self.materialized_file_dict.keys():
+ continue
+
+ f = self.open_file(filename)
+
+ is_stdin = filename == self.stdin_filename
- self.read_file_using_csv(f, analyze_only)
- if not self.table_created:
- self.column_inferer.force_analysis()
- self._do_create_table()
+ mfs = MaterializedFileState(filename,f,self.encoding,dialect,is_stdin)
+ self.materialized_file_dict[filename] = mfs
+
+ def _populate(self,dialect,stop_after_analysis=False):
+ # For each match
+ for filename in self.materialized_file_list:
+ mfs = self.materialized_file_dict[filename]
+
+ try:
+ try:
+ for col_vals in mfs.read_file_using_csv():
+ self._insert_row(col_vals)
+ if stop_after_analysis and self.column_inferer.inferred:
+ return
+ if mfs.lines_read == 0 or (mfs.lines_read == 1 and self.skip_header):
+ raise EmptyDataException()
+ except StrictModeColumnCountMismatchException,e:
+ raise ColumnCountMismatchException(
+ 'Strict mode - Expected %s columns instead of %s columns in file %s row %s. Either use relaxed/fluffy modes or check your delimiter' % (
+ e.expected_col_count, e.actual_col_count, normalized_filename(mfs.filename), mfs.lines_read))
+ except FluffyModeColumnCountMismatchException,e:
+ raise ColumnCountMismatchException(
+ 'Deprecated fluffy mode - Too many columns in file %s row %s (%s fields instead of %s fields). Consider moving to either relaxed or strict mode' % (
+ normalized_filename(mfs.filename), mfs.lines_read, e.actual_col_count, e.expected_col_count))
+ finally:
+ if not stop_after_analysis:
+ mfs.close()
+ self._flush_inserts()
+
+ if not self.table_created:
+ self.column_inferer.force_analysis()
+ self._do_create_table()
+
+ def populate(self,dialect,stop_after_analysis=False):
+ if self.state == TableCreatorState.NEW:
+ self._pre_populate(dialect)
+ self.state = TableCreatorState.INITIALIZED
+
+ if self.state == TableCreatorState.INITIALIZED:
+ self._populate(dialect,stop_after_analysis=True)
+ self.state = TableCreatorState.ANALYZED
+
+ if stop_after_analysis:
+ return
+
+ if self.state == TableCreatorState.ANALYZED:
+ self._populate(dialect,stop_after_analysis=False)
+ self.state = TableCreatorState.FULLY_READ
+ return
def _flush_pre_creation_rows(self):
for i, col_vals in enumerate(self.pre_creation_rows):
@@ -765,21 +771,6 @@ class TableCreator(object):
self._flush_inserts()
self.pre_creation_rows = []
- def read_file_using_csv(self, f, analyze_only):
- csv_reader = encoded_csv_reader(self.encoding, f, dialect='q')
- try:
- for col_vals in csv_reader:
- self.lines_read += 1
- self._insert_row(col_vals)
- if analyze_only and self.column_inferer.inferred:
- return
- if self.lines_read == 0 or (self.lines_read == 1 and self.skip_header):
- raise EmptyDataException()
- finally:
- if f != sys.stdin:
- f.close()
- self._flush_inserts()
-
def _insert_row(self, col_vals):
# If table has not been created yet
if not self.table_created:
@@ -823,8 +814,7 @@ class TableCreator(object):
actual_col_count = len(col_vals)
if self.mode == 'strict':
if actual_col_count != expected_col_count:
- raise ColumnCountMismatchException('Strict mode - Expected %s columns instead of %s columns in file %s row %s. Either use relaxed/fluffy modes or check your delimiter' % (
- expected_col_count, actual_col_count, normalized_filename(self.current_filename), self.lines_read))
+ raise StrictModeColumnCountMismatchException(expected_col_count,actual_col_count)
return col_vals
# in all non strict mode, we add dummy data to missing columns
@@ -845,8 +835,7 @@ class TableCreator(object):
if self.mode == 'fluffy':
if actual_col_count > expected_col_count:
- raise ColumnCountMismatchException('Deprecated fluffy mode - Too many columns in file %s row %s (%s fields instead of %s fields). Consider moving to either relaxed or strict mode' % (
- normalized_filename(self.current_filename), self.lines_read, actual_col_count, expected_col_count))
+ raise FluffyModeColumnCountMismatchException(expected_col_count,actual_col_count)
return col_vals
raise Exception("Unidentified parsing mode %s" % self.mode)
@@ -872,11 +861,12 @@ class TableCreator(object):
if not self.table_created:
return
- insert_row_stmt = self.db.generate_insert_row(
- self.table_name, self.buffered_inserts[0][0])
- params = [col_vals for col_names, col_vals in self.buffered_inserts]
+ if len(self.buffered_inserts) > 0:
+ insert_row_stmt = self.db.generate_insert_row(
+ self.table_name, self.buffered_inserts[0][0])
+ params = [col_vals for col_names, col_vals in self.buffered_inserts]
- self.db.update_many(insert_row_stmt, params)
+ self.db.update_many(insert_row_stmt, params)
# print self.db.execute_and_fetch(self.db.generate_end_transaction())
self.buffered_inserts = []
@@ -928,82 +918,317 @@ def print_credentials():
print >>sys.stderr,"http://harelba.github.io/q/"
print >>sys.stderr
-(options, args) = parser.parse_args()
+class QWarning(object):
+ def __init__(self,exception,msg):
+ self.exception = exception
+ self.msg = msg
-if options.version:
- print_credentials()
- sys.exit(0)
+class QError(object):
+ def __init__(self,exception,msg,errorcode):
+ self.exception = exception
+ self.msg = msg
+ self.errorcode = errorcode
+ self.traceback = traceback.format_exc()
-if len(args) > 1:
- print >>sys.stderr,"Must provide query as one parameter, enclosed in quotes, or through a file with the -f parameter"
- sys.exit(1)
+class QDataLoad(object):
+ def __init__(self,filename,start_time,end_time):
+ self.filename = filename
+ self.start_time = start_time
+ self.end_time = end_time
-if len(args) == 0 and options.query_filename is None:
- print_credentials()
- print >>sys.stderr,"Must provide a query in the command line, or through the a file with the -f parameter"
- sys.exit(1)
+ def duration(self):
+ return self.end_time - self.start_time
-if options.query_filename is not None:
- if len(args) != 0:
- print >>sys.stderr,"Can't provide both a query file and a query on the command line"
- sys.exit(1)
- try:
- f = file(options.query_filename)
- query_str = f.read()
- f.close()
- except:
- print >>sys.stderr,"Could not read query from file %s" % options.query_filename
- sys.exit(1)
-else:
- query_str = args[0]
+ def __str__(self):
+ return "DataLoad<'%s' at %s (took %4.3f seconds)>" % (self.filename,self.start_time,self.duration())
+ __repr__ = __str__
-if options.query_encoding is not None and options.query_encoding != 'none':
- try:
- query_str = query_str.decode(options.query_encoding)
- except:
- print >>sys.stderr,"Could not decode query using the provided query encoding (%s)" % options.query_encoding
- sys.exit(3)
+class QMaterializedFile(object):
+ def __init__(self,filename,is_stdin):
+ self.filename = filename
+ self.is_stdin = is_stdin
-query_str = query_str.strip()
+ def __str__(self):
+ return "QMaterializedFile<filename=%s,is_stdin=%s>" % (self.filename,self.is_stdin)
+ __repr__ = __str__
-if len(query_str) == 0:
- print >>sys.stderr,"Query cannot be empty"
- sys.exit(1)
+class QTableStructure(object):
+ def __init__(self,filenames_str,materialized_files,column_names,column_types):
+ self.filenames_str = filenames_str
+ self.materialized_files = materialized_files
+ self.column_names = column_names
+ self.column_types = column_types
+
+ def __str__(self):
+ return "QTableStructure<filenames_str=%s,materialized_file_count=%s,column_names=%s,column_types=%s>" % (
+ self.filenames_str,len(self.materialized_files.keys()),self.column_names,self.column_types)
+ __repr__ = __str__
+
+class QMetadata(object):
+ def __init__(self,table_structures=[],output_column_name_list=None,data_loads=[]):
+ self.table_structures = table_structures
+ self.output_column_name_list = output_column_name_list
+ self.data_loads = data_loads
+
+ def __str__(self):
+ return "QMetadata<table_count=%s,output_column_name_list=%s,data_load_count=%s" % (
+ len(self.table_structures),self.output_column_name_list,len(self.data_loads))
+ __repr__ = __str__
+
+class QOutput(object):
+ def __init__(self,data=None,metadata=None,warnings=[],error=None):
+ self.data = data
+ self.metadata = metadata
+
+ self.warnings = warnings
+ self.error = error
+ if error is None:
+ self.status = 'ok'
+ else:
+ self.status = 'error'
+
+ def __str__(self):
+ s = []
+ s.append('status=%s' % self.status)
+ if self.error is not None:
+ s.append("error=%s" % self.error.msg)
+ if len(self.warnings) > 0:
+ s.append("warning_count=%s" % len(self.warnings))
+ if self.data is not None:
+ s.append("row_count=%s" % len(self.data))
+ else:
+ s.append("row_count=None")
+ if self.metadata is not None:
+ s.append("metadata=<%s>" % self.metadata)
+ else:
+ s.append("metadata=None")
+ return "QOutput<%s>" % ",".join(s)
+ __repr__ = __str__
+
+class QInputParams(object):
+ def __init__(self,skip_header=False,
+ delimiter=' ',input_encoding='UTF-8',gzipped_input=False,parsing_mode='relaxed',
+ expected_column_count=None,keep_leading_whitespace_in_values=False,
+ disable_double_double_quoting=False,disable_escaped_double_quoting=False,
+ input_quoting_mode='minimal',stdin_file=None,stdin_filename='-'):
+ self.skip_header = skip_header
+ self.delimiter = delimiter
+ self.input_encoding = input_encoding
+ self.gzipped_input = gzipped_input
+ self.parsing_mode = parsing_mode
+ self.expected_column_count = expected_column_count
+ self.keep_leading_whitespace_in_values = keep_leading_whitespace_in_values
+ self.disable_double_double_quoting = disable_double_double_quoting
+ self.disable_escaped_double_quoting = disable_escaped_double_quoting
+ self.input_quoting_mode = input_quoting_mode
+
+ def merged_with(self,input_params):
+ params = QInputParams(**self.__dict__)
+ if input_params is not None:
+ params.__dict__.update(**input_params.__dict__)
+ return params
+
+ def __str__(self):
+ return "QInputParams<%s>" % str(self.__dict__)
+
+ def __repr__(self):
+ return "QInputParams(...)"
+
+class QTextAsData(object):
+ def __init__(self,default_input_params=QInputParams()):
+ self.default_input_params = default_input_params
+
+ self.table_creators = {}
+
+ # Create DB object
+ self.db = Sqlite3DB()
+
+
+ input_quoting_modes = { 'minimal' : csv.QUOTE_MINIMAL,
+ 'all' : csv.QUOTE_ALL,
+ # nonnumeric is not supported for input quoting modes, since we determine the data types
+ # ourselves instead of letting the csv module try to identify the types
+ 'none' : csv.QUOTE_NONE }
-if options.mode not in ['fluffy', 'relaxed', 'strict']:
- print >>sys.stderr, "Parsing mode can be one of fluffy, relaxed or strict"
- sys.exit(13)
+ def determine_proper_dialect(self,input_params):
-output_encoding = get_stdout_encoding(options.output_encoding)
-try:
- STDOUT = codecs.getwriter(output_encoding)(sys.stdout)
-except:
- print >>sys.stderr,"Could not create output stream using output encoding %s" % (output_encoding)
- sys.exit(200)
+ input_quoting_mode_csv_numeral = QTextAsData.input_quoting_modes[input_params.input_quoting_mode]
-# Create DB object
-db = Sqlite3DB()
+ if input_params.keep_leading_whitespace_in_values:
+ skip_initial_space = False
+ else:
+ skip_initial_space = True
+
+ dialect = {'skipinitialspace': skip_initial_space,
+ 'delimiter': input_params.delimiter, 'quotechar': '"' }
+ dialect['quoting'] = input_quoting_mode_csv_numeral
+ dialect['doublequote'] = input_params.disable_double_double_quoting
+
+ if input_params.disable_escaped_double_quoting:
+ dialect['escapechar'] = '\\'
+
+ return dialect
+
+ def get_dialect_id(self,filename):
+ return 'q_dialect_%s' % filename
-# Create SQL statment
-sql_object = Sql('%s' % query_str)
+ def _load_data(self,filename,input_params=QInputParams(),stdin_file=None,stdin_filename='-',stop_after_analysis=False):
+ start_time = time.time()
+
+ q_dialect = self.determine_proper_dialect(input_params)
+ dialect_id = self.get_dialect_id(filename)
+ csv.register_dialect(dialect_id, **q_dialect)
-# If the user flagged for a tab-delimited file then set the delimiter to tab
-if options.tab_delimited:
- options.delimiter = '\t'
+ # Create a line splitter
+ line_splitter = LineSplitter(input_params.delimiter, input_params.expected_column_count)
-if options.tab_delimited_output:
- options.output_delimiter = '\t'
+ # reuse already loaded data, except for stdin file data (stdin file data will always
+ # be reloaded and overwritten)
+ if filename in self.table_creators.keys() and filename != stdin_filename:
+ return None
+
+ # Create the matching database table and populate it
+ table_creator = TableCreator(
+ self.db, filename, line_splitter, input_params.skip_header, input_params.gzipped_input, input_params.input_encoding,
+ mode=input_params.parsing_mode, expected_column_count=input_params.expected_column_count,
+ input_delimiter=input_params.delimiter,stdin_file = stdin_file,stdin_filename = stdin_filename)
-if options.delimiter is None:
- options.delimiter = ' '
-elif len(options.delimiter) != 1:
- print >>sys.stderr, "Delimiter must be one character only"
- sys.exit(5)
+ table_creator.populate(dialect_id,stop_after_analysis)
-if options.keep_leading_whitespace_in_values:
- skip_initial_space = False
-else:
- skip_initial_space = True
+ self.table_creators[filename] = table_creator
+
+ return QDataLoad(filename,start_time,time.time())
+
+ def load_data(self,filename,input_params=QInputParams(),stop_after_analysis=False):
+ self._load_data(filename,input_params,stop_after_analysis=stop_after_analysis)
+
+ def load_data_from_string(self,filename,str_data,input_params=QInputParams(),stop_after_analysis=False):
+ sf = cStringIO.StringIO(str_data)
+ try:
+ self._load_data(filename,input_params,stdin_file=sf,stdin_filename=filename,stop_after_analysis=stop_after_analysis)
+ finally:
+ if sf is not None:
+ sf.close()
+
+ def _ensure_data_is_loaded(self,sql_object,input_params,stdin_file,stdin_filename='-',stop_after_analysis=False):
+ data_loads = []
+
+ # Get each "table name" which is actually the file name
+ for filename in sql_object.qtable_names:
+ data_load = self._load_data(filename,input_params,stdin_file=stdin_file,stdin_filename=stdin_filename,stop_after_analysis=stop_after_analysis)
+ if data_load is not None:
+ data_loads.append(data_load)
+
+ return data_loads
+
+ def materialize_sql_object(self,sql_object):
+ for filename in sql_object.qtable_names:
+ sql_object.set_effective_table_name(filename,self.table_creators[filename].table_name)
+
+ def _execute(self,query_str,input_params=None,stdin_file=None,stdin_filename='-',stop_after_analysis=False):
+ warnings = []
+ error = None
+ data_loads = []
+ table_structures = []
+
+ db_results_obj = None
+
+ effective_input_params = self.default_input_params.merged_with(input_params)
+
+ if type(query_str) != unicode:
+ try:
+ # Hueristic attempt to auto convert the query to unicode before failing
+ query_str = query_str.decode('utf-8')
+ except:
+ error = QError(EncodedQueryException(),"Query should be in unicode. Please make sure to provide a unicode literal string or decode it using proper the character encoding.",91)
+ return QOutput(error = error)
+
+ # Create SQL statment
+ sql_object = Sql('%s' % query_str)
+
+ try:
+ data_loads += self._ensure_data_is_loaded(sql_object,effective_input_params,stdin_file=stdin_file,stdin_filename=stdin_filename,stop_after_analysis=stop_after_analysis)
+
+ table_structures = self._create_table_structures_list()
+
+ self.materialize_sql_object(sql_object)
+
+ # Execute the query and fetch the data
+ db_results_obj = sql_object.execute_and_fetch(self.db)
+
+ return QOutput(
+ data = db_results_obj.results,
+ metadata = QMetadata(
+ table_structures=table_structures,
+ output_column_name_list=db_results_obj.query_column_names,
+ data_loads=data_loads),
+ warnings = warnings,
+ error = error)
+
+ except EmptyDataException,e:
+ warnings.append(QWarning(e,"Warning - data is empty"))
+ except FileNotFoundException, e:
+ error = QError(e,e.msg,30)
+ except sqlite3.OperationalError, e:
+ msg = str(e)
+ error = QError(e,"query error: %s" % msg,1)
+ if "no such column" in msg and effective_input_params.skip_header:
+ warnings.append(QWarning(e,'Warning - There seems to be a "no such column" error, and -H (header line) exists. Please make sure that you are using the column names from the header line and not the default (cXX) column names'))
+ except ColumnCountMismatchException, e:
+ error = QError(e,e.msg,2)
+ except (UnicodeDecodeError, UnicodeError), e:
+ error = QError(e,"Cannot decode data. Try to change the encoding by setting it using the -e parameter. Error:%s" % e,3)
+ except BadHeaderException, e:
+ error = QError(e,"Bad header row: %s" % e.msg,35)
+ except CannotUnzipStdInException,e:
+ error = QError(e,"Cannot decompress standard input. Pipe the input through zcat in order to decompress.",36)
+ except UnprovidedStdInException,e:
+ error = QError(e,"Standard Input must be provided in order to use it as a table",61)
+ except CouldNotConvertStringToNumericValueException,e:
+ error = QError(e,"Could not convert string to a numeric value. Did you use `-w nonnumeric` with unquoted string values? Error: %s" % e.msg,58)
+ except CouldNotParseInputException,e:
+ error = QError(e,"Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error: %s" % e.msg,59)
+ except KeyboardInterrupt,e:
+ warnings.append(QWarning(e,"Interrupted"))
+ except Exception, e:
+ error = QError(e,repr(e),199)
+
+ return QOutput(warnings = warnings,error = error , metadata=QMetadata(table_structures=table_structures,data_loads = data_loads))
+
+ def execute(self,query_str,input_params=None,stdin_file=None,stdin_filename='-'):
+ return self._execute(query_str,input_params,stdin_file,stdin_filename,stop_after_analysis=False)
+
+ def unload(self):
+
+ for filename,table_creator in self.table_creators.iteritems():
+ try:
+ table_creator.drop_table()
+ except:
+ # Support no-table select queries
+ pass
+ self.table_creators = {}
+
+ def _create_materialized_files(self,table_creator):