diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rwxr-xr-x | bin/__init__.py | 2 | ||||
-rwxr-xr-x | bin/q | 1208 | ||||
l--------- | bin/qtextasdata.py | 1 | ||||
-rwxr-xr-x | dist/create-rpm | 49 | ||||
-rw-r--r-- | dist/pre-requisites | 5 | ||||
-rw-r--r-- | dist/q-text-as-data.spec.template | 33 | ||||
-rwxr-xr-x | test/test-suite | 457 |
8 files changed, 1304 insertions, 452 deletions
@@ -1,3 +1,4 @@ build q.spec q.1 +*.pyc diff --git a/bin/__init__.py b/bin/__init__.py new file mode 100755 index 0000000..cf529d7 --- /dev/null +++ b/bin/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python + @@ -29,6 +29,8 @@ # q_version = "1.5.0" # not released yet +__all__ = [ 'QTextAsData' ] + import os import sys import sqlite3 @@ -44,6 +46,8 @@ from ConfigParser import ConfigParser import traceback import csv import hashlib +import uuid +import cStringIO DEBUG = False @@ -58,121 +62,6 @@ def get_stdout_encoding(encoding_override=None): SHOW_SQL = False -p = ConfigParser() -p.read([os.path.expanduser('~/.qrc'), '.qrc']) - - -def get_option_with_default(p, option_type, option, default): - if not p.has_option('options', option): - return default - if option_type == 'boolean': - return p.getboolean('options', option) - elif option_type == 'int': - return p.getint('options', option) - elif option_type == 'string': - return p.get('options', option) - elif option_type == 'escaped_string': - return p.get('options', option).decode('string-escape') - else: - raise Exception("Unknown option type") - -default_beautify = get_option_with_default(p, 'boolean', 'beautify', False) -default_gzipped = get_option_with_default(p, 'boolean', 'gzipped', False) -default_delimiter = get_option_with_default( - p, 'escaped_string', 'delimiter', None) -default_output_delimiter = get_option_with_default( - p, 'escaped_string', 'output_delimiter', None) -default_skip_header = get_option_with_default(p, 'int', 'skip_header', 0) -default_formatting = get_option_with_default(p, 'string', 'formatting', None) -default_encoding = get_option_with_default(p, 'string', 'encoding', 'UTF-8') -default_output_encoding = get_option_with_default(p, 'string', 'encoding', None) -default_query_encoding = get_option_with_default(p, 'string', 'query_encoding', locale.getpreferredencoding()) -default_output_header = get_option_with_default(p, 'string', 'output_header', False) - -parser = OptionParser(usage=""" - q allows performing SQL-like statements on tabular text data. - - Its purpose is to bring SQL expressive power to manipulating text data using the Linux command line. - - Basic usage is q "<sql like query>" where table names are just regular file names (Use - to read from standard input) - When the input contains a header row, use -H, and column names will be set according to the header row content. If there isn't a header row, then columns will automatically be named c1..cN. - - Column types are detected automatically. Use -A in order to see the column name/type analysis. - - Delimiter can be set using the -d (or -t) option. Output delimiter can be set using -D - - All sqlite3 SQL constructs are supported. - - Examples: - - Example 1: ls -ltrd * | q "select c1,count(1) from - group by c1" - This example would print a count of each unique permission string in the current folder. - - Example 2: seq 1 1000 | q "select avg(c1),sum(c1) from -" - This example would provide the average and the sum of the numbers in the range 1 to 1000 - - Example 3: sudo find /tmp -ls | q "select c5,c6,sum(c7)/1024.0/1024 as total from - group by c5,c6 order by total desc" - This example will output the total size in MB per user+group in the /tmp subtree - - - See the help or https://github.com/harelba/q/ for more details. -""") - -#----------------------------------------------- -parser.add_option("-v", "--version", dest="version", default=False, action="store_true", - help="Print version") -#----------------------------------------------- -input_data_option_group = OptionGroup(parser,"Input Data Options") -input_data_option_group.add_option("-H", "--skip-header", dest="skip_header", default=default_skip_header, action="store_true", - help="Skip header row. This has been changed from earlier version - Only one header row is supported, and the header row is used for column naming") -input_data_option_group.add_option("-d", "--delimiter", dest="delimiter", default=default_delimiter, - help="Field delimiter. If none specified, then space is used as the delimiter.") -input_data_option_group.add_option("-t", "--tab-delimited", dest="tab_delimited", default=False, action="store_true", - help="Same as -d <tab>. Just a shorthand for handling standard tab delimited file You can use $'\\t' if you want (this is how Linux expects to provide tabs in the command line") -input_data_option_group.add_option("-e", "--encoding", dest="encoding", default=default_encoding, - help="Input file encoding. Defaults to UTF-8. set to none for not setting any encoding - faster, but at your own risk...") -input_data_option_group.add_option("-z", "--gzipped", dest="gzipped", default=default_gzipped, action="store_true", - help="Data is gzipped. Useful for reading from stdin. For files, .gz means automatic gunzipping") -input_data_option_group.add_option("-A", "--analyze-only", dest="analyze_only", action='store_true', - help="Analyze sample input and provide information about data types") -input_data_option_group.add_option("-m", "--mode", dest="mode", default="relaxed", - help="Data parsing mode. fluffy, relaxed and strict. In strict mode, the -c column-count parameter must be supplied as well") -input_data_option_group.add_option("-c", "--column-count", dest="column_count", default=None, - help="Specific column count when using relaxed or strict mode") -input_data_option_group.add_option("-k", "--keep-leading-whitespace", dest="keep_leading_whitespace_in_values", default=False, action="store_true", - help="Keep leading whitespace in values. Default behavior strips leading whitespace off values, in order to provide out-of-the-box usability for simple use cases. If you need to preserve whitespace, use this flag.") -input_data_option_group.add_option("--disable-double-double-quoting", dest="disable_double_double_quoting", default=True, action="store_false", - help="Disable support for double double-quoting for escaping the double quote character. By default, you can use \"\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.") -input_data_option_group.add_option("--disable-escaped-double-quoting", dest="disable_escaped_double_quoting", default=True, action="store_false", - help="Disable support for escaped double-quoting for escaping the double quote character. By default, you can use \\\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.") -input_data_option_group.add_option("-w","--input-quoting-mode",dest="input_quoting_mode",default="minimal", - help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.") -parser.add_option_group(input_data_option_group) -#----------------------------------------------- -output_data_option_group = OptionGroup(parser,"Output Options") -output_data_option_group.add_option("-D", "--output-delimiter", dest="output_delimiter", default=default_output_delimiter, - help="Field delimiter for output. If none specified, then the -d delimiter is used if present, or space if no delimiter is specified") -output_data_option_group.add_option("-T", "--tab-delimited-output", dest="tab_delimited_output", default=False, action="store_true", - help="Same as -D <tab>. Just a shorthand for outputing tab delimited output. You can use -D $'\\t' if you want.") -output_data_option_group.add_option("-O", "--output-header", dest="output_header", default=default_output_header, action="store_true",help="Output header line. Output column-names are determined from the query itself. Use column aliases in order to set your column names in the query. For example, 'select name FirstName,value1/value2 MyCalculation from ...'. This can be used even if there was no header in the input.") -output_data_option_group.add_option("-b", "--beautify", dest="beautify", default=default_beautify, action="store_true", - help="Beautify output according to actual values. Might be slow...") -output_data_option_group.add_option("-f", "--formatting", dest="formatting", default=default_formatting, - help="Output-level formatting, in the format X=fmt,Y=fmt etc, where X,Y are output column numbers (e.g. 1 for first SELECT column etc.") -output_data_option_group.add_option("-E", "--output-encoding", dest="output_encoding", default=default_output_encoding, - help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding") -output_data_option_group.add_option("-W","--output-quoting-mode",dest="output_quoting_mode",default="minimal", - help="Output quoting mode. Possible values are all, minimal, nonnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.") -parser.add_option_group(output_data_option_group) -#----------------------------------------------- -query_option_group = OptionGroup(parser,"Query Related Options") -query_option_group.add_option("-q", "--query-filename", dest="query_filename", default=None, - help="Read query from the provided filename instead of the command line, possibly using the provided query encoding (using -Q).") -query_option_group.add_option("-Q", "--query-encoding", dest="query_encoding", default=default_query_encoding, - help="query text encoding. Experimental. Please send your feedback on this") -parser.add_option_group(query_option_group) -#----------------------------------------------- - def sha1(data): if not isinstance(data,str) and not isinstance(data,unicode): return hashlib.sha1(str(data)).hexdigest() @@ -307,12 +196,25 @@ class BadHeaderException(Exception): def __str(self): return repr(self.msg) +class EncodedQueryException(Exception): + + def __init__(self, msg): + self.msg = msg + + def __str(self): + return repr(self.msg) + class CannotUnzipStdInException(Exception): def __init__(self): pass +class UnprovidedStdInException(Exception): + + def __init__(self): + pass + class EmptyDataException(Exception): def __init__(self): @@ -336,6 +238,19 @@ class ColumnCountMismatchException(Exception): def __str(self): return repr(self.msg) +class StrictModeColumnCountMismatchException(Exception): + + def __init__(self,expected_col_count,actual_col_count): + self.expected_col_count = expected_col_count + self.actual_col_count = actual_col_count + +class FluffyModeColumnCountMismatchException(Exception): + + def __init__(self,expected_col_count,actual_col_count): + self.expected_col_count = expected_col_count + self.actual_col_count = actual_col_count + + # Simplistic Sql "parsing" class... We'll eventually require a real SQL parser which will provide us with a parse tree # # A "qtable" is a filename which behaves like an SQL table... @@ -688,10 +603,34 @@ def normalized_filename(filename): else: return filename +class TableCreatorState(object): + NEW = 'NEW' + INITIALIZED = 'INITIALIZED' + ANALYZED = 'ANALYZED' + FULLY_READ = 'FULLY_READ' + +class MaterializedFileState(object): + def __init__(self,filename,f,encoding,dialect,is_stdin): + self.filename = filename + self.lines_read = 0 + self.f = f + self.encoding = encoding + self.dialect = dialect + self.is_stdin = is_stdin + + def read_file_using_csv(self): + csv_reader = encoded_csv_reader(self.encoding, self.f, dialect=self.dialect) + for col_vals in csv_reader: + self.lines_read += 1 + yield col_vals + + def close(self): + if self.f != sys.stdin: + self.f.close() class TableCreator(object): - def __init__(self, db, filenames_str, line_splitter, skip_header=False, gzipped=False, encoding='UTF-8', mode='fluffy', expected_column_count=None, input_delimiter=None): + def __init__(self, db, filenames_str, line_splitter, skip_header=False, gzipped=False, encoding='UTF-8', mode='fluffy', expected_column_count=None, input_delimiter=None,stdin_file=None,stdin_filename='-'): self.db = db self.filenames_str = filenames_str self.skip_header = skip_header @@ -702,6 +641,9 @@ class TableCreator(object): self.mode = mode self.expected_column_count = expected_column_count self.input_delimiter = input_delimiter + self.stdin_file = stdin_file + self.stdin_filename = stdin_filename + self.column_inferer = TableColumnInferer( mode, expected_column_count, input_delimiter, skip_header) @@ -716,45 +658,109 @@ class TableCreator(object): # so column inferer can do its work before this information is needed self.numeric_column_indices = None - def get_table_name(self): - return self.table_name + self.materialized_file_list = self.materialize_file_list() + self.materialized_file_dict = {} + + self.state = TableCreatorState.NEW + + def materialize_file_list(self): + materialized_file_list = [] - def populate(self, analyze_only=False): # Get the list of filenames filenames = self.filenames_str.split("+") + # for each filename (or pattern) for fileglob in filenames: # Allow either stdin or a glob match - if fileglob == '-': - files_to_go_over = ['-'] + if fileglob == self.stdin_filename: + materialized_file_list.append(self.stdin_filename) else: - files_to_go_over = glob.glob(fileglob) - - # If there are no files to go over, - if len(files_to_go_over) == 0: - raise FileNotFoundException( - "File %s has not been found" % fileglob) - - # For each match - for filename in files_to_go_over: - self.current_filename = filename - self.lines_read = 0 - - # Check if it's standard input or a file - if filename == '-': - f = sys.stdin - if self.gzipped: - raise CannotUnzipStdInException() - else: - if self.gzipped or filename.endswith('.gz'): - f = gzip.GzipFile(fileobj=file(filename,'rb')) - else: - f = file(filename,'rb') + materialized_file_list += glob.glob(fileglob) + + # If there are no files to go over, + if len(materialized_file_list) == 0: + raise FileNotFoundException( + "No files matching '%s' have been found" % self.filenames_str) + + return materialized_file_list + + def get_table_name(self): + return self.table_name + + def open_file(self,filename): + # Check if it's standard input or a file + if filename == self.stdin_filename: + if self.stdin_file is None: + raise UnprovidedStdInException() + f = self.stdin_file + if self.gzipped: + raise CannotUnzipStdInException() + else: + if self.gzipped or filename.endswith('.gz'): + f = gzip.GzipFile(fileobj=file(filename,'rb')) + else: + f = file(filename,'rb') + return f + + def _pre_populate(self,dialect): + # For each match + for filename in self.materialized_file_list: + if filename in self.materialized_file_dict.keys(): + continue + + f = self.open_file(filename) + + is_stdin = filename == self.stdin_filename - self.read_file_using_csv(f, analyze_only) - if not self.table_created: - self.column_inferer.force_analysis() - self._do_create_table() + mfs = MaterializedFileState(filename,f,self.encoding,dialect,is_stdin) + self.materialized_file_dict[filename] = mfs + + def _populate(self,dialect,stop_after_analysis=False): + # For each match + for filename in self.materialized_file_list: + mfs = self.materialized_file_dict[filename] + + try: + try: + for col_vals in mfs.read_file_using_csv(): + self._insert_row(col_vals) + if stop_after_analysis and self.column_inferer.inferred: + return + if mfs.lines_read == 0 or (mfs.lines_read == 1 and self.skip_header): + raise EmptyDataException() + except StrictModeColumnCountMismatchException,e: + raise ColumnCountMismatchException( + 'Strict mode - Expected %s columns instead of %s columns in file %s row %s. Either use relaxed/fluffy modes or check your delimiter' % ( + e.expected_col_count, e.actual_col_count, normalized_filename(mfs.filename), mfs.lines_read)) + except FluffyModeColumnCountMismatchException,e: + raise ColumnCountMismatchException( + 'Deprecated fluffy mode - Too many columns in file %s row %s (%s fields instead of %s fields). Consider moving to either relaxed or strict mode' % ( + normalized_filename(mfs.filename), mfs.lines_read, e.actual_col_count, e.expected_col_count)) + finally: + if not stop_after_analysis: + mfs.close() + self._flush_inserts() + + if not self.table_created: + self.column_inferer.force_analysis() + self._do_create_table() + + def populate(self,dialect,stop_after_analysis=False): + if self.state == TableCreatorState.NEW: + self._pre_populate(dialect) + self.state = TableCreatorState.INITIALIZED + + if self.state == TableCreatorState.INITIALIZED: + self._populate(dialect,stop_after_analysis=True) + self.state = TableCreatorState.ANALYZED + + if stop_after_analysis: + return + + if self.state == TableCreatorState.ANALYZED: + self._populate(dialect,stop_after_analysis=False) + self.state = TableCreatorState.FULLY_READ + return def _flush_pre_creation_rows(self): for i, col_vals in enumerate(self.pre_creation_rows): @@ -765,21 +771,6 @@ class TableCreator(object): self._flush_inserts() self.pre_creation_rows = [] - def read_file_using_csv(self, f, analyze_only): - csv_reader = encoded_csv_reader(self.encoding, f, dialect='q') - try: - for col_vals in csv_reader: - self.lines_read += 1 - self._insert_row(col_vals) - if analyze_only and self.column_inferer.inferred: - return - if self.lines_read == 0 or (self.lines_read == 1 and self.skip_header): - raise EmptyDataException() - finally: - if f != sys.stdin: - f.close() - self._flush_inserts() - def _insert_row(self, col_vals): # If table has not been created yet if not self.table_created: @@ -823,8 +814,7 @@ class TableCreator(object): actual_col_count = len(col_vals) if self.mode == 'strict': if actual_col_count != expected_col_count: - raise ColumnCountMismatchException('Strict mode - Expected %s columns instead of %s columns in file %s row %s. Either use relaxed/fluffy modes or check your delimiter' % ( - expected_col_count, actual_col_count, normalized_filename(self.current_filename), self.lines_read)) + raise StrictModeColumnCountMismatchException(expected_col_count,actual_col_count) return col_vals # in all non strict mode, we add dummy data to missing columns @@ -845,8 +835,7 @@ class TableCreator(object): if self.mode == 'fluffy': if actual_col_count > expected_col_count: - raise ColumnCountMismatchException('Deprecated fluffy mode - Too many columns in file %s row %s (%s fields instead of %s fields). Consider moving to either relaxed or strict mode' % ( - normalized_filename(self.current_filename), self.lines_read, actual_col_count, expected_col_count)) + raise FluffyModeColumnCountMismatchException(expected_col_count,actual_col_count) return col_vals raise Exception("Unidentified parsing mode %s" % self.mode) @@ -872,11 +861,12 @@ class TableCreator(object): if not self.table_created: return - insert_row_stmt = self.db.generate_insert_row( - self.table_name, self.buffered_inserts[0][0]) - params = [col_vals for col_names, col_vals in self.buffered_inserts] + if len(self.buffered_inserts) > 0: + insert_row_stmt = self.db.generate_insert_row( + self.table_name, self.buffered_inserts[0][0]) + params = [col_vals for col_names, col_vals in self.buffered_inserts] - self.db.update_many(insert_row_stmt, params) + self.db.update_many(insert_row_stmt, params) # print self.db.execute_and_fetch(self.db.generate_end_transaction()) self.buffered_inserts = [] @@ -928,82 +918,317 @@ def print_credentials(): print >>sys.stderr,"http://harelba.github.io/q/" print >>sys.stderr -(options, args) = parser.parse_args() +class QWarning(object): + def __init__(self,exception,msg): + self.exception = exception + self.msg = msg -if options.version: - print_credentials() - sys.exit(0) +class QError(object): + def __init__(self,exception,msg,errorcode): + self.exception = exception + self.msg = msg + self.errorcode = errorcode + self.traceback = traceback.format_exc() -if len(args) > 1: - print >>sys.stderr,"Must provide query as one parameter, enclosed in quotes, or through a file with the -f parameter" - sys.exit(1) +class QDataLoad(object): + def __init__(self,filename,start_time,end_time): + self.filename = filename + self.start_time = start_time + self.end_time = end_time -if len(args) == 0 and options.query_filename is None: - print_credentials() - print >>sys.stderr,"Must provide a query in the command line, or through the a file with the -f parameter" - sys.exit(1) + def duration(self): + return self.end_time - self.start_time -if options.query_filename is not None: - if len(args) != 0: - print >>sys.stderr,"Can't provide both a query file and a query on the command line" - sys.exit(1) - try: - f = file(options.query_filename) - query_str = f.read() - f.close() - except: - print >>sys.stderr,"Could not read query from file %s" % options.query_filename - sys.exit(1) -else: - query_str = args[0] + def __str__(self): + return "DataLoad<'%s' at %s (took %4.3f seconds)>" % (self.filename,self.start_time,self.duration()) + __repr__ = __str__ -if options.query_encoding is not None and options.query_encoding != 'none': - try: - query_str = query_str.decode(options.query_encoding) - except: - print >>sys.stderr,"Could not decode query using the provided query encoding (%s)" % options.query_encoding - sys.exit(3) +class QMaterializedFile(object): + def __init__(self,filename,is_stdin): + self.filename = filename + self.is_stdin = is_stdin -query_str = query_str.strip() + def __str__(self): + return "QMaterializedFile<filename=%s,is_stdin=%s>" % (self.filename,self.is_stdin) + __repr__ = __str__ -if len(query_str) == 0: - print >>sys.stderr,"Query cannot be empty" - sys.exit(1) +class QTableStructure(object): + def __init__(self,filenames_str,materialized_files,column_names,column_types): + self.filenames_str = filenames_str + self.materialized_files = materialized_files + self.column_names = column_names + self.column_types = column_types + + def __str__(self): + return "QTableStructure<filenames_str=%s,materialized_file_count=%s,column_names=%s,column_types=%s>" % ( + self.filenames_str,len(self.materialized_files.keys()),self.column_names,self.column_types) + __repr__ = __str__ + +class QMetadata(object): + def __init__(self,table_structures=[],output_column_name_list=None,data_loads=[]): + self.table_structures = table_structures + self.output_column_name_list = output_column_name_list + self.data_loads = data_loads + + def __str__(self): + return "QMetadata<table_count=%s,output_column_name_list=%s,data_load_count=%s" % ( + len(self.table_structures),self.output_column_name_list,len(self.data_loads)) + __repr__ = __str__ + +class QOutput(object): + def __init__(self,data=None,metadata=None,warnings=[],error=None): + self.data = data + self.metadata = metadata + + self.warnings = warnings + self.error = error + if error is None: + self.status = 'ok' + else: + self.status = 'error' + + def __str__(self): + s = [] + s.append('status=%s' % self.status) + if self.error is not None: + s.append("error=%s" % self.error.msg) + if len(self.warnings) > 0: + s.append("warning_count=%s" % len(self.warnings)) + if self.data is not None: + s.append("row_count=%s" % len(self.data)) + else: + s.append("row_count=None") + if self.metadata is not None: + s.append("metadata=<%s>" % self.metadata) + else: + s.append("metadata=None") + return "QOutput<%s>" % ",".join(s) + __repr__ = __str__ + +class QInputParams(object): + def __init__(self,skip_header=False, + delimiter=' ',input_encoding='UTF-8',gzipped_input=False,parsing_mode='relaxed', + expected_column_count=None,keep_leading_whitespace_in_values=False, + disable_double_double_quoting=False,disable_escaped_double_quoting=False, + input_quoting_mode='minimal',stdin_file=None,stdin_filename='-'): + self.skip_header = skip_header + self.delimiter = delimiter + self.input_encoding = input_encoding + self.gzipped_input = gzipped_input + self.parsing_mode = parsing_mode + self.expected_column_count = expected_column_count + self.keep_leading_whitespace_in_values = keep_leading_whitespace_in_values + self.disable_double_double_quoting = disable_double_double_quoting + self.disable_escaped_double_quoting = disable_escaped_double_quoting + self.input_quoting_mode = input_quoting_mode + + def merged_with(self,input_params): + params = QInputParams(**self.__dict__) + if input_params is not None: + params.__dict__.update(**input_params.__dict__) + return params + + def __str__(self): + return "QInputParams<%s>" % str(self.__dict__) + + def __repr__(self): + return "QInputParams(...)" + +class QTextAsData(object): + def __init__(self,default_input_params=QInputParams()): + self.default_input_params = default_input_params + + self.table_creators = {} + + # Create DB object + self.db = Sqlite3DB() + + + input_quoting_modes = { 'minimal' : csv.QUOTE_MINIMAL, + 'all' : csv.QUOTE_ALL, + # nonnumeric is not supported for input quoting modes, since we determine the data types + # ourselves instead of letting the csv module try to identify the types + 'none' : csv.QUOTE_NONE } -if options.mode not in ['fluffy', 'relaxed', 'strict']: - print >>sys.stderr, "Parsing mode can be one of fluffy, relaxed or strict" - sys.exit(13) + def determine_proper_dialect(self,input_params): -output_encoding = get_stdout_encoding(options.output_encoding) -try: - STDOUT = codecs.getwriter(output_encoding)(sys.stdout) -except: - print >>sys.stderr,"Could not create output stream using output encoding %s" % (output_encoding) - sys.exit(200) + input_quoting_mode_csv_numeral = QTextAsData.input_quoting_modes[input_params.input_quoting_mode] -# Create DB object -db = Sqlite3DB() + if input_params.keep_leading_whitespace_in_values: + skip_initial_space = False + else: + skip_initial_space = True + + dialect = {'skipinitialspace': skip_initial_space, + 'delimiter': input_params.delimiter, 'quotechar': '"' } + dialect['quoting'] = input_quoting_mode_csv_numeral + dialect['doublequote'] = input_params.disable_double_double_quoting + + if input_params.disable_escaped_double_quoting: + dialect['escapechar'] = '\\' + + return dialect + + def get_dialect_id(self,filename): + return 'q_dialect_%s' % filename -# Create SQL statment -sql_object = Sql('%s' % query_str) + def _load_data(self,filename,input_params=QInputParams(),stdin_file=None,stdin_filename='-',stop_after_analysis=False): + start_time = time.time() + + q_dialect = self.determine_proper_dialect(input_params) + dialect_id = self.get_dialect_id(filename) + csv.register_dialect(dialect_id, **q_dialect) -# If the user flagged for a tab-delimited file then set the delimiter to tab -if options.tab_delimited: - options.delimiter = '\t' + # Create a line splitter + line_splitter = LineSplitter(input_params.delimiter, input_params.expected_column_count) -if options.tab_delimited_output: - options.output_delimiter = '\t' + # reuse already loaded data, except for stdin file data (stdin file data will always + # be reloaded and overwritten) + if filename in self.table_creators.keys() and filename != stdin_filename: + return None + + # Create the matching database table and populate it + table_creator = TableCreator( + self.db, filename, line_splitter, input_params.skip_header, input_params.gzipped_input, input_params.input_encoding, + mode=input_params.parsing_mode, expected_column_count=input_params.expected_column_count, + input_delimiter=input_params.delimiter,stdin_file = stdin_file,stdin_filename = stdin_filename) -if options.delimiter is None: - options.delimiter = ' ' -elif len(options.delimiter) != 1: - print >>sys.stderr, "Delimiter must be one character only" - sys.exit(5) + table_creator.populate(dialect_id,stop_after_analysis) -if options.keep_leading_whitespace_in_values: - skip_initial_space = False -else: - skip_initial_space = True + self.table_creators[filename] = table_creator + + return QDataLoad(filename,start_time,time.time()) + + def load_data(self,filename,input_params=QInputParams(),stop_after_analysis=False): + self._load_data(filename,input_params,stop_after_analysis=stop_after_analysis) + + def load_data_from_string(self,filename,str_data,input_params=QInputParams(),stop_after_analysis=False): + sf = cStringIO.StringIO(str_data) + try: + self._load_data(filename,input_params,stdin_file=sf,stdin_filename=filename,stop_after_analysis=stop_after_analysis) + finally: + if sf is not None: + sf.close() + + def _ensure_data_is_loaded(self,sql_object,input_params,stdin_file,stdin_filename='-',stop_after_analysis=False): + data_loads = [] + + # Get each "table name" which is actually the file name + for filename in sql_object.qtable_names: + data_load = self._load_data(filename,input_params,stdin_file=stdin_file,stdin_filename=stdin_filename,stop_after_analysis=stop_after_analysis) + if data_load is not None: + data_loads.append(data_load) + + return data_loads + + def materialize_sql_object(self,sql_object): + for filename in sql_object.qtable_names: + sql_object.set_effective_table_name(filename,self.table_creators[filename].table_name) + + def _execute(self,query_str,input_params=None,stdin_file=None,stdin_filename='-',stop_after_analysis=False): + warnings = [] + error = None + data_loads = [] + table_structures = [] + + db_results_obj = None + + effective_input_params = self.default_input_params.merged_with(input_params) + + if type(query_str) != unicode: + try: + # Hueristic attempt to auto convert the query to unicode before failing + query_str = query_str.decode('utf-8') + except: + error = QError(EncodedQueryException(),"Query should be in unicode. Please make sure to provide a unicode literal string or decode it using proper the character encoding.",91) + return QOutput(error = error) + + # Create SQL statment + sql_object = Sql('%s' % query_str) + + try: + data_loads += self._ensure_data_is_loaded(sql_object,effective_input_params,stdin_file=stdin_file,stdin_filename=stdin_filename,stop_after_analysis=stop_after_analysis) + + table_structures = self._create_table_structures_list() + + self.materialize_sql_object(sql_object) + + # Execute the query and fetch the data + db_results_obj = sql_object.execute_and_fetch(self.db) + + return QOutput( + data = db_results_obj.results, + metadata = QMetadata( + table_structures=table_structures, + output_column_name_list=db_results_obj.query_column_names, + data_loads=data_loads), + warnings = warnings, + error = error) + + except EmptyDataException,e: + warnings.append(QWarning(e,"Warning - data is empty")) + except FileNotFoundException, e: + error = QError(e,e.msg,30) + except sqlite3.OperationalError, e: + msg = str(e) + error = QError(e,"query error: %s" % msg,1) + if "no such column" in msg and effective_input_params.skip_header: + warnings.append(QWarning(e,'Warning - There seems to be a "no such column" error, and -H (header line) exists. Please make sure that you are using the column names from the header line and not the default (cXX) column names')) + except ColumnCountMismatchException, e: + error = QError(e,e.msg,2) + except (UnicodeDecodeError, UnicodeError), e: + error = QError(e,"Cannot decode data. Try to change the encoding by setting it using the -e parameter. Error:%s" % e,3) + except BadHeaderException, e: + error = QError(e,"Bad header row: %s" % e.msg,35) + except CannotUnzipStdInException,e: + error = QError(e,"Cannot decompress standard input. Pipe the input through zcat in order to decompress.",36) + except UnprovidedStdInException,e: + error = QError(e,"Standard Input must be provided in order to use it as a table",61) + except CouldNotConvertStringToNumericValueException,e: + error = QError(e,"Could not convert string to a numeric value. Did you use `-w nonnumeric` with unquoted string values? Error: %s" % e.msg,58) + except CouldNotParseInputException,e: + error = QError(e,"Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error: %s" % e.msg,59) + except KeyboardInterrupt,e: + warnings.append(QWarning(e,"Interrupted")) + except Exception, e: + error = QError(e,repr(e),199) + + return QOutput(warnings = warnings,error = error , metadata=QMetadata(table_structures=table_structures,data_loads = data_loads)) + + def execute(self,query_str,input_params=None,stdin_file=None,stdin_filename='-'): + return self._execute(query_str,input_params,stdin_file,stdin_filename,stop_after_analysis=False) + + def unload(self): + + for filename,table_creator in self.table_creators.iteritems(): + try: + table_creator.drop_table() + except: + # Support no-table select queries + pass + self.table_creators = {} + + def _create_materialized_files(self,table_creator): + d = table_creator.materialized_file_dict + m = {} + for filename,mfs in d.iteritems(): + m[filename] = QMaterializedFile(filename,mfs.is_stdin) + return m < |