summaryrefslogtreecommitdiffstats
path: root/bin
diff options
context:
space:
mode:
authorHarel Ben-Attia <harelba@gmail.com>2014-11-08 09:47:08 -0500
committerHarel Ben-Attia <harelba@gmail.com>2014-11-08 09:47:08 -0500
commitd569c7248b00eee1814f9180c85cc577eaf9a62c (patch)
tree58a81364a83e027618519c34045d00edd24b0fec /bin
parent5043992fc46ebe1555a37fc6cc88c44cb90e688c (diff)
Added full control of input/output quoting of data + tests
Supported quoting types are none, miminal, non numeric and all
Diffstat (limited to 'bin')
-rwxr-xr-xbin/q116
1 files changed, 94 insertions, 22 deletions
diff --git a/bin/q b/bin/q
index a7213d8..4f61b37 100755
--- a/bin/q
+++ b/bin/q
@@ -145,6 +145,8 @@ input_data_option_group.add_option("--disable-double-double-quoting", dest="disa
help="Disable support for double double-quoting for escaping the double quote character. By default, you can use \"\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
input_data_option_group.add_option("--disable-escaped-double-quoting", dest="disable_escaped_double_quoting", default=True, action="store_false",
help="Disable support for escaped double-quoting for escaping the double quote character. By default, you can use \\\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
+input_data_option_group.add_option("-w","--input-quoting-mode",dest="input_quoting_mode",default="minimal",
+ help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.")
parser.add_option_group(input_data_option_group)
#-----------------------------------------------
output_data_option_group = OptionGroup(parser,"Output Options")
@@ -159,7 +161,8 @@ output_data_option_group.add_option("-f", "--formatting", dest="formatting", def
help="Output-level formatting, in the format X=fmt,Y=fmt etc, where X,Y are output column numbers (e.g. 1 for first SELECT column etc.")
output_data_option_group.add_option("-E", "--output-encoding", dest="output_encoding", default=default_output_encoding,
help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding")
-# -M will be added here for supporting output quoting mode in the future
+output_data_option_group.add_option("-W","--output-quoting-mode",dest="output_quoting_mode",default="minimal",
+ help="Output quoting mode. Possible values are all, minimal, nunnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.")
parser.add_option_group(output_data_option_group)
#-----------------------------------------------
query_option_group = OptionGroup(parser,"Query Related Options")
@@ -280,6 +283,21 @@ class Sqlite3DB(object):
def drop_table(self, table_name):
return self.execute_and_fetch(self.generate_drop_table(table_name))
+class CouldNotConvertStringToNumericValueException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __str(self):
+ return repr(self.msg)
+
+class CouldNotParseInputException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __str(self):
+ return repr(self.msg)
class BadHeaderException(Exception):
@@ -650,14 +668,19 @@ class TableColumnInferer(object):
def encoded_csv_reader(encoding, f, dialect, **kwargs):
- csv_reader = csv.reader(f, dialect, **kwargs)
- if encoding is not None and encoding != 'none':
- for row in csv_reader:
- yield [unicode(x, encoding) for x in row]
- else:
- for row in csv_reader:
- yield row
-
+ try:
+ csv_reader = csv.reader(f, dialect, **kwargs)
+ if encoding is not None and encoding != 'none':
+ for row in csv_reader:
+ yield [unicode(x, encoding) for x in row]
+ else:
+ for row in csv_reader:
+ yield row
+ except ValueError,e:
+ if e.message is not None and e.message.startswith('could not convert string to'):
+ raise CouldNotConvertStringToNumericValueException(e.message)
+ else:
+ raise CouldNotParseInputException(str(e))
def normalized_filename(filename):
if filename == '-':
@@ -719,15 +742,14 @@ class TableCreator(object):
# Check if it's standard input or a file
if filename == '-':
- codec_info = codecs.lookup(self.encoding)
- f = codecs.StreamReaderWriter(sys.stdin,codec_info.streamreader,codec_info.streamwriter,None)
+ f = sys.stdin
if self.gzipped:
raise CannotUnzipStdInException()
else:
if self.gzipped or filename.endswith('.gz'):
f = gzip.GzipFile(fileobj=file(filename,'rb'))
else:
- f = codecs.open(filename, 'rb',encoding=self.encoding)
+ f = file(filename,'rb')
self.read_file_using_csv(f, analyze_only)
if not self.table_created:
@@ -889,13 +911,13 @@ class TableCreator(object):
self.db.drop_table(self.table_name)
-def determine_max_col_lengths(m):
+def determine_max_col_lengths(m,output_field_quoting_func,output_delimiter):
if len(m) == 0:
return []
max_lengths = [0 for x in xrange(0, len(m[0]))]
for row_index in xrange(0, len(m)):
for col_index in xrange(0, len(m[0])):
- new_len = len(unicode(m[row_index][col_index]))
+ new_len = len(unicode(output_field_quoting_func(output_delimiter,m[row_index][col_index])))
if new_len > max_lengths[col_index]:
max_lengths[col_index] = new_len
return max_lengths
@@ -983,10 +1005,53 @@ if options.keep_leading_whitespace_in_values:
else:
skip_initial_space = True
-q_dialect = {'skipinitialspace': skip_initial_space, 'quoting': 0,
+def quote_none_func(output_delimiter,v):
+ return v
+
+def quote_minimal_func(output_delimiter,v):
+ if v is None:
+ return v
+ if type(v) == str or type(v) == unicode and output_delimiter in v:
+ return '"%s"' % (v)
+ return v;
+
+def quote_nonnumeric_func(output_delimiter,v):
+ if v is None:
+ return v
+ if type(v) == str or type(v) == unicode:
+ return '"%s"' % (v)
+ return v;
+
+def quote_all_func(output_delimiter,v):
+ return '"%s"' % (v)
+
+input_quoting_modes = { 'minimal' : csv.QUOTE_MINIMAL,
+ 'all' : csv.QUOTE_ALL,
+ # nonnumeric is not supported for input quoting modes, since we determine the data types
+ # ourselves instead of letting the csv module try to identify the types
+ 'none' : csv.QUOTE_NONE }
+
+output_quoting_modes = { 'minimal' : quote_minimal_func,
+ 'all' : quote_all_func,
+ 'nonnumeric' : quote_nonnumeric_func,
+ 'none' : quote_none_func }
+
+if options.input_quoting_mode not in input_quoting_modes.keys():
+ print >>sys.stderr,"Input quoting mode can only be one of %s. It cannot be set to '%s'" % (",".join(input_quoting_modes.keys()),options.input_quoting_mode)
+ sys.exit(55)
+
+if options.output_quoting_mode not in output_quoting_modes.keys():
+ print >>sys.stderr,"Output quoting mode can only be one of %s. It cannot be set to '%s'" % (",".join(output_quoting_modes.keys()),options.input_quoting_mode)
+ sys.exit(56)
+
+input_quoting_mode_csv_numeral = input_quoting_modes[options.input_quoting_mode]
+output_field_quoting_func = output_quoting_modes[options.output_quoting_mode]
+
+q_dialect = {'skipinitialspace': skip_initial_space,
'delimiter': options.delimiter, 'quotechar': '"' }
-q_dialect['doublequote'] = options.disable_double_double_quoting;
+q_dialect['quoting'] = input_quoting_mode_csv_numeral
+q_dialect['doublequote'] = options.disable_double_double_quoting
if options.disable_escaped_double_quoting:
q_dialect['escapechar'] = '\\'
@@ -1010,6 +1075,7 @@ if options.encoding != 'none':
print >>sys.stderr, "Encoding %s could not be found" % options.encoding
sys.exit(10)
+
try:
table_creators = []
# Get each "table name" which is actually the file name
@@ -1064,15 +1130,17 @@ except BadHeaderException, e:
except CannotUnzipStdInException,e:
print >>sys.stderr,"Cannot decompress standard input. Pipe the input through zcat in order to decompress."
sys.exit(36)
+except CouldNotConvertStringToNumericValueException,e:
+ print >>sys.stderr,"Could not convert string to a numeric value. Did you use `-w nonnumeric` with unquoted string values? Error: %s" % e.msg
+ sys.exit(58)
+except CouldNotParseInputException,e:
+ print >>sys.stderr,"Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error: %s" % e.msg
+ sys.exit(59)
+
except KeyboardInterrupt:
print >>sys.stderr, "Interrupted"
sys.exit(0)
-
-# If the user requested beautifying the output
-if options.beautify:
- max_lengths = determine_max_col_lengths(m)
-
if options.output_delimiter:
# If output delimiter is specified, then we use it
output_delimiter = options.output_delimiter
@@ -1087,6 +1155,10 @@ else:
# (since no input delimiter means any whitespace)
output_delimiter = " "
+# If the user requested beautifying the output
+if options.beautify:
+ max_lengths = determine_max_col_lengths(m,output_field_quoting_func,output_delimiter)
+
if options.formatting:
formatting_dict = dict(
[(x.split("=")[0], x.split("=")[1]) for x in options.formatting.split(",")])
@@ -1108,7 +1180,7 @@ try:
fmt_str = "%s"
if col is not None:
- row_str.append(fmt_str % col)
+ row_str.append(fmt_str % output_field_quoting_func(output_delimiter,col))
else:
row_str.append(fmt_str % "")