2 files changed, 333 insertions, 31 deletions
diff --git a/bin/q b/bin/q
index a7213d8..4f61b37 100755
--- a/bin/q
+++ b/bin/q
@@ -145,6 +145,8 @@ input_data_option_group.add_option("--disable-double-double-quoting", dest="disa
                   help="Disable support for double double-quoting for escaping the double quote character. By default, you can use \"\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
 input_data_option_group.add_option("--disable-escaped-double-quoting", dest="disable_escaped_double_quoting", default=True, action="store_false",
                   help="Disable support for escaped double-quoting for escaping the double quote character. By default, you can use \\\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
+input_data_option_group.add_option("-w","--input-quoting-mode",dest="input_quoting_mode",default="minimal",
+                  help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.")
 parser.add_option_group(input_data_option_group)
 #-----------------------------------------------
 output_data_option_group = OptionGroup(parser,"Output Options") 
@@ -159,7 +161,8 @@ output_data_option_group.add_option("-f", "--formatting", dest="formatting", def
                   help="Output-level formatting, in the format X=fmt,Y=fmt etc, where X,Y are output column numbers (e.g. 1 for first SELECT column etc.")
 output_data_option_group.add_option("-E", "--output-encoding", dest="output_encoding", default=default_output_encoding,
                   help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding")
-# -M will be added here for supporting output quoting mode in the future
+output_data_option_group.add_option("-W","--output-quoting-mode",dest="output_quoting_mode",default="minimal",
+                  help="Output quoting mode. Possible values are all, minimal, nunnumeric and none. Note the slightly misleading parameter name, and see the matching -w parameter for input quoting.")
 parser.add_option_group(output_data_option_group)
 #-----------------------------------------------
 query_option_group = OptionGroup(parser,"Query Related Options") 
@@ -280,6 +283,21 @@ class Sqlite3DB(object):
     def drop_table(self, table_name):
         return self.execute_and_fetch(self.generate_drop_table(table_name))
 
+class CouldNotConvertStringToNumericValueException(Exception):
+
+    def __init__(self, msg):
+        self.msg = msg
+
+    def __str(self):
+        return repr(self.msg)
+
+class CouldNotParseInputException(Exception):
+
+    def __init__(self, msg):
+        self.msg = msg
+
+    def __str(self):
+        return repr(self.msg)
 
 class BadHeaderException(Exception):
 
@@ -650,14 +668,19 @@ class TableColumnInferer(object):
 
 
 def encoded_csv_reader(encoding, f, dialect, **kwargs):
-    csv_reader = csv.reader(f, dialect, **kwargs)
-    if encoding is not None and encoding != 'none':
-        for row in csv_reader:
-            yield [unicode(x, encoding) for x in row]
-    else:
-        for row in csv_reader:
-            yield row
-
+    try:
+        csv_reader = csv.reader(f, dialect, **kwargs)
+        if encoding is not None and encoding != 'none':
+            for row in csv_reader:
+                yield [unicode(x, encoding) for x in row]
+        else:
+            for row in csv_reader:
+                yield row
+    except ValueError,e:
+        if e.message is not None and e.message.startswith('could not convert string to'):
+            raise CouldNotConvertStringToNumericValueException(e.message)
+        else:
+            raise CouldNotParseInputException(str(e))
 
 def normalized_filename(filename):
     if filename == '-':
@@ -719,15 +742,14 @@ class TableCreator(object):
 
                 # Check if it's standard input or a file
                 if filename == '-':
-                    codec_info = codecs.lookup(self.encoding)
-                    f  = codecs.StreamReaderWriter(sys.stdin,codec_info.streamreader,codec_info.streamwriter,None)
+                    f = sys.stdin
                     if self.gzipped:
                         raise CannotUnzipStdInException()
                 else:
                     if self.gzipped or filename.endswith('.gz'):
                         f = gzip.GzipFile(fileobj=file(filename,'rb'))    
                     else:
-                        f = codecs.open(filename, 'rb',encoding=self.encoding)
+                        f = file(filename,'rb')
 
                 self.read_file_using_csv(f, analyze_only)
                 if not self.table_created:
@@ -889,13 +911,13 @@ class TableCreator(object):
             self.db.drop_table(self.table_name)
 
 
-def determine_max_col_lengths(m):
+def determine_max_col_lengths(m,output_field_quoting_func,output_delimiter):
     if len(m) == 0:
         return []
     max_lengths = [0 for x in xrange(0, len(m[0]))]
     for row_index in xrange(0, len(m)):
         for col_index in xrange(0, len(m[0])):
-            new_len = len(unicode(m[row_index][col_index]))
+            new_len = len(unicode(output_field_quoting_func(output_delimiter,m[row_index][col_index])))
             if new_len > max_lengths[col_index]:
                 max_lengths[col_index] = new_len
     return max_lengths
@@ -983,10 +1005,53 @@ if options.keep_leading_whitespace_in_values:
 else:
     skip_initial_space = True
 
-q_dialect = {'skipinitialspace': skip_initial_space, 'quoting': 0,
+def quote_none_func(output_delimiter,v):
+    return v
+
+def quote_minimal_func(output_delimiter,v):
+    if v is None:
+        return v
+    if type(v) == str or type(v) == unicode and output_delimiter in v:
+        return '"%s"' % (v)
+    return v;
+
+def quote_nonnumeric_func(output_delimiter,v):
+    if v is None:
+        return v
+    if type(v) == str or type(v) == unicode:
+        return '"%s"' % (v)
+    return v;
+
+def quote_all_func(output_delimiter,v):
+    return '"%s"' % (v)
+
+input_quoting_modes = {   'minimal' : csv.QUOTE_MINIMAL,
+                    'all' : csv.QUOTE_ALL,
+                    # nonnumeric is not supported for input quoting modes, since we determine the data types 
+                    # ourselves instead of letting the csv module try to identify the types
+                    'none' : csv.QUOTE_NONE }
+
+output_quoting_modes = {   'minimal' : quote_minimal_func,
+                    'all' : quote_all_func,
+                    'nonnumeric' : quote_nonnumeric_func,
+                    'none' : quote_none_func }
+
+if options.input_quoting_mode not in input_quoting_modes.keys():
+    print >>sys.stderr,"Input quoting mode can only be one of %s. It cannot be set to '%s'" % (",".join(input_quoting_modes.keys()),options.input_quoting_mode)
+    sys.exit(55)
+
+if options.output_quoting_mode not in output_quoting_modes.keys():
+    print >>sys.stderr,"Output quoting mode can only be one of %s. It cannot be set to '%s'" % (",".join(output_quoting_modes.keys()),options.input_quoting_mode)
+    sys.exit(56)
+
+input_quoting_mode_csv_numeral = input_quoting_modes[options.input_quoting_mode]
+output_field_quoting_func = output_quoting_modes[options.output_quoting_mode]
+
+q_dialect = {'skipinitialspace': skip_initial_space,
              'delimiter': options.delimiter, 'quotechar': '"' }
 
-q_dialect['doublequote'] = options.disable_double_double_quoting;
+q_dialect['quoting'] = input_quoting_mode_csv_numeral
+q_dialect['doublequote'] = options.disable_double_double_quoting
 
 if options.disable_escaped_double_quoting:
     q_dialect['escapechar'] = '\\'
@@ -1010,6 +1075,7 @@ if options.encoding != 'none':
         print >>sys.stderr, "Encoding %s could not be found" % options.encoding
         sys.exit(10)
 
+
 try:
     table_creators = []
     # Get each "table name" which is actually the file name
@@ -1064,15 +1130,17 @@ except BadHeaderException, e:
 except CannotUnzipStdInException,e:
     print >>sys.stderr,"Cannot decompress standard input. Pipe the input through zcat in order to decompress."
     sys.exit(36)
+except CouldNotConvertStringToNumericValueException,e:
+    print >>sys.stderr,"Could not convert string to a numeric value. Did you use `-w nonnumeric` with unquoted string values? Error: %s" % e.msg
+    sys.exit(58)
+except CouldNotParseInputException,e:
+    print >>sys.stderr,"Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error: %s" % e.msg
+    sys.exit(59)
+
 except KeyboardInterrupt:
     print >>sys.stderr, "Interrupted"
     sys.exit(0)
 
-
-# If the user requested beautifying the output
-if options.beautify:
-    max_lengths = determine_max_col_lengths(m)
-
 if options.output_delimiter:
     # If output delimiter is specified, then we use it
     output_delimiter = options.output_delimiter
@@ -1087,6 +1155,10 @@ else:
         # (since no input delimiter means any whitespace)
         output_delimiter = " "
 
+# If the user requested beautifying the output
+if options.beautify:
+    max_lengths = determine_max_col_lengths(m,output_field_quoting_func,output_delimiter)
+
 if options.formatting:
     formatting_dict = dict(
         [(x.split("=")[0], x.split("=")[1]) for x in options.formatting.split(",")])
@@ -1108,7 +1180,7 @@ try:
                     fmt_str = "%s"
 
             if col is not None:
-                row_str.append(fmt_str % col)
+                row_str.append(fmt_str % output_field_quoting_func(output_delimiter,col))
             else:
                 row_str.append(fmt_str % "")
 
diff --git a/test/test-suite b/test/test-suite
index b167fdf..373d766 100755
--- a/test/test-suite
+++ b/test/test-suite
@@ -92,6 +92,8 @@ combined_quoted_data = '''regular_double_quoted double_double_quoted escaped_dou
 "this is a quoted value" "this is a quoted value with ""double double quotes""" "this is a quoted value with \\"escaped double quotes\\""
 '''
 
+sample_quoted_data2 = '"quoted data" 23\nunquoted-data 54'
+
 one_column_data = '''data without commas 1
 data without commas 2
 '''
@@ -823,6 +825,234 @@ class BasicTests(AbstractQTestCase):
 
         self.cleanup(tmp_data_file)
 
+    def test_none_input_quoting_mode_in_relaxed_mode(self):
+        tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+        cmd = '../bin/q -d " " -m relaxed -D , -w none "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),2)
+
+        self.assertEquals(o[0],'"quoted,data",23')
+        self.assertEquals(o[1],'unquoted-data,54,')
+
+        self.cleanup(tmp_data_file)
+
+    def test_none_input_quoting_mode_in_strict_mode(self):
+        tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+        cmd = '../bin/q -d " " -m strict -D , -w none "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertNotEquals(retcode,0)
+        self.assertEquals(len(e),1)
+        self.assertEquals(len(o),0)
+
+        self.assertTrue(e[0].startswith('Strict mode. Column Count is expected to identical'))
+
+        self.cleanup(tmp_data_file)
+
+    def test_minimal_input_quoting_mode(self):
+        tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+        cmd = '../bin/q -d " " -D , -w minimal "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),2)
+
+        self.assertEquals(o[0],'quoted data,23')
+        self.assertEquals(o[1],'unquoted-data,54')
+
+        self.cleanup(tmp_data_file)
+
+    def test_all_input_quoting_mode(self):
+        tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+        cmd = '../bin/q -d " " -D , -w all "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),2)
+
+        self.assertEquals(o[0],'quoted data,23')
+        self.assertEquals(o[1],'unquoted-data,54')
+
+        self.cleanup(tmp_data_file)
+
+    def test_incorrect_input_quoting_mode(self):
+        tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+        cmd = '../bin/q -d " " -D , -w unknown_wrapping_mode "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertNotEquals(retcode,0)
+        self.assertEquals(len(e),1)
+        self.assertEquals(len(o),0)
+
+        self.assertTrue(e[0].startswith('Input quoting mode can only be one of all,none,minimal'))
+        self.assertTrue('unknown_wrapping_mode' in e[0])
+
+        self.cleanup(tmp_data_file)
+
+    def test_none_output_quoting_mode(self):
+        tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+        cmd = '../bin/q -d " " -D , -w all -W none "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),2)
+
+        self.assertEquals(o[0],'quoted data,23')
+        self.assertEquals(o[1],'unquoted-data,54')
+
+        self.cleanup(tmp_data_file)
+
+    def test_minimal_output_quoting_mode__without_need_to_quote_in_output(self):
+        tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+        cmd = '../bin/q -d " " -D , -w all -W minimal "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),2)
+
+        self.assertEquals(o[0],'quoted data,23')
+        self.assertEquals(o[1],'unquoted-data,54')
+
+        self.cleanup(tmp_data_file)
+
+    def test_minimal_output_quoting_mode__with_need_to_quote_in_output(self):
+        tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+        # output delimiter is set to space, so the output will contain it
+        cmd = '../bin/q -d " " -D " " -w all -W minimal "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),2)
+
+        self.assertEquals(o[0],'"quoted data" 23')
+        self.assertEquals(o[1],'unquoted-data 54')
+
+        self.cleanup(tmp_data_file)
+
+    def test_nonnumeric_output_quoting_mode(self):
+        tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+        cmd = '../bin/q -d " " -D , -w all -W nonnumeric "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),2)
+
+        self.assertEquals(o[0],'"quoted data",23')
+        self.assertEquals(o[1],'"unquoted-data",54')
+
+        self.cleanup(tmp_data_file)
+
+    def test_all_output_quoting_mode(self):
+        tmp_data_file = self.create_file_with_data(sample_quoted_data2)
+
+        cmd = '../bin/q -d " " -D , -w all -W all "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),2)
+
+        self.assertEquals(o[0],'"quoted data","23"')
+        self.assertEquals(o[1],'"unquoted-data","54"')
+
+        self.cleanup(tmp_data_file)
+
+    def _internal_test_consistency_of_chaining_output_to_input(self,input_data,input_wrapping_mode,output_wrapping_mode):
+
+        tmp_data_file = self.create_file_with_data(input_data)
+
+        basic_cmd = '../bin/q -w %s -W %s "select * from -"' % (input_wrapping_mode,output_wrapping_mode)
+        chained_cmd = 'cat %s | %s | %s | %s' % (tmp_data_file.name,basic_cmd,basic_cmd,basic_cmd)
+
+        retcode, o, e = run_command(chained_cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),2)
+
+        self.assertEquals("\n".join(o),input_data)
+
+        self.cleanup(tmp_data_file)
+
+    def test_consistency_of_chaining_minimal_wrapping_to_minimal_wrapping(self):
+        input_data = '"quoted data" 23\nunquoted-data 54'
+        self._internal_test_consistency_of_chaining_output_to_input(input_data,'minimal','minimal')
+
+    def test_consistency_of_chaining_all_wrapping_to_all_wrapping(self):
+        input_data = '"quoted data" "23"\n"unquoted-data" "54"'
+        self._internal_test_consistency_of_chaining_output_to_input(input_data,'all','all')
+
+    def test_utf8_with_bom_encoding(self):
+        return 
+        # DOES NOT PASS due to csv python module issue http://bugs.python.org/issue7185 . Need to find a workaround.
+        # Trying to wrap the file object passed to the csv module using a proper codec fails because of csv module internals.
+
+        utf_8_data_with_bom = '\xef\xbb\xbf"typeid","limit","apcost","date","checkpointId"\n"1","2","5","1,2,3,4,5,6,7","3000,3001,3002"\n"2","2","5","1,2,3,4,5,6,7","3003,3004,3005"\n'
+        tmp_data_file = self.create_file_with_data(utf_8_data_with_bom,encoding=None)
+
+        cmd = '../bin/q -d , -H -O -e utf-8-sig "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),3)
+
+        self.assertEquals(o[0],'typeid,limit,apcost,date,checkpointId')
+        self.assertEquals(o[1],'1,2,5,"1,2,3,4,5,6,7","3000,3001,3002"')
+        self.assertEquals(o[2],'2,2,5,"1,2,3,4,5,6,7","3003,3004,3005"')
+
+        self.cleanup(tmp_data_file)
+
+    def test_input_field_quoting_and_data_types_with_encoding(self):
+        # Checks combination of minimal input field quoting, with special characters that need to be decoded - 
+        # Both content and proper data types are verified
+        data = '111,22.22,"testing text with special characters - citt\xc3\xa0 ",http://somekindofurl.com,12.13.14.15,12.1\n'
+        tmp_data_file = self.create_file_with_data(data,encoding='none')
+
+        cmd = '../bin/q -d , "select * from %s"' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),1)
+
+        self.assertEquals(o[0].decode('utf-8'),u'111,22.22,testing text with special characters - citt\xe0 ,http://somekindofurl.com,12.13.14.15,12.1')
+
+        cmd = '../bin/q -d , "select * from %s" -A' % tmp_data_file.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode,0)
+        self.assertEquals(len(e),0)
+        self.assertEquals(len(o),7)
+
+        self.assertTrue(o[0].startswith('Table for file'))
+        self.assertEquals(o[1].strip(),'`c1` - int')
+        self.assertEquals(o[2].strip(),'`c2` - float')
+        self.assertEquals(o[3].strip(),'`c3` - text')
+        self.assertEquals(o[4].strip(),'`c4` - text')
+        self.assertEquals(o[5].strip(),'`c5` - text')
+        self.assertEquals(o[6].strip(),'`c6` - float')
+
+        self.cleanup(tmp_data_file)
+
     def test_multiline_double_double_quoted_values_in_quoted_data(self):
         tmp_data_file = self.create_file_with_data(sample_quoted_data)
         
@@ -865,7 +1095,7 @@ class BasicTests(AbstractQTestCase):
 
         tmp_data_file = self.create_file_with_data(double_double_quoted_data)
      
-        cmd = '../bin/q -d " " --disable-double-double-quoting "select c2 from %s"' % tmp_data_file.name
+        cmd = '../bin/q -d " " --disable-double-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
         self.assertEquals(retcode,0)
@@ -875,7 +1105,7 @@ class BasicTests(AbstractQTestCase):
         self.assertEquals(o[0],'double_double_quoted')
         self.assertEquals(o[1],'this is a quoted value with "double')
 
-        cmd = '../bin/q -d " " --disable-double-double-quoting "select c3 from %s"' % tmp_data_file.name
+        cmd = '../bin/q -d " " --disable-double-double-quoting "select c3 from %s" -W none' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
         self.assertEquals(retcode,0)
@@ -885,7 +1115,7 @@ class BasicTests(AbstractQTestCase):
         self.assertEquals(o[0],'')
         self.assertEquals(o[1],'double')
 
-        cmd = '../bin/q -d " " --disable-double-double-quoting "select c4 from %s"' % tmp_data_file.name
+        cmd = '../bin/q -d " " --disable-double-double-quoting "select c4 from %s" -W none' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
         self.assertEquals(retcode,0)
@@ -902,8 +1132,8 @@ class BasicTests(AbstractQTestCase):
         # this flag will be removed completely in the future
 
         tmp_data_file = self.create_file_with_data(escaped_double_quoted_data)
-     
-        cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c2 from %s"' % tmp_data_file.name
+        
+        cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
         self.assertEquals(retcode,0)
@@ -913,7 +1143,7 @@ class BasicTests(AbstractQTestCase):
         self.assertEquals(o[0],'escaped_double_quoted')
         self.assertEquals(o[1],'this is a quoted value with \\escaped')
 
-        cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c3 from %s"' % tmp_data_file.name
+        cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c3 from %s" -W none' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
         self.assertEquals(retcode,0)
@@ -923,7 +1153,7 @@ class BasicTests(AbstractQTestCase):
         self.assertEquals(o[0],'')
         self.assertEquals(o[1],'double')
 
-        cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c4 from %s"' % tmp_data_file.name
+        cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c4 from %s" -W none' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
         self.assertEquals(retcode,0)
@@ -1166,8 +1396,8 @@ class ParsingModeTests(AbstractQTestCase):
         self.assertEquals(len(o), 9)
         self.assertEquals(len(e), 0)
 
-        expected_output = ["/selinux", "/mnt", "/srv", "/lost+found", "/initrd.img.old -> /boot/initrd.img-3.8.0-19-generic",
-                           "/cdrom", "/home", "/vmlinuz -> boot/vmlinuz-3.8.0-19-generic", "/initrd.img -> boot/initrd.img-3.8.0-19-generic"]
+        expected_output = ["/selinux", "/mnt", "/srv", "/lost+found", '"/initrd.img.old -> /boot/initrd.img-3.8.0-19-generic"',
+                           "/cdrom", "/home", '"/vmlinuz -> boot/vmlinuz-3.8.0-19-generic"', '"/initrd.img -> boot/initrd.img-3.8.0-19-generic"']
 
         self.assertEquals(o, expected_output)