Added control over max field size + info when failing on this1.6.1 v1.6.0-release-test

author: Harel Ben-Attia <harelba@gmail.com> 2017-04-06 18:58:20 +0300
committer: Harel Ben-Attia <harelba@gmail.com> 2017-04-06 18:58:20 +0300
commit: 94bae328b5e083a2e1fbe87b9d40062efd97971a (patch)
tree: d543b254177909dfe9b9550f0c5f16aa28b24188
parent: 24ab831958743618a71d837981cf62722e97fcee (diff)
3 files changed, 136 insertions, 21 deletions
diff --git a/.gitignore b/.gitignore
index 9e318f1..76b8d88 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ setup.exe
 win_output
 win_build
 packages
+.idea/
diff --git a/bin/q b/bin/q
index a3b10d3..82d8c10 100755
--- a/bin/q
+++ b/bin/q
@@ -50,8 +50,6 @@ import uuid
 import cStringIO
 import math
 
-csv.field_size_limit(sys.maxsize)
-
 DEBUG = False
 
 def get_stdout_encoding(encoding_override=None):
@@ -210,6 +208,14 @@ class CouldNotConvertStringToNumericValueException(Exception):
     def __str(self):
         return repr(self.msg)
 
+class ColumnMaxLengthLimitExceededException(Exception):
+
+    def __init__(self, msg):
+        self.msg = msg
+
+    def __str(self):
+        return repr(self.msg)
+
 class CouldNotParseInputException(Exception):
 
     def __init__(self, msg):
@@ -636,6 +642,11 @@ def encoded_csv_reader(encoding, f, dialect, **kwargs):
             raise CouldNotConvertStringToNumericValueException(e.message)
         else:
             raise CouldNotParseInputException(str(e))
+    except Exception,e:
+        if str(e).startswith("field larger than field limit"):
+            raise ColumnMaxLengthLimitExceededException(str(e))
+        else:
+            raise
 
 def normalized_filename(filename):
     if filename == '-':
@@ -671,9 +682,13 @@ class MaterializedFileState(object):
             except Exception,e:
                 raise Exception('Tried to skip BOM for "utf-8-sig" encoding and failed. Error message is ' + str(e))
         csv_reader = encoded_csv_reader(self.encoding, self.f, dialect=self.dialect)
-        for col_vals in csv_reader:
-            self.lines_read += 1
-            yield col_vals
+        try:
+            for col_vals in csv_reader:
+                self.lines_read += 1
+                yield col_vals
+        except ColumnMaxLengthLimitExceededException,e:
+            msg = "Column length is larger than the maximum. Offending file is '%s' - Line is %s, counting from 1 (encoding %s). The line number is the raw line number of the file, ignoring whether there's a header or not" % (self.filename,self.lines_read + 1,self.encoding)
+            raise ColumnMaxLengthLimitExceededException(msg)
 
     def close(self):
         if self.f != sys.stdin:
@@ -1079,7 +1094,8 @@ class QInputParams(object):
             expected_column_count=None,keep_leading_whitespace_in_values=False,
             disable_double_double_quoting=False,disable_escaped_double_quoting=False,
             disable_column_type_detection=False,
-            input_quoting_mode='minimal',stdin_file=None,stdin_filename='-'):
+            input_quoting_mode='minimal',stdin_file=None,stdin_filename='-',
+            max_column_length_limit=131072):
         self.skip_header = skip_header
         self.delimiter = delimiter
         self.input_encoding = input_encoding
@@ -1091,6 +1107,7 @@ class QInputParams(object):
         self.disable_escaped_double_quoting = disable_escaped_double_quoting
         self.input_quoting_mode = input_quoting_mode
         self.disable_column_type_detection = disable_column_type_detection
+        self.max_column_length_limit = max_column_length_limit
 
     def merged_with(self,input_params):
         params = QInputParams(**self.__dict__)
@@ -1113,7 +1130,6 @@ class QTextAsData(object):
         # Create DB object
         self.db = Sqlite3DB()
 
-
     input_quoting_modes = {   'minimal' : csv.QUOTE_MINIMAL,
                         'all' : csv.QUOTE_ALL,
                         # nonnumeric is not supported for input quoting modes, since we determine the data types 
@@ -1149,6 +1165,8 @@ class QTextAsData(object):
         dialect_id = self.get_dialect_id(filename)
         csv.register_dialect(dialect_id, **q_dialect)
 
+        csv.field_size_limit(input_params.max_column_length_limit)
+
         # Create a line splitter
         line_splitter = LineSplitter(input_params.delimiter, input_params.expected_column_count)
 
@@ -1261,6 +1279,8 @@ class QTextAsData(object):
             error = QError(e,"Could not convert string to a numeric value. Did you use `-w nonnumeric` with unquoted string values? Error: %s" % e.msg,58)
         except CouldNotParseInputException,e:
             error = QError(e,"Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error: %s" % e.msg,59)
+        except ColumnMaxLengthLimitExceededException,e:
+            error = QError(e,e.msg,31)
         except KeyboardInterrupt,e:
             warnings.append(QWarning(e,"Interrupted"))
         except Exception, e:
@@ -1560,6 +1580,8 @@ def run_standalone():
                       help="Don't detect column types - All columns will be treated as text columns")
     input_data_option_group.add_option("-w","--input-quoting-mode",dest="input_quoting_mode",default="minimal",
                       help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.")
+    input_data_option_group.add_option("-M","--max-column-length-limit",dest="max_column_length_limit",default=131072,
+                      help="Sets the maximum column length.")
     parser.add_option_group(input_data_option_group)
     #-----------------------------------------------
     output_data_option_group = OptionGroup(parser,"Output Options") 
@@ -1683,6 +1705,14 @@ def run_standalone():
             # (since no input delimiter means any whitespace)
             options.output_delimiter = " "
 
+    try:
+        max_column_length_limit = int(options.max_column_length_limit)
+        if max_column_length_limit < 1:
+            raise Exception()
+    except:
+        print >> sys.stderr, "Max column length limit must be a positive integer (%s)" % max_column_length_limit
+        sys.exit(31)
+
     default_input_params = QInputParams(skip_header=options.skip_header,
         delimiter=options.delimiter,
         input_encoding=options.encoding,
@@ -1693,7 +1723,8 @@ def run_standalone():
         disable_double_double_quoting=options.disable_double_double_quoting,
         disable_escaped_double_quoting=options.disable_escaped_double_quoting,
         input_quoting_mode=options.input_quoting_mode,
-        disable_column_type_detection=options.disable_column_type_detection)
+        disable_column_type_detection=options.disable_column_type_detection,
+        max_column_length_limit=max_column_length_limit)
     q_engine = QTextAsData(default_input_params=default_input_params)
 
     output_params = QOutputParams(
diff --git a/test/test-suite b/test/test-suite
index 717e9c8..a0f6fcc 100755
--- a/test/test-suite
+++ b/test/test-suite
@@ -801,7 +801,7 @@ class BasicTests(AbstractQTestCase):
 
     def test_non_quoted_values_in_quoted_data(self):
         tmp_data_file = self.create_file_with_data(sample_quoted_data)
-        
+
         cmd = '../bin/q -d " " "select c1 from %s"' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
@@ -819,7 +819,7 @@ class BasicTests(AbstractQTestCase):
 
     def test_regular_quoted_values_in_quoted_data(self):
         tmp_data_file = self.create_file_with_data(sample_quoted_data)
-        
+
         cmd = '../bin/q -d " " "select c2 from %s"' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
@@ -836,7 +836,7 @@ class BasicTests(AbstractQTestCase):
 
     def test_double_double_quoted_values_in_quoted_data(self):
         tmp_data_file = self.create_file_with_data(sample_quoted_data)
-        
+
         cmd = '../bin/q -d " " "select c3 from %s"' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
@@ -853,7 +853,7 @@ class BasicTests(AbstractQTestCase):
 
     def test_escaped_double_quoted_values_in_quoted_data(self):
         tmp_data_file = self.create_file_with_data(sample_quoted_data)
-        
+
         cmd = '../bin/q -d " " "select c4 from %s"' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
@@ -880,7 +880,7 @@ class BasicTests(AbstractQTestCase):
 
         self.assertEquals(o[0],'"quoted,data",23')
         self.assertEquals(o[1],'unquoted-data,54,')
-	
+
         self.cleanup(tmp_data_file)
 
     def test_none_input_quoting_mode_in_strict_mode(self):
@@ -1061,7 +1061,7 @@ class BasicTests(AbstractQTestCase):
         self.cleanup(tmp_data_file)
 
     def test_input_field_quoting_and_data_types_with_encoding(self):
-        # Checks combination of minimal input field quoting, with special characters that need to be decoded - 
+        # Checks combination of minimal input field quoting, with special characters that need to be decoded -
         # Both content and proper data types are verified
         data = '111,22.22,"testing text with special characters - citt\xc3\xa0 ",http://somekindofurl.com,12.13.14.15,12.1\n'
         tmp_data_file = self.create_file_with_data(data,encoding='none')
@@ -1094,7 +1094,7 @@ class BasicTests(AbstractQTestCase):
 
     def test_multiline_double_double_quoted_values_in_quoted_data(self):
         tmp_data_file = self.create_file_with_data(sample_quoted_data)
-        
+
         # FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
         cmd = '../bin/q -d " " "select replace(c5,X\'0A\',\'::\') from %s"' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
@@ -1112,7 +1112,7 @@ class BasicTests(AbstractQTestCase):
 
     def test_multiline_escaped_double_quoted_values_in_quoted_data(self):
         tmp_data_file = self.create_file_with_data(sample_quoted_data)
-        
+
         # FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
         cmd = '../bin/q -d " " "select replace(c6,X\'0A\',\'::\') from %s"' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
@@ -1129,11 +1129,11 @@ class BasicTests(AbstractQTestCase):
         self.cleanup(tmp_data_file)
 
     def test_disable_double_double_quoted_data_flag__values(self):
-        # This test (and flag) is meant to verify backward comptibility only. It is possible that 
+        # This test (and flag) is meant to verify backward comptibility only. It is possible that
         # this flag will be removed completely in the future
 
         tmp_data_file = self.create_file_with_data(double_double_quoted_data)
-     
+
         cmd = '../bin/q -d " " --disable-double-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
@@ -1167,11 +1167,11 @@ class BasicTests(AbstractQTestCase):
         self.cleanup(tmp_data_file)
 
     def test_disable_escaped_double_quoted_data_flag__values(self):
-        # This test (and flag) is meant to verify backward comptibility only. It is possible that 
+        # This test (and flag) is meant to verify backward comptibility only. It is possible that
         # this flag will be removed completely in the future
 
         tmp_data_file = self.create_file_with_data(escaped_double_quoted_data)
-        
+
         cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
         retcode, o, e = run_command(cmd)
 
@@ -1205,7 +1205,7 @@ class BasicTests(AbstractQTestCase):
         self.cleanup(tmp_data_file)
 
     def test_combined_quoted_data_flags__number_of_columns_detected(self):
-        # This test (and flags) is meant to verify backward comptibility only. It is possible that 
+        # This test (and flags) is meant to verify backward comptibility only. It is possible that
         # these flags will be removed completely in the future
         tmp_data_file = self.create_file_with_data(combined_quoted_data)
 
@@ -1258,6 +1258,86 @@ class BasicTests(AbstractQTestCase):
 
         self.assertEquals(e[0],"No files matching 'non-existent-file' have been found")
 
+    def test_default_column_max_length_parameter__short_enough(self):
+        huge_text = "x" * 131000
+
+        file_data = "a,b,c\n1,%s,3\n" % huge_text
+
+        tmpfile = self.create_file_with_data(file_data)
+
+        cmd = '../bin/q -H -d , "select a from %s"' % tmpfile.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode, 0)
+        self.assertEquals(len(o), 1)
+        self.assertEquals(len(e), 0)
+
+        self.assertEquals(o[0],'1')
+
+        self.cleanup(tmpfile)
+
+    def test_default_column_max_length_parameter__too_long(self):
+        huge_text = "x" * 132000
+
+        file_data = "a,b,c\n1,%s,3\n" % huge_text
+
+        tmpfile = self.create_file_with_data(file_data)
+
+        cmd = '../bin/q -H -d , "select a from %s"' % tmpfile.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode, 31)
+        self.assertEquals(len(o), 0)
+        self.assertEquals(len(e), 1)
+
+        self.assertTrue(e[0].startswith("Column length is larger than the maximum"))
+        self.assertTrue(("Offending file is '%s'" % tmpfile.name) in e[0])
+        self.assertTrue('Line is 2' in e[0])
+
+        self.cleanup(tmpfile)
+
+    def test_column_max_length_parameter(self):
+        file_data = "a,b,c\nvery-long-text,2,3\n"
+        tmpfile = self.create_file_with_data(file_data)
+
+        cmd = '../bin/q -H -d , -M 3 "select a from %s"' % tmpfile.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode, 31)
+        self.assertEquals(len(o), 0)
+        self.assertEquals(len(e), 1)
+
+        self.assertTrue(e[0].startswith("Column length is larger than the maximum"))
+        self.assertTrue(("Offending file is '%s'" % tmpfile.name) in e[0])
+        self.assertTrue('Line is 2' in e[0])
+
+        cmd2 = '../bin/q -H -d , -M 300 -H "select a from %s"' % tmpfile.name
+        retcode2, o2, e2 = run_command(cmd2)
+
+        self.assertEquals(retcode2, 0)
+        self.assertEquals(len(o2), 1)
+        self.assertEquals(len(e2), 0)
+
+        self.assertEquals(o2[0],'very-long-text')
+
+        self.cleanup(tmpfile)
+
+    def test_invalid_column_max_length_parameter(self):
+        file_data = "a,b,c\nvery-long-text,2,3\n"
+        tmpfile = self.create_file_with_data(file_data)
+
+        cmd = '../bin/q -H -d , -M 0 "select a from %s"' % tmpfile.name
+        retcode, o, e = run_command(cmd)
+
+        self.assertEquals(retcode, 31)
+        self.assertEquals(len(o), 0)
+        self.assertEquals(len(e), 1)
+
+        self.assertTrue(e[0].startswith('Max column length limit must be a positive integer'))
+
+
+        self.cleanup(tmpfile)
+
 class ParsingModeTests(AbstractQTestCase):
 
     def test_strict_mode_column_count_mismatch_error(self):
@@ -1580,6 +1660,7 @@ class ParsingModeTests(AbstractQTestCase):
         self.cleanup(tmpfile)
 
 
+
 class FormattingTests(AbstractQTestCase):
 
     def test_column_formatting(self):
@@ -1607,6 +1688,7 @@ class FormattingTests(AbstractQTestCase):
         self.assertEquals(o[1], '55.000 5.500')
 
 
+
 class SqlTests(AbstractQTestCase):
 
     def test_find_example(self):
@@ -1761,6 +1843,7 @@ class SqlTests(AbstractQTestCase):
 
         self.cleanup(tmpfile)
 
+
 class BasicModuleTests(AbstractQTestCase):
 
     def test_simple_query(self):
author	Harel Ben-Attia <harelba@gmail.com>	2017-04-06 18:58:20 +0300
committer	Harel Ben-Attia <harelba@gmail.com>	2017-04-06 18:58:20 +0300
commit	94bae328b5e083a2e1fbe87b9d40062efd97971a (patch)
tree	d543b254177909dfe9b9550f0c5f16aa28b24188
parent	24ab831958743618a71d837981cf62722e97fcee (diff)