summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarel Ben-Attia <harelba@gmail.com>2017-04-06 18:58:20 +0300
committerHarel Ben-Attia <harelba@gmail.com>2017-04-06 18:58:20 +0300
commit94bae328b5e083a2e1fbe87b9d40062efd97971a (patch)
treed543b254177909dfe9b9550f0c5f16aa28b24188
parent24ab831958743618a71d837981cf62722e97fcee (diff)
Added control over max field size + info when failing on this1.6.1v1.6.0-release-test
-rw-r--r--.gitignore1
-rwxr-xr-xbin/q47
-rwxr-xr-xtest/test-suite109
3 files changed, 136 insertions, 21 deletions
diff --git a/.gitignore b/.gitignore
index 9e318f1..76b8d88 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ setup.exe
win_output
win_build
packages
+.idea/
diff --git a/bin/q b/bin/q
index a3b10d3..82d8c10 100755
--- a/bin/q
+++ b/bin/q
@@ -50,8 +50,6 @@ import uuid
import cStringIO
import math
-csv.field_size_limit(sys.maxsize)
-
DEBUG = False
def get_stdout_encoding(encoding_override=None):
@@ -210,6 +208,14 @@ class CouldNotConvertStringToNumericValueException(Exception):
def __str(self):
return repr(self.msg)
+class ColumnMaxLengthLimitExceededException(Exception):
+
+ def __init__(self, msg):
+ self.msg = msg
+
+ def __str(self):
+ return repr(self.msg)
+
class CouldNotParseInputException(Exception):
def __init__(self, msg):
@@ -636,6 +642,11 @@ def encoded_csv_reader(encoding, f, dialect, **kwargs):
raise CouldNotConvertStringToNumericValueException(e.message)
else:
raise CouldNotParseInputException(str(e))
+ except Exception,e:
+ if str(e).startswith("field larger than field limit"):
+ raise ColumnMaxLengthLimitExceededException(str(e))
+ else:
+ raise
def normalized_filename(filename):
if filename == '-':
@@ -671,9 +682,13 @@ class MaterializedFileState(object):
except Exception,e:
raise Exception('Tried to skip BOM for "utf-8-sig" encoding and failed. Error message is ' + str(e))
csv_reader = encoded_csv_reader(self.encoding, self.f, dialect=self.dialect)
- for col_vals in csv_reader:
- self.lines_read += 1
- yield col_vals
+ try:
+ for col_vals in csv_reader:
+ self.lines_read += 1
+ yield col_vals
+ except ColumnMaxLengthLimitExceededException,e:
+ msg = "Column length is larger than the maximum. Offending file is '%s' - Line is %s, counting from 1 (encoding %s). The line number is the raw line number of the file, ignoring whether there's a header or not" % (self.filename,self.lines_read + 1,self.encoding)
+ raise ColumnMaxLengthLimitExceededException(msg)
def close(self):
if self.f != sys.stdin:
@@ -1079,7 +1094,8 @@ class QInputParams(object):
expected_column_count=None,keep_leading_whitespace_in_values=False,
disable_double_double_quoting=False,disable_escaped_double_quoting=False,
disable_column_type_detection=False,
- input_quoting_mode='minimal',stdin_file=None,stdin_filename='-'):
+ input_quoting_mode='minimal',stdin_file=None,stdin_filename='-',
+ max_column_length_limit=131072):
self.skip_header = skip_header
self.delimiter = delimiter
self.input_encoding = input_encoding
@@ -1091,6 +1107,7 @@ class QInputParams(object):
self.disable_escaped_double_quoting = disable_escaped_double_quoting
self.input_quoting_mode = input_quoting_mode
self.disable_column_type_detection = disable_column_type_detection
+ self.max_column_length_limit = max_column_length_limit
def merged_with(self,input_params):
params = QInputParams(**self.__dict__)
@@ -1113,7 +1130,6 @@ class QTextAsData(object):
# Create DB object
self.db = Sqlite3DB()
-
input_quoting_modes = { 'minimal' : csv.QUOTE_MINIMAL,
'all' : csv.QUOTE_ALL,
# nonnumeric is not supported for input quoting modes, since we determine the data types
@@ -1149,6 +1165,8 @@ class QTextAsData(object):
dialect_id = self.get_dialect_id(filename)
csv.register_dialect(dialect_id, **q_dialect)
+ csv.field_size_limit(input_params.max_column_length_limit)
+
# Create a line splitter
line_splitter = LineSplitter(input_params.delimiter, input_params.expected_column_count)
@@ -1261,6 +1279,8 @@ class QTextAsData(object):
error = QError(e,"Could not convert string to a numeric value. Did you use `-w nonnumeric` with unquoted string values? Error: %s" % e.msg,58)
except CouldNotParseInputException,e:
error = QError(e,"Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error: %s" % e.msg,59)
+ except ColumnMaxLengthLimitExceededException,e:
+ error = QError(e,e.msg,31)
except KeyboardInterrupt,e:
warnings.append(QWarning(e,"Interrupted"))
except Exception, e:
@@ -1560,6 +1580,8 @@ def run_standalone():
help="Don't detect column types - All columns will be treated as text columns")
input_data_option_group.add_option("-w","--input-quoting-mode",dest="input_quoting_mode",default="minimal",
help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.")
+ input_data_option_group.add_option("-M","--max-column-length-limit",dest="max_column_length_limit",default=131072,
+ help="Sets the maximum column length.")
parser.add_option_group(input_data_option_group)
#-----------------------------------------------
output_data_option_group = OptionGroup(parser,"Output Options")
@@ -1683,6 +1705,14 @@ def run_standalone():
# (since no input delimiter means any whitespace)
options.output_delimiter = " "
+ try:
+ max_column_length_limit = int(options.max_column_length_limit)
+ if max_column_length_limit < 1:
+ raise Exception()
+ except:
+ print >> sys.stderr, "Max column length limit must be a positive integer (%s)" % max_column_length_limit
+ sys.exit(31)
+
default_input_params = QInputParams(skip_header=options.skip_header,
delimiter=options.delimiter,
input_encoding=options.encoding,
@@ -1693,7 +1723,8 @@ def run_standalone():
disable_double_double_quoting=options.disable_double_double_quoting,
disable_escaped_double_quoting=options.disable_escaped_double_quoting,
input_quoting_mode=options.input_quoting_mode,
- disable_column_type_detection=options.disable_column_type_detection)
+ disable_column_type_detection=options.disable_column_type_detection,
+ max_column_length_limit=max_column_length_limit)
q_engine = QTextAsData(default_input_params=default_input_params)
output_params = QOutputParams(
diff --git a/test/test-suite b/test/test-suite
index 717e9c8..a0f6fcc 100755
--- a/test/test-suite
+++ b/test/test-suite
@@ -801,7 +801,7 @@ class BasicTests(AbstractQTestCase):
def test_non_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
+
cmd = '../bin/q -d " " "select c1 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)
@@ -819,7 +819,7 @@ class BasicTests(AbstractQTestCase):
def test_regular_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
+
cmd = '../bin/q -d " " "select c2 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)
@@ -836,7 +836,7 @@ class BasicTests(AbstractQTestCase):
def test_double_double_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
+
cmd = '../bin/q -d " " "select c3 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)
@@ -853,7 +853,7 @@ class BasicTests(AbstractQTestCase):
def test_escaped_double_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
+
cmd = '../bin/q -d " " "select c4 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)
@@ -880,7 +880,7 @@ class BasicTests(AbstractQTestCase):
self.assertEquals(o[0],'"quoted,data",23')
self.assertEquals(o[1],'unquoted-data,54,')
-
+
self.cleanup(tmp_data_file)
def test_none_input_quoting_mode_in_strict_mode(self):
@@ -1061,7 +1061,7 @@ class BasicTests(AbstractQTestCase):
self.cleanup(tmp_data_file)
def test_input_field_quoting_and_data_types_with_encoding(self):
- # Checks combination of minimal input field quoting, with special characters that need to be decoded -
+ # Checks combination of minimal input field quoting, with special characters that need to be decoded -
# Both content and proper data types are verified
data = '111,22.22,"testing text with special characters - citt\xc3\xa0 ",http://somekindofurl.com,12.13.14.15,12.1\n'
tmp_data_file = self.create_file_with_data(data,encoding='none')
@@ -1094,7 +1094,7 @@ class BasicTests(AbstractQTestCase):
def test_multiline_double_double_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
+
# FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
cmd = '../bin/q -d " " "select replace(c5,X\'0A\',\'::\') from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)
@@ -1112,7 +1112,7 @@ class BasicTests(AbstractQTestCase):
def test_multiline_escaped_double_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)
-
+
# FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
cmd = '../bin/q -d " " "select replace(c6,X\'0A\',\'::\') from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)
@@ -1129,11 +1129,11 @@ class BasicTests(AbstractQTestCase):
self.cleanup(tmp_data_file)
def test_disable_double_double_quoted_data_flag__values(self):
- # This test (and flag) is meant to verify backward comptibility only. It is possible that
+ # This test (and flag) is meant to verify backward comptibility only. It is possible that
# this flag will be removed completely in the future
tmp_data_file = self.create_file_with_data(double_double_quoted_data)
-
+
cmd = '../bin/q -d " " --disable-double-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
retcode, o, e = run_command(cmd)
@@ -1167,11 +1167,11 @@ class BasicTests(AbstractQTestCase):
self.cleanup(tmp_data_file)
def test_disable_escaped_double_quoted_data_flag__values(self):
- # This test (and flag) is meant to verify backward comptibility only. It is possible that
+ # This test (and flag) is meant to verify backward comptibility only. It is possible that
# this flag will be removed completely in the future
tmp_data_file = self.create_file_with_data(escaped_double_quoted_data)
-
+
cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c2 from %s" -W none' % tmp_data_file.name
retcode, o, e = run_command(cmd)
@@ -1205,7 +1205,7 @@ class BasicTests(AbstractQTestCase):
self.cleanup(tmp_data_file)
def test_combined_quoted_data_flags__number_of_columns_detected(self):
- # This test (and flags) is meant to verify backward comptibility only. It is possible that
+ # This test (and flags) is meant to verify backward comptibility only. It is possible that
# these flags will be removed completely in the future
tmp_data_file = self.create_file_with_data(combined_quoted_data)
@@ -1258,6 +1258,86 @@ class BasicTests(AbstractQTestCase):
self.assertEquals(e[0],"No files matching 'non-existent-file' have been found")
+ def test_default_column_max_length_parameter__short_enough(self):
+ huge_text = "x" * 131000
+
+ file_data = "a,b,c\n1,%s,3\n" % huge_text
+
+ tmpfile = self.create_file_with_data(file_data)
+
+ cmd = '../bin/q -H -d , "select a from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEquals(retcode, 0)
+ self.assertEquals(len(o), 1)
+ self.assertEquals(len(e), 0)
+
+ self.assertEquals(o[0],'1')
+
+ self.cleanup(tmpfile)
+
+ def test_default_column_max_length_parameter__too_long(self):
+ huge_text = "x" * 132000
+
+ file_data = "a,b,c\n1,%s,3\n" % huge_text
+
+ tmpfile = self.create_file_with_data(file_data)
+
+ cmd = '../bin/q -H -d , "select a from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEquals(retcode, 31)
+ self.assertEquals(len(o), 0)
+ self.assertEquals(len(e), 1)
+
+ self.assertTrue(e[0].startswith("Column length is larger than the maximum"))
+ self.assertTrue(("Offending file is '%s'" % tmpfile.name) in e[0])
+ self.assertTrue('Line is 2' in e[0])
+
+ self.cleanup(tmpfile)
+
+ def test_column_max_length_parameter(self):
+ file_data = "a,b,c\nvery-long-text,2,3\n"
+ tmpfile = self.create_file_with_data(file_data)
+
+ cmd = '../bin/q -H -d , -M 3 "select a from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEquals(retcode, 31)
+ self.assertEquals(len(o), 0)
+ self.assertEquals(len(e), 1)
+
+ self.assertTrue(e[0].startswith("Column length is larger than the maximum"))
+ self.assertTrue(("Offending file is '%s'" % tmpfile.name) in e[0])
+ self.assertTrue('Line is 2' in e[0])
+
+ cmd2 = '../bin/q -H -d , -M 300 -H "select a from %s"' % tmpfile.name
+ retcode2, o2, e2 = run_command(cmd2)
+
+ self.assertEquals(retcode2, 0)
+ self.assertEquals(len(o2), 1)
+ self.assertEquals(len(e2), 0)
+
+ self.assertEquals(o2[0],'very-long-text')
+
+ self.cleanup(tmpfile)
+
+ def test_invalid_column_max_length_parameter(self):
+ file_data = "a,b,c\nvery-long-text,2,3\n"
+ tmpfile = self.create_file_with_data(file_data)
+
+ cmd = '../bin/q -H -d , -M 0 "select a from %s"' % tmpfile.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEquals(retcode, 31)
+ self.assertEquals(len(o), 0)
+ self.assertEquals(len(e), 1)
+
+ self.assertTrue(e[0].startswith('Max column length limit must be a positive integer'))
+
+
+ self.cleanup(tmpfile)
+
class ParsingModeTests(AbstractQTestCase):
def test_strict_mode_column_count_mismatch_error(self):
@@ -1580,6 +1660,7 @@ class ParsingModeTests(AbstractQTestCase):
self.cleanup(tmpfile)
+
class FormattingTests(AbstractQTestCase):
def test_column_formatting(self):
@@ -1607,6 +1688,7 @@ class FormattingTests(AbstractQTestCase):
self.assertEquals(o[1], '55.000 5.500')
+
class SqlTests(AbstractQTestCase):
def test_find_example(self):
@@ -1761,6 +1843,7 @@ class SqlTests(AbstractQTestCase):
self.cleanup(tmpfile)
+
class BasicModuleTests(AbstractQTestCase):
def test_simple_query(self):