diff options
author | Harel Ben-Attia <harelba@gmail.com> | 2016-04-02 15:56:17 +0300 |
---|---|---|
committer | Harel Ben-Attia <harelba@gmail.com> | 2016-04-02 15:56:17 +0300 |
commit | cd8bc6f6f7159071089f3067b39085ab34257a0a (patch) | |
tree | 9e68709cc808392c7adc2cfde0c7d569e9a90ea5 | |
parent | 0aa96f216956391ec498dd4204a2f043ac1f2ba6 (diff) |
Added a flag to disable automatic column type detection
-rwxr-xr-x | bin/q | 21 | ||||
-rwxr-xr-x | test/test-suite | 69 |
2 files changed, 85 insertions, 5 deletions
@@ -396,7 +396,7 @@ class LineSplitter(object): class TableColumnInferer(object): - def __init__(self, mode, expected_column_count, input_delimiter, skip_header=False): + def __init__(self, mode, expected_column_count, input_delimiter, skip_header=False,disable_column_type_detection=False): self.inferred = False self.mode = mode self.rows = [] @@ -404,6 +404,7 @@ class TableColumnInferer(object): self.header_row = None self.expected_column_count = expected_column_count self.input_delimiter = input_delimiter + self.disable_column_type_detection = disable_column_type_detection def analyze(self, col_vals): if self.inferred: @@ -427,6 +428,9 @@ class TableColumnInferer(object): self.do_analysis() def determine_type_of_value(self, value): + if self.disable_column_type_detection: + return str + if value is not None: value = value.strip() if value == '' or value is None: @@ -677,7 +681,8 @@ class MaterializedFileState(object): class TableCreator(object): - def __init__(self, db, filenames_str, line_splitter, skip_header=False, gzipped=False, encoding='UTF-8', mode='fluffy', expected_column_count=None, input_delimiter=None,stdin_file=None,stdin_filename='-'): + def __init__(self, db, filenames_str, line_splitter, skip_header=False, gzipped=False, encoding='UTF-8', mode='fluffy', expected_column_count=None, input_delimiter=None,disable_column_type_detection=False, + stdin_file=None,stdin_filename='-'): self.db = db self.filenames_str = filenames_str self.skip_header = skip_header @@ -692,7 +697,7 @@ class TableCreator(object): self.stdin_filename = stdin_filename self.column_inferer = TableColumnInferer( - mode, expected_column_count, input_delimiter, skip_header) + mode, expected_column_count, input_delimiter, skip_header,disable_column_type_detection) # Filled only after table population since we're inferring the table # creation data @@ -1073,6 +1078,7 @@ class QInputParams(object): delimiter=' ',input_encoding='UTF-8',gzipped_input=False,parsing_mode='relaxed', expected_column_count=None,keep_leading_whitespace_in_values=False, disable_double_double_quoting=False,disable_escaped_double_quoting=False, + disable_column_type_detection=False, input_quoting_mode='minimal',stdin_file=None,stdin_filename='-'): self.skip_header = skip_header self.delimiter = delimiter @@ -1084,6 +1090,7 @@ class QInputParams(object): self.disable_double_double_quoting = disable_double_double_quoting self.disable_escaped_double_quoting = disable_escaped_double_quoting self.input_quoting_mode = input_quoting_mode + self.disable_column_type_detection = disable_column_type_detection def merged_with(self,input_params): params = QInputParams(**self.__dict__) @@ -1154,7 +1161,8 @@ class QTextAsData(object): table_creator = TableCreator( self.db, filename, line_splitter, input_params.skip_header, input_params.gzipped_input, input_params.input_encoding, mode=input_params.parsing_mode, expected_column_count=input_params.expected_column_count, - input_delimiter=input_params.delimiter,stdin_file = stdin_file,stdin_filename = stdin_filename) + input_delimiter=input_params.delimiter,disable_column_type_detection=input_params.disable_column_type_detection, + stdin_file = stdin_file,stdin_filename = stdin_filename) table_creator.populate(dialect_id,stop_after_analysis) @@ -1542,6 +1550,8 @@ def run_standalone(): help="Disable support for double double-quoting for escaping the double quote character. By default, you can use \"\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.") input_data_option_group.add_option("--disable-escaped-double-quoting", dest="disable_escaped_double_quoting", default=True, action="store_false", help="Disable support for escaped double-quoting for escaping the double quote character. By default, you can use \\\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.") + input_data_option_group.add_option("--disable-column-type-detection", dest="disable_column_type_detection", default=False, action="store_true", + help="Don't detect column types - All columns will be text columns") input_data_option_group.add_option("-w","--input-quoting-mode",dest="input_quoting_mode",default="minimal", help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.") parser.add_option_group(input_data_option_group) @@ -1676,7 +1686,8 @@ def run_standalone(): keep_leading_whitespace_in_values=options.keep_leading_whitespace_in_values, disable_double_double_quoting=options.disable_double_double_quoting, disable_escaped_double_quoting=options.disable_escaped_double_quoting, - input_quoting_mode=options.input_quoting_mode) + input_quoting_mode=options.input_quoting_mode, + disable_column_type_detection=options.disable_column_type_detection) q_engine = QTextAsData(default_input_params=default_input_params) output_params = QOutputParams( diff --git a/test/test-suite b/test/test-suite index 65652aa..e041e0d 100755 --- a/test/test-suite +++ b/test/test-suite @@ -1694,6 +1694,75 @@ class SqlTests(AbstractQTestCase): self.cleanup(tmpfile2) + def test_disable_column_type_detection(self): + tmpfile = self.create_file_with_data('''regular_text,text_with_digits1,text_with_digits2,float_number +"regular text 1",67,"67",12.3 +"regular text 2",067,"067",22.3 +"regular text 3",123,"123",33.4 +"regular text 4",-123,"-123",0122.2 +''') + + # Check original column type detection + cmd = '../bin/q -A -d , -H "select * from %s"' % (tmpfile.name) + + retcode, o, e = run_command(cmd) + + self.assertEquals(retcode, 0) + self.assertEquals(len(e), 0) + self.assertEquals(len(o), 5) + + + self.assertEquals(o[0],'Table for file: %s' % tmpfile.name) + self.assertEquals(o[1],' `regular_text` - text') + self.assertEquals(o[2],' `text_with_digits1` - int') + self.assertEquals(o[3],' `text_with_digits2` - int') + self.assertEquals(o[4],' `float_number` - float') + + # Check column types detected when actual detection is disabled + cmd = '../bin/q -A -d , -H --disable-column-type-detection "select * from %s"' % (tmpfile.name) + + retcode, o, e = run_command(cmd) + + self.assertEquals(retcode, 0) + self.assertEquals(len(e), 0) + self.assertEquals(len(o), 5) + + self.assertEquals(o[0],'Table for file: %s' % tmpfile.name) + self.assertEquals(o[1],' `regular_text` - text') + self.assertEquals(o[2],' `text_with_digits1` - text') + self.assertEquals(o[3],' `text_with_digits2` - text') + self.assertEquals(o[4],' `float_number` - text') + + # Get actual data with regular detection + cmd = '../bin/q -d , -H "select * from %s"' % (tmpfile.name) + + retcode, o, e = run_command(cmd) + + self.assertEquals(retcode, 0) + self.assertEquals(len(e), 0) + self.assertEquals(len(o), 4) + + self.assertEquals(o[0],"regular text 1,67,67,12.3"); + self.assertEquals(o[1],"regular text 2,67,67,22.3"); + self.assertEquals(o[2],"regular text 3,123,123,33.4"); + self.assertEquals(o[3],"regular text 4,-123,-123,122.2"); + + # Get actual data without detection + cmd = '../bin/q -d , -H --disable-column-type-detection "select * from %s"' % (tmpfile.name) + + retcode, o, e = run_command(cmd) + + self.assertEquals(retcode, 0) + self.assertEquals(len(e), 0) + self.assertEquals(len(o), 4) + + self.assertEquals(o[0],"regular text 1,67,67,12.3"); + self.assertEquals(o[1],"regular text 2,067,067,22.3"); + self.assertEquals(o[2],"regular text 3,123,123,33.4"); + self.assertEquals(o[3],"regular text 4,-123,-123,0122.2"); + + self.cleanup(tmpfile) + class BasicModuleTests(AbstractQTestCase): def test_simple_query(self): |