summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarel Ben-Attia <harelba@gmail.com>2017-04-22 11:49:09 +0300
committerHarel Ben-Attia <harelba@gmail.com>2017-04-22 11:49:09 +0300
commita81a493bcee35177d5fbdb17e72209723bf91bb1 (patch)
tree39f223fb19f9f0b6c9c308710156573f651d506c
parent544abf0629d587b287cf387b1d7590931f9fe4d9 (diff)
Added flag for basic support of universal newlines
-rwxr-xr-xbin/q33
-rwxr-xr-xtest/test-suite39
2 files changed, 67 insertions, 5 deletions
diff --git a/bin/q b/bin/q
index bd3175f..b794ef3 100755
--- a/bin/q
+++ b/bin/q
@@ -246,6 +246,11 @@ class CannotUnzipStdInException(Exception):
def __init__(self):
pass
+class UniversalNewlinesExistException(Exception):
+
+ def __init__(self):
+ pass
+
class UnprovidedStdInException(Exception):
def __init__(self):
@@ -645,6 +650,8 @@ def encoded_csv_reader(encoding, f, dialect, **kwargs):
except Exception,e:
if str(e).startswith("field larger than field limit"):
raise ColumnMaxLengthLimitExceededException(str(e))
+ elif 'universal-newline' in str(e):
+ raise UniversalNewlinesExistException()
else:
raise
@@ -689,6 +696,9 @@ class MaterializedFileState(object):
except ColumnMaxLengthLimitExceededException,e:
msg = "Column length is larger than the maximum. Offending file is '%s' - Line is %s, counting from 1 (encoding %s). The line number is the raw line number of the file, ignoring whether there's a header or not" % (self.filename,self.lines_read + 1,self.encoding)
raise ColumnMaxLengthLimitExceededException(msg)
+ except UniversalNewlinesExistException,e2:
+ # No need to translate the exception, but we want it to be explicitly defined here for clarity
+ raise UniversalNewlinesExistException()
def close(self):
if self.f != sys.stdin:
@@ -696,7 +706,7 @@ class MaterializedFileState(object):
class TableCreator(object):
- def __init__(self, db, filenames_str, line_splitter, skip_header=False, gzipped=False, encoding='UTF-8', mode='fluffy', expected_column_count=None, input_delimiter=None,disable_column_type_detection=False,
+ def __init__(self, db, filenames_str, line_splitter, skip_header=False, gzipped=False, with_universal_newlines=False, encoding='UTF-8', mode='fluffy', expected_column_count=None, input_delimiter=None,disable_column_type_detection=False,
stdin_file=None,stdin_filename='-'):
self.db = db
self.filenames_str = filenames_str
@@ -710,6 +720,7 @@ class TableCreator(object):
self.input_delimiter = input_delimiter
self.stdin_file = stdin_file
self.stdin_filename = stdin_filename
+ self.with_universal_newlines = with_universal_newlines
self.column_inferer = TableColumnInferer(
mode, expected_column_count, input_delimiter, skip_header,disable_column_type_detection)
@@ -755,6 +766,8 @@ class TableCreator(object):
return self.table_name
def open_file(self,filename):
+ # TODO Support universal newlines for gzipped and stdin data as well
+
# Check if it's standard input or a file
if filename == self.stdin_filename:
if self.stdin_file is None:
@@ -764,9 +777,13 @@ class TableCreator(object):
raise CannotUnzipStdInException()
else:
if self.gzipped or filename.endswith('.gz'):
- f = gzip.GzipFile(fileobj=file(filename,'rb'))
+ f = gzip.GzipFile(fileobj=file(filename,'rb'))
else:
- f = file(filename,'rb')
+ if self.with_universal_newlines:
+ file_opening_mode = 'rbU'
+ else:
+ file_opening_mode = 'rb'
+ f = file(filename,file_opening_mode)
return f
def _pre_populate(self,dialect):
@@ -1090,7 +1107,7 @@ class QOutput(object):
class QInputParams(object):
def __init__(self,skip_header=False,
- delimiter=' ',input_encoding='UTF-8',gzipped_input=False,parsing_mode='relaxed',
+ delimiter=' ',input_encoding='UTF-8',gzipped_input=False,with_universal_newlines=False,parsing_mode='relaxed',
expected_column_count=None,keep_leading_whitespace_in_values=False,
disable_double_double_quoting=False,disable_escaped_double_quoting=False,
disable_column_type_detection=False,
@@ -1100,6 +1117,7 @@ class QInputParams(object):
self.delimiter = delimiter
self.input_encoding = input_encoding
self.gzipped_input = gzipped_input
+ self.with_universal_newlines = with_universal_newlines
self.parsing_mode = parsing_mode
self.expected_column_count = expected_column_count
self.keep_leading_whitespace_in_values = keep_leading_whitespace_in_values
@@ -1177,7 +1195,7 @@ class QTextAsData(object):
# Create the matching database table and populate it
table_creator = TableCreator(
- self.db, filename, line_splitter, input_params.skip_header, input_params.gzipped_input, input_params.input_encoding,
+ self.db, filename, line_splitter, input_params.skip_header, input_params.gzipped_input, input_params.with_universal_newlines,input_params.input_encoding,
mode=input_params.parsing_mode, expected_column_count=input_params.expected_column_count,
input_delimiter=input_params.delimiter,disable_column_type_detection=input_params.disable_column_type_detection,
stdin_file = stdin_file,stdin_filename = stdin_filename)
@@ -1273,6 +1291,8 @@ class QTextAsData(object):
error = QError(e,"Bad header row: %s" % e.msg,35)
except CannotUnzipStdInException,e:
error = QError(e,"Cannot decompress standard input. Pipe the input through zcat in order to decompress.",36)
+ except UniversalNewlinesExistException,e:
+ error = QError(e,"Data contains universal newlines. Run q with -U to use universal newlines. Please note that q still doesn't support universal newlines for .gz files or for stdin. Route the data through a regular file to use -U.",103)
except UnprovidedStdInException,e:
error = QError(e,"Standard Input must be provided in order to use it as a table",61)
except CouldNotConvertStringToNumericValueException,e:
@@ -1582,6 +1602,8 @@ def run_standalone():
help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.")
input_data_option_group.add_option("-M","--max-column-length-limit",dest="max_column_length_limit",default=131072,
help="Sets the maximum column length.")
+ input_data_option_group.add_option("-U","--with-universal-newlines",dest="with_universal_newlines",default=False,action="store_true",
+ help="Expect universal newlines in the data. Limitation: -U works only with regular files for now, stdin or .gz files are not supported yet.")
parser.add_option_group(input_data_option_group)
#-----------------------------------------------
output_data_option_group = OptionGroup(parser,"Output Options")
@@ -1717,6 +1739,7 @@ def run_standalone():
delimiter=options.delimiter,
input_encoding=options.encoding,
gzipped_input=options.gzipped,
+ with_universal_newlines=options.with_universal_newlines,
parsing_mode=options.mode,
expected_column_count=expected_column_count,
keep_leading_whitespace_in_values=options.keep_leading_whitespace_in_values,
diff --git a/test/test-suite b/test/test-suite
index a0f6fcc..c6d9344 100755
--- a/test/test-suite
+++ b/test/test-suite
@@ -1687,6 +1687,45 @@ class FormattingTests(AbstractQTestCase):
self.assertEquals(o[0], 'mysum myavg')
self.assertEquals(o[1], '55.000 5.500')
+ def test_failure_to_parse_universal_newlines_without_explicit_flag(self):
+ data = 'permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round\rlifelock,LifeLock,,web,Tempe,AZ,1-May-07,6850000,USD,b\rlifelock,LifeLock,,web,Tempe,AZ,1-Oct-06,6000000,USD,a\rlifelock,LifeLock,,web,Tempe,AZ,1-Jan-08,25000000,USD,c\rmycityfaces,MyCityFaces,7,web,Scottsdale,AZ,1-Jan-08,50000,USD,seed\rflypaper,Flypaper,,web,Phoenix,AZ,1-Feb-08,3000000,USD,a\rinfusionsoft,Infusionsoft,105,software,Gilbert,AZ,1-Oct-07,9000000,USD,a'
+ tmp_data_file = self.create_file_with_data(data)
+
+ cmd = '../bin/q -d , -H "select * from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertNotEquals(retcode, 0)
+ self.assertEquals(len(e), 1)
+ self.assertEquals(len(o), 0)
+
+ self.assertTrue(e[0].startswith('Data contains universal newlines'))
+
+ self.cleanup(tmp_data_file)
+
+ def test_universal_newlines_parsing_flag(self):
+ expected_output = [['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-May-07', '6850000', 'USD', 'b'],
+ ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Oct-06', '6000000', 'USD', 'a'],
+ ['lifelock', 'LifeLock', '', 'web', 'Tempe', 'AZ', '1-Jan-08', '25000000', 'USD', 'c'],
+ ['mycityfaces', 'MyCityFaces', '7', 'web', 'Scottsdale', 'AZ', '1-Jan-08', '50000', 'USD', 'seed'],
+ ['flypaper', 'Flypaper', '', 'web', 'Phoenix', 'AZ', '1-Feb-08', '3000000', 'USD', 'a'],
+ ['infusionsoft', 'Infusionsoft', '105', 'software', 'Gilbert', 'AZ', '1-Oct-07', '9000000', 'USD', 'a']]
+
+ data = 'permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round\rlifelock,LifeLock,,web,Tempe,AZ,1-May-07,6850000,USD,b\rlifelock,LifeLock,,web,Tempe,AZ,1-Oct-06,6000000,USD,a\rlifelock,LifeLock,,web,Tempe,AZ,1-Jan-08,25000000,USD,c\rmycityfaces,MyCityFaces,7,web,Scottsdale,AZ,1-Jan-08,50000,USD,seed\rflypaper,Flypaper,,web,Phoenix,AZ,1-Feb-08,3000000,USD,a\rinfusionsoft,Infusionsoft,105,software,Gilbert,AZ,1-Oct-07,9000000,USD,a'
+ tmp_data_file = self.create_file_with_data(data)
+
+ cmd = '../bin/q -d , -H -U "select permalink,company,numEmps,category,city,state,fundedDate,raisedAmt,raisedCurrency,round from %s"' % tmp_data_file.name
+ retcode, o, e = run_command(cmd)
+
+ self.assertEquals(retcode,0)
+ self.assertEquals(len(e), 0)
+ self.assertEquals(len(o), 6)
+
+ actual_output = map(lambda row: row.split(","),o)
+
+ self.assertEquals(actual_output,expected_output)
+
+ self.cleanup(tmp_data_file)
+
class SqlTests(AbstractQTestCase):