diff options
author | Harel Ben-Attia <harelba@gmail.com> | 2017-04-06 19:11:07 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-04-06 19:11:07 +0300 |
commit | f8bbceafb991f5580ea89266f298ae10865bcf42 (patch) | |
tree | d543b254177909dfe9b9550f0c5f16aa28b24188 | |
parent | cfe2d047810cc54821bc3832e2076d373c643d0a (diff) | |
parent | 94bae328b5e083a2e1fbe87b9d40062efd97971a (diff) |
Merge pull request #143 from harelba/v1.6.0-release-test1.6.2
V1.6.2 stuff
-rw-r--r-- | .gitignore | 2 | ||||
-rwxr-xr-x | bin/q | 53 | ||||
-rwxr-xr-x | build-deb-builder-container | 10 | ||||
-rwxr-xr-x | build-rpm-builder-container | 10 | ||||
-rwxr-xr-x | dist/create-rpm | 20 | ||||
-rw-r--r-- | dist/deb-builder-Dockerfile | 8 | ||||
-rw-r--r-- | dist/q-text-as-data.spec.template | 10 | ||||
-rw-r--r-- | dist/rpm-builder-Dockerfile | 12 | ||||
-rwxr-xr-x | package-release | 35 | ||||
-rwxr-xr-x | test/test-suite | 109 |
10 files changed, 234 insertions, 35 deletions
@@ -8,3 +8,5 @@ rpm_build_area setup.exe win_output win_build +packages +.idea/ @@ -27,7 +27,7 @@ # # Run with --help for command line details # -q_version = "1.6.0notreleasedyet" +q_version = "1.6.0" __all__ = [ 'QTextAsData' ] @@ -50,8 +50,6 @@ import uuid import cStringIO import math -csv.field_size_limit(sys.maxsize) - DEBUG = False def get_stdout_encoding(encoding_override=None): @@ -210,6 +208,14 @@ class CouldNotConvertStringToNumericValueException(Exception): def __str(self): return repr(self.msg) +class ColumnMaxLengthLimitExceededException(Exception): + + def __init__(self, msg): + self.msg = msg + + def __str(self): + return repr(self.msg) + class CouldNotParseInputException(Exception): def __init__(self, msg): @@ -636,6 +642,11 @@ def encoded_csv_reader(encoding, f, dialect, **kwargs): raise CouldNotConvertStringToNumericValueException(e.message) else: raise CouldNotParseInputException(str(e)) + except Exception,e: + if str(e).startswith("field larger than field limit"): + raise ColumnMaxLengthLimitExceededException(str(e)) + else: + raise def normalized_filename(filename): if filename == '-': @@ -671,9 +682,13 @@ class MaterializedFileState(object): except Exception,e: raise Exception('Tried to skip BOM for "utf-8-sig" encoding and failed. Error message is ' + str(e)) csv_reader = encoded_csv_reader(self.encoding, self.f, dialect=self.dialect) - for col_vals in csv_reader: - self.lines_read += 1 - yield col_vals + try: + for col_vals in csv_reader: + self.lines_read += 1 + yield col_vals + except ColumnMaxLengthLimitExceededException,e: + msg = "Column length is larger than the maximum. Offending file is '%s' - Line is %s, counting from 1 (encoding %s). The line number is the raw line number of the file, ignoring whether there's a header or not" % (self.filename,self.lines_read + 1,self.encoding) + raise ColumnMaxLengthLimitExceededException(msg) def close(self): if self.f != sys.stdin: @@ -1079,7 +1094,8 @@ class QInputParams(object): expected_column_count=None,keep_leading_whitespace_in_values=False, disable_double_double_quoting=False,disable_escaped_double_quoting=False, disable_column_type_detection=False, - input_quoting_mode='minimal',stdin_file=None,stdin_filename='-'): + input_quoting_mode='minimal',stdin_file=None,stdin_filename='-', + max_column_length_limit=131072): self.skip_header = skip_header self.delimiter = delimiter self.input_encoding = input_encoding @@ -1091,6 +1107,7 @@ class QInputParams(object): self.disable_escaped_double_quoting = disable_escaped_double_quoting self.input_quoting_mode = input_quoting_mode self.disable_column_type_detection = disable_column_type_detection + self.max_column_length_limit = max_column_length_limit def merged_with(self,input_params): params = QInputParams(**self.__dict__) @@ -1113,7 +1130,6 @@ class QTextAsData(object): # Create DB object self.db = Sqlite3DB() - input_quoting_modes = { 'minimal' : csv.QUOTE_MINIMAL, 'all' : csv.QUOTE_ALL, # nonnumeric is not supported for input quoting modes, since we determine the data types @@ -1149,6 +1165,8 @@ class QTextAsData(object): dialect_id = self.get_dialect_id(filename) csv.register_dialect(dialect_id, **q_dialect) + csv.field_size_limit(input_params.max_column_length_limit) + # Create a line splitter line_splitter = LineSplitter(input_params.delimiter, input_params.expected_column_count) @@ -1261,6 +1279,8 @@ class QTextAsData(object): error = QError(e,"Could not convert string to a numeric value. Did you use `-w nonnumeric` with unquoted string values? Error: %s" % e.msg,58) except CouldNotParseInputException,e: error = QError(e,"Could not parse the input. Please make sure to set the proper -w input-wrapping parameter for your input, and that you use the proper input encoding (-e). Error: %s" % e.msg,59) + except ColumnMaxLengthLimitExceededException,e: + error = QError(e,e.msg,31) except KeyboardInterrupt,e: warnings.append(QWarning(e,"Interrupted")) except Exception, e: @@ -1530,6 +1550,8 @@ def run_standalone(): #----------------------------------------------- parser.add_option("-v", "--version", dest="version", default=False, action="store_true", help="Print version") + parser.add_option("-V", "--verbose", dest="verbose", default=False, action="store_true", + help="Print debug info in case of problems") #----------------------------------------------- input_data_option_group = OptionGroup(parser,"Input Data Options") input_data_option_group.add_option("-H", "--skip-header", dest="skip_header", default=default_skip_header, action="store_true", @@ -1558,6 +1580,8 @@ def run_standalone(): help="Don't detect column types - All columns will be treated as text columns") input_data_option_group.add_option("-w","--input-quoting-mode",dest="input_quoting_mode",default="minimal", help="Input quoting mode. Possible values are all, minimal and none. Note the slightly misleading parameter name, and see the matching -W parameter for output quoting.") + input_data_option_group.add_option("-M","--max-column-length-limit",dest="max_column_length_limit",default=131072, + help="Sets the maximum column length.") parser.add_option_group(input_data_option_group) #----------------------------------------------- output_data_option_group = OptionGroup(parser,"Output Options") @@ -1681,6 +1705,14 @@ def run_standalone(): # (since no input delimiter means any whitespace) options.output_delimiter = " " + try: + max_column_length_limit = int(options.max_column_length_limit) + if max_column_length_limit < 1: + raise Exception() + except: + print >> sys.stderr, "Max column length limit must be a positive integer (%s)" % max_column_length_limit + sys.exit(31) + default_input_params = QInputParams(skip_header=options.skip_header, delimiter=options.delimiter, input_encoding=options.encoding, @@ -1691,7 +1723,8 @@ def run_standalone(): disable_double_double_quoting=options.disable_double_double_quoting, disable_escaped_double_quoting=options.disable_escaped_double_quoting, input_quoting_mode=options.input_quoting_mode, - disable_column_type_detection=options.disable_column_type_detection) + disable_column_type_detection=options.disable_column_type_detection, + max_column_length_limit=max_column_length_limit) q_engine = QTextAsData(default_input_params=default_input_params) output_params = QOutputParams( @@ -1700,7 +1733,7 @@ def run_standalone(): output_quoting_mode=options.output_quoting_mode, formatting=options.formatting, output_header=options.output_header) - q_output_printer = QOutputPrinter(output_params,show_tracebacks=DEBUG) + q_output_printer = QOutputPrinter(output_params,show_tracebacks=options.verbose) for query_str in query_strs: if options.analyze_only: diff --git a/build-deb-builder-container b/build-deb-builder-container new file mode 100755 index 0000000..abd021a --- /dev/null +++ b/build-deb-builder-container @@ -0,0 +1,10 @@ +#!/bin/bash + +if [ $# -ne 1 ]; +then + echo "Usage: $(basename $0) <version-tag>" + exit 1 +fi +VERSION_TAG="$1" + +docker build -f dist/deb-builder-Dockerfile -t q-text-as-data-deb-builder:${VERSION_TAG} . diff --git a/build-rpm-builder-container b/build-rpm-builder-container new file mode 100755 index 0000000..4788f19 --- /dev/null +++ b/build-rpm-builder-container @@ -0,0 +1,10 @@ +#!/bin/bash + +if [ $# -ne 1 ]; +then + echo "Usage: $(basename $0) <version-tag>" + exit 1 +fi +VERSION_TAG="$1" + +docker build -f dist/rpm-builder-Dockerfile -t q-text-as-data-rpm-builder:${VERSION_TAG} . diff --git a/dist/create-rpm b/dist/create-rpm index a841db5..7d86661 100755 --- a/dist/create-rpm +++ b/dist/create-rpm @@ -5,9 +5,9 @@ # # -if [ $# -ne 2 ]; +if [ $# -ne 1 ]; then - echo 'create-rpm <commit-hash> <version>' + echo 'create-rpm <version>' exit 1 fi @@ -26,9 +26,7 @@ mkdir -p ${rpm_build_area}/{SOURCES,SPECS,BUILD,RPMS,SRPMS,BUILDROOT} echo RPM build area is in ${rpm_build_area} -COMMIT_HASH=$1 -SHORT_HASH=${COMMIT_HASH:0:7} -VERSION=$2 +VERSION=$1 REAL_PACKAGE_NAME=q RPM_PACKAGE_NAME=q-text-as-data @@ -40,11 +38,15 @@ then exit 1 fi -rm -vf ${rpm_build_area}/SOURCES/q-${COMMIT_HASH}.tar.gz - -curl -o ${rpm_build_area}/SOURCES/q-${COMMIT_HASH}.tar.gz -L -R "https://github.com/harelba/q/tarball/${COMMIT_HASH}" +curl -o ${rpm_build_area}/SOURCES/q.tar.gz -L -R "https://github.com/harelba/q/tarball/${VERSION}" +mkdir -p ${rpm_build_area}/SOURCES +pushd ${rpm_build_area}/SOURCES >/dev/null +tar xvzf ./q.tar.gz --strip-components=1 +rm -vf ./q.tar.gz +popd >/dev/null +find ${rpm_build_area}/ -ls -cat ${RPM_PACKAGE_NAME}.spec.template | sed "s/VERSION_PLACEHOLDER/$VERSION/g" | sed "s/COMMIT_HASH_PLACEHOLDER/${COMMIT_HASH}/g" | sed "s/SHORT_HASH_PLACEHOLDER/${SHORT_HASH}/g" > ${rpm_build_area}/SPECS/${RPM_PACKAGE_NAME}.spec +cat ${RPM_PACKAGE_NAME}.spec.template | sed "s/VERSION_PLACEHOLDER/$VERSION/g" > ${rpm_build_area}/SPECS/${RPM_PACKAGE_NAME}.spec rpmbuild -v --define "_topdir ${rpm_build_area}" -ba ${rpm_build_area}/SPECS/${RPM_PACKAGE_NAME}.spec diff --git a/dist/deb-builder-Dockerfile b/dist/deb-builder-Dockerfile new file mode 100644 index 0000000..7ff7b08 --- /dev/null +++ b/dist/deb-builder-Dockerfile @@ -0,0 +1,8 @@ + +FROM ubuntu:12.04 + +RUN apt-get update && apt-get install -y alien + +ENTRYPOINT "/bin/bash" + + diff --git a/dist/q-text-as-data.spec.template b/dist/q-text-as-data.spec.template index 4f271e6..ad3d0c2 100644 --- a/dist/q-text-as-data.spec.template +++ b/dist/q-text-as-data.spec.template @@ -10,7 +10,6 @@ Summary: q - Text as Data Group: Applications/Text License: GPLv3 URL: https://github.com/harelba/q -Source: q-COMMIT_HASH_PLACEHOLDER.tar.gz BuildArch: noarch %description @@ -18,10 +17,12 @@ q allows to perform SQL-like statements on tabular text data. %prep -%setup -qn harelba-q-SHORT_HASH_PLACEHOLDER +cd %{_topdir}/BUILD +cp -vrf %{_topdir}/SOURCES/* %{_topdir}/BUILD/ +chmod -Rf a+rX,u+w,g-w,o-w %{_topdir}/BUILD/ %build -ls -ltr +cd %{_topdir}/BUILD ronn doc/USAGE.markdown %install @@ -43,6 +44,9 @@ gzip ${RPM_BUILD_ROOT}%{_mandir}/man1/q.1 %doc %_mandir/man1/q.1.gz %changelog +*Wed Apr 05 2017 Harel Ben-Attia <harelba@gmail.com> 1.6.0-1 +- Moved RPM building to be dockerized +- Removed the need for providing commit hashes *Fri Dec 12 2014 Harel Ben-Attia <harelba@gmail.com> 1.5.0-1 - Moved stuff from create-rpm script into the rpm spec itself *Sat Jun 14 2014 Harel Ben-Attia <harelba@gmail.com> 1.4.0-1 diff --git a/dist/rpm-builder-Dockerfile b/dist/rpm-builder-Dockerfile new file mode 100644 index 0000000..dafcd1e --- /dev/null +++ b/dist/rpm-builder-Dockerfile @@ -0,0 +1,12 @@ + +FROM centos:centos6 + +RUN yum install -y which curl gcc make rpm rpm-build + +RUN curl -sSL https://get.rvm.io | bash + +RUN /bin/bash -l -c "rvm install 2.4.1" && /bin/bash -l -c "gem install ronn" + +ENTRYPOINT "/bin/bash" + + diff --git a/package-release b/package-release new file mode 100755 index 0000000..0bbe604 --- /dev/null +++ b/package-release @@ -0,0 +1,35 @@ +#!/bin/bash + +set -e + +base_folder=$(dirname $0) +pushd ${base_folder} >/dev/null + +if [ $# -ne 1 ]; +then + echo "Usage: $(dirname $0) <git-tag>" + echo + echo "Note that the git tag must be pushed to github before doing this." + exit 1 +fi +TAG="$1" + +d=`pwd` +cid1=`docker run -i -d -v ${d}:/q q-text-as-data-rpm-builder:0.1` +cid2=`docker run -i -d -v ${d}:/q q-text-as-data-deb-builder:0.1` + +function kill_container { + tmp=`docker kill ${cid1} ${cid2}` +} +trap kill_container EXIT + +rm -rvf ${base_folder}/packages +mkdir -p ${base_folder}/packages + +sleep 1 +docker exec -it ${cid1} /bin/bash -i -c "/q/dist/create-rpm ${TAG}" + +docker cp ${cid1}:/q/dist/rpm_build_area/RPMS/noarch/q-text-as-data-${TAG}-1.el6.noarch.rpm ${base_folder}/packages/q-text-as-data-${TAG}-1.noarch.rpm + +docker exec -it ${cid2} /bin/bash -i -c "cd /q/packages && alien ./q-text-as-data-${TAG}-1.noarch.rpm" + diff --git a/test/test-suite b/test/test-suite index 717e9c8..a0f6fcc 100755 --- a/test/test-suite +++ b/test/test-suite @@ -801,7 +801,7 @@ class BasicTests(AbstractQTestCase): def test_non_quoted_values_in_quoted_data(self): tmp_data_file = self.create_file_with_data(sample_quoted_data) - + cmd = '../bin/q -d " " "select c1 from %s"' % tmp_data_file.name retcode, o, e = run_command(cmd) @@ -819,7 +819,7 @@ class BasicTests(AbstractQTestCase): def test_regular_quoted_values_in_quoted_data(self): tmp_data_file = self.create_file_with_data(sample_quoted_data) - + cmd = '../bin/q -d " " "select c2 from %s"' % tmp_data_file.name retcode, o, e = run_command(cmd) @@ -836,7 +836,7 @@ class BasicTests(AbstractQTestCase): def test_double_double_quoted_values_in_quoted_data(self): tmp_data_file = self.create_file_with_data(sample_quoted_data) - + cmd = '../bin/q -d " " "select c3 from %s"' % tmp_data_file.name retcode, o, e = run_command(cmd) @@ -853,7 +853,7 @@ class BasicTests(AbstractQTestCase): def test_escaped_double_quoted_values_in_quoted_data(self): tmp_data_file = self.create_file_with_data(sample_quoted_data) - + cmd = '../bin/q -d " " "select c4 from %s"' % tmp_data_file.name retcode, o, e = run_command(cmd) @@ -880,7 +880,7 @@ class BasicTests(AbstractQTestCase): self.assertEquals(o[0],'"quoted,data",23') self.assertEquals(o[1],'unquoted-data,54,') - + self.cleanup(tmp_data_file) def test_none_input_quoting_mode_in_strict_mode(self): @@ -1061,7 +1061,7 @@ class BasicTests(AbstractQTestCase): self.cleanup(tmp_data_file) def test_input_field_quoting_and_data_types_with_encoding(self): - # Checks combination of minimal input field quoting, with special characters that need to be decoded - + # Checks combination of minimal input field quoting, with special characters that need to be decoded - # Both content and proper data types are verified data = '111,22.22,"testing text with special characters - citt\xc3\xa0 ",http://somekindofurl.com,12.13.14.15,12.1\n' tmp_data_file = self.create_file_with_data(data,encoding='none') @@ -1094,7 +1094,7 @@ class BasicTests(AbstractQTestCase): def test_multiline_double_double_quoted_values_in_quoted_data(self): tmp_data_file = self.create_file_with_data(sample_quoted_data) - + # FIXME Need to convert \0a to proper encoding suitable for the person running the tests. cmd = '../bin/q -d " " "select replace(c5,X\'0A\',\'::\') from %s"' % tmp_data_file.name retcode, o, e = run_command(cmd) @@ -1112,7 +1112,7 @@ class BasicTests(AbstractQTestCase): def test_multiline_escaped_double_quoted_values_in_quoted_data(self): tmp_data_file = self.create_file_with_data(sample_quoted_data) - + # FIXME Need to convert \0a to proper encoding suitable for the person running the tests. cmd = '../bin/q -d " " "select replace(c6,X\'0A\',\'::\') from %s"' % tmp_data_file.name retcode, o, e = run_command(cmd) @@ -1129,11 +1129,11 @@ class BasicTests(AbstractQTestCase): self.cleanup(tmp_data_file) def test_disable_double_double_quoted_data_flag__values(self): - # This test (and flag) is meant to verify backward comptibility only. It is possible that + # This test (and flag) is meant to verify backward comptibility only. It is possible that # this flag will be removed completely in the future tmp_data_file = self.create_file_with_data(double_double_quoted_data) - + cmd = '../bin/q -d " " --disable-double-double-quoting "select c2 from %s" -W none' % tmp_data_file.name retcode, o, e = run_command(cmd) @@ -1167,11 +1167,11 @@ class BasicTests(AbstractQTestCase): self.cleanup(tmp_data_file) def test_disable_escaped_double_quoted_data_flag__values(self): - # This test (and flag) is meant to verify backward comptibility only. It is possible that + # This test (and flag) is meant to verify backward comptibility only. It is possible that # this flag will be removed completely in the future tmp_data_file = self.create_file_with_data(escaped_double_quoted_data) - + cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c2 from %s" -W none' % tmp_data_file.name retcode, o, e = run_command(cmd) @@ -1205,7 +1205,7 @@ class BasicTests(AbstractQTestCase): self.cleanup(tmp_data_file) def test_combined_quoted_data_flags__number_of_columns_detected(self): - # This test (and flags) is meant to verify backward comptibility only. It is possible that + # This test (and flags) is meant to verify backward comptibility only. It is possible that # these flags will be removed completely in the future tmp_data_file = self.create_file_with_data(combined_quoted_data) @@ -1258,6 +1258,86 @@ class BasicTests(AbstractQTestCase): self.assertEquals(e[0],"No files matching 'non-existent-file' have been found") + def test_default_column_max_length_parameter__short_enough(self): + huge_text = "x" * 131000 + + file_data = "a,b,c\n1,%s,3\n" % huge_text + + tmpfile = self.create_file_with_data(file_data) + + cmd = '../bin/q -H -d , "select a from %s"' % tmpfile.name + retcode, o, e = run_command(cmd) + + self.assertEquals(retcode, 0) + self.assertEquals(len(o), 1) + self.assertEquals(len(e), 0) + + self.assertEquals(o[0],'1') + + self.cleanup(tmpfile) + + def test_default_column_max_length_parameter__too_long(self): + huge_text = "x" * 132000 + + file_data = "a,b,c\n1,%s,3\n" % huge_text + + tmpfile = self.create_file_with_data(file_data) + + cmd = '../bin/q -H -d , "select a from %s"' % tmpfile.name + retcode, o, e = run_command(cmd) + + self.assertEquals(retcode, 31) + self.assertEquals(len(o), 0) + self.assertEquals(len(e), 1) + + self.assertTrue(e[0].startswith("Column length is larger than the maximum")) + self.assertTrue(("Offending file is '%s'" % tmpfile.name) in e[0]) + self.assertTrue('Line is 2' in e[0]) + + self.cleanup(tmpfile) + + def test_column_max_length_parameter(self): + file_data = "a,b,c\nvery-long-text,2,3\n" + tmpfile = self.create_file_with_data(file_data) + + cmd = '../bin/q -H -d , -M 3 "select a from %s"' % tmpfile.name + retcode, o, e = run_command(cmd) + + self.assertEquals(retcode, 31) + self.assertEquals(len(o), 0) + self.assertEquals(len(e), 1) + + self.assertTrue(e[0].startswith("Column length is larger than the maximum")) + self.assertTrue(("Offending file is '%s'" % tmpfile.name) in e[0]) + self.assertTrue('Line is 2' in e[0]) + + cmd2 = '../bin/q -H -d , -M 300 -H "select a from %s"' % tmpfile.name + retcode2, o2, e2 = run_command(cmd2) + + self.assertEquals(retcode2, 0) + self.assertEquals(len(o2), 1) + self.assertEquals(len(e2), 0) + + self.assertEquals(o2[0],'very-long-text') + + self.cleanup(tmpfile) + + def test_invalid_column_max_length_parameter(self): + file_data = "a,b,c\nvery-long-text,2,3\n" + tmpfile = self.create_file_with_data(file_data) + + cmd = '../bin/q -H -d , -M 0 "select a from %s"' % tmpfile.name + retcode, o, e = run_command(cmd) + + self.assertEquals(retcode, 31) + self.assertEquals(len(o), 0) + self.assertEquals(len(e), 1) + + self.assertTrue(e[0].startswith('Max column length limit must be a positive integer')) + + + self.cleanup(tmpfile) + class ParsingModeTests(AbstractQTestCase): def test_strict_mode_column_count_mismatch_error(self): @@ -1580,6 +1660,7 @@ class ParsingModeTests(AbstractQTestCase): self.cleanup(tmpfile) + class FormattingTests(AbstractQTestCase): def test_column_formatting(self): @@ -1607,6 +1688,7 @@ class FormattingTests(AbstractQTestCase): self.assertEquals(o[1], '55.000 5.500') + class SqlTests(AbstractQTestCase): def test_find_example(self): @@ -1761,6 +1843,7 @@ class SqlTests(AbstractQTestCase): self.cleanup(tmpfile) + class BasicModuleTests(AbstractQTestCase): def test_simple_query(self): |