From c14ac80e07cf8c433cf74e026b6ed5e030afb155 Mon Sep 17 00:00:00 2001 From: Harel Ben-Attia Date: Fri, 26 Nov 2021 15:13:09 +0200 Subject: Release 3.1.3 (#286) Some minor fixes from beta. Also removed the one-column warning. Still with manual version bump - Hopefully all works well in terms of the package versioning. I'll automate the version bumps later on. --- .github/workflows/build-and-package.yaml | 44 ++-- .github/workflows/q.rb.brew-formula-template | 3 +- QSQL-NOTES.md | 73 +----- README.markdown | 76 +++--- benchmark-config.sh | 2 +- bin/q.py | 11 +- mkdocs/docs/about.md | 2 +- mkdocs/docs/index.md | 367 +++++++++++++++++++++------ pyoxidizer.bzl | 2 +- setup.py | 2 +- test/test_suite.py | 67 +++-- 11 files changed, 424 insertions(+), 225 deletions(-) diff --git a/.github/workflows/build-and-package.yaml b/.github/workflows/build-and-package.yaml index b8cc4b0..08970ee 100644 --- a/.github/workflows/build-and-package.yaml +++ b/.github/workflows/build-and-package.yaml @@ -139,12 +139,12 @@ jobs: gem install fpm cp dist/fpm-config ~/.fpm - fpm -s dir -t deb --deb-use-file-permissions -p packages/linux/q-text-as-data-3.1.1-beta-1.x86_64.deb --version 3.1.1-beta ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz + fpm -s dir -t deb --deb-use-file-permissions -p packages/linux/q-text-as-data-3.1.3-1.x86_64.deb --version 3.1.3 ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz - name: Upload DEB Package uses: actions/upload-artifact@v1.0.0 with: - name: q-text-as-data-3.1.1-beta-1.x86_64.deb - path: packages/linux/q-text-as-data-3.1.1-beta-1.x86_64.deb + name: q-text-as-data-3.1.3-1.x86_64.deb + path: packages/linux/q-text-as-data-3.1.3-1.x86_64.deb test-deb-packaging: runs-on: ubuntu-18.04 @@ -155,7 +155,7 @@ jobs: - name: Download DEB uses: actions/download-artifact@v2 with: - name: q-text-as-data-3.1.1-beta-1.x86_64.deb + name: q-text-as-data-3.1.3-1.x86_64.deb - name: Install Python for Testing uses: actions/setup-python@v2 with: @@ -167,7 +167,7 @@ jobs: pip3 install -r test-requirements.txt - name: Test DEB Package Installation - run: ./dist/test-using-deb.sh ./q-text-as-data-3.1.1-beta-1.x86_64.deb + run: ./dist/test-using-deb.sh ./q-text-as-data-3.1.3-1.x86_64.deb package-linux-rpm: needs: [test-linux, create-man] @@ -199,12 +199,12 @@ jobs: gem install fpm cp dist/fpm-config ~/.fpm - fpm -s dir -t rpm --rpm-use-file-permissions -p packages/linux/q-text-as-data-3.1.1-beta.x86_64.rpm --version 3.1.1-beta ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz + fpm -s dir -t rpm --rpm-use-file-permissions -p packages/linux/q-text-as-data-3.1.3.x86_64.rpm --version 3.1.3 ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz - name: Upload RPM Package uses: actions/upload-artifact@v1.0.0 with: - name: q-text-as-data-3.1.1-beta.x86_64.rpm - path: packages/linux/q-text-as-data-3.1.1-beta.x86_64.rpm + name: q-text-as-data-3.1.3.x86_64.rpm + path: packages/linux/q-text-as-data-3.1.3.x86_64.rpm test-rpm-packaging: runs-on: ubuntu-18.04 @@ -215,9 +215,9 @@ jobs: - name: Download RPM uses: actions/download-artifact@v2 with: - name: q-text-as-data-3.1.1-beta.x86_64.rpm + name: q-text-as-data-3.1.3.x86_64.rpm - name: Retest using RPM - run: ./dist/test-using-rpm.sh ./q-text-as-data-3.1.1-beta.x86_64.rpm + run: ./dist/test-using-rpm.sh ./q-text-as-data-3.1.3.x86_64.rpm build-mac: runs-on: macos-11 @@ -308,7 +308,7 @@ jobs: export BRANCH_NAME=master # TODO temp, since template rendering action doesn't work in mac - cat .github/workflows/q.rb.brew-formula-template | sed 's/{{ .Q_VERSION }}/3.1.1-beta/g' | sed "s/{{ .Q_BRANCH_NAME }}/${BRANCH_NAME}/g" > ./brew/q.rb + cat .github/workflows/q.rb.brew-formula-template | sed 's/{{ .Q_VERSION }}/3.1.3/g' | sed "s/{{ .Q_BRANCH_NAME }}/${BRANCH_NAME}/g" > ./brew/q.rb echo "Resulting formula:" cat ./brew/q.rb @@ -322,8 +322,8 @@ jobs: - name: Upload Executable uses: actions/upload-artifact@v1.0.0 with: - name: q--3.1.1-beta_1.big_sur.bottle.tar.gz - path: ./q--3.1.1-beta_1.big_sur.bottle.tar.gz + name: q--3.1.3_1.big_sur.bottle.tar.gz + path: ./q--3.1.3_1.big_sur.bottle.tar.gz # TODO auto-create PR to main homebrew-core # git clone https://github.com/harelba/homebrew-core.git @@ -340,7 +340,7 @@ jobs: - name: Download q bottle uses: actions/download-artifact@v2 with: - name: q--3.1.1-beta_1.big_sur.bottle.tar.gz + name: q--3.1.3_1.big_sur.bottle.tar.gz - name: Test the created bottle run: | set -x -e @@ -349,7 +349,7 @@ jobs: WD=$(pwd) pushd /usr/local/Cellar - tar xvfz ${WD}/q--3.1.1-beta_1.big_sur.bottle.tar.gz + tar xvfz ${WD}/q--3.1.3_1.big_sur.bottle.tar.gz popd brew link q @@ -459,17 +459,17 @@ jobs: # TODO Windows versions do not support the -beta postfix - export Q_MSI=./build/x86_64-pc-windows-msvc/release/msi_installer/q-text-as-data-3.1.1.msi + export Q_MSI=./build/x86_64-pc-windows-msvc/release/msi_installer/q-text-as-data-3.1.3.msi chmod 755 $Q_MSI mkdir -p packages/windows/ - cp $Q_MSI packages/windows/q-text-as-data-3.1.1.msi + cp $Q_MSI packages/windows/q-text-as-data-3.1.3.msi - name: Upload Windows MSI uses: actions/upload-artifact@v1.0.0 with: - name: q-text-as-data-3.1.1.msi - path: packages/windows/q-text-as-data-3.1.1.msi + name: q-text-as-data-3.1.3.msi + path: packages/windows/q-text-as-data-3.1.3.msi test-windows-packaging: needs: package-windows @@ -480,12 +480,12 @@ jobs: - name: Download Windows Package uses: actions/download-artifact@v2 with: - name: q-text-as-data-3.1.1.msi + name: q-text-as-data-3.1.3.msi - name: Test Install of MSI continue-on-error: true shell: powershell run: | - $process = Start-Process msiexec.exe -ArgumentList "/i q-text-as-data-3.1.1.msi -l* msi-install.log /norestart /quiet" -PassThru -Wait + $process = Start-Process msiexec.exe -ArgumentList "/i q-text-as-data-3.1.3.msi -l* msi-install.log /norestart /quiet" -PassThru -Wait $process.ExitCode gc msi-install.log @@ -494,7 +494,7 @@ jobs: continue-on-error: true shell: powershell run: | - $process = Start-Process msiexec.exe -ArgumentList "/u q-text-as-data-3.1.1.msi /norestart /quiet" -PassThru -Wait + $process = Start-Process msiexec.exe -ArgumentList "/u q-text-as-data-3.1.3.msi /norestart /quiet" -PassThru -Wait $process.ExitCode exit $process.ExitCode diff --git a/.github/workflows/q.rb.brew-formula-template b/.github/workflows/q.rb.brew-formula-template index 15b77d0..c860a0c 100644 --- a/.github/workflows/q.rb.brew-formula-template +++ b/.github/workflows/q.rb.brew-formula-template @@ -5,8 +5,7 @@ class Q < Formula desc "Run SQL directly on CSV or TSV files" homepage "https://harelba.github.io/q/" # Using branch name for pre-releases, for tagged releases this would be the version tag, and not "version" part will be needed - url "https://github.com/harelba/q/archive/{{ .Q_BRANCH_NAME }}.tar.gz" - version "{{ .Q_VERSION }}" + url "https://github.com/harelba/q/archive/3.1.3.tar.gz" # Removed for now, until everything is finalized # sha256 "0844aed6658d0347a299b84bee978c88724d45093e8cbd7b05506ecc0b93c98c" diff --git a/QSQL-NOTES.md b/QSQL-NOTES.md index dc67d8a..5f61f83 100644 --- a/QSQL-NOTES.md +++ b/QSQL-NOTES.md @@ -1,57 +1,22 @@ -# New beta version 3.1.1-beta is available -Installation instructions [at the end of this document](QSQL-NOTES.md#installation-of-the-new-beta-release) - -Contains a lot of major changes, see sections below for details. - -## Basic Example of using the caching -```bash -# Prepare some data -$ seq 1 1000000 > myfile.csv - -# read from the resulting file (-c 1 just prevents the warning of having one column only) -$ time q -c 1 "select sum(c1),count(*) from myfile.csv" -500000500000 1000000 -q -c 1 "select sum(c1),count(*) from myfile.csv" 4.02s user 0.06s system 99% cpu 4.108 total - -# Running with `-C readwrite` auto-creates a cache file if there is none. The cache filename would be myfile.csv.qsql. The query runs as usual -$ time q -c 1 "select sum(c1),count(*) from myfile.csv" -C readwrite -time q -c 1 "select sum(c1),count(*) from myfile.csv" -C readwrite -500000500000 1000000 -q -c 1 "select sum(c1),count(*) from myfile.csv" -C readwrite 3.96s user 0.08s system 99% cpu 4.057 total - -# Now run with `-C read`. The query will run from the cache file and not the original. As the file gets bigger, the difference will be much more noticable -$ time q -c 1 "select sum(c1),count(*) from myfile.csv" -C read -500000500000 1000000 -q -c 1 "select sum(c1),count(*) from myfile.csv" -C read 0.17s user 0.05s system 94% cpu 0.229 total - -# Now let's try another query on that file. Notice the short query duration. The cache is being used for any query that uses this file, and queries on multiple files that contain caches will reuse the cache as well. -$ time q -c 1 "select avg(c1) from myfile.csv" -C read -500000.5 -q -c 1 "select avg(c1) from myfile.csv" -C read 0.16s user 0.05s system 99% cpu 0.217 total - -# You can also query the qsql file directly -$ time q -c 1 "select sum(c1),count(*) from myfile.csv.qsql" -500000500000 1000000 -q -c 1 "select sum(c1),count(*) from myfile.csv.qsql" 0.17s user 0.05s system 95% cpu 0.226 total - -# Now let's delete the original csv file -$ rm -vf myfile.csv - -# Running another query directly on the qsql file just works -$ q -c 1 "select sum(c1),count(*) from myfile.csv.qsql" -500000500000 1000000 -q -c 1 "select sum(c1),count(*) from myfile.csv.qsql" 0.17s user 0.04s system 94% cpu 0.226 total - -# See the `.qrc` section below if you want to set the default `-C` (`--caching-mode`) to something other than `none` (the default) -``` +## Major changes and additions in the new `3.x` version +This is the list of new/changed functionality in this version. Large changes, please make sure to read the details if you're already using q. + +* **Automatic Immutable Caching** - Automatic caching of data files (into `.qsql` files), with huge speedups for medium/large files. Enabled through `-C readwrite` or `-C read` +* **Direct querying of standard sqlite databases** - Just use it as a table name in the query. Format is `select ... from :::`, or just `` if the database contains only one table. Multiple separate sqlite databases are fully supported in the same query. +* **Direct querying of the `qsql` cache files** - The user can query directly from the `qsql` files, removing the need for the original files. Just use `select ... from .qsql`. Please wait until the non-beta version is out before thinking about deleting any of your original files... +* **Revamped `.qrc` mechanism** - allows opting-in to caching without specifying it in every query. By default, caching is **disabled**, for backward compatibility and for finding usability issues. +* **Save-to-db is now reusable for queries** - `--save-db-to-disk` option (`-S`) has been enhanced to match the new capabilities. You can query the resulting file directly through q, using the method mentioned above (it's just a standard sqlite database). +* **Only python3 is supported from now on** - Shouldn't be an issue, since q is a self-contained binary executable which has its own python embedded in it. Internally, q is now packaged with Python 3.8. After everything cools down, I'll probably bump this to 3.9/3.10. +* **Minimal Linux Version Bumped** - Works with CentOS 8, Ubuntu 18.04+, Debian 10+. Currently only for x86_64. Depends on glibc version 2.25+. Haven't tested it on other architectures. Issuing other architectures will be possible later on +* **Completely revamped binary packaging** - Using [pyoxidizer](https://github.com/indygreg/PyOxidizer) -The following sections provide the details of each of the new functionality in this major version. +The following sections provide the details of each of the new functionalities in this major version. ## Automatic caching of data files Speeding up subsequent reads from the same file by several orders of magnitude by automatically creating an immutable cache file for each tabular text file. -For example, reading a 0.9GB file with 1M rows and 100 columns without caching takes ~50 seconds. When the cache exists, querying the same file will take less than 1 second. Obviously, the cache can be used in order to perform any query and not just the original query that was used for creating the cache. +For example, reading a 0.9GB file with 1M rows and 100 columns without caching takes ~50 seconds. When the cache exists, querying the same file will take around ~1-2 seconds. Obviously, the cache can be used in order to perform any query and not just the original query that was used for creating the cache. When caching is enabled, the cache is created on the first read of a file, and used automatically when reading it in other queries. A separate cache is being created for each file that is being used, allowing reuse in multiple use-cases. For example, if two csv files each have their own cache file from previous queries, then running a query that JOINs these two files would use the caches as well (without loading the data into memory), speeding it up considerably. @@ -139,15 +104,3 @@ Removed the dual py2/py3 support. Since q is packaged as a self-contained execut Users which for some reason still use q's main source code file directly and use python 2 would need to stay with the latest 2.0.19 release. In some next version, q's code structure is going to change significantly anyway in order to become a standard python module, so using the main source code file directly would not be possible. If you are such a user, and this decision hurts you considerably, please ping me. - - -# Installation of the new beta release -For now, only Linux RPM, DEB, Mac OSX and Windows are supported. Packages for additional Linux Distros will be added later (it should be rather easy now, due to the use of `fpm`). - -The beta OSX version is not in `brew` yet, you'll need to take the `macos-q` executable, put it in your filesystem and `chmod +x` it. - -Note: For some reason showing the q manual (`man q`) does not work for Debian, even though it's packaged in the DEB file. I'll get around to fixing it later. If you have any thoughts about this, please drop me a line. - -Download the relevant files directly from [The Beta Release Assets](https://github.com/harelba/q/releases/tag/v3.1.1-beta). - - diff --git a/README.markdown b/README.markdown index b0f20d2..aacdfb6 100644 --- a/README.markdown +++ b/README.markdown @@ -1,66 +1,50 @@ -[![Build Status](https://travis-ci.org/harelba/q.svg?branch=master)](https://travis-ci.org/harelba/q) +[![Build and Package](https://github.com/harelba/q/workflows/BuildAndPackage/badge.svg?branch=master)](https://github.com/harelba/q/actions?query=branch%3Amaster) -# q - Text as Data -q is a command line tool that allows direct execution of SQL-like queries on CSVs/TSVs (and any other tabular text files). - -q treats ordinary files as database tables, and supports all SQL constructs, such as `WHERE`, `GROUP BY`, `JOIN`s, etc. It supports automatic column name and type detection, and q provides full support for multiple character encodings. - -q's web site is [http://harelba.github.io/q/](http://harelba.github.io/q/) or [https://q.textasdata.wiki](https://q.textasdata.wiki) It contains everything you need to download and use q immediately. - -## New beta version `3.1.1-beta` is available -Full Details [here](QSQL-NOTES.md), and an example of the caching is in [here](QSQL-NOTES.md#basic-example-of-using-the-caching). - -This is the list of new/changed functionality in this version, large changes, please make sure to read it and the details link as well. - -* **Automatic Immutable Caching** - Automatic caching of data files (into `.qsql` files), with huge speedups for medium/large files. Enabled through `-C readwrite` or `-C read` -* **Direct querying of standard sqlite databases** - Just use it as a table name in the query. Format is `select ... from :::`, or just `` if the database contains only one table. Multiple separate sqlite databases are fully supported in the same query. -* **Direct querying of the `qsql` cache files** - The user can query directly from the `qsql` files, removing the need for the original files. Just use `select ... from .qsql`. Please wait until the non-beta version is out before thinking about deleting any of your original files... -* **Revamped `.qrc` mechanism** - allows opting-in to caching without specifying it in every query. By default, caching is **disabled**, for backward compatibility and for finding usability issues. -* **Save-to-db is now reusable for queries** - `--save-db-to-disk` option (`-S`) has been enhanced to match the new capabilities. You can query the resulting file directly through q, using the method mentioned above (it's just a standard sqlite database). -* **Only python3 is supported from now on** - Shouldn't be an issue, since q is a self-contained binary executable which has its own python embedded in it. Internally, q is now packaged with Python 3.8. After everything cools down, I'll probably bump this to 3.9/3.10. -* **Minimal Linux Version Bumped** - Works with CentOS 8, Ubuntu 18.04+, Debian 10+. Currently only for x86_64. Depends on glibc version 2.25+. Haven't tested it on other architectures. Issuing other architectures will be possible later on -Full details on the changes and the new usage is in [here](QSQL-NOTES.md) - -The version is still in early testing, for two reasons: - -* Completely new build and packaging flow - Using [pyoxidizer](https://github.com/indygreg/PyOxidizer) -* It's a very large change in functionality, which might surface issues, new and backward compatibility ones +# q - Text as Data +q's purpose is to bring SQL expressive power to the Linux command line and to provide easy access to text as actual data. -**Please don't use it for production, until the final non-beta version is out** +q allows the following: -If you're testing it out, I'd be more than happy to get any feedback. Please write all your feedback in [this issue](https://github.com/harelba/q/issues/281), instead of opening separate issues. That would really help me with managing this. +* Performing SQL-like statements directly on tabular text data, auto-caching the data in order to accelerate additional querying on the same file. +* Performing SQL statements directly on multi-file sqlite3 databases, without having to merge them or load them into memory -## Installation. -**This will currently install the latest standard version `2.0.19`. See below if you want to download the `3.1.1-beta` version** +The following table shows the impact of using caching: -The current production version `2.0.19` installation is extremely simple. +| Rows | Columns | File Size | Query time without caching | Query time with caching | Speed Improvement | +|:---------:|:-------:|:---------:|:--------------------------:|:-----------------------:|:-----------------:| +| 5,000,000 | 100 | 4.8GB | 4 minutes, 47 seconds | 1.92 seconds | x149 | +| 1,000,000 | 100 | 983MB | 50.9 seconds | 0.461 seconds | x110 | +| 1,000,000 | 50 | 477MB | 27.1 seconds | 0.272 seconds | x99 | +| 100,000 | 100 | 99MB | 5.2 seconds | 0.141 seconds | x36 | +| 100,000 | 50 | 48MB | 2.7 seconds | 0.105 seconds | x25 | -Instructions for all OSs are [here](http://harelba.github.io/q/#installation). +Notice that for the current version, caching is **not enabled** by default, since the caches take disk space. Use `-C readwrite` or `-C read` to enable it for a query, or add `caching_mode` to `.qrc` to set a new default. + +q's web site is [http://harelba.github.io/q/](http://harelba.github.io/q/) or [https://q.textasdata.wiki](https://q.textasdata.wiki) It contains everything you need to download and use q immediately. -### Installation of the new beta release -For now, only Linux RPM, DEB, Mac OSX and Windows are supported. Packages for additional Linux Distros will be added later (it should be rather easy now, due to the use of `fpm`). -The beta OSX version is not in `brew` yet, you'll need to take the `macos-q` executable, put it in your filesystem and `chmod +x` it. +## Usage Examples +q treats ordinary files as database tables, and supports all SQL constructs, such as `WHERE`, `GROUP BY`, `JOIN`s, etc. It supports automatic column name and type detection, and provides full support for multiple character encodings. -Note: For some reason showing the q manual (`man q`) does not work for Debian, even though it's packaged in the DEB file. I'll get around to fixing it later. If you have any thoughts about this, please drop me a line. +Here are some example commands to get the idea: -Download the relevant files directly from [The Beta Release Assets](https://github.com/harelba/q/releases/tag/v3.1.1-beta). +```bash +$ q "SELECT COUNT(*) FROM ./clicks_file.csv WHERE c3 > 32.3" -## Examples +$ ps -ef | q -H "SELECT UID, COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3" +$ q "select count(*) from some_db.sqlite3:::albums a left join another_db.sqlite3:::tracks t on (a.album_id = t.album_id)" ``` -q "SELECT COUNT(*) FROM ./clicks_file.csv WHERE c3 > 32.3" -ps -ef | q -H "SELECT UID, COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3" -``` +Detailed examples are in [here](http://harelba.github.io/q/#examples) -Go [here](http://harelba.github.io/q/#examples) for more examples. +## Installation. +**New Major Version `3.1.3` is out with a lot of significant additions.** -## Benchmark -I have created a preliminary benchmark comparing q's speed between python2, python3, and comparing both to textql and octosql. +Instructions for all OSs are [here](http://harelba.github.io/q/#installation). -Your input about the validity of the benchmark and about the results would be greatly appreciated. More details are [here](test/BENCHMARK.md). +The previous version `2.0.19` Can still be downloaded from [here](https://github.com/harelba/q/releases/tag/2.0.19) ## Contact Any feedback/suggestions/complaints regarding this tool would be much appreciated. Contributions are most welcome as well, of course. @@ -71,5 +55,5 @@ Twitter [@harelba](https://twitter.com/harelba) Email [harelba@gmail.com](mailto:harelba@gmail.com) -q on twitter: #qtextasdata +q on twitter: [#qtextasdata](https://twitter.com/hashtag/qtextasdata?src=hashtag_click) diff --git a/benchmark-config.sh b/benchmark-config.sh index 8606b92..b293d9a 100644 --- a/benchmark-config.sh +++ b/benchmark-config.sh @@ -1,3 +1,3 @@ #!/bin/bash -BENCHMARK_PYTHON_VERSIONS=(3.6.4 3.8.5) +BENCHMARK_PYTHON_VERSIONS=(3.8.5) diff --git a/bin/q.py b/bin/q.py index 716e783..f36cb8f 100755 --- a/bin/q.py +++ b/bin/q.py @@ -35,7 +35,7 @@ from collections import OrderedDict from sqlite3.dbapi2 import OperationalError from uuid import uuid4 -q_version = '3.1.1-beta' +q_version = '3.1.3' #__all__ = [ 'QTextAsData' ] @@ -337,7 +337,7 @@ class Sqlite3DB(object): QCATALOG_TABLE_NAME = '_qcatalog' NUMERIC_COLUMN_TYPES = {int, long, float} - PYTHON_TO_SQLITE_TYPE_NAMES = { str: 'TEXT', int: 'INT', long : 'INT' , float: 'FLOAT', None: 'TEXT' } + PYTHON_TO_SQLITE_TYPE_NAMES = { str: 'TEXT', int: 'INT', long : 'INT' , float: 'REAL', None: 'TEXT' } def __str__(self): @@ -1025,6 +1025,9 @@ class TableColumnInferer(object): # return it return type_list_without_nulls[0] else: + # If there are only two types, one float an one int, then choose a float type + if len(set(type_list_without_nulls)) == 2 and float in type_list_without_nulls and int in type_list_without_nulls: + return float return str def do_analysis(self): @@ -1035,8 +1038,8 @@ class TableColumnInferer(object): else: raise Exception('Unknown parsing mode %s' % self.mode) - if self.column_count == 1 and self.expected_column_count != 1: - print("Warning: column count is one - did you provide the correct delimiter?", file=sys.stderr) + if self.column_count == 1 and self.expected_column_count != 1 and self.expected_column_count is not None: + print(f"Warning: column count is one (expected column count is {self.expected_column_count} - did you provide the correct delimiter?", file=sys.stderr) self.infer_column_types() self.infer_column_names() diff --git a/mkdocs/docs/about.md b/mkdocs/docs/about.md index 49ec15b..9da1294 100644 --- a/mkdocs/docs/about.md +++ b/mkdocs/docs/about.md @@ -6,5 +6,5 @@ ### Email [harelba@gmail.com](mailto:harelba@gmail.com) -### Chinese translation [jinzhencheng@outlook.com](jinzhencheng@outlook.com) +### Chinese translation [jinzhencheng@outlook.com](mailto:jinzhencheng@outlook.com) diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md index 98e31c6..29ab2e7 100644 --- a/mkdocs/docs/index.md +++ b/mkdocs/docs/index.md @@ -5,19 +5,56 @@ ## Overview -q is a command line tool that allows direct execution of SQL-like queries on CSVs/TSVs (and any other tabular text files). +q's purpose is to bring SQL expressive power to the Linux command line by providing easy access to text as actual data. -q treats ordinary files as database tables, and supports all SQL constructs, such as WHERE, GROUP BY, JOINs etc. It supports automatic column name and column type detection, and provides full support for multiple encodings. +q allows the following: -``` bash -q "SELECT COUNT(*) FROM ./clicks_file.csv WHERE c3 > 32.3" -``` +* Performing SQL-like statements directly on tabular text data, auto-caching the data in order to accelerate additional querying on the same file +* Performing SQL statements directly on multi-file sqlite3 databases, without having to merge them or load them into memory -``` bash -ps -ef | q -H "SELECT UID,COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3" +The following table shows the impact of using caching: + +| Rows | Columns | File Size | Query time without caching | Query time with caching | Speed Improvement | +|:---------:|:-------:|:---------:|:--------------------------:|:-----------------------:|:-----------------:| +| 5,000,000 | 100 | 4.8GB | 4 minutes, 47 seconds | 1.92 seconds | x149 | +| 1,000,000 | 100 | 983MB | 50.9 seconds | 0.461 seconds | x110 | +| 1,000,000 | 50 | 477MB | 27.1 seconds | 0.272 seconds | x99 | +| 100,000 | 100 | 99MB | 5.2 seconds | 0.141 seconds | x36 | +| 100,000 | 50 | 48MB | 2.7 seconds | 0.105 seconds | x25 | + +Notice that for the current version, caching is **not enabled** by default, since the caches take disk space. Use `-C readwrite` or `-C read` to enable it for a query, or add `caching_mode` to `.qrc` to set a new default. + +q treats ordinary files as database tables, and supports all SQL constructs, such as `WHERE`, `GROUP BY`, `JOIN`s, etc. It supports automatic column name and type detection, and provides full support for multiple character encodings. + +The new features - autocaching, direct querying of sqlite database and the use of `~/.qrc` file are described in detail in [here](https://github.com/harelba/q/blob/master/QSQL-NOTES.md). + +## Basic Usage + +```bash + q + +Example Execution for a delimited file: + + q "select * from myfile.csv" + +Example Execution for an sqlite3 database: + + q "select * from mydatabase.sqlite:::my_table_name" + + or + + q "select * from mydatabase.sqlite" + + if the database file contains only one table + +Auto-caching of delimited files can be activated through `-C readwrite` +(writes new caches if needed) or `-C read` (only reads existing cache files) + +Setting the default caching mode (`-C`) can be done by +writing a `~/.qrc` file. See docs for more info. ``` -Look at some examples [here](#examples), or just download the tool using the links in the [installation](#installation) below and play with it. +Download the tool using the links in the [installation](#installation) below and play with it. | | | |:--------------------------------------:|:-----------------------------------------------:| @@ -32,99 +69,262 @@ Look at some examples [here](#examples), or just download the tool using the lin | Format | Instructions | Comments | :---|:---|:---| -|[OSX](https://github.com/harelba/q/releases/download/2.0.19/q-x86_64-Darwin)|run `brew install q`|man page is not available for this release yet. Use `q --help` for now|| -|[RPM Package](https://github.com/harelba/q/releases/download/2.0.19/q-text-as-data-2.0.19-1.x86_64.rpm)| run `rpm -ivh ` or `rpm -U ` if you already have an older version of q.| A man page is available for this release. Just enter `man q`.| -|[DEB Package](https://github.com/harelba/q/releases/download/2.0.19/q-text-as-data_2.0.19-2_amd64.deb)| Run `sudo dpkg -i `|A man page is available for this release. Just enter `man q`.| -|[Windows Installer](https://github.com/harelba/q/releases/download/2.0.19/q-AMD64-Windows-installer.exe)|Run the installer executable and hit next next next... q.exe will be added to the PATH so you can access it everywhere.|Windows doesn't update the PATH retroactively for open windows, so you'll need to open a new cmd window after the installation is done.| -|[tar.gz](https://github.com/harelba/q/archive/2.0.19.tar.gz)|Full source file tree for latest stable version. Note that q.py cannot be used directly anymore, as it requires python dependencies|| -|[zip](https://github.com/harelba/q/archive/2.0.19.zip)|Full source file tree for the latest stable version. Note that q.py cannot be used directly anymore, as it requires python dependencies|| +|[OSX](https://github.com/harelba/q/releases/download/v3.1.3/macos-q)|`brew install` will install the previous `2.0.19` for now, until homebrew approves the new version. In the mean time, you can download the new version executable from the link, `chmod +x` it and then run. You might need to run it the first time from Finder using Right-Click -> Open, and then click the Open button. After the first time, it will run from the command line without any issues. |A man page is available, just run `man q`|| +|[RPM Package](https://github.com/harelba/q/releases/download/v3.1.3/q-text-as-data-3.1.3.x86_64.rpm)| run `rpm -ivh ` or `rpm -U ` if you already have an older version of q.| A man page is available for this release. Just enter `man q`.| +|[DEB Package](https://github.com/harelba/q/releases/download/v3.1.3/q-text-as-data-3.1.3-1.x86_64.deb)| Run `sudo dpkg -i `|A man page is available for this release. Just enter `man q`. Some installations don't install the man page properly for some reason. I'll fix this soon| +|[Windows Installer](https://github.com/harelba/q/releases/download/v3.1.3/q-text-as-data-3.1.3.msi)|Run the installer executable and hit next next next... q.exe will be added to the PATH so you can access it everywhere.|Windows doesn't update the PATH retroactively for open windows, so you'll need to open a new `cmd`/`bash` window after the installation is done.| +|[Source tar.gz](https://github.com/harelba/q/archive/refs/tags/v3.1.3.tar.gz)|Full source file tree for latest stable version. Note that q.py cannot be used directly anymore, as it requires python dependencies|| +|[Source zip](https://github.com/harelba/q/archive/refs/tags/v3.1.3.zip)|Full source file tree for the latest stable version. Note that q.py cannot be used directly anymore, as it requires python dependencies|| -**Older versions can be downloaded [here](https://github.com/harelba/packages-for-q). Please let me know if you plan on using an older version, and why - I know of no reason to use any of them.** +I will add packages for additional Linux Distributions if there's demand for it. If you're interested in another Linux distribution, please ping me. It's relatively easy to add new ones with the new packaging flow. + +The previous version `2.0.19` can be downloaded directly from [here](https://github.com/harelba/q/releases/tag/2.0.19). Please let me know if for some reason the new version is not suitable for your needs, and you're planning on using the previous one. ## Requirements -As of version `2.0.9`, there's no need for any external dependency. Python itself (3.7), and any needed libraries are self-contained inside the installation, isolated from the rest of your system. +q is packaged as a compiled standalone-executable that has no dependencies, not even python itself. This was done by using the awesome [pyoxidizer](https://github.com/indygreg/PyOxidizer) project. -## Usage -``` bash -q "" +## Examples - Simplest execution is `q "SELECT * FROM myfile"` which prints the entire file. +This section shows example flows that highlight the main features. For more basic examples, see [here](#getting-started-examples). + +### Basic Examples: + +```bash +# Prepare some data +$ seq 1 1000000 > myfile.csv + +# Query it +$ q "select sum(c1),count(*) from myfile.csv where c1 % 3 = 0" +166666833333 333333 + +# Use q to query from stdin +$ ps -ef | q -b -H "SELECT UID, COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3" +501 288 +0 115 +270 17 ``` -q allows performing SQL-like statements on tabular text data. Its purpose is to bring SQL expressive power to the Linux command line and to provide easy access to text as actual data. +### Auto-caching Examples -Query should be an SQL-like query which contains *filenames instead of table names* (or - for stdin). The query itself should be provided as one parameter to the tool (i.e. enclosed in quotes). Multiple files can be used as one table by either writing them as `filename1+filename2+...` or by using shell wildcards (e.g. `my_files*.csv`). +```bash +# (time command output has been shortened for berevity) -Use `-H` to signify that the input contains a header line. Column names will be detected automatically in that case, and can be used in the query. If this option is not provided, columns will be named cX, starting with 1 (e.g. `q "SELECT c3,c8 from ..."`). +# Prepare some data +$ seq 1 1000000 > myfile.csv -Use `-d` to specify the input delimiter. +# Read from the resulting file +$ time q "select sum(c1),count(*) from myfile.csv" +500000500000 1000000 +total_time=4.108 seconds -Column types are auto detected by the tool, no casting is needed. Note that there's a flag `--as-text` which forces all columns to be treated as text columns. +# Running with `-C readwrite` auto-creates a cache file if there is none. The cache filename would be myfile.csv.qsql. The query runs as usual +$ time q "select sum(c1),count(*) from myfile.csv" -C readwrite +500000500000 1000000 +total_time=4.057 seconds -Please note that column names that include spaces need to be used in the query with back-ticks, as per the sqlite standard. +# Now run with `-C read`. The query will run from the cache file and not the original. As the file gets bigger, the difference will be much more noticable +$ time q "select sum(c1),count(*) from myfile.csv" -C read +500000500000 1000000 +total_time=0.229 seconds -Query/Input/Output encodings are fully supported (and q tries to provide out-of-the-box usability in that area). Please use `-e`,`-E` and `-Q` to control encoding if needed. +# Now let's try another query on that file. Notice the short query duration. The cache is being used for any query that uses this file, and queries on multiple files that contain caches will reuse the cache as well. +$ time q "select avg(c1) from myfile.csv" -C read +500000.5 +total_time=0.217 seconds -All sqlite3 SQL constructs are supported, including joins across files (use an alias for each table). Take a look at the [limitations](#limitations) section below for some rarely-used use cases which are not fully supported. +# You can also query the qsql file directly, as it's just a standard sqlite3 DB file (see next section for q's support of reading directly from sqlite DBs) +$ time q "select sum(c1),count(*) from myfile.csv.qsql" +500000500000 1000000 +total_time=0.226 seconds -### Query -Each parameter that q gets is a full SQL query. All queries are executed one after another, outputing the results to standard output. Note that data loading is done only once, so when passing multiple queries on the same command-line, only the first one will take a long time. The rest will starting running almost instantanously, since all the data will already have been loaded. Remeber to double-quote each of the queries - Each parameter is a full SQL query. +# Now let's delete the original csv file (be careful when deleting original data) +$ rm -vf myfile.csv -Any standard SQL expression, condition (both WHERE and HAVING), GROUP BY, ORDER BY etc. are allowed. +# Running another query directly on the qsql file just works +$ time q "select sum(c1),count(*) from myfile.csv.qsql" +500000500000 1000000 +total_time=0.226 seconds -JOINs are supported and Subqueries are supported in the WHERE clause, but unfortunately not in the FROM clause for now. Use table aliases when performing JOINs. +# See the `.qrc` section below if you want to set the default `-C` (`--caching-mode`) to something other than `none` (the default) +``` -The SQL syntax itself is sqlite's syntax. For details look at http://www.sqlite.org/lang.html or search the net for examples. +### Direct sqlite Querying Examples + +```bash +# Download example sqlite3 database from https://www.sqlitetutorial.net/sqlite-sample-database/ and unzip it. The resulting file will be chinook.db +$ curl -L https://www.sqlitetutorial.net/wp-content/uploads/2018/03/chinook.zip | tar -xvf - + +# Now we can query the database directly, specifying the name of the table in the query (:::) +$ q "select count(*) from chinook.db:::albums" +347 + +# Let's take the top 5 longest tracks of album id 34. The -b option just beautifies the output, and -O tells q to output the column names as headers +$ q "select * from chinook.db:::tracks where albumid = '34' order by milliseconds desc limit 5" -b -O +TrackId Name AlbumId MediaTypeId GenreId Composer Milliseconds Bytes UnitPrice +407 "Só Tinha De Ser Com Você" 34 1 7 Vários 389642 13085596 0.99 +398 "Only A Dream In Rio" 34 1 7 Vários 371356 12192989 0.99 +393 "Tarde Em Itapoã" 34 1 7 Vários 313704 10344491 0.99 +401 "Momentos Que Marcam" 34 1 7 Vários 280137 9313740 0.99 +391 "Garota De Ipanema" 34 1 7 Vários 279536 9141343 0.99 + +# Let's now copy the chinook database to another file, as if it's just another different database +$ cp chinook.db another_db.db + +# Now we can run a join query between the two databases. They could have been any two different databases, using the copy of chinook is just for simplicity +# Let's get the top-5 longest albums, using albums from the first database and tracks from the second database. The track times are converted to seconds, and rounded to two digits after the decimal point. +$ q -b -O "select a.title,round(sum(t.milliseconds)/1000.0/60,2) total_album_time_seconds from chinook.db:::albums a left join another_database.db:::tracks t on (a.albumid = t.albumid) group by a.albumid order by total_album_time_seconds desc limit 5" +Title total_album_time_seconds +"Lost, Season 3" 1177.76 +"Battlestar Galactica (Classic), Season 1" 1170.23 +"Lost, Season 1" 1080.92 +"Lost, Season 2" 1054.83 +"Heroes, Season 1" 996.34 +``` -NOTE: Full type detection is implemented, so there is no need for any casting or anything. +### Analysis Examples + +```bash +# Let's create a simple CSV file without a header. Make sure to copy only the three lines, press enter, and +# then press Ctrl-D to exit so the file will be written. +$ cat > some-data-without-header.csv +harel,1,2 +ben,3,4 +attia,5,6 + + +# Let's run q on it with -A, to see the detected structure of the file. `-d ,` sets the delimiter to a comma +$ q -d , "select * from some-data-without-header.csv" -A +Table: /Users/harelben-attia/dev/harelba/q/some-data-without-header.csv + Sources: + source_type: file source: /Users/harelben-attia/dev/harelba/q/some-data-without-header.csv + Fields: + `c1` - text + `c2` - int + `c3` - int + +# Now let's create another simple CSV file, this time with a header (-H tells q to expect a header in the file) +$ cat > some-data.csv +planet_id,name,diameter_km,length_of_day_hours +1000,Earth,12756,24 +2000,Mars,6792,24.7 +3000,Jupiter,142984,9.9 + + +# Let's run q with -A to see the analysis results. +$ q -b -O -H -d , "select * from some-data.csv" -A +Table: /Users/harelben-attia/dev/harelba/q/some-data.csv + Sources: + source_type: file source: /Users/harelben-attia/dev/harelba/q/some-data.csv + Fields: + `planet_id` - int + `name` - text + `diameter_km` - int + `length_of_day_hours` - real + +# Let's run it with `-C readwrite` so a cache will be created +$ q -b -O -H -d , "select * from some-data.csv" -C readwrite +planet_id,name ,diameter_km,length_of_day_hours +1000 ,Earth ,12756 ,24.0 +2000 ,Mars ,6792 ,24.7 +3000 ,Jupiter,142984 ,9.9 + +# Running another query that uses some-data.csv with -A will now show that a qsql exists for that file. The source-type +# will be "file-with-unused-qsql". The qsql cache is not being used, since by default, q does not activate caching +# so backward compatibility is maintained +$ q -b -O -H -d , "select * from some-data.csv" -A +Table: /Users/harelben-attia/dev/harelba/q/some-data.csv + Sources: + source_type: file-with-unused-qsql source: /Users/harelben-attia/dev/harelba/q/some-data.csv + Fields: + `planet_id` - int + `name` - text + `diameter_km` - int + `length_of_day_hours` - real + +# Now let's run another query, this time with `-C read`, telling q to use the qsql caches. This time source-type will +# be "qsql-file-with-original", and the cache will be used when querying: +$ q -b -O -H -d , "select * from some-data.csv" -A -C read +Table: /Users/harelben-attia/dev/harelba/q/some-data.csv + Sources: + source_type: qsql-file-with-original source: /Users/harelben-attia/dev/harelba/q/some-data.csv.qsql + Fields: + `planet_id` - int + `name` - text + `diameter_km` - int + `length_of_day_hours` - real + +# Let's now read directly from the qsql file. Notice the change in the table name inside the query. `-C read` is not needed +# here. The source-type will be "qsql-file" +$ q -b -O -H -d , "select * from some-data.csv.qsql" -A +Table: /Users/harelben-attia/dev/harelba/q/some-data.csv.qsql + Sources: + source_type: qsql-file source: /Users/harelben-attia/dev/harelba/q/some-data.csv.qsql + Fields: + `planet_id` - int + `name` - text + `diameter_km` - int + `length_of_day_hours` - real +``` -NOTE2: When using the `-O` output header option, use column name aliases if you want to control the output column names. For example, `q -O -H "select count(*) cnt,sum(*) as mysum from -"` would output `cnt` and `mysum` as the output header column names. +### Usage +Query should be an SQL-like query which contains filenames instead of table names (or - for stdin). The query itself should be provided as one parameter to the tool (i.e. enclosed in quotes). -### Flags +All sqlite3 SQL constructs are supported, including joins across files (use an alias for each table). Take a look at the [limitations](#limitations) section below for some rarely-used use cases which are not fully supported. -``` bash -Usage: - q allows performing SQL-like statements on tabular text data. +q gets a full SQL query as a parameter. Remember to double-quote the query. - Its purpose is to bring SQL expressive power to manipulating text data using the Linux command line. +Historically, q supports multiple queries on the same command-line, loading each data file only once, even if it is used by multiple queries on the same q invocation. This is still supported. However, due to the new automatic-caching capabilities, this is not really required. Activate caching, and a cache file will be automatically created for each file. q Will use the cache behind the scenes in order to speed up queries. The speed up is extremely significant, so consider using caching for large files. - Basic usage is q "" where table names are just regular file names (Use - to read from standard input) - When the input contains a header row, use -H, and column names will be set according to the header row content. If there isn't a header row, then columns will automatically be named c1..cN. +The following filename types are supported: - Column types are detected automatically. Use -A in order to see the column name/type analysis. +* **Delimited-file filenames** - including relative/absolute paths. E.g. `./my_folder/my_file.csv` or `/var/tmp/my_file.csv` +* **sqlite3 database filenames** + * **With Multiple Tables** - Add an additional `:::` for accessing a specific table. For example `mydatabase.sqlite3:::users_table`. + * **With One Table Only** - Just specify the database filename, no need for a table name postfix. For example `my_single_table_database.sqlite`. +* **`.qsql` cache files** - q can auto-generate cache files for delimited files, and they can be queried directly as a table, since they contain only one table, as they are essentially standard sqlite datbases - Delimiter can be set using the -d (or -t) option. Output delimiter can be set using -D +Use `-H` to signify that the input contains a header line. Column names will be detected automatically in that case, and can be used in the query. If this option is not provided, columns will be named cX, starting with 1 (e.g. `q "SELECT c3,c8 from ..."`). - All sqlite3 SQL constructs are supported. +Use `-d` to specify the input delimiter. - Examples: +Column types are auto detected by the tool, no casting is needed. Note that there's a flag `--as-text` which forces all columns to be treated as text columns. - Example 1: ls -ltrd * | q "select c1,count(1) from - group by c1" - This example would print a count of each unique permission string in the current folder. +Please note that column names that include spaces need to be used in the query with back-ticks, as per the sqlite standard. Make sure to use single-quotes around the query, so bash/zsh won't interpret the backticks. - Example 2: seq 1 1000 | q "select avg(c1),sum(c1) from -" - This example would provide the average and the sum of the numbers in the range 1 to 1000 +Query/Input/Output encodings are fully supported (and q tries to provide out-of-the-box usability in that area). Please use `-e`,`-E` and `-Q` to control encoding if needed. - Example 3: sudo find /tmp -ls | q "select c5,c6,sum(c7)/1024.0/1024 as total from - group by c5,c6 order by total desc" - This example will output the total size in MB per user+group in the /tmp subtree +JOINs are supported and Subqueries are supported in the WHERE clause, but unfortunately not in the FROM clause for now. Use table aliases when performing JOINs. +The SQL syntax itself is sqlite's syntax. For details look at http://www.sqlite.org/lang.html or search the net for examples. - See the help or https://github.com/harelba/q/ for more details. - +NOTE: When using the `-O` output header option, use column name aliases if you want to control the output column names. For example, `q -O -H "select count(*) cnt,sum(*) as mysum from -"` would output `cnt` and `mysum` as the output header column names. +``` bash Options: -h, --help show this help message and exit -v, --version Print version -V, --verbose Print debug info in case of problems -S SAVE_DB_TO_DISK_FILENAME, --save-db-to-disk=SAVE_DB_TO_DISK_FILENAME Save database to an sqlite database file - --save-db-to-disk-method=SAVE_DB_TO_DISK_METHOD - Method to use to save db to disk. 'standard' does not - require any deps, 'fast' currenty requires manually - running `pip install sqlitebck` on your python - installation. Once packing issues are solved, the fast - method will be the default. + -C CACHING_MODE, --caching-mode=CACHING_MODE + Choose the autocaching mode (none/read/readwrite). + Autocaches files to disk db so further queries will be + faster. Caching is done to a side-file with the same + name of the table, but with an added extension .qsql + --dump-defaults Dump all default values for parameters and exit. Can + be used in order to make sure .qrc file content is + being read properly. + --max-attached-sqlite-databases=MAX_ATTACHED_SQLITE_DATABASES + Set the maximum number of concurrently-attached sqlite + dbs. This is a compile time definition of sqlite. q's + performance will slow down once this limit is reached + for a query, since it will perform table copies in + order to avoid that limit. + --overwrite-qsql=OVERWRITE_QSQL + When used, qsql files (both caches and store-to-db) + will be overwritten if they already exist. Use with + care. Input Data Options: -H, --skip-header Skip header row. This has been changed from earlier @@ -227,14 +427,32 @@ Options: feedback on this ``` -## Examples -The `-H` flag in the examples below signifies that the file has a header row which is used for naming columns. +### Setting the default values for parameters +It's possible to set default values for parameters which are used often by configuring them in the file `~/.qrc`. + +The file format is as follows: +```bash +[options] += +``` -The `-t` flag is just a shortcut for saying that the file is a tab-separated file (any delimiter is supported - Use the `-d` flag). +It's possible to generate a default `.qrc` file by running `q --dump-defaults` and write the output into the `.qrc` file. -Queries are given using upper case for clarity, but actual query keywords such as SELECT and WHERE are not really case sensitive. +One valuable use-case for this could be setting the caching-mode to `read`. This will make q automatically use generated `.qsql` cache files if they exist. Whenever you want a cache file to be generated, just use `-C readwrite` and a `.qsql` file will be generated if it doesn't exist. -Example List: + +## Getting Started Examples +This section shows some more basic examples of simple SQL constructs. + +For some more complex use-cases, see the [examples](#examples) at the beginning of the documentation. + +NOTES: + +* The `-H` flag in the examples below signifies that the file has a header row which is used for naming columns. +* The `-t` flag is just a shortcut for saying that the file is a tab-separated file (any delimiter is supported - Use the `-d` flag). +* Queries are given using upper case for clarity, but actual query keywords such as SELECT and WHERE are not really case sensitive. + +Basic Example List: * [Example 1 - COUNT DISTINCT values of specific field (uuid of clicks data)](#example-1) * [Example 2 - Filter numeric data, controlling ORDERing and LIMITing output](#example-2) @@ -345,24 +563,32 @@ You can see that the ppp filename appears twice, each time matched to one of the Column name detection is supported for JOIN scenarios as well. Just specify `-H` in the command line and make sure that the source files contain the header rows. ## Implementation -The current implementation is written in Python using an in-memory database, in order to prevent the need for external dependencies. The implementation itself supports SELECT statements, including JOINs (Subqueries are supported only in the WHERE clause for now). If you want to do further analysis on the data, you can use the `--save-db-to-disk` option to write the resulting tables to an sqlite database file, and then use `seqlite3` in order to perform queries on the data separately from q itself. +Behind the scenes q creates a "virtual" sqlite3 database that does not contain data of its own, but attaches to multiple other databases as follows: + +* When reading delimited files or data from `stdin`, it will analyze the data and construct an in-memory "adhoc database" that contains it. This adhoc database will be attached to the virtual database +* When a delimited file has a `.qsql` cache, it will attach to that file directly, without having to read it into memory +* When querying a standard sqlite3 file, it will be attached to the virtual database to it as well, without reading it into memory. sqlite3 files are auto-detected, no need for any special filename extension + +The user query will be executed directly on the virtual database, using the attached databases. -Please note that there is currently no checks and bounds on data size - It's up to the user to make sure things don't get too big. +sqlite3 itself has a limit on the number of attached databases (usually 10). If that limit is reached, q will automatically attach databases until that limit is reached, and will load additional tables into the adhoc database's in-memory database. Please make sure to read the [limitations](#limitations) section as well. ## Development ### Tests -The code includes a test suite runnable through `test/test-all`. If you're planning on sending a pull request, I'd appreciate if you could make sure that it doesn't fail. +The code includes a test suite runnable through `run-tests.sh`. By default, it uses the python source code for running the tests. However, it is possible to provide a path to an actual executable to the tests using the `Q_EXECUTABLE` env var. This is actually being used during the build and packaging process, in order to test the resulting binary. ## Limitations Here's the list of known limitations. Please contact me if you have a use case that needs any of those missing capabilities. +* Common Table Expressions (CTE) are not supported for now. Will be implemented soon - See [here](https://github.com/harelba/q/issues/67) and [here](https://github.com/harelba/q/issues/124) for details. * `FROM ` is not supported -* Common Table Expressions (CTE) are not supported * Spaces in file names are not supported. Use stdin for piping the data into q, or rename the file * Some rare cases of subqueries are not supported yet. +* Queries with more than 10 different sqlite3 databases will load some data into memory +* up to 500 tables are supported in a single query ## Rationale Have you ever stared at a text file on the screen, hoping it would have been a database so you could ask anything you want about it? I had that feeling many times, and I've finally understood that it's not the database that I want. It's the language - SQL. @@ -381,7 +607,6 @@ This tool has been designed with general Linux/Unix design principles in mind. I ## Future -* Expose python as a python module - Mostly implemented. Requires some internal API changes with regard to handling stdin before exposing it. -* Allow to use a distributed backend for scaling the computations +* Expose python as a python module - Planned as a goal after the new version `3.x` is out diff --git a/pyoxidizer.bzl b/pyoxidizer.bzl index 6ccf0db..22ef5bf 100644 --- a/pyoxidizer.bzl +++ b/pyoxidizer.bzl @@ -61,7 +61,7 @@ def make_msi(exe): # The name of your application. "q-text-as-data", # The version of your application. - "3.1.1", + "3.1.3", # The author/manufacturer of your application. "Harel Ben-Attia" ) diff --git a/setup.py b/setup.py index d0c5d48..10b1e9e 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup import setuptools -q_version = '3.1.1-beta' +q_version = '3.1.3' with open("README.markdown", "r", encoding="utf-8") as fh: long_description = fh.read() diff --git a/test/test_suite.py b/test/test_suite.py index f722e8f..8ec21ab 100755 --- a/test/test_suite.py +++ b/test/test_suite.py @@ -1243,11 +1243,10 @@ class BasicTests(AbstractQTestCase): 'seq 1 10 | ' + Q_EXECUTABLE + ' "select sum(c1),avg(c1) from -"') self.assertTrue(retcode == 0) self.assertTrue(len(o) == 1) - self.assertTrue(len(e) == 1) + self.assertTrue(len(e) == 0) s = sum(range(1, 11)) self.assertTrue(o[0] == six.b('%s %s' % (s, s / 10.0))) - self.assertTrue(one_column_warning(e)) def test_select_one_column(self): tmpfile = self.create_file_with_data(sample_data_no_header) @@ -2180,11 +2179,10 @@ class GzippingTests(AbstractQTestCase): retcode, o, e = run_command(cmd) self.assertTrue(retcode == 0) self.assertTrue(len(o) == 1) - self.assertTrue(len(e) == 1) + self.assertTrue(len(e) == 0) s = sum(range(1, 11)) self.assertTrue(o[0] == six.b('%s %s' % (s, s / 10.0))) - self.assertTrue(one_column_warning(e)) self.cleanup(tmpfile) @@ -2199,12 +2197,10 @@ class DelimiterTests(AbstractQTestCase): self.assertNotEqual(retcode, 0) self.assertEqual(len(o), 0) - self.assertEqual(len(e), 3) + self.assertEqual(len(e), 2) - self.assertTrue(e[0].startswith( - six.b("Warning: column count is one - did you provide the correct delimiter"))) - self.assertTrue(e[1].startswith(six.b("Bad header row"))) - self.assertTrue(six.b("Column name cannot contain commas") in e[2]) + self.assertTrue(e[0].startswith(six.b("Bad header row"))) + self.assertTrue(six.b("Column name cannot contain commas") in e[1]) self.cleanup(tmpfile) @@ -2405,6 +2401,46 @@ class AnalysisTests(AbstractQTestCase): self.cleanup(tmpfile) + def test_column_analysis_with_mixed_ints_and_floats(self): + tmpfile = self.create_file_with_data(six.b("""planet_id,name,diameter_km,length_of_day_hours\n1000,Earth,12756,24\n2000,Mars,6792,24.7\n3000,Jupiter,142984,9.9""")) + + cmd = Q_EXECUTABLE + ' -d , -H "select * from %s" -A' % tmpfile.name + retcode, o, e = run_command(cmd) + + self.assertEqual(retcode, 0) + self.assertEqual(len(o),8) + self.assertEqual(len(e),0) + self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name)) + self.assertEqual(o[1],six.b(' Sources:')) + self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmpfile.name)) + self.assertEqual(o[3],six.b(' Fields:')) + self.assertEqual(o[4], six.b(' `planet_id` - int')) + self.assertEqual(o[5], six.b(' `name` - text')) + self.assertEqual(o[6], six.b(' `diameter_km` - int')) + self.assertEqual(o[7], six.b(' `length_of_day_hours` - real')) + + self.cleanup(tmpfile) + + def test_column_analysis_with_mixed_ints_and_floats_and_nulls(self): + tmpfile = self.create_file_with_data(six.b("""planet_id,name,diameter_km,length_of_day_hours\n1000,Earth,12756,24\n2000,Mars,6792,24.7\n2500,Venus,,\n3000,Jupiter,142984,9.9""")) + + cmd = Q_EXECUTABLE + ' -d , -H "select * from %s" -A' % tmpfile.name + retcode, o, e = run_command(cmd) + + self.assertEqual(retcode, 0) + self.assertEqual(len(o),8) + self.assertEqual(len(e),0) + self.assertEqual(o[0], six.b('Table: %s' % tmpfile.name)) + self.assertEqual(o[1],six.b(' Sources:')) + self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmpfile.name)) + self.assertEqual(o[3],six.b(' Fields:')) + self.assertEqual(o[4], six.b(' `planet_id` - int')) + self.assertEqual(o[5], six.b(' `name` - text')) + self.assertEqual(o[6], six.b(' `diameter_km` - int')) + self.assertEqual(o[7], six.b(' `length_of_day_hours` - real')) + + self.cleanup(tmpfile) + def test_column_analysis_no_header(self): tmpfile = self.create_file_with_data(sample_data_no_header) @@ -2806,11 +2842,11 @@ class QuotingTests(AbstractQTestCase): self.assertEqual(o[2],six.b(' source_type: file source: %s' % tmp_data_file.name)) self.assertEqual(o[3],six.b(' Fields:')) self.assertEqual(o[4],six.b(' `c1` - int')) - self.assertEqual(o[5],six.b(' `c2` - float')) + self.assertEqual(o[5],six.b(' `c2` - real')) self.assertEqual(o[6],six.b(' `c3` - text')) self.assertEqual(o[7],six.b(' `c4` - text')) self.assertEqual(o[8],six.b(' `c5` - text')) - self.assertEqual(o[9],six.b(' `c6` - float')) + self.assertEqual(o[9],six.b(' `c6` - real')) self.cleanup(tmp_data_file) @@ -4618,10 +4654,9 @@ class ParsingModeTests(AbstractQTestCase): retcode, o, e = run_command(cmd) self.assertEqual(retcode, 0) - self.assertEqual(len(e), 1) + self.assertEqual(len(e), 0) self.assertEqual(len(o),2) - self.assertEqual(e[0],six.b("Warning: column count is one - did you provide the correct delimiter?")) self.assertEqual(o[0],six.b('data without commas 1')) self.assertEqual(o[1],six.b('data without commas 2')) @@ -4946,7 +4981,7 @@ class SqlTests(AbstractQTestCase): self.assertEqual(o[4], six.b(' `regular_text` - text')) self.assertEqual(o[5], six.b(' `text_with_digits1` - int')) self.assertEqual(o[6], six.b(' `text_with_digits2` - int')) - self.assertEqual(o[7], six.b(' `float_number` - float')) + self.assertEqual(o[7], six.b(' `float_number` - real')) # Check column types detected when actual detection is disabled cmd = Q_EXECUTABLE + ' -A -d , -H --as-text "select * from %s"' % (tmpfile.name) @@ -5439,7 +5474,7 @@ class BasicModuleTests(AbstractQTestCase): table_structure = metadata.table_structures['my_data'] self.assertEqual(table_structure.column_names,['column1','column2','column3']) - self.assertEqual(table_structure.sqlite_column_types,['text','float','text']) + self.assertEqual(table_structure.sqlite_column_types,['text','real','text']) self.assertEqual(table_structure.python_column_types,[str,float,str]) self.assertEqual(table_structure.qtable_name, 'my_data') self.assertEqual(table_structure.source_type, 'data-stream') @@ -5475,7 +5510,7 @@ class BasicMod