summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarel Ben-Attia <harelba@gmail.com>2021-11-26 15:13:09 +0200
committerGitHub <noreply@github.com>2021-11-26 15:13:09 +0200
commitc14ac80e07cf8c433cf74e026b6ed5e030afb155 (patch)
tree070a46f4aa191353e874d0f9b0cbf5c3a3296d6b
parentf6c0299605c1f598c0fc08dcb029fcebd61b7052 (diff)
Release 3.1.3 (#286)
Some minor fixes from beta. Also removed the one-column warning. Still with manual version bump - Hopefully all works well in terms of the package versioning. I'll automate the version bumps later on.
-rw-r--r--.github/workflows/build-and-package.yaml44
-rw-r--r--.github/workflows/q.rb.brew-formula-template3
-rw-r--r--QSQL-NOTES.md73
-rw-r--r--README.markdown76
-rw-r--r--benchmark-config.sh2
-rwxr-xr-xbin/q.py11
-rw-r--r--mkdocs/docs/about.md2
-rw-r--r--mkdocs/docs/index.md367
-rw-r--r--pyoxidizer.bzl2
-rw-r--r--setup.py2
-rwxr-xr-xtest/test_suite.py67
11 files changed, 424 insertions, 225 deletions
diff --git a/.github/workflows/build-and-package.yaml b/.github/workflows/build-and-package.yaml
index b8cc4b0..08970ee 100644
--- a/.github/workflows/build-and-package.yaml
+++ b/.github/workflows/build-and-package.yaml
@@ -139,12 +139,12 @@ jobs:
gem install fpm
cp dist/fpm-config ~/.fpm
- fpm -s dir -t deb --deb-use-file-permissions -p packages/linux/q-text-as-data-3.1.1-beta-1.x86_64.deb --version 3.1.1-beta ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz
+ fpm -s dir -t deb --deb-use-file-permissions -p packages/linux/q-text-as-data-3.1.3-1.x86_64.deb --version 3.1.3 ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz
- name: Upload DEB Package
uses: actions/upload-artifact@v1.0.0
with:
- name: q-text-as-data-3.1.1-beta-1.x86_64.deb
- path: packages/linux/q-text-as-data-3.1.1-beta-1.x86_64.deb
+ name: q-text-as-data-3.1.3-1.x86_64.deb
+ path: packages/linux/q-text-as-data-3.1.3-1.x86_64.deb
test-deb-packaging:
runs-on: ubuntu-18.04
@@ -155,7 +155,7 @@ jobs:
- name: Download DEB
uses: actions/download-artifact@v2
with:
- name: q-text-as-data-3.1.1-beta-1.x86_64.deb
+ name: q-text-as-data-3.1.3-1.x86_64.deb
- name: Install Python for Testing
uses: actions/setup-python@v2
with:
@@ -167,7 +167,7 @@ jobs:
pip3 install -r test-requirements.txt
- name: Test DEB Package Installation
- run: ./dist/test-using-deb.sh ./q-text-as-data-3.1.1-beta-1.x86_64.deb
+ run: ./dist/test-using-deb.sh ./q-text-as-data-3.1.3-1.x86_64.deb
package-linux-rpm:
needs: [test-linux, create-man]
@@ -199,12 +199,12 @@ jobs:
gem install fpm
cp dist/fpm-config ~/.fpm
- fpm -s dir -t rpm --rpm-use-file-permissions -p packages/linux/q-text-as-data-3.1.1-beta.x86_64.rpm --version 3.1.1-beta ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz
+ fpm -s dir -t rpm --rpm-use-file-permissions -p packages/linux/q-text-as-data-3.1.3.x86_64.rpm --version 3.1.3 ./linux-q=/usr/bin/q USAGE.gz=/usr/share/man/man1/q.1.gz
- name: Upload RPM Package
uses: actions/upload-artifact@v1.0.0
with:
- name: q-text-as-data-3.1.1-beta.x86_64.rpm
- path: packages/linux/q-text-as-data-3.1.1-beta.x86_64.rpm
+ name: q-text-as-data-3.1.3.x86_64.rpm
+ path: packages/linux/q-text-as-data-3.1.3.x86_64.rpm
test-rpm-packaging:
runs-on: ubuntu-18.04
@@ -215,9 +215,9 @@ jobs:
- name: Download RPM
uses: actions/download-artifact@v2
with:
- name: q-text-as-data-3.1.1-beta.x86_64.rpm
+ name: q-text-as-data-3.1.3.x86_64.rpm
- name: Retest using RPM
- run: ./dist/test-using-rpm.sh ./q-text-as-data-3.1.1-beta.x86_64.rpm
+ run: ./dist/test-using-rpm.sh ./q-text-as-data-3.1.3.x86_64.rpm
build-mac:
runs-on: macos-11
@@ -308,7 +308,7 @@ jobs:
export BRANCH_NAME=master
# TODO temp, since template rendering action doesn't work in mac
- cat .github/workflows/q.rb.brew-formula-template | sed 's/{{ .Q_VERSION }}/3.1.1-beta/g' | sed "s/{{ .Q_BRANCH_NAME }}/${BRANCH_NAME}/g" > ./brew/q.rb
+ cat .github/workflows/q.rb.brew-formula-template | sed 's/{{ .Q_VERSION }}/3.1.3/g' | sed "s/{{ .Q_BRANCH_NAME }}/${BRANCH_NAME}/g" > ./brew/q.rb
echo "Resulting formula:"
cat ./brew/q.rb
@@ -322,8 +322,8 @@ jobs:
- name: Upload Executable
uses: actions/upload-artifact@v1.0.0
with:
- name: q--3.1.1-beta_1.big_sur.bottle.tar.gz
- path: ./q--3.1.1-beta_1.big_sur.bottle.tar.gz
+ name: q--3.1.3_1.big_sur.bottle.tar.gz
+ path: ./q--3.1.3_1.big_sur.bottle.tar.gz
# TODO auto-create PR to main homebrew-core
# git clone https://github.com/harelba/homebrew-core.git
@@ -340,7 +340,7 @@ jobs:
- name: Download q bottle
uses: actions/download-artifact@v2
with:
- name: q--3.1.1-beta_1.big_sur.bottle.tar.gz
+ name: q--3.1.3_1.big_sur.bottle.tar.gz
- name: Test the created bottle
run: |
set -x -e
@@ -349,7 +349,7 @@ jobs:
WD=$(pwd)
pushd /usr/local/Cellar
- tar xvfz ${WD}/q--3.1.1-beta_1.big_sur.bottle.tar.gz
+ tar xvfz ${WD}/q--3.1.3_1.big_sur.bottle.tar.gz
popd
brew link q
@@ -459,17 +459,17 @@ jobs:
# TODO Windows versions do not support the -beta postfix
- export Q_MSI=./build/x86_64-pc-windows-msvc/release/msi_installer/q-text-as-data-3.1.1.msi
+ export Q_MSI=./build/x86_64-pc-windows-msvc/release/msi_installer/q-text-as-data-3.1.3.msi
chmod 755 $Q_MSI
mkdir -p packages/windows/
- cp $Q_MSI packages/windows/q-text-as-data-3.1.1.msi
+ cp $Q_MSI packages/windows/q-text-as-data-3.1.3.msi
- name: Upload Windows MSI
uses: actions/upload-artifact@v1.0.0
with:
- name: q-text-as-data-3.1.1.msi
- path: packages/windows/q-text-as-data-3.1.1.msi
+ name: q-text-as-data-3.1.3.msi
+ path: packages/windows/q-text-as-data-3.1.3.msi
test-windows-packaging:
needs: package-windows
@@ -480,12 +480,12 @@ jobs:
- name: Download Windows Package
uses: actions/download-artifact@v2
with:
- name: q-text-as-data-3.1.1.msi
+ name: q-text-as-data-3.1.3.msi
- name: Test Install of MSI
continue-on-error: true
shell: powershell
run: |
- $process = Start-Process msiexec.exe -ArgumentList "/i q-text-as-data-3.1.1.msi -l* msi-install.log /norestart /quiet" -PassThru -Wait
+ $process = Start-Process msiexec.exe -ArgumentList "/i q-text-as-data-3.1.3.msi -l* msi-install.log /norestart /quiet" -PassThru -Wait
$process.ExitCode
gc msi-install.log
@@ -494,7 +494,7 @@ jobs:
continue-on-error: true
shell: powershell
run: |
- $process = Start-Process msiexec.exe -ArgumentList "/u q-text-as-data-3.1.1.msi /norestart /quiet" -PassThru -Wait
+ $process = Start-Process msiexec.exe -ArgumentList "/u q-text-as-data-3.1.3.msi /norestart /quiet" -PassThru -Wait
$process.ExitCode
exit $process.ExitCode
diff --git a/.github/workflows/q.rb.brew-formula-template b/.github/workflows/q.rb.brew-formula-template
index 15b77d0..c860a0c 100644
--- a/.github/workflows/q.rb.brew-formula-template
+++ b/.github/workflows/q.rb.brew-formula-template
@@ -5,8 +5,7 @@ class Q < Formula
desc "Run SQL directly on CSV or TSV files"
homepage "https://harelba.github.io/q/"
# Using branch name for pre-releases, for tagged releases this would be the version tag, and not "version" part will be needed
- url "https://github.com/harelba/q/archive/{{ .Q_BRANCH_NAME }}.tar.gz"
- version "{{ .Q_VERSION }}"
+ url "https://github.com/harelba/q/archive/3.1.3.tar.gz"
# Removed for now, until everything is finalized
# sha256 "0844aed6658d0347a299b84bee978c88724d45093e8cbd7b05506ecc0b93c98c"
diff --git a/QSQL-NOTES.md b/QSQL-NOTES.md
index dc67d8a..5f61f83 100644
--- a/QSQL-NOTES.md
+++ b/QSQL-NOTES.md
@@ -1,57 +1,22 @@
-# New beta version 3.1.1-beta is available
-Installation instructions [at the end of this document](QSQL-NOTES.md#installation-of-the-new-beta-release)
-
-Contains a lot of major changes, see sections below for details.
-
-## Basic Example of using the caching
-```bash
-# Prepare some data
-$ seq 1 1000000 > myfile.csv
-
-# read from the resulting file (-c 1 just prevents the warning of having one column only)
-$ time q -c 1 "select sum(c1),count(*) from myfile.csv"
-500000500000 1000000
-q -c 1 "select sum(c1),count(*) from myfile.csv" 4.02s user 0.06s system 99% cpu 4.108 total
-
-# Running with `-C readwrite` auto-creates a cache file if there is none. The cache filename would be myfile.csv.qsql. The query runs as usual
-$ time q -c 1 "select sum(c1),count(*) from myfile.csv" -C readwrite
-time q -c 1 "select sum(c1),count(*) from myfile.csv" -C readwrite
-500000500000 1000000
-q -c 1 "select sum(c1),count(*) from myfile.csv" -C readwrite 3.96s user 0.08s system 99% cpu 4.057 total
-
-# Now run with `-C read`. The query will run from the cache file and not the original. As the file gets bigger, the difference will be much more noticable
-$ time q -c 1 "select sum(c1),count(*) from myfile.csv" -C read
-500000500000 1000000
-q -c 1 "select sum(c1),count(*) from myfile.csv" -C read 0.17s user 0.05s system 94% cpu 0.229 total
-
-# Now let's try another query on that file. Notice the short query duration. The cache is being used for any query that uses this file, and queries on multiple files that contain caches will reuse the cache as well.
-$ time q -c 1 "select avg(c1) from myfile.csv" -C read
-500000.5
-q -c 1 "select avg(c1) from myfile.csv" -C read 0.16s user 0.05s system 99% cpu 0.217 total
-
-# You can also query the qsql file directly
-$ time q -c 1 "select sum(c1),count(*) from myfile.csv.qsql"
-500000500000 1000000
-q -c 1 "select sum(c1),count(*) from myfile.csv.qsql" 0.17s user 0.05s system 95% cpu 0.226 total
-
-# Now let's delete the original csv file
-$ rm -vf myfile.csv
-
-# Running another query directly on the qsql file just works
-$ q -c 1 "select sum(c1),count(*) from myfile.csv.qsql"
-500000500000 1000000
-q -c 1 "select sum(c1),count(*) from myfile.csv.qsql" 0.17s user 0.04s system 94% cpu 0.226 total
-
-# See the `.qrc` section below if you want to set the default `-C` (`--caching-mode`) to something other than `none` (the default)
-```
+## Major changes and additions in the new `3.x` version
+This is the list of new/changed functionality in this version. Large changes, please make sure to read the details if you're already using q.
+
+* **Automatic Immutable Caching** - Automatic caching of data files (into `<my-csv-filename>.qsql` files), with huge speedups for medium/large files. Enabled through `-C readwrite` or `-C read`
+* **Direct querying of standard sqlite databases** - Just use it as a table name in the query. Format is `select ... from <sqlitedb_filename>:::<table_name>`, or just `<sqlitedb_filename>` if the database contains only one table. Multiple separate sqlite databases are fully supported in the same query.
+* **Direct querying of the `qsql` cache files** - The user can query directly from the `qsql` files, removing the need for the original files. Just use `select ... from <my-csv-filename>.qsql`. Please wait until the non-beta version is out before thinking about deleting any of your original files...
+* **Revamped `.qrc` mechanism** - allows opting-in to caching without specifying it in every query. By default, caching is **disabled**, for backward compatibility and for finding usability issues.
+* **Save-to-db is now reusable for queries** - `--save-db-to-disk` option (`-S`) has been enhanced to match the new capabilities. You can query the resulting file directly through q, using the method mentioned above (it's just a standard sqlite database).
+* **Only python3 is supported from now on** - Shouldn't be an issue, since q is a self-contained binary executable which has its own python embedded in it. Internally, q is now packaged with Python 3.8. After everything cools down, I'll probably bump this to 3.9/3.10.
+* **Minimal Linux Version Bumped** - Works with CentOS 8, Ubuntu 18.04+, Debian 10+. Currently only for x86_64. Depends on glibc version 2.25+. Haven't tested it on other architectures. Issuing other architectures will be possible later on
+* **Completely revamped binary packaging** - Using [pyoxidizer](https://github.com/indygreg/PyOxidizer)
-The following sections provide the details of each of the new functionality in this major version.
+The following sections provide the details of each of the new functionalities in this major version.
## Automatic caching of data files
Speeding up subsequent reads from the same file by several orders of magnitude by automatically creating an immutable cache file for each tabular text file.
-For example, reading a 0.9GB file with 1M rows and 100 columns without caching takes ~50 seconds. When the cache exists, querying the same file will take less than 1 second. Obviously, the cache can be used in order to perform any query and not just the original query that was used for creating the cache.
+For example, reading a 0.9GB file with 1M rows and 100 columns without caching takes ~50 seconds. When the cache exists, querying the same file will take around ~1-2 seconds. Obviously, the cache can be used in order to perform any query and not just the original query that was used for creating the cache.
When caching is enabled, the cache is created on the first read of a file, and used automatically when reading it in other queries. A separate cache is being created for each file that is being used, allowing reuse in multiple use-cases. For example, if two csv files each have their own cache file from previous queries, then running a query that JOINs these two files would use the caches as well (without loading the data into memory), speeding it up considerably.
@@ -139,15 +104,3 @@ Removed the dual py2/py3 support. Since q is packaged as a self-contained execut
Users which for some reason still use q's main source code file directly and use python 2 would need to stay with the latest 2.0.19 release. In some next version, q's code structure is going to change significantly anyway in order to become a standard python module, so using the main source code file directly would not be possible.
If you are such a user, and this decision hurts you considerably, please ping me.
-
-
-# Installation of the new beta release
-For now, only Linux RPM, DEB, Mac OSX and Windows are supported. Packages for additional Linux Distros will be added later (it should be rather easy now, due to the use of `fpm`).
-
-The beta OSX version is not in `brew` yet, you'll need to take the `macos-q` executable, put it in your filesystem and `chmod +x` it.
-
-Note: For some reason showing the q manual (`man q`) does not work for Debian, even though it's packaged in the DEB file. I'll get around to fixing it later. If you have any thoughts about this, please drop me a line.
-
-Download the relevant files directly from [The Beta Release Assets](https://github.com/harelba/q/releases/tag/v3.1.1-beta).
-
-
diff --git a/README.markdown b/README.markdown
index b0f20d2..aacdfb6 100644
--- a/README.markdown
+++ b/README.markdown
@@ -1,66 +1,50 @@
-[![Build Status](https://travis-ci.org/harelba/q.svg?branch=master)](https://travis-ci.org/harelba/q)
+[![Build and Package](https://github.com/harelba/q/workflows/BuildAndPackage/badge.svg?branch=master)](https://github.com/harelba/q/actions?query=branch%3Amaster)
-# q - Text as Data
-q is a command line tool that allows direct execution of SQL-like queries on CSVs/TSVs (and any other tabular text files).
-
-q treats ordinary files as database tables, and supports all SQL constructs, such as `WHERE`, `GROUP BY`, `JOIN`s, etc. It supports automatic column name and type detection, and q provides full support for multiple character encodings.
-
-q's web site is [http://harelba.github.io/q/](http://harelba.github.io/q/) or [https://q.textasdata.wiki](https://q.textasdata.wiki) It contains everything you need to download and use q immediately.
-
-## New beta version `3.1.1-beta` is available
-Full Details [here](QSQL-NOTES.md), and an example of the caching is in [here](QSQL-NOTES.md#basic-example-of-using-the-caching).
-
-This is the list of new/changed functionality in this version, large changes, please make sure to read it and the details link as well.
-
-* **Automatic Immutable Caching** - Automatic caching of data files (into `<my-csv-filename>.qsql` files), with huge speedups for medium/large files. Enabled through `-C readwrite` or `-C read`
-* **Direct querying of standard sqlite databases** - Just use it as a table name in the query. Format is `select ... from <sqlitedb_filename>:::<table_name>`, or just `<sqlitedb_filename>` if the database contains only one table. Multiple separate sqlite databases are fully supported in the same query.
-* **Direct querying of the `qsql` cache files** - The user can query directly from the `qsql` files, removing the need for the original files. Just use `select ... from <my-csv-filename>.qsql`. Please wait until the non-beta version is out before thinking about deleting any of your original files...
-* **Revamped `.qrc` mechanism** - allows opting-in to caching without specifying it in every query. By default, caching is **disabled**, for backward compatibility and for finding usability issues.
-* **Save-to-db is now reusable for queries** - `--save-db-to-disk` option (`-S`) has been enhanced to match the new capabilities. You can query the resulting file directly through q, using the method mentioned above (it's just a standard sqlite database).
-* **Only python3 is supported from now on** - Shouldn't be an issue, since q is a self-contained binary executable which has its own python embedded in it. Internally, q is now packaged with Python 3.8. After everything cools down, I'll probably bump this to 3.9/3.10.
-* **Minimal Linux Version Bumped** - Works with CentOS 8, Ubuntu 18.04+, Debian 10+. Currently only for x86_64. Depends on glibc version 2.25+. Haven't tested it on other architectures. Issuing other architectures will be possible later on
-Full details on the changes and the new usage is in [here](QSQL-NOTES.md)
-
-The version is still in early testing, for two reasons:
-
-* Completely new build and packaging flow - Using [pyoxidizer](https://github.com/indygreg/PyOxidizer)
-* It's a very large change in functionality, which might surface issues, new and backward compatibility ones
+# q - Text as Data
+q's purpose is to bring SQL expressive power to the Linux command line and to provide easy access to text as actual data.
-**Please don't use it for production, until the final non-beta version is out**
+q allows the following:
-If you're testing it out, I'd be more than happy to get any feedback. Please write all your feedback in [this issue](https://github.com/harelba/q/issues/281), instead of opening separate issues. That would really help me with managing this.
+* Performing SQL-like statements directly on tabular text data, auto-caching the data in order to accelerate additional querying on the same file.
+* Performing SQL statements directly on multi-file sqlite3 databases, without having to merge them or load them into memory
-## Installation.
-**This will currently install the latest standard version `2.0.19`. See below if you want to download the `3.1.1-beta` version**
+The following table shows the impact of using caching:
-The current production version `2.0.19` installation is extremely simple.
+| Rows | Columns | File Size | Query time without caching | Query time with caching | Speed Improvement |
+|:---------:|:-------:|:---------:|:--------------------------:|:-----------------------:|:-----------------:|
+| 5,000,000 | 100 | 4.8GB | 4 minutes, 47 seconds | 1.92 seconds | x149 |
+| 1,000,000 | 100 | 983MB | 50.9 seconds | 0.461 seconds | x110 |
+| 1,000,000 | 50 | 477MB | 27.1 seconds | 0.272 seconds | x99 |
+| 100,000 | 100 | 99MB | 5.2 seconds | 0.141 seconds | x36 |
+| 100,000 | 50 | 48MB | 2.7 seconds | 0.105 seconds | x25 |
-Instructions for all OSs are [here](http://harelba.github.io/q/#installation).
+Notice that for the current version, caching is **not enabled** by default, since the caches take disk space. Use `-C readwrite` or `-C read` to enable it for a query, or add `caching_mode` to `.qrc` to set a new default.
+
+q's web site is [http://harelba.github.io/q/](http://harelba.github.io/q/) or [https://q.textasdata.wiki](https://q.textasdata.wiki) It contains everything you need to download and use q immediately.
-### Installation of the new beta release
-For now, only Linux RPM, DEB, Mac OSX and Windows are supported. Packages for additional Linux Distros will be added later (it should be rather easy now, due to the use of `fpm`).
-The beta OSX version is not in `brew` yet, you'll need to take the `macos-q` executable, put it in your filesystem and `chmod +x` it.
+## Usage Examples
+q treats ordinary files as database tables, and supports all SQL constructs, such as `WHERE`, `GROUP BY`, `JOIN`s, etc. It supports automatic column name and type detection, and provides full support for multiple character encodings.
-Note: For some reason showing the q manual (`man q`) does not work for Debian, even though it's packaged in the DEB file. I'll get around to fixing it later. If you have any thoughts about this, please drop me a line.
+Here are some example commands to get the idea:
-Download the relevant files directly from [The Beta Release Assets](https://github.com/harelba/q/releases/tag/v3.1.1-beta).
+```bash
+$ q "SELECT COUNT(*) FROM ./clicks_file.csv WHERE c3 > 32.3"
-## Examples
+$ ps -ef | q -H "SELECT UID, COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3"
+$ q "select count(*) from some_db.sqlite3:::albums a left join another_db.sqlite3:::tracks t on (a.album_id = t.album_id)"
```
-q "SELECT COUNT(*) FROM ./clicks_file.csv WHERE c3 > 32.3"
-ps -ef | q -H "SELECT UID, COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3"
-```
+Detailed examples are in [here](http://harelba.github.io/q/#examples)
-Go [here](http://harelba.github.io/q/#examples) for more examples.
+## Installation.
+**New Major Version `3.1.3` is out with a lot of significant additions.**
-## Benchmark
-I have created a preliminary benchmark comparing q's speed between python2, python3, and comparing both to textql and octosql.
+Instructions for all OSs are [here](http://harelba.github.io/q/#installation).
-Your input about the validity of the benchmark and about the results would be greatly appreciated. More details are [here](test/BENCHMARK.md).
+The previous version `2.0.19` Can still be downloaded from [here](https://github.com/harelba/q/releases/tag/2.0.19)
## Contact
Any feedback/suggestions/complaints regarding this tool would be much appreciated. Contributions are most welcome as well, of course.
@@ -71,5 +55,5 @@ Twitter [@harelba](https://twitter.com/harelba)
Email [harelba@gmail.com](mailto:harelba@gmail.com)
-q on twitter: #qtextasdata
+q on twitter: [#qtextasdata](https://twitter.com/hashtag/qtextasdata?src=hashtag_click)
diff --git a/benchmark-config.sh b/benchmark-config.sh
index 8606b92..b293d9a 100644
--- a/benchmark-config.sh
+++ b/benchmark-config.sh
@@ -1,3 +1,3 @@
#!/bin/bash
-BENCHMARK_PYTHON_VERSIONS=(3.6.4 3.8.5)
+BENCHMARK_PYTHON_VERSIONS=(3.8.5)
diff --git a/bin/q.py b/bin/q.py
index 716e783..f36cb8f 100755
--- a/bin/q.py
+++ b/bin/q.py
@@ -35,7 +35,7 @@ from collections import OrderedDict
from sqlite3.dbapi2 import OperationalError
from uuid import uuid4
-q_version = '3.1.1-beta'
+q_version = '3.1.3'
#__all__ = [ 'QTextAsData' ]
@@ -337,7 +337,7 @@ class Sqlite3DB(object):
QCATALOG_TABLE_NAME = '_qcatalog'
NUMERIC_COLUMN_TYPES = {int, long, float}
- PYTHON_TO_SQLITE_TYPE_NAMES = { str: 'TEXT', int: 'INT', long : 'INT' , float: 'FLOAT', None: 'TEXT' }
+ PYTHON_TO_SQLITE_TYPE_NAMES = { str: 'TEXT', int: 'INT', long : 'INT' , float: 'REAL', None: 'TEXT' }
def __str__(self):
@@ -1025,6 +1025,9 @@ class TableColumnInferer(object):
# return it
return type_list_without_nulls[0]
else:
+ # If there are only two types, one float an one int, then choose a float type
+ if len(set(type_list_without_nulls)) == 2 and float in type_list_without_nulls and int in type_list_without_nulls:
+ return float
return str
def do_analysis(self):
@@ -1035,8 +1038,8 @@ class TableColumnInferer(object):
else:
raise Exception('Unknown parsing mode %s' % self.mode)
- if self.column_count == 1 and self.expected_column_count != 1:
- print("Warning: column count is one - did you provide the correct delimiter?", file=sys.stderr)
+ if self.column_count == 1 and self.expected_column_count != 1 and self.expected_column_count is not None:
+ print(f"Warning: column count is one (expected column count is {self.expected_column_count} - did you provide the correct delimiter?", file=sys.stderr)
self.infer_column_types()
self.infer_column_names()
diff --git a/mkdocs/docs/about.md b/mkdocs/docs/about.md
index 49ec15b..9da1294 100644
--- a/mkdocs/docs/about.md
+++ b/mkdocs/docs/about.md
@@ -6,5 +6,5 @@
### Email [harelba@gmail.com](mailto:harelba@gmail.com)
-### Chinese translation [jinzhencheng@outlook.com](jinzhencheng@outlook.com)
+### Chinese translation [jinzhencheng@outlook.com](mailto:jinzhencheng@outlook.com)
diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md
index 98e31c6..29ab2e7 100644
--- a/mkdocs/docs/index.md
+++ b/mkdocs/docs/index.md
@@ -5,19 +5,56 @@
## Overview
-q is a command line tool that allows direct execution of SQL-like queries on CSVs/TSVs (and any other tabular text files).
+q's purpose is to bring SQL expressive power to the Linux command line by providing easy access to text as actual data.
-q treats ordinary files as database tables, and supports all SQL constructs, such as WHERE, GROUP BY, JOINs etc. It supports automatic column name and column type detection, and provides full support for multiple encodings.
+q allows the following:
-``` bash
-q "SELECT COUNT(*) FROM ./clicks_file.csv WHERE c3 > 32.3"
-```
+* Performing SQL-like statements directly on tabular text data, auto-caching the data in order to accelerate additional querying on the same file
+* Performing SQL statements directly on multi-file sqlite3 databases, without having to merge them or load them into memory
-``` bash
-ps -ef | q -H "SELECT UID,COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3"
+The following table shows the impact of using caching:
+
+| Rows | Columns | File Size | Query time without caching | Query time with caching | Speed Improvement |
+|:---------:|:-------:|:---------:|:--------------------------:|:-----------------------:|:-----------------:|
+| 5,000,000 | 100 | 4.8GB | 4 minutes, 47 seconds | 1.92 seconds | x149 |
+| 1,000,000 | 100 | 983MB | 50.9 seconds | 0.461 seconds | x110 |
+| 1,000,000 | 50 | 477MB | 27.1 seconds | 0.272 seconds | x99 |
+| 100,000 | 100 | 99MB | 5.2 seconds | 0.141 seconds | x36 |
+| 100,000 | 50 | 48MB | 2.7 seconds | 0.105 seconds | x25 |
+
+Notice that for the current version, caching is **not enabled** by default, since the caches take disk space. Use `-C readwrite` or `-C read` to enable it for a query, or add `caching_mode` to `.qrc` to set a new default.
+
+q treats ordinary files as database tables, and supports all SQL constructs, such as `WHERE`, `GROUP BY`, `JOIN`s, etc. It supports automatic column name and type detection, and provides full support for multiple character encodings.
+
+The new features - autocaching, direct querying of sqlite database and the use of `~/.qrc` file are described in detail in [here](https://github.com/harelba/q/blob/master/QSQL-NOTES.md).
+
+## Basic Usage
+
+```bash
+ q <flags> <query>
+
+Example Execution for a delimited file:
+
+ q "select * from myfile.csv"
+
+Example Execution for an sqlite3 database:
+
+ q "select * from mydatabase.sqlite:::my_table_name"
+
+ or
+
+ q "select * from mydatabase.sqlite"
+
+ if the database file contains only one table
+
+Auto-caching of delimited files can be activated through `-C readwrite`
+(writes new caches if needed) or `-C read` (only reads existing cache files)
+
+Setting the default caching mode (`-C`) can be done by
+writing a `~/.qrc` file. See docs for more info.
```
-Look at some examples [here](#examples), or just download the tool using the links in the [installation](#installation) below and play with it.
+Download the tool using the links in the [installation](#installation) below and play with it.
| | |
|:--------------------------------------:|:-----------------------------------------------:|
@@ -32,99 +69,262 @@ Look at some examples [here](#examples), or just download the tool using the lin
| Format | Instructions | Comments |
:---|:---|:---|
-|[OSX](https://github.com/harelba/q/releases/download/2.0.19/q-x86_64-Darwin)|run `brew install q`|man page is not available for this release yet. Use `q --help` for now||
-|[RPM Package](https://github.com/harelba/q/releases/download/2.0.19/q-text-as-data-2.0.19-1.x86_64.rpm)| run `rpm -ivh <package-filename>` or `rpm -U <package-filename>` if you already have an older version of q.| A man page is available for this release. Just enter `man q`.|
-|[DEB Package](https://github.com/harelba/q/releases/download/2.0.19/q-text-as-data_2.0.19-2_amd64.deb)| Run `sudo dpkg -i <package-filename>`|A man page is available for this release. Just enter `man q`.|
-|[Windows Installer](https://github.com/harelba/q/releases/download/2.0.19/q-AMD64-Windows-installer.exe)|Run the installer executable and hit next next next... q.exe will be added to the PATH so you can access it everywhere.|Windows doesn't update the PATH retroactively for open windows, so you'll need to open a new cmd window after the installation is done.|
-|[tar.gz](https://github.com/harelba/q/archive/2.0.19.tar.gz)|Full source file tree for latest stable version. Note that q.py cannot be used directly anymore, as it requires python dependencies||
-|[zip](https://github.com/harelba/q/archive/2.0.19.zip)|Full source file tree for the latest stable version. Note that q.py cannot be used directly anymore, as it requires python dependencies||
+|[OSX](https://github.com/harelba/q/releases/download/v3.1.3/macos-q)|`brew install` will install the previous `2.0.19` for now, until homebrew approves the new version. In the mean time, you can download the new version executable from the link, `chmod +x` it and then run. You might need to run it the first time from Finder using Right-Click -> Open, and then click the Open button. After the first time, it will run from the command line without any issues. |A man page is available, just run `man q`||
+|[RPM Package](https://github.com/harelba/q/releases/download/v3.1.3/q-text-as-data-3.1.3.x86_64.rpm)| run `rpm -ivh <package-filename>` or `rpm -U <package-filename>` if you already have an older version of q.| A man page is available for this release. Just enter `man q`.|
+|[DEB Package](https://github.com/harelba/q/releases/download/v3.1.3/q-text-as-data-3.1.3-1.x86_64.deb)| Run `sudo dpkg -i <package-filename>`|A man page is available for this release. Just enter `man q`. Some installations don't install the man page properly for some reason. I'll fix this soon|
+|[Windows Installer](https://github.com/harelba/q/releases/download/v3.1.3/q-text-as-data-3.1.3.msi)|Run the installer executable and hit next next next... q.exe will be added to the PATH so you can access it everywhere.|Windows doesn't update the PATH retroactively for open windows, so you'll need to open a new `cmd`/`bash` window after the installation is done.|
+|[Source tar.gz](https://github.com/harelba/q/archive/refs/tags/v3.1.3.tar.gz)|Full source file tree for latest stable version. Note that q.py cannot be used directly anymore, as it requires python dependencies||
+|[Source zip](https://github.com/harelba/q/archive/refs/tags/v3.1.3.zip)|Full source file tree for the latest stable version. Note that q.py cannot be used directly anymore, as it requires python dependencies||
-**Older versions can be downloaded [here](https://github.com/harelba/packages-for-q). Please let me know if you plan on using an older version, and why - I know of no reason to use any of them.**
+I will add packages for additional Linux Distributions if there's demand for it. If you're interested in another Linux distribution, please ping me. It's relatively easy to add new ones with the new packaging flow.
+
+The previous version `2.0.19` can be downloaded directly from [here](https://github.com/harelba/q/releases/tag/2.0.19). Please let me know if for some reason the new version is not suitable for your needs, and you're planning on using the previous one.
## Requirements
-As of version `2.0.9`, there's no need for any external dependency. Python itself (3.7), and any needed libraries are self-contained inside the installation, isolated from the rest of your system.
+q is packaged as a compiled standalone-executable that has no dependencies, not even python itself. This was done by using the awesome [pyoxidizer](https://github.com/indygreg/PyOxidizer) project.
-## Usage
-``` bash
-q <flags> "<query>"
+## Examples
- Simplest execution is `q "SELECT * FROM myfile"` which prints the entire file.
+This section shows example flows that highlight the main features. For more basic examples, see [here](#getting-started-examples).
+
+### Basic Examples:
+
+```bash
+# Prepare some data
+$ seq 1 1000000 > myfile.csv
+
+# Query it
+$ q "select sum(c1),count(*) from myfile.csv where c1 % 3 = 0"
+166666833333 333333
+
+# Use q to query from stdin
+$ ps -ef | q -b -H "SELECT UID, COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3"
+501 288
+0 115
+270 17
```
-q allows performing SQL-like statements on tabular text data. Its purpose is to bring SQL expressive power to the Linux command line and to provide easy access to text as actual data.
+### Auto-caching Examples
-Query should be an SQL-like query which contains *filenames instead of table names* (or - for stdin). The query itself should be provided as one parameter to the tool (i.e. enclosed in quotes). Multiple files can be used as one table by either writing them as `filename1+filename2+...` or by using shell wildcards (e.g. `my_files*.csv`).
+```bash
+# (time command output has been shortened for berevity)
-Use `-H` to signify that the input contains a header line. Column names will be detected automatically in that case, and can be used in the query. If this option is not provided, columns will be named cX, starting with 1 (e.g. `q "SELECT c3,c8 from ..."`).
+# Prepare some data
+$ seq 1 1000000 > myfile.csv
-Use `-d` to specify the input delimiter.
+# Read from the resulting file
+$ time q "select sum(c1),count(*) from myfile.csv"
+500000500000 1000000
+total_time=4.108 seconds
-Column types are auto detected by the tool, no casting is needed. Note that there's a flag `--as-text` which forces all columns to be treated as text columns.
+# Running with `-C readwrite` auto-creates a cache file if there is none. The cache filename would be myfile.csv.qsql. The query runs as usual
+$ time q "select sum(c1),count(*) from myfile.csv" -C readwrite
+500000500000 1000000
+total_time=4.057 seconds
-Please note that column names that include spaces need to be used in the query with back-ticks, as per the sqlite standard.
+# Now run with `-C read`. The query will run from the cache file and not the original. As the file gets bigger, the difference will be much more noticable
+$ time q "select sum(c1),count(*) from myfile.csv" -C read
+500000500000 1000000
+total_time=0.229 seconds
-Query/Input/Output encodings are fully supported (and q tries to provide out-of-the-box usability in that area). Please use `-e`,`-E` and `-Q` to control encoding if needed.
+# Now let's try another query on that file. Notice the short query duration. The cache is being used for any query that uses this file, and queries on multiple files that contain caches will reuse the cache as well.
+$ time q "select avg(c1) from myfile.csv" -C read
+500000.5
+total_time=0.217 seconds
-All sqlite3 SQL constructs are supported, including joins across files (use an alias for each table). Take a look at the [limitations](#limitations) section below for some rarely-used use cases which are not fully supported.
+# You can also query the qsql file directly, as it's just a standard sqlite3 DB file (see next section for q's support of reading directly from sqlite DBs)
+$ time q "select sum(c1),count(*) from myfile.csv.qsql"
+500000500000 1000000
+total_time=0.226 seconds
-### Query
-Each parameter that q gets is a full SQL query. All queries are executed one after another, outputing the results to standard output. Note that data loading is done only once, so when passing multiple queries on the same command-line, only the first one will take a long time. The rest will starting running almost instantanously, since all the data will already have been loaded. Remeber to double-quote each of the queries - Each parameter is a full SQL query.
+# Now let's delete the original csv file (be careful when deleting original data)
+$ rm -vf myfile.csv
-Any standard SQL expression, condition (both WHERE and HAVING), GROUP BY, ORDER BY etc. are allowed.
+# Running another query directly on the qsql file just works
+$ time q "select sum(c1),count(*) from myfile.csv.qsql"
+500000500000 1000000
+total_time=0.226 seconds
-JOINs are supported and Subqueries are supported in the WHERE clause, but unfortunately not in the FROM clause for now. Use table aliases when performing JOINs.
+# See the `.qrc` section below if you want to set the default `-C` (`--caching-mode`) to something other than `none` (the