summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2020-02-20 18:16:27 -0500
committerAndrew Gallant <jamslam@gmail.com>2022-08-22 14:49:35 -0400
commit60a1db34a69b0d57adb9c2725366e9d8adb5efdc (patch)
treea2a39770b2ad0e86e2e3a8e6b59b3f26857f7302
parent87b33c96c02b5d728324632956d301ef3d234f80 (diff)
index: embarking on a most ambitious project...ag/index
-rw-r--r--Cargo.lock8
-rw-r--r--Cargo.toml1
-rw-r--r--crates/index/Cargo.toml19
-rw-r--r--crates/index/LICENSE-MIT21
-rw-r--r--crates/index/README.md8
-rw-r--r--crates/index/UNLICENSE24
-rw-r--r--crates/index/src/lib.rs7
-rw-r--r--crates/index/src/literal.rs1040
8 files changed, 1128 insertions, 0 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 4fd41885..f1f7535f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -180,6 +180,14 @@ dependencies = [
]
[[package]]
+name = "grep-index"
+version = "0.0.1"
+dependencies = [
+ "bstr 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex-syntax 0.6.14 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
name = "grep-matcher"
version = "0.1.5"
dependencies = [
diff --git a/Cargo.toml b/Cargo.toml
index fb78fcb4..90fe862c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,6 +32,7 @@ members = [
"crates/globset",
"crates/grep",
"crates/cli",
+ "crates/index",
"crates/matcher",
"crates/pcre2",
"crates/printer",
diff --git a/crates/index/Cargo.toml b/crates/index/Cargo.toml
new file mode 100644
index 00000000..125e8629
--- /dev/null
+++ b/crates/index/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+publish = false
+name = "grep-index"
+version = "0.0.1" #:version
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = """
+Grep, but with an index.
+"""
+documentation = "https://docs.rs/grep-index"
+homepage = "https://github.com/BurntSushi/ripgrep"
+repository = "https://github.com/BurntSushi/ripgrep"
+readme = "README.md"
+keywords = ["regex", "grep", "search", "index", "ngram"]
+license = "Unlicense/MIT"
+edition = "2018"
+
+[dependencies]
+bstr = "0.2"
+regex-syntax = "0.6.14"
diff --git a/crates/index/LICENSE-MIT b/crates/index/LICENSE-MIT
new file mode 100644
index 00000000..3b0a5dc0
--- /dev/null
+++ b/crates/index/LICENSE-MIT
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Andrew Gallant
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/crates/index/README.md b/crates/index/README.md
new file mode 100644
index 00000000..ab396753
--- /dev/null
+++ b/crates/index/README.md
@@ -0,0 +1,8 @@
+grep-index
+----------
+WIP.
+
+[![Build status](https://github.com/BurntSushi/ripgrep/workflows/ci/badge.svg)](https://github.com/BurntSushi/ripgrep/actions)
+[![](https://img.shields.io/crates/v/grep-regex.svg)](https://crates.io/crates/grep-index)
+
+Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
diff --git a/crates/index/UNLICENSE b/crates/index/UNLICENSE
new file mode 100644
index 00000000..68a49daa
--- /dev/null
+++ b/crates/index/UNLICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/crates/index/src/lib.rs b/crates/index/src/lib.rs
new file mode 100644
index 00000000..eb9bf8d1
--- /dev/null
+++ b/crates/index/src/lib.rs
@@ -0,0 +1,7 @@
+/*!
+TODO
+*/
+
+#![allow(warnings)]
+
+mod literal;
diff --git a/crates/index/src/literal.rs b/crates/index/src/literal.rs
new file mode 100644
index 00000000..75f2a586
--- /dev/null
+++ b/crates/index/src/literal.rs
@@ -0,0 +1,1040 @@
+// This module is currently a prototype. It was inspired a bit by a similar
+// implementation in codesearch[1]. The main difference is that it isn't fixed
+// to a particular ngram size and isn't quite as smart about reducing memory
+// usage. This was hastily written in order to bootstrap ngram extraction so
+// that other things in this crate could be worked on.
+//
+// Moving forward, we should polish this module up and probably generate
+// ngrams via graphemes or at least codepoints. The main issue with doing that
+// is that it will likely severely hinder our indexing speed. But alas, not
+// enough is built yet to let us quantify the performance difference.
+//
+// [1] - https://github.com/google/codesearch
+
+use std::cmp;
+use std::mem;
+
+use bstr::{BString, ByteSlice, ByteVec};
+use regex_syntax::hir::{self, Hir, HirKind};
+
+#[derive(Clone)]
+pub enum GramQuery {
+ Literal(BString),
+ And(Vec<GramQuery>),
+ Or(Vec<GramQuery>),
+}
+
+impl GramQuery {
+ fn nothing() -> GramQuery {
+ GramQuery::Or(vec![])
+ }
+
+ fn anything() -> GramQuery {
+ GramQuery::And(vec![])
+ }
+
+ fn is_nothing(&self) -> bool {
+ match *self {
+ GramQuery::Or(ref qs) => qs.is_empty(),
+ _ => false,
+ }
+ }
+
+ fn is_anything(&self) -> bool {
+ match *self {
+ GramQuery::And(ref qs) => qs.is_empty(),
+ _ => false,
+ }
+ }
+
+ fn is_literal(&self) -> bool {
+ match *self {
+ GramQuery::Literal(..) => true,
+ _ => false,
+ }
+ }
+
+ fn is_and(&self) -> bool {
+ match *self {
+ GramQuery::And(..) => true,
+ _ => false,
+ }
+ }
+
+ fn is_or(&self) -> bool {
+ match *self {
+ GramQuery::Or(..) => true,
+ _ => false,
+ }
+ }
+
+ fn unwrap(self) -> GramQuery {
+ match self {
+ q @ GramQuery::Literal(_) => q,
+ GramQuery::And(qs) => GramQuery::and(qs),
+ GramQuery::Or(qs) => GramQuery::or(qs),
+ }
+ }
+
+ fn unwrap_literal(self) -> BString {
+ match self {
+ GramQuery::Literal(lit) => lit,
+ GramQuery::And(_) => panic!("expected literal, but got And"),
+ GramQuery::Or(_) => panic!("expected literal, but got Or"),
+ }
+ }
+
+ fn or(mut queries: Vec<GramQuery>) -> GramQuery {
+ queries.retain(|q| !q.is_nothing());
+
+ if queries.iter().any(GramQuery::is_anything) {
+ GramQuery::anything()
+ } else if queries.len() == 1 {
+ queries.pop().unwrap()
+ } else if queries.iter().all(GramQuery::is_literal) {
+ let set = LiteralSet::from_vec(
+ queries.into_iter().map(GramQuery::unwrap_literal).collect(),
+ );
+ GramQuery::from_set_or(set)
+ } else if queries.iter().any(GramQuery::is_or) {
+ let mut flat = vec![];
+ for q in queries {
+ match q {
+ GramQuery::Or(qs) => {
+ flat.extend(qs);
+ }
+ q => flat.push(q),
+ }
+ }
+ GramQuery::or(flat)
+ } else {
+ GramQuery::Or(queries)
+ }
+ }
+
+ fn and(mut queries: Vec<GramQuery>) -> GramQuery {
+ queries.retain(|q| !q.is_anything());
+
+ if queries.iter().any(GramQuery::is_nothing) {
+ GramQuery::nothing()
+ } else if queries.len() == 1 {
+ queries.pop().unwrap()
+ } else if queries.iter().all(GramQuery::is_literal) {
+ let set = LiteralSet::from_vec(
+ queries.into_iter().map(GramQuery::unwrap_literal).collect(),
+ );
+ GramQuery::from_set_and(set)
+ } else if queries.iter().any(GramQuery::is_and) {
+ let mut flat = vec![];
+ for q in queries {
+ match q {
+ GramQuery::And(qs) => {
+ flat.extend(qs);
+ }
+ q => flat.push(q),
+ }
+ }
+ GramQuery::and(flat)
+ } else {
+ GramQuery::And(queries)
+ }
+ }
+
+ fn from_set_or(mut set: LiteralSet) -> GramQuery {
+ if set.is_empty() {
+ GramQuery::anything()
+ } else if set.len() == 1 {
+ GramQuery::Literal(set.lits.pop().unwrap())
+ } else {
+ let lits: Vec<_> =
+ set.lits.into_iter().map(GramQuery::Literal).collect();
+ GramQuery::Or(lits)
+ }
+ }
+
+ fn from_set_and(mut set: LiteralSet) -> GramQuery {
+ if set.is_empty() {
+ GramQuery::anything()
+ } else if set.len() == 1 {
+ GramQuery::Literal(set.lits.pop().unwrap())
+ } else {
+ let lits: Vec<_> =
+ set.lits.into_iter().map(GramQuery::Literal).collect();
+ GramQuery::And(lits)
+ }
+ }
+
+ fn implies(&self, o: &GramQuery) -> bool {
+ true
+ }
+
+ fn union(&mut self, q2: GramQuery) {
+ use self::GramQuery::*;
+
+ let (or, and) = (GramQuery::or, GramQuery::and);
+ let mut q1 = mem::replace(self, GramQuery::nothing());
+ match (q1, q2) {
+ (Literal(lit1), Literal(lit2)) => {
+ // let set = LiteralSet::from_vec(vec![lit1, lit2]);
+ // *self = GramQuery::from_set_or(set);
+ *self = or(vec![Literal(lit1), Literal(lit2)]);
+ }
+ (Literal(lit1), And(qs2)) => {
+ *self = or(vec![Literal(lit1), and(qs2)]);
+ }
+ (Literal(lit1), Or(mut qs2)) => {
+ qs2.push(Literal(lit1));
+ *self = or(qs2);
+ }
+ (And(qs1), And(qs2)) => {
+ if qs1.iter().all(GramQuery::is_literal)
+ && qs2.iter().all(GramQuery::is_literal)
+ {
+ let (common, not1, not2) = factor(qs1, qs2);
+ let mut disjuncts = vec![];
+ if !not1.is_empty() {
+ disjuncts.push(GramQuery::from_set_and(not1));
+ }
+ if !not2.is_empty() {
+ disjuncts.push(GramQuery::from_set_and(not2));
+ }
+ let mut conjuncts = vec![];
+ if !common.is_empty() {
+ conjuncts.push(GramQuery::from_set_and(common));
+ }
+ if !disjuncts.is_empty() {
+ conjuncts.push(or(disjuncts));
+ }
+ *self = and(conjuncts);
+ } else {
+ *self = or(vec![and(qs1), and(qs2)]);
+ }
+ }
+ (And(qs1), q2) => {
+ *self = or(vec![and(qs1), q2]);
+ }
+ (Or(mut qs1), q2) => {
+ qs1.push(q2);
+ *self = or(qs1);
+ }
+ }
+ }
+
+ fn intersect(&mut self, q2: GramQuery) {
+ use self::GramQuery::*;
+
+ let (or, and) = (GramQuery::or, GramQuery::and);
+ let mut q1 = mem::replace(self, GramQuery::nothing()).unwrap();
+ let q2 = q2.unwrap();
+ match (q1, q2) {
+ (Literal(lit1), Literal(lit2)) => {
+ *self = and(vec![Literal(lit1), Literal(lit2)]);
+ }
+ (Literal(lit1), And(mut qs2)) => {
+ qs2.push(Literal(lit1));
+ *self = and(qs2);
+ }
+ (Literal(lit1), Or(qs2)) => {
+ *self = and(vec![Literal(lit1), or(qs2)]);
+ }
+ (And(mut qs1), q2) => {
+ qs1.push(q2);
+ *self = and(qs1);
+ }
+ (Or(qs1), Or(qs2)) => {
+ if qs1.iter().all(GramQuery::is_literal)
+ && qs2.iter().all(GramQuery::is_literal)
+ {
+ let (common, not1, not2) = factor(qs1, qs2);
+ let mut conjuncts = vec![];
+ if !not1.is_empty() {
+ conjuncts.push(GramQuery::from_set_or(not1));
+ }
+ if !not2.is_empty() {
+ conjuncts.push(GramQuery::from_set_or(not2));
+ }
+ let mut disjuncts = vec![];
+ if !common.is_empty() {
+ disjuncts.push(GramQuery::from_set_or(common));
+ }
+ if !conjuncts.is_empty() {
+ disjuncts.push(and(conjuncts));
+ }
+ *self = or(disjuncts);
+ } else {
+ *self = and(vec![or(qs1), or(qs2)]);
+ }
+ }
+ (Or(qs1), q2) => {
+ *self = and(vec![or(qs1), q2]);
+ }
+ }
+ }
+
+ fn and_ngrams(&mut self, size: usize, set: &LiteralSet) {
+ if set.min_len() < size {
+ return;
+ }
+ let mut qor = GramQuery::nothing();
+ for lit in &set.lits {
+ if lit.len() < size {
+ continue;
+ }
+
+ let mut set = LiteralSet::new();
+ set.extend(ngrams(size, lit).map(BString::from));
+ qor.union(GramQuery::from_set_and(set));
+ }
+ self.intersect(qor);
+ }
+}
+
+impl std::fmt::Debug for GramQuery {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ if f.alternate() {
+ return match *self {
+ GramQuery::Literal(ref lit) => {
+ f.debug_tuple("Literal").field(lit).finish()
+ }
+ GramQuery::And(ref qs) => {
+ f.debug_tuple("And").field(qs).finish()
+ }
+ GramQuery::Or(ref qs) => {
+ f.debug_tuple("Or").field(qs).finish()
+ }
+ };
+ }
+
+ match *self {
+ GramQuery::Literal(ref lit) => {
+ let x = format!("{:?}", lit);
+ write!(f, "'{}'", &x[1..x.len() - 1])
+ }
+ GramQuery::And(ref qs) => {
+ let it = qs.iter().map(|q| {
+ if q.is_literal() {
+ format!("{:?}", q)
+ } else {
+ format!("({:?})", q)
+ }
+ });
+ write!(f, "{}", it.collect::<Vec<String>>().join(" & "))
+ }
+ GramQuery::Or(ref qs) => {
+ let it = qs.iter().map(|q| {
+ if q.is_literal() {
+ format!("{:?}", q)
+ } else {
+ format!("({:?})", q)
+ }
+ });
+ write!(f, "{}", it.collect::<Vec<String>>().join(" | "))
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Analysis {
+ size: usize,
+ query: GramQuery,
+ exact: LiteralSet,
+ prefix: LiteralSet,
+ suffix: LiteralSet,
+}
+
+impl Analysis {
+ fn exact(size: usize, set: LiteralSet) -> Analysis {
+ Analysis {
+ size,
+ query: GramQuery::anything(),
+ exact: set,
+ prefix: LiteralSet::new(),
+ suffix: LiteralSet::new(),
+ }
+ }
+
+ fn anything(size: usize) -> Analysis {
+ Analysis {
+ size,
+ query: GramQuery::anything(),
+ exact: LiteralSet::new(),
+ prefix: LiteralSet::new(),
+ suffix: LiteralSet::new(),
+ }
+ }
+
+ fn exact_one(size: usize, string: BString) -> Analysis {
+ Analysis {
+ size,
+ query: GramQuery::anything(),
+ exact: LiteralSet::single(string),
+ prefix: LiteralSet::new(),
+ suffix: LiteralSet::new(),
+ }
+ }
+
+ fn empty_string(size: usize) -> Analysis {
+ Analysis::exact(size, LiteralSet::single(BString::from("")))
+ }
+
+ fn max_len(&self) -> usize {
+ cmp::max(
+ self.exact.len(),
+ cmp::max(self.prefix.len(), self.suffix.len()),
+ )
+ }
+
+ fn is_exact(&self) -> bool {
+ !self.exact.is_empty()
+ }
+
+ fn make_inexact(&mut self) {
+ if !self.is_exact() {
+ return;
+ }
+ self.prefix = mem::replace(&mut self.exact, LiteralSet::new());
+ self.suffix = self.prefix.clone();
+ }
+
+ fn save_exact(&mut self) {
+ self.query.and_ngrams(3, &self.exact);
+ }
+
+ fn union(&mut self, mut o: Analysis) {
+ if self.is_exact() && o.is_exact() {
+ self.exact.union(o.exact);
+ } else if self.is_exact() {
+ self.save_exact();
+ self.make_inexact();
+ self.prefix.union(o.prefix);
+ self.suffix.union(o.suffix);
+ } else if o.is_exact() {
+ o.save_exact();
+ self.prefix.union(o.exact.clone());
+ self.suffix.union(o.exact);
+ } else {
+ self.prefix.union(o.prefix);
+ self.suffix.union(o.suffix);
+ }
+ self.query.union(o.query);
+ self.simplify();
+ }
+
+ fn concat(&mut self, mut o: Analysis) {
+ let inex_cross = if self.is_exact() && o.is_exact() {
+ self.exact.cross(o.exact);
+ None
+ } else {
+ let self_exact = self.is_exact();
+ let o_exact = o.is_exact();
+ self.make_inexact();
+ o.make_inexact();
+
+ let inex_cross = if !self_exact && !o_exact {
+ let mut cross = self.suffix.clone();
+ cross.cross(o.prefix.clone());
+ Some(cross)
+ } else {
+ None
+ };
+
+ if self_exact {
+ self.prefix.cross(o.prefix);
+ } else if self.prefix.has_empty() {
+ self.prefix.union(o.prefix);
+ }
+ if o_exact {
+ self.suffix.cross(o.suffix);
+ } else if self.suffix.has_empty() {
+ self.suffix.union(o.suffix);
+ } else {
+ self.suffix = o.suffix;
+ }
+ inex_cross
+ };
+ self.query.intersect(o.query);
+ if let Some(cross) = inex_cross {
+ self.query.and_ngrams(self.size, &cross);
+ }
+ self.simplify();
+ }
+
+ fn simplify(&mut self) {
+ if self.is_exact() && self.exact.min_len() >= (self.size + 1) {
+ self.save_exact();
+ for lit in &self.exact.lits {
+ if lit.len() < self.size {
+ self.prefix.lits.push(lit.clone());
+ self.suffix.lits.push(lit.clone());
+ } else {
+ self.prefix.lits.push(lit[..self.size - 1].into());
+ self.suffix
+ .lits
+ .push(lit[lit.len() - (self.size - 1)..].into());
+ }
+ }
+ self.exact.clear();
+ self.prefix.canonicalize();
+ self.suffix.canonicalize();
+ }
+ if !self.is_exact() {
+ self.simplify_prefix();
+ self.simplify_suffix();
+ }
+ }
+
+ fn finalize(&mut self) {
+ if self.is_exact() && self.exact.min_len() >= self.size {
+ self.save_exact();
+ for lit in &self.exact.lits {
+ if lit.len() < self.size {
+ self.prefix.lits.push(lit.clone());
+ self.suffix.lits.push(lit.clone());
+ } else {
+ self.prefix.lits.push(lit[..self.size - 1].into());
+ self.suffix
+ .lits
+ .push(lit[lit.len() - (self.size - 1)..].into());
+ }
+ }
+ self.exact.clear();
+ self.prefix.canonicalize();
+ self.suffix.canonicalize();
+ }
+ if !self.is_exact() {
+ self.simplify_prefix();
+ self.simplify_suffix();
+ }
+ }
+
+ fn simplify_prefix(&mut self) {
+ self.query.and_ngrams(self.size, &self.prefix);
+ self.prefix.retain_prefix(self.size - 1);
+ }
+
+ fn simplify_suffix(&mut self) {
+ self.query.and_ngrams(self.size, &self.suffix);
+ self.suffix.retain_suffix(self.size - 1);
+ }
+}
+
+#[derive(Clone, Debug)]
+struct LiteralSet {
+ lits: Vec<BString>,
+}
+
+impl LiteralSet {
+ fn new() -> LiteralSet {
+ LiteralSet { lits: vec![] }
+ }
+
+ fn single(lit: BString) -> LiteralSet {
+ LiteralSet { lits: vec![lit] }
+ }
+
+ fn from_vec(lits: Vec<BString>) -> LiteralSet {
+ let mut set = LiteralSet { lits };
+ set.canonicalize();
+ set
+ }
+
+ fn clear(&mut self) {
+ self.lits.clear();
+ }
+
+ fn retain_prefix(&mut self, max: usize) {
+ for lit in &mut self.lits {
+ lit.truncate(max);
+ }
+ self.canonicalize();
+ }
+
+ fn retain_suffix(&mut self, max: usize) {
+ for lit in &mut self.lits {
+ if lit.len() <= max {
+ continue;
+ }
+ let start = lit.len() - max;
+ lit.drain(..start);
+ }
+ self.canonicalize();
+ }
+
+ fn extend<I: IntoIterator<Item = BString>>(&mut self, it: I) {
+ self.lits.extend(it);
+ self.canonicalize();
+ }
+
+ fn canonicalize(&mut self) {
+ self.lits.sort();
+ self.lits.dedup();
+ }
+
+ fn factor(self, o: LiteralSet) -> (LiteralSet, LiteralSet, LiteralSet) {
+ // TODO: Do this without cloning every literal.
+
+ let (set1, set2) = (self.lits, o.lits);
+ let (mut common, mut not1, mut not2) = (vec![], vec![], vec![]);
+
+ let (mut i1, mut i2) = (0, 0);
+ while i1 < set1.len() && i2 < set2.len() {
+ if set1[i1] < set2[i2] {
+ not1.push(set1[i1].clone());
+ i1 += 1;
+ } else if set2[i2] < set1[i1] {
+ not2.push(set2[i2].clone());
+ i2 += 1;
+ } else {
+ common.push(set1[i1].clone());
+ i1 += 1;
+ i2 += 1;
+ }
+ }
+ while i1 < set1.len() {
+ not1.push(set1[i1].clone());
+ i1 += 1;
+ }
+ while i2 < set2.len() {
+ not2.push(set2[i2].clone());
+ i2 += 1;
+ }
+ (
+ LiteralSet::from_vec(common),
+ LiteralSet::from_vec(not1),
+ LiteralSet::from_vec(not2),
+ )
+ }
+
+ fn union(&mut self, o: LiteralSet) {
+ self.lits.extend(o.lits);
+ self.canonicalize();
+ }
+
+ fn cross(&mut self, o: LiteralSet) {
+ if o.is_empty() || o.has_only_empty() {
+ return;
+ }
+ if self.is_empty() || self.has_only_empty() {
+ *self = o;
+ return;
+ }
+
+ let orig = mem::replace(&mut self.lits, vec![]);
+ for selflit in &orig {
+ for olit in &o.lits {
+ let mut newlit = selflit.clone();
+ newlit.push_str(olit);
+ self.lits.push(newlit);
+ }
+ }
+ }
+
+ fn is_empty(&self) -> bool {
+ self.lits.is_empty()
+ }
+
+ fn len(&self) -> usize {
+ self.lits.len()
+ }
+
+ fn min_len(&self) -> usize {
+ self.lits.iter().map(|x| x.len()).min().unwrap_or(0)
+ }
+
+ fn has_empty(&self) -> bool {
+ self.lits.get(0).map_or(false, |x| x.is_empty())
+ }
+
+ fn has_only_empty(&self) -> bool {
+ self.len() == 1 && self.has_empty()
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct GramQueryBuilder {
+ ngram_size: usize,
+ limit_len: usize,
+ limit_class: usize,
+}
+
+impl GramQueryBuilder {
+ pub fn new() -> GramQueryBuilder {
+ GramQueryBuilder { ngram_size: 3, limit_len: 250, limit_class: 10 }
+ }
+
+ pub fn ngram_size(&mut self, size: usize) -> &mut GramQueryBuilder {
+ // A size smaller than this doesn't make a ton of sense, particularly
+ // given that it is currently measured in bytes, not codepoints (or
+ // graphemes).
+ assert!(size >= 2);
+ self.ngram_size = size;
+ self
+ }
+
+ pub fn limit_len(&mut self, len: usize) -> &mut GramQueryBuilder {
+ self.limit_len = len;
+ self
+ }
+
+ pub fn limit_class(&mut self, len: usize) -> &mut GramQueryBuilder {
+ self.limit_class = len;
+ self
+ }
+
+ pub fn build(&self, exp: &Hir) -> GramQuery {
+ self.build_analysis(exp).query
+ }
+
+ fn build_analysis(&self, exp: &Hir) -> Analysis {
+ let mut ana = self.b(exp);
+ ana.finalize();
+ ana
+ }
+
+ fn b(&self, exp: &Hir) -> Analysis {
+ match *exp.kind() {
+ HirKind::Empty | HirKind::Anchor(_) | HirKind::WordBoundary(_) => {
+ Analysis::empty_string(self.ngram_size)
+ }
+ HirKind::Literal(hir::Literal::Unicode(ch)) => {
+ let mut lit = BString::from(vec![]);
+ lit.push_char(ch);
+ Analysis::exact_one(self.ngram_size, lit)
+ }
+ HirKind::Literal(hir::Literal::Byte(b)) => {
+ let mut lit = BString::from(vec![]);
+ lit.push_byte(b);
+ Analysis::exact_one(self.ngram_size, lit)
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ if class_over_limit_unicode(cls, self.limit_class) {
+ return Analysis::anything(self.ngram_size);
+ }
+
+ let mut set = LiteralSet::new();
+ for r in cls.iter() {
+ for cp in (r.start() as u32)..=(r.end() as u32) {
+ let ch = match std::char::from_u32(cp) {
+ None => continue,
+ Some(ch) => ch,
+ };
+ set.lits.push(BString::from(ch.to_string()));
+ }
+ }
+ set.canonicalize();
+ Analysis::exact(self.ngram_size, set)
+ }
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ if class_over_limit_bytes(cls, self.limit_class) {
+ return Analysis::anything(self.ngram_size);
+ }
+
+ let mut set = LiteralSet::new();
+ for r in cls.iter() {
+ for b in r.start()..=r.end() {
+ set.lits.push(BString::from(vec![b]));
+ }
+ }
+ set.canonicalize();
+ Analysis::exact(self.ngram_size, set)
+ }
+ HirKind::Group(ref group) => self.b(&group.hir),
+ HirKind::Repetition(ref rep) => {
+ if rep.is_match_empty() {
+ Analysis::anything(self.ngram_size)
+ } else {
+ let mut ana = self.b(&rep.hir);
+ ana.make_inexact();
+ ana
+ }
+ }
+ HirKind::Alternation(ref exps) => {
+ let mut ana = self.b(&exps[0]);
+ for e in exps.iter().skip(1) {
+ ana.union(self.b(e));
+ }
+ ana
+ }
+ HirKind::Concat(ref exps) => {
+ let mut exps = combine_literals(exps);
+ let mut ana = Analysis::empty_string(self.ngram_size);
+ for e in exps {
+ let next = self.build_literal_or_hir(e);
+ if ana.max_len() + ana.max_len() > self.limit_len {
+ ana.concat(Analysis::anything(self.ngram_size));
+ } else {
+ ana.concat(next);
+ }
+ }
+ ana
+ }
+ }
+ }
+
+ fn build_literal_or_hir(&self, or: LiteralOrHir) -> Analysis {
+ match or {
+ LiteralOrHir::Literal(string) => {
+ Analysis::exact_one(self.ngram_size, string)
+ }
+ LiteralOrHir::Other(exp) => self.b(exp),
+ }
+ }
+}
+
+impl Default for GramQueryBuilder {
+ fn default() -> GramQueryBuilder {
+ GramQueryBuilder::new()
+ }
+}
+
+fn class_over_limit_unicode(cls: &hir::ClassUnicode, limit: usize) -> bool {
+ let mut count = 0;
+ for r in cls.iter() {
+ if count > limit {
+ return true;
+ }
+ count += (r.end() as u32 - r.start() as u32) as usize;
+ }
+ count > limit
+}
+
+fn class_over_limit_bytes(cls: &hir::ClassBytes, limit: usize) -> bool {
+ let mut count = 0;
+ for r in cls.iter() {
+ if count > limit {
+ return true;
+ }
+ count += (r.end() - r.start()) as usize;
+ }
+ count > limit
+}
+
+#[derive(Debug)]
+enum LiteralOrHir<'a> {
+ Literal(BString),
+ // Guaranteed to never contain a HirKind::Literal.
+ Other(&'a Hir),
+}
+
+fn combine_literals(concat: &[Hir]) -> Vec<LiteralOrHir> {
+ let mut combined = vec![];
+ let mut lit = BString::from(vec![]);
+ for exp in concat {
+ match *exp.kind() {
+ HirKind::Literal(hir::Literal::Unicode(ch)) => {
+ lit.push_char(ch);
+ }
+ HirKind::Literal(hir::Literal::Byte(b)) => {
+ lit.push_byte(b);
+ }
+ _ => {
+ if !lit.is_empty() {
+ combined.push(LiteralOrHir::Literal(lit));
+ lit = BString::from(vec![]);
+ }
+ combined.push(LiteralOrHir::Other(exp));
+ }
+ }
+ }
+ if !lit.is_empty() {
+ combined.push(LiteralOrHir::Literal(lit));
+ }
+ combined
+}
+
+/// Returns all ngrams of the given size in a sliding window fashion over the
+/// given literal. If the literal is smaller than the given size, then the
+/// entire literal is returned as an ngram. (An empty literal always results in
+/// a single empty string returned.)
+fn ngrams<'b, B: 'b + AsRef<[u8]> + ?Sized>(
+ size: usize,
+ lit: &'b B,
+) -> impl Iterator<Item = &'b [u8]> {
+ let lit = lit.as_ref();
+ let size = cmp::min(size, lit.len());
+ let end = lit.len() - size;
+ (0..=end).map(move |i| &lit[i..i + size])
+}
+
+fn factor(
+ qs1: Vec<GramQuery>,
+ qs2: Vec<GramQuery>,
+) -> (LiteralSet, LiteralSet, LiteralSet) {
+ let (mut lits1, mut lits2) = (vec![], vec![]);
+ for q in qs1 {
+ lits1.push(q.unwrap_literal());
+ }
+ for q in qs2 {
+ lits2.push(q.unwrap_literal());
+ }
+
+ let set1 = LiteralSet::from_vec(lits1);
+ let set2 = LiteralSet::from_vec(lits2);
+ set1.factor(set2)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use regex_syntax::ParserBuilder;