summaryrefslogtreecommitdiffstats
path: root/src/fuzzy_patterns.rs
blob: 08a03fea082fcfed2374a58a71c8570a398fd29c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
//! a simple fuzzy pattern matcher for filename filtering / sorting.
//! It's not meant for file contents but for small strings (less than 1000 chars)
//!  such as file names.

use std::fmt::{self, Write};
use crate::patterns::{Match};

// weights used in match score computing
const BONUS_MATCH: i32 = 10_000;
const BONUS_EXACT: i32 = 1_000;
const BONUS_START: i32 = 0; // disabled
const BONUX_CANDIDATE_LENGTH: i32 = -1; // per char
const BONUS_LENGTH: i32 = -10; // per char of length of the match
const MAX_LENGTH_BASE: usize = 2;
const MAX_LENGTH_PER_CHAR: usize = 2;

#[derive(Debug, Clone)]
pub struct FuzzyPattern {
    lc_chars: Box<[char]>, // lowercase characters
}

impl fmt::Display for FuzzyPattern {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        for &c in self.lc_chars.iter() {
            f.write_char(c)?
        }
        Ok(())
    }
}

impl FuzzyPattern {
    pub fn from(pat: &str) -> FuzzyPattern {
        let lc_chars: Vec<char> = pat.chars().map(|c| c.to_ascii_lowercase()).collect();
        let lc_chars = lc_chars.into_boxed_slice();
        FuzzyPattern { lc_chars }
    }
    fn match_starting_at_index(
        &self,
        cand_chars: &[char],
        start_idx: usize, // start index in candidate
        max_match_len: usize,
    ) -> Option<Match> {
        if cand_chars[start_idx] != self.lc_chars[0] {
            return None;
        }
        let mut pos: Vec<usize> = vec![]; // positions of matching chars in candidate
        pos.push(start_idx);
        let mut d = 1;
        for pat_idx in 1..self.lc_chars.len() {
            loop {
                let cand_idx = start_idx + d;
                if cand_idx == cand_chars.len() || d > max_match_len {
                    return None;
                }
                d += 1;
                if cand_chars[cand_idx] == self.lc_chars[pat_idx] {
                    pos.push(cand_idx);
                    break;
                }
            }
        }
        Some(Match { score: 0, pos })
    }
    // return a match if the pattern can be found in the candidate string.
    // The algorithm tries to return the best one. For example if you search
    // "abc" in "ababaca-abc", the returned match would be at the end.
    pub fn find(&self, candidate: &str) -> Option<Match> {
        let cand_chars: Vec<char> = candidate.chars().map(|c| c.to_ascii_lowercase()).collect();
        if cand_chars.len() < self.lc_chars.len() {
            return None;
        }
        let mut best_score = 0;
        let max_match_len = MAX_LENGTH_BASE + MAX_LENGTH_PER_CHAR * self.lc_chars.len();
        let mut best_match: Option<Match> = None;
        for start_idx in 0..=cand_chars.len() - self.lc_chars.len() {
            let sm = self.match_starting_at_index(&cand_chars, start_idx, max_match_len);
            if let Some(mut m) = sm {
                let match_len = m.pos[m.pos.len() - 1] - m.pos[0];
                let mut score = BONUS_MATCH;
                score += BONUX_CANDIDATE_LENGTH * (cand_chars.len() as i32);
                if m.pos[0] == 0 {
                    score += BONUS_START;
                    if cand_chars.len() == self.lc_chars.len() {
                        score += BONUS_EXACT;
                    }
                }
                score += (match_len as i32) * BONUS_LENGTH;
                if score > best_score {
                    best_score = score;
                    m.score = score;
                    best_match = Some(m);
                }
            }
        }
        best_match
    }
}