summaryrefslogtreecommitdiffstats
path: root/src/tokenizer.go
diff options
context:
space:
mode:
authorJunegunn Choi <junegunn.c@gmail.com>2015-04-17 22:23:52 +0900
committerJunegunn Choi <junegunn.c@gmail.com>2015-04-17 22:23:52 +0900
commit2fe1e28220c543ddbf4e12ee7396e44ee85ad8e0 (patch)
tree82c77b5e639cd66b941356788bcb8e0e053949be /src/tokenizer.go
parent288131ac5a895ba335681339d85ee039557490da (diff)
Improvements in performance and memory usage
I profiled fzf and it turned out that it was spending significant amount of time repeatedly converting character arrays into Unicode codepoints. This commit greatly improves search performance after the initial scan by memoizing the converted results. This commit also addresses the problem of unbounded memory usage of fzf. fzf is a short-lived process that usually processes small input, so it was implemented to cache the intermediate results very aggressively with no notion of cache expiration/eviction. I still think a proper implementation of caching scheme is definitely an overkill. Instead this commit introduces limits to the maximum size (or minimum selectivity) of the intermediate results that can be cached.
Diffstat (limited to 'src/tokenizer.go')
-rw-r--r--src/tokenizer.go44
1 files changed, 21 insertions, 23 deletions
diff --git a/src/tokenizer.go b/src/tokenizer.go
index d38f46f5..c61b2383 100644
--- a/src/tokenizer.go
+++ b/src/tokenizer.go
@@ -16,15 +16,9 @@ type Range struct {
end int
}
-// Transformed holds the result of tokenization and transformation
-type Transformed struct {
- whole *string
- parts []Token
-}
-
// Token contains the tokenized part of the strings and its prefix length
type Token struct {
- text *string
+ text *[]rune
prefixLength int
}
@@ -81,8 +75,8 @@ func withPrefixLengths(tokens []string, begin int) []Token {
for idx, token := range tokens {
// Need to define a new local variable instead of the reused token to take
// the pointer to it
- str := token
- ret[idx] = Token{text: &str, prefixLength: prefixLength}
+ runes := []rune(token)
+ ret[idx] = Token{text: &runes, prefixLength: prefixLength}
prefixLength += len([]rune(token))
}
return ret
@@ -142,33 +136,40 @@ func Tokenize(str *string, delimiter *regexp.Regexp) []Token {
return withPrefixLengths(tokens, 0)
}
-func joinTokens(tokens []Token) string {
+func joinTokens(tokens *[]Token) *string {
ret := ""
- for _, token := range tokens {
- ret += *token.text
+ for _, token := range *tokens {
+ ret += string(*token.text)
}
- return ret
+ return &ret
+}
+
+func joinTokensAsRunes(tokens *[]Token) *[]rune {
+ ret := []rune{}
+ for _, token := range *tokens {
+ ret = append(ret, *token.text...)
+ }
+ return &ret
}
// Transform is used to transform the input when --with-nth option is given
-func Transform(tokens []Token, withNth []Range) *Transformed {
+func Transform(tokens []Token, withNth []Range) *[]Token {
transTokens := make([]Token, len(withNth))
numTokens := len(tokens)
- whole := ""
for idx, r := range withNth {
- part := ""
+ part := []rune{}
minIdx := 0
if r.begin == r.end {
idx := r.begin
if idx == rangeEllipsis {
- part += joinTokens(tokens)
+ part = append(part, *joinTokensAsRunes(&tokens)...)
} else {
if idx < 0 {
idx += numTokens + 1
}
if idx >= 1 && idx <= numTokens {
minIdx = idx - 1
- part += *tokens[idx-1].text
+ part = append(part, *tokens[idx-1].text...)
}
}
} else {
@@ -195,11 +196,10 @@ func Transform(tokens []Token, withNth []Range) *Transformed {
minIdx = util.Max(0, begin-1)
for idx := begin; idx <= end; idx++ {
if idx >= 1 && idx <= numTokens {
- part += *tokens[idx-1].text
+ part = append(part, *tokens[idx-1].text...)
}
}
}
- whole += part
var prefixLength int
if minIdx < numTokens {
prefixLength = tokens[minIdx].prefixLength
@@ -208,7 +208,5 @@ func Transform(tokens []Token, withNth []Range) *Transformed {
}
transTokens[idx] = Token{&part, prefixLength}
}
- return &Transformed{
- whole: &whole,
- parts: transTokens}
+ return &transTokens
}