summaryrefslogtreecommitdiffstats
path: root/src/tokenizer.go
diff options
context:
space:
mode:
authorJunegunn Choi <junegunn.c@gmail.com>2016-08-14 00:39:44 +0900
committerJunegunn Choi <junegunn.c@gmail.com>2016-08-14 00:41:30 +0900
commit1d4057c20907b7d263d6f2b8cb4350a024859dfe (patch)
treeadb1edd9c4f1806cd65f8c5117645c22618c7301 /src/tokenizer.go
parent822b86942c4ffb0dbf7fd096584d2970675f3ebc (diff)
[perf] Avoid allocating rune array for ascii string
In the best case (all ascii), this reduces the memory footprint by 60% and the response time by 15% to 20%. In the worst case (every line has non-ascii characters), 3 to 4% overhead is observed.
Diffstat (limited to 'src/tokenizer.go')
-rw-r--r--src/tokenizer.go42
1 files changed, 22 insertions, 20 deletions
diff --git a/src/tokenizer.go b/src/tokenizer.go
index 4b89b38e..05b890a2 100644
--- a/src/tokenizer.go
+++ b/src/tokenizer.go
@@ -18,7 +18,7 @@ type Range struct {
// Token contains the tokenized part of the strings and its prefix length
type Token struct {
- text []rune
+ text util.Chars
prefixLength int
trimLength int
}
@@ -75,15 +75,15 @@ func ParseRange(str *string) (Range, bool) {
return newRange(n, n), true
}
-func withPrefixLengths(tokens [][]rune, begin int) []Token {
+func withPrefixLengths(tokens []util.Chars, begin int) []Token {
ret := make([]Token, len(tokens))
prefixLength := begin
for idx, token := range tokens {
// Need to define a new local variable instead of the reused token to take
// the pointer to it
- ret[idx] = Token{token, prefixLength, util.TrimLen(token)}
- prefixLength += len(token)
+ ret[idx] = Token{token, prefixLength, token.TrimLength()}
+ prefixLength += token.Length()
}
return ret
}
@@ -94,13 +94,15 @@ const (
awkWhite
)
-func awkTokenizer(input []rune) ([][]rune, int) {
+func awkTokenizer(input util.Chars) ([]util.Chars, int) {
// 9, 32
- ret := [][]rune{}
+ ret := []util.Chars{}
str := []rune{}
prefixLength := 0
state := awkNil
- for _, r := range input {
+ numChars := input.Length()
+ for idx := 0; idx < numChars; idx++ {
+ r := input.Get(idx)
white := r == 9 || r == 32
switch state {
case awkNil:
@@ -119,34 +121,34 @@ func awkTokenizer(input []rune) ([][]rune, int) {
if white {
str = append(str, r)
} else {
- ret = append(ret, str)
+ ret = append(ret, util.RunesToChars(str))
state = awkBlack
str = []rune{r}
}
}
}
if len(str) > 0 {
- ret = append(ret, str)
+ ret = append(ret, util.RunesToChars(str))
}
return ret, prefixLength
}
// Tokenize tokenizes the given string with the delimiter
-func Tokenize(runes []rune, delimiter Delimiter) []Token {
+func Tokenize(text util.Chars, delimiter Delimiter) []Token {
if delimiter.str == nil && delimiter.regex == nil {
// AWK-style (\S+\s*)
- tokens, prefixLength := awkTokenizer(runes)
+ tokens, prefixLength := awkTokenizer(text)
return withPrefixLengths(tokens, prefixLength)
}
var tokens []string
if delimiter.str != nil {
- tokens = strings.Split(string(runes), *delimiter.str)
+ tokens = strings.Split(text.ToString(), *delimiter.str)
for i := 0; i < len(tokens)-1; i++ {
tokens[i] = tokens[i] + *delimiter.str
}
} else if delimiter.regex != nil {
- str := string(runes)
+ str := text.ToString()
for len(str) > 0 {
loc := delimiter.regex.FindStringIndex(str)
if loc == nil {
@@ -157,9 +159,9 @@ func Tokenize(runes []rune, delimiter Delimiter) []Token {
str = str[last:]
}
}
- asRunes := make([][]rune, len(tokens))
+ asRunes := make([]util.Chars, len(tokens))
for i, token := range tokens {
- asRunes[i] = []rune(token)
+ asRunes[i] = util.RunesToChars([]rune(token))
}
return withPrefixLengths(asRunes, 0)
}
@@ -167,7 +169,7 @@ func Tokenize(runes []rune, delimiter Delimiter) []Token {
func joinTokens(tokens []Token) []rune {
ret := []rune{}
for _, token := range tokens {
- ret = append(ret, token.text...)
+ ret = append(ret, token.text.ToRunes()...)
}
return ret
}
@@ -175,7 +177,7 @@ func joinTokens(tokens []Token) []rune {
func joinTokensAsRunes(tokens []Token) []rune {
ret := []rune{}
for _, token := range tokens {
- ret = append(ret, token.text...)
+ ret = append(ret, token.text.ToRunes()...)
}
return ret
}
@@ -197,7 +199,7 @@ func Transform(tokens []Token, withNth []Range) []Token {
}
if idx >= 1 && idx <= numTokens {
minIdx = idx - 1
- part = append(part, tokens[idx-1].text...)
+ part = append(part, tokens[idx-1].text.ToRunes()...)
}
}
} else {
@@ -224,7 +226,7 @@ func Transform(tokens []Token, withNth []Range) []Token {
minIdx = util.Max(0, begin-1)
for idx := begin; idx <= end; idx++ {
if idx >= 1 && idx <= numTokens {
- part = append(part, tokens[idx-1].text...)
+ part = append(part, tokens[idx-1].text.ToRunes()...)
}
}
}
@@ -234,7 +236,7 @@ func Transform(tokens []Token, withNth []Range) []Token {
} else {
prefixLength = 0
}
- transTokens[idx] = Token{part, prefixLength, util.TrimLen(part)}
+ transTokens[idx] = Token{util.RunesToChars(part), prefixLength, util.TrimLen(part)}
}
return transTokens
}