diff options
author | Junegunn Choi <junegunn.c@gmail.com> | 2016-08-14 00:39:44 +0900 |
---|---|---|
committer | Junegunn Choi <junegunn.c@gmail.com> | 2016-08-14 00:41:30 +0900 |
commit | 1d4057c20907b7d263d6f2b8cb4350a024859dfe (patch) | |
tree | adb1edd9c4f1806cd65f8c5117645c22618c7301 /src/tokenizer.go | |
parent | 822b86942c4ffb0dbf7fd096584d2970675f3ebc (diff) |
[perf] Avoid allocating rune array for ascii string
In the best case (all ascii), this reduces the memory footprint by 60%
and the response time by 15% to 20%. In the worst case (every line has
non-ascii characters), 3 to 4% overhead is observed.
Diffstat (limited to 'src/tokenizer.go')
-rw-r--r-- | src/tokenizer.go | 42 |
1 files changed, 22 insertions, 20 deletions
diff --git a/src/tokenizer.go b/src/tokenizer.go index 4b89b38e..05b890a2 100644 --- a/src/tokenizer.go +++ b/src/tokenizer.go @@ -18,7 +18,7 @@ type Range struct { // Token contains the tokenized part of the strings and its prefix length type Token struct { - text []rune + text util.Chars prefixLength int trimLength int } @@ -75,15 +75,15 @@ func ParseRange(str *string) (Range, bool) { return newRange(n, n), true } -func withPrefixLengths(tokens [][]rune, begin int) []Token { +func withPrefixLengths(tokens []util.Chars, begin int) []Token { ret := make([]Token, len(tokens)) prefixLength := begin for idx, token := range tokens { // Need to define a new local variable instead of the reused token to take // the pointer to it - ret[idx] = Token{token, prefixLength, util.TrimLen(token)} - prefixLength += len(token) + ret[idx] = Token{token, prefixLength, token.TrimLength()} + prefixLength += token.Length() } return ret } @@ -94,13 +94,15 @@ const ( awkWhite ) -func awkTokenizer(input []rune) ([][]rune, int) { +func awkTokenizer(input util.Chars) ([]util.Chars, int) { // 9, 32 - ret := [][]rune{} + ret := []util.Chars{} str := []rune{} prefixLength := 0 state := awkNil - for _, r := range input { + numChars := input.Length() + for idx := 0; idx < numChars; idx++ { + r := input.Get(idx) white := r == 9 || r == 32 switch state { case awkNil: @@ -119,34 +121,34 @@ func awkTokenizer(input []rune) ([][]rune, int) { if white { str = append(str, r) } else { - ret = append(ret, str) + ret = append(ret, util.RunesToChars(str)) state = awkBlack str = []rune{r} } } } if len(str) > 0 { - ret = append(ret, str) + ret = append(ret, util.RunesToChars(str)) } return ret, prefixLength } // Tokenize tokenizes the given string with the delimiter -func Tokenize(runes []rune, delimiter Delimiter) []Token { +func Tokenize(text util.Chars, delimiter Delimiter) []Token { if delimiter.str == nil && delimiter.regex == nil { // AWK-style (\S+\s*) - tokens, prefixLength := awkTokenizer(runes) + tokens, prefixLength := awkTokenizer(text) return withPrefixLengths(tokens, prefixLength) } var tokens []string if delimiter.str != nil { - tokens = strings.Split(string(runes), *delimiter.str) + tokens = strings.Split(text.ToString(), *delimiter.str) for i := 0; i < len(tokens)-1; i++ { tokens[i] = tokens[i] + *delimiter.str } } else if delimiter.regex != nil { - str := string(runes) + str := text.ToString() for len(str) > 0 { loc := delimiter.regex.FindStringIndex(str) if loc == nil { @@ -157,9 +159,9 @@ func Tokenize(runes []rune, delimiter Delimiter) []Token { str = str[last:] } } - asRunes := make([][]rune, len(tokens)) + asRunes := make([]util.Chars, len(tokens)) for i, token := range tokens { - asRunes[i] = []rune(token) + asRunes[i] = util.RunesToChars([]rune(token)) } return withPrefixLengths(asRunes, 0) } @@ -167,7 +169,7 @@ func Tokenize(runes []rune, delimiter Delimiter) []Token { func joinTokens(tokens []Token) []rune { ret := []rune{} for _, token := range tokens { - ret = append(ret, token.text...) + ret = append(ret, token.text.ToRunes()...) } return ret } @@ -175,7 +177,7 @@ func joinTokens(tokens []Token) []rune { func joinTokensAsRunes(tokens []Token) []rune { ret := []rune{} for _, token := range tokens { - ret = append(ret, token.text...) + ret = append(ret, token.text.ToRunes()...) } return ret } @@ -197,7 +199,7 @@ func Transform(tokens []Token, withNth []Range) []Token { } if idx >= 1 && idx <= numTokens { minIdx = idx - 1 - part = append(part, tokens[idx-1].text...) + part = append(part, tokens[idx-1].text.ToRunes()...) } } } else { @@ -224,7 +226,7 @@ func Transform(tokens []Token, withNth []Range) []Token { minIdx = util.Max(0, begin-1) for idx := begin; idx <= end; idx++ { if idx >= 1 && idx <= numTokens { - part = append(part, tokens[idx-1].text...) + part = append(part, tokens[idx-1].text.ToRunes()...) } } } @@ -234,7 +236,7 @@ func Transform(tokens []Token, withNth []Range) []Token { } else { prefixLength = 0 } - transTokens[idx] = Token{part, prefixLength, util.TrimLen(part)} + transTokens[idx] = Token{util.RunesToChars(part), prefixLength, util.TrimLen(part)} } return transTokens } |