summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/gdamore/encoding/charmap.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/gdamore/encoding/charmap.go')
-rw-r--r--vendor/github.com/gdamore/encoding/charmap.go192
1 files changed, 192 insertions, 0 deletions
diff --git a/vendor/github.com/gdamore/encoding/charmap.go b/vendor/github.com/gdamore/encoding/charmap.go
new file mode 100644
index 0000000..e64eaed
--- /dev/null
+++ b/vendor/github.com/gdamore/encoding/charmap.go
@@ -0,0 +1,192 @@
+// Copyright 2015 Garrett D'Amore
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use file except in compliance with the License.
+// You may obtain a copy of the license at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package encoding
+
+import (
+ "sync"
+ "unicode/utf8"
+
+ "golang.org/x/text/transform"
+ "golang.org/x/text/encoding"
+)
+
+const (
+ // RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'.
+ RuneError = '\uFFFD'
+
+ // RuneSelf is the rune below which UTF-8 and the Unicode values are
+ // identical. Its also the limit for ASCII.
+ RuneSelf = 0x80
+
+ // ASCIISub is the ASCII substitution character.
+ ASCIISub = '\x1a'
+)
+
+// Charmap is a structure for setting up encodings for 8-bit character sets,
+// for transforming between UTF8 and that other character set. It has some
+// ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a
+// different implementation. This implementation uses maps, and supports
+// user-defined maps.
+//
+// We do assume that a character map has a reasonable substitution character,
+// and that valid encodings are stable (exactly a 1:1 map) and stateless
+// (that is there is no shift character or anything like that.) Hence this
+// approach will not work for many East Asian character sets.
+//
+// Measurement shows little or no measurable difference in the performance of
+// the two approaches. The difference was down to a couple of nsec/op, and
+// no consistent pattern as to which ran faster. With the conversion to
+// UTF-8 the code takes about 25 nsec/op. The conversion in the reverse
+// direction takes about 100 nsec/op. (The larger cost for conversion
+// from UTF-8 is most likely due to the need to convert the UTF-8 byte stream
+// to a rune before conversion.
+//
+type Charmap struct {
+ transform.NopResetter
+ bytes map[rune]byte
+ runes [256][]byte
+ once sync.Once
+
+ // The map between bytes and runes. To indicate that a specific
+ // byte value is invalid for a charcter set, use the rune
+ // utf8.RuneError. Values that are absent from this map will
+ // be assumed to have the identity mapping -- that is the default
+ // is to assume ISO8859-1, where all 8-bit characters have the same
+ // numeric value as their Unicode runes. (Not to be confused with
+ // the UTF-8 values, which *will* be different for non-ASCII runes.)
+ //
+ // If no values less than RuneSelf are changed (or have non-identity
+ // mappings), then the character set is assumed to be an ASCII
+ // superset, and certain assumptions and optimizations become
+ // available for ASCII bytes.
+ Map map[byte]rune
+
+ // The ReplacementChar is the byte value to use for substitution.
+ // It should normally be ASCIISub for ASCII encodings. This may be
+ // unset (left to zero) for mappings that are strictly ASCII supersets.
+ // In that case ASCIISub will be assumed instead.
+ ReplacementChar byte
+}
+
+type cmapDecoder struct {
+ transform.NopResetter
+ runes [256][]byte
+}
+
+type cmapEncoder struct {
+ transform.NopResetter
+ bytes map[rune]byte
+ replace byte
+}
+
+// Init initializes internal values of a character map. This should
+// be done early, to minimize the cost of allocation of transforms
+// later. It is not strictly necessary however, as the allocation
+// functions will arrange to call it if it has not already been done.
+func (c *Charmap) Init() {
+ c.once.Do(c.initialize)
+}
+
+func (c *Charmap) initialize() {
+ c.bytes = make(map[rune]byte)
+ ascii := true
+
+ for i := 0; i < 256; i++ {
+ r, ok := c.Map[byte(i)]
+ if !ok {
+ r = rune(i)
+ }
+ if r < 128 && r != rune(i) {
+ ascii = false
+ }
+ if r != RuneError {
+ c.bytes[r] = byte(i)
+ }
+ utf := make([]byte, utf8.RuneLen(r))
+ utf8.EncodeRune(utf, r)
+ c.runes[i] = utf
+ }
+ if ascii && c.ReplacementChar == '\x00' {
+ c.ReplacementChar = ASCIISub
+ }
+}
+
+// NewDecoder returns a Decoder the converts from the 8-bit
+// character set to UTF-8. Unknown mappings, if any, are mapped
+// to '\uFFFD'.
+func (c *Charmap) NewDecoder() *encoding.Decoder {
+ c.Init()
+ return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}}
+}
+
+// NewEncoder returns a Transformer that converts from UTF8 to the
+// 8-bit character set. Unknown mappings are mapped to 0x1A.
+func (c *Charmap) NewEncoder() *encoding.Encoder {
+ c.Init()
+ return &encoding.Encoder{Transformer:
+ &cmapEncoder{bytes: c.bytes, replace: c.ReplacementChar}}
+}
+
+func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
+ var e error
+ var ndst, nsrc int
+
+ for _, c := range src {
+ b := d.runes[c]
+ l := len(b)
+
+ if ndst+l > len(dst) {
+ e = transform.ErrShortDst
+ break
+ }
+ for i := 0; i < l; i++ {
+ dst[ndst] = b[i]
+ ndst++
+ }
+ nsrc++
+ }
+ return ndst, nsrc, e
+}
+
+func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
+ var e error
+ var ndst, nsrc int
+ for nsrc < len(src) {
+ if ndst >= len(dst) {
+ e = transform.ErrShortDst
+ break
+ }
+
+ r, sz := utf8.DecodeRune(src[nsrc:])
+ if r == utf8.RuneError && sz == 1 {
+ // If its inconclusive due to insufficient data in
+ // in the source, report it
+ if !atEOF && !utf8.FullRune(src[nsrc:]) {
+ e = transform.ErrShortSrc
+ break
+ }
+ }
+
+ if c, ok := d.bytes[r]; ok {
+ dst[ndst] = c
+ } else {
+ dst[ndst] = d.replace
+ }
+ nsrc += sz
+ ndst++
+ }
+
+ return ndst, nsrc, e
+}