summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/rivo/uniseg/gen_breaktest.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/rivo/uniseg/gen_breaktest.go')
-rw-r--r--vendor/github.com/rivo/uniseg/gen_breaktest.go213
1 files changed, 213 insertions, 0 deletions
diff --git a/vendor/github.com/rivo/uniseg/gen_breaktest.go b/vendor/github.com/rivo/uniseg/gen_breaktest.go
new file mode 100644
index 000000000..e613c4cd0
--- /dev/null
+++ b/vendor/github.com/rivo/uniseg/gen_breaktest.go
@@ -0,0 +1,213 @@
+//go:build generate
+
+// This program generates a Go containing a slice of test cases based on the
+// Unicode Character Database auxiliary data files. The command line arguments
+// are as follows:
+//
+// 1. The name of the Unicode data file (just the filename, without extension).
+// 2. The name of the locally generated Go file.
+// 3. The name of the slice containing the test cases.
+// 4. The name of the generator, for logging purposes.
+//
+//go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes
+//go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words
+//go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences
+//go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines
+
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "errors"
+ "fmt"
+ "go/format"
+ "io/ioutil"
+ "log"
+ "net/http"
+ "os"
+ "time"
+)
+
+// We want to test against a specific version rather than the latest. When the
+// package is upgraded to a new version, change these to generate new tests.
+const (
+ testCaseURL = `https://www.unicode.org/Public/14.0.0/ucd/auxiliary/%s.txt`
+)
+
+func main() {
+ if len(os.Args) < 5 {
+ fmt.Println("Not enough arguments, see code for details")
+ os.Exit(1)
+ }
+
+ log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ")
+ log.SetFlags(0)
+
+ // Read text of testcases and parse into Go source code.
+ src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1]))
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ // Format the Go code.
+ formatted, err := format.Source(src)
+ if err != nil {
+ log.Fatalln("gofmt:", err)
+ }
+
+ // Write it out.
+ log.Print("Writing to ", os.Args[2])
+ if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
+ log.Fatal(err)
+ }
+}
+
+// parse reads a break text file, either from a local file or from a URL. It
+// parses the file data into Go source code representing the test cases.
+func parse(url string) ([]byte, error) {
+ log.Printf("Parsing %s", url)
+ res, err := http.Get(url)
+ if err != nil {
+ return nil, err
+ }
+ body := res.Body
+ defer body.Close()
+
+ buf := new(bytes.Buffer)
+ buf.Grow(120 << 10)
+ buf.WriteString(`package uniseg
+
+// Code generated via go generate from gen_breaktest.go. DO NOT EDIT.
+
+// ` + os.Args[3] + ` are Grapheme testcases taken from
+// ` + url + `
+// on ` + time.Now().Format("January 2, 2006") + `. See
+// https://www.unicode.org/license.html for the Unicode license agreement.
+var ` + os.Args[3] + ` = []testCase {
+`)
+
+ sc := bufio.NewScanner(body)
+ num := 1
+ var line []byte
+ original := make([]byte, 0, 64)
+ expected := make([]byte, 0, 64)
+ for sc.Scan() {
+ num++
+ line = sc.Bytes()
+ if len(line) == 0 || line[0] == '#' {
+ continue
+ }
+ var comment []byte
+ if i := bytes.IndexByte(line, '#'); i >= 0 {
+ comment = bytes.TrimSpace(line[i+1:])
+ line = bytes.TrimSpace(line[:i])
+ }
+ original, expected, err := parseRuneSequence(line, original[:0], expected[:0])
+ if err != nil {
+ return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line)
+ }
+ fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment)
+ }
+ if err := sc.Err(); err != nil {
+ return nil, err
+ }
+
+ // Check for final "# EOF", useful check if we're streaming via HTTP
+ if !bytes.Equal(line, []byte("# EOF")) {
+ return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line)
+ }
+ buf.WriteString("}\n")
+ return buf.Bytes(), nil
+}
+
+// Used by parseRuneSequence to match input via bytes.HasPrefix.
+var (
+ prefixBreak = []byte("÷ ")
+ prefixDontBreak = []byte("× ")
+ breakOk = []byte("÷")
+ breakNo = []byte("×")
+)
+
+// parseRuneSequence parses a rune + breaking opportunity sequence from b
+// and appends the Go code for testcase.original to orig
+// and appends the Go code for testcase.expected to exp.
+// It retuns the new orig and exp slices.
+//
+// E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷"
+// it will append
+// "\u0020\u0308\U0001F1E6"
+// and "[][]rune{{0x0020,0x0308},{0x1F1E6},}"
+// to orig and exp respectively.
+//
+// The formatting of exp is expected to be cleaned up by gofmt or format.Source.
+// Note we explicitly require the sequence to start with ÷ and we implicitly
+// require it to end with ÷.
+func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) {
+ // Check for and remove first ÷ or ×.
+ if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) {
+ return nil, nil, errors.New("expected ÷ or × as first character")
+ }
+ if bytes.HasPrefix(b, prefixBreak) {
+ b = b[len(prefixBreak):]
+ } else {
+ b = b[len(prefixDontBreak):]
+ }
+
+ boundary := true
+ exp = append(exp, "[][]rune{"...)
+ for len(b) > 0 {
+ if boundary {
+ exp = append(exp, '{')
+ }
+ exp = append(exp, "0x"...)
+ // Find end of hex digits.
+ var i int
+ for i = 0; i < len(b) && b[i] != ' '; i++ {
+ if d := b[i]; ('0' <= d || d <= '9') ||
+ ('A' <= d || d <= 'F') ||
+ ('a' <= d || d <= 'f') {
+ continue
+ }
+ return nil, nil, errors.New("bad hex digit")
+ }
+ switch i {
+ case 4:
+ orig = append(orig, "\\u"...)
+ case 5:
+ orig = append(orig, "\\U000"...)
+ default:
+ return nil, nil, errors.New("unsupport code point hex length")
+ }
+ orig = append(orig, b[:i]...)
+ exp = append(exp, b[:i]...)
+ b = b[i:]
+
+ // Check for space between hex and ÷ or ×.
+ if len(b) < 1 || b[0] != ' ' {
+ return nil, nil, errors.New("bad input")
+ }
+ b = b[1:]
+
+ // Check for next boundary.
+ switch {
+ case bytes.HasPrefix(b, breakOk):
+ boundary = true
+ b = b[len(breakOk):]
+ case bytes.HasPrefix(b, breakNo):
+ boundary = false
+ b = b[len(breakNo):]
+ default:
+ return nil, nil, errors.New("missing ÷ or ×")
+ }
+ if boundary {
+ exp = append(exp, '}')
+ }
+ exp = append(exp, ',')
+ if len(b) > 0 && b[0] == ' ' {
+ b = b[1:]
+ }
+ }
+ exp = append(exp, '}')
+ return orig, exp, nil
+}