diff options
Diffstat (limited to 'vendor/github.com/rivo/uniseg/gen_breaktest.go')
-rw-r--r-- | vendor/github.com/rivo/uniseg/gen_breaktest.go | 213 |
1 files changed, 213 insertions, 0 deletions
diff --git a/vendor/github.com/rivo/uniseg/gen_breaktest.go b/vendor/github.com/rivo/uniseg/gen_breaktest.go new file mode 100644 index 000000000..e613c4cd0 --- /dev/null +++ b/vendor/github.com/rivo/uniseg/gen_breaktest.go @@ -0,0 +1,213 @@ +//go:build generate + +// This program generates a Go containing a slice of test cases based on the +// Unicode Character Database auxiliary data files. The command line arguments +// are as follows: +// +// 1. The name of the Unicode data file (just the filename, without extension). +// 2. The name of the locally generated Go file. +// 3. The name of the slice containing the test cases. +// 4. The name of the generator, for logging purposes. +// +//go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes +//go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words +//go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences +//go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines + +package main + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "go/format" + "io/ioutil" + "log" + "net/http" + "os" + "time" +) + +// We want to test against a specific version rather than the latest. When the +// package is upgraded to a new version, change these to generate new tests. +const ( + testCaseURL = `https://www.unicode.org/Public/14.0.0/ucd/auxiliary/%s.txt` +) + +func main() { + if len(os.Args) < 5 { + fmt.Println("Not enough arguments, see code for details") + os.Exit(1) + } + + log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ") + log.SetFlags(0) + + // Read text of testcases and parse into Go source code. + src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1])) + if err != nil { + log.Fatal(err) + } + + // Format the Go code. + formatted, err := format.Source(src) + if err != nil { + log.Fatalln("gofmt:", err) + } + + // Write it out. + log.Print("Writing to ", os.Args[2]) + if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil { + log.Fatal(err) + } +} + +// parse reads a break text file, either from a local file or from a URL. It +// parses the file data into Go source code representing the test cases. +func parse(url string) ([]byte, error) { + log.Printf("Parsing %s", url) + res, err := http.Get(url) + if err != nil { + return nil, err + } + body := res.Body + defer body.Close() + + buf := new(bytes.Buffer) + buf.Grow(120 << 10) + buf.WriteString(`package uniseg + +// Code generated via go generate from gen_breaktest.go. DO NOT EDIT. + +// ` + os.Args[3] + ` are Grapheme testcases taken from +// ` + url + ` +// on ` + time.Now().Format("January 2, 2006") + `. See +// https://www.unicode.org/license.html for the Unicode license agreement. +var ` + os.Args[3] + ` = []testCase { +`) + + sc := bufio.NewScanner(body) + num := 1 + var line []byte + original := make([]byte, 0, 64) + expected := make([]byte, 0, 64) + for sc.Scan() { + num++ + line = sc.Bytes() + if len(line) == 0 || line[0] == '#' { + continue + } + var comment []byte + if i := bytes.IndexByte(line, '#'); i >= 0 { + comment = bytes.TrimSpace(line[i+1:]) + line = bytes.TrimSpace(line[:i]) + } + original, expected, err := parseRuneSequence(line, original[:0], expected[:0]) + if err != nil { + return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line) + } + fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment) + } + if err := sc.Err(); err != nil { + return nil, err + } + + // Check for final "# EOF", useful check if we're streaming via HTTP + if !bytes.Equal(line, []byte("# EOF")) { + return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line) + } + buf.WriteString("}\n") + return buf.Bytes(), nil +} + +// Used by parseRuneSequence to match input via bytes.HasPrefix. +var ( + prefixBreak = []byte("÷ ") + prefixDontBreak = []byte("× ") + breakOk = []byte("÷") + breakNo = []byte("×") +) + +// parseRuneSequence parses a rune + breaking opportunity sequence from b +// and appends the Go code for testcase.original to orig +// and appends the Go code for testcase.expected to exp. +// It retuns the new orig and exp slices. +// +// E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷" +// it will append +// "\u0020\u0308\U0001F1E6" +// and "[][]rune{{0x0020,0x0308},{0x1F1E6},}" +// to orig and exp respectively. +// +// The formatting of exp is expected to be cleaned up by gofmt or format.Source. +// Note we explicitly require the sequence to start with ÷ and we implicitly +// require it to end with ÷. +func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) { + // Check for and remove first ÷ or ×. + if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) { + return nil, nil, errors.New("expected ÷ or × as first character") + } + if bytes.HasPrefix(b, prefixBreak) { + b = b[len(prefixBreak):] + } else { + b = b[len(prefixDontBreak):] + } + + boundary := true + exp = append(exp, "[][]rune{"...) + for len(b) > 0 { + if boundary { + exp = append(exp, '{') + } + exp = append(exp, "0x"...) + // Find end of hex digits. + var i int + for i = 0; i < len(b) && b[i] != ' '; i++ { + if d := b[i]; ('0' <= d || d <= '9') || + ('A' <= d || d <= 'F') || + ('a' <= d || d <= 'f') { + continue + } + return nil, nil, errors.New("bad hex digit") + } + switch i { + case 4: + orig = append(orig, "\\u"...) + case 5: + orig = append(orig, "\\U000"...) + default: + return nil, nil, errors.New("unsupport code point hex length") + } + orig = append(orig, b[:i]...) + exp = append(exp, b[:i]...) + b = b[i:] + + // Check for space between hex and ÷ or ×. + if len(b) < 1 || b[0] != ' ' { + return nil, nil, errors.New("bad input") + } + b = b[1:] + + // Check for next boundary. + switch { + case bytes.HasPrefix(b, breakOk): + boundary = true + b = b[len(breakOk):] + case bytes.HasPrefix(b, breakNo): + boundary = false + b = b[len(breakNo):] + default: + return nil, nil, errors.New("missing ÷ or ×") + } + if boundary { + exp = append(exp, '}') + } + exp = append(exp, ',') + if len(b) > 0 && b[0] == ' ' { + b = b[1:] + } + } + exp = append(exp, '}') + return orig, exp, nil +} |