VictoriaMetrics/vendor/github.com/rivo/uniseg/gen_breaktest.go
2024-02-01 17:10:39 +02:00

215 lines
5.8 KiB
Go
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//go:build generate
// This program generates a Go containing a slice of test cases based on the
// Unicode Character Database auxiliary data files. The command line arguments
// are as follows:
//
// 1. The name of the Unicode data file (just the filename, without extension).
// 2. The name of the locally generated Go file.
// 3. The name of the slice containing the test cases.
// 4. The name of the generator, for logging purposes.
//
//go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes
//go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words
//go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences
//go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines
package main
import (
"bufio"
"bytes"
"errors"
"fmt"
"go/format"
"io/ioutil"
"log"
"net/http"
"os"
"time"
)
// We want to test against a specific version rather than the latest. When the
// package is upgraded to a new version, change these to generate new tests.
const (
testCaseURL = `https://www.unicode.org/Public/15.0.0/ucd/auxiliary/%s.txt`
)
func main() {
if len(os.Args) < 5 {
fmt.Println("Not enough arguments, see code for details")
os.Exit(1)
}
log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ")
log.SetFlags(0)
// Read text of testcases and parse into Go source code.
src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1]))
if err != nil {
log.Fatal(err)
}
// Format the Go code.
formatted, err := format.Source(src)
if err != nil {
log.Fatalln("gofmt:", err)
}
// Write it out.
log.Print("Writing to ", os.Args[2])
if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
log.Fatal(err)
}
}
// parse reads a break text file, either from a local file or from a URL. It
// parses the file data into Go source code representing the test cases.
func parse(url string) ([]byte, error) {
log.Printf("Parsing %s", url)
res, err := http.Get(url)
if err != nil {
return nil, err
}
body := res.Body
defer body.Close()
buf := new(bytes.Buffer)
buf.Grow(120 << 10)
buf.WriteString(`// Code generated via go generate from gen_breaktest.go. DO NOT EDIT.
package uniseg
// ` + os.Args[3] + ` are Grapheme testcases taken from
// ` + url + `
// on ` + time.Now().Format("January 2, 2006") + `. See
// https://www.unicode.org/license.html for the Unicode license agreement.
var ` + os.Args[3] + ` = []testCase {
`)
sc := bufio.NewScanner(body)
num := 1
var line []byte
original := make([]byte, 0, 64)
expected := make([]byte, 0, 64)
for sc.Scan() {
num++
line = sc.Bytes()
if len(line) == 0 || line[0] == '#' {
continue
}
var comment []byte
if i := bytes.IndexByte(line, '#'); i >= 0 {
comment = bytes.TrimSpace(line[i+1:])
line = bytes.TrimSpace(line[:i])
}
original, expected, err := parseRuneSequence(line, original[:0], expected[:0])
if err != nil {
return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line)
}
fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment)
}
if err := sc.Err(); err != nil {
return nil, err
}
// Check for final "# EOF", useful check if we're streaming via HTTP
if !bytes.Equal(line, []byte("# EOF")) {
return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line)
}
buf.WriteString("}\n")
return buf.Bytes(), nil
}
// Used by parseRuneSequence to match input via bytes.HasPrefix.
var (
prefixBreak = []byte("÷ ")
prefixDontBreak = []byte("× ")
breakOk = []byte("÷")
breakNo = []byte("×")
)
// parseRuneSequence parses a rune + breaking opportunity sequence from b
// and appends the Go code for testcase.original to orig
// and appends the Go code for testcase.expected to exp.
// It retuns the new orig and exp slices.
//
// E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷"
// it will append
//
// "\u0020\u0308\U0001F1E6"
//
// and "[][]rune{{0x0020,0x0308},{0x1F1E6},}"
// to orig and exp respectively.
//
// The formatting of exp is expected to be cleaned up by gofmt or format.Source.
// Note we explicitly require the sequence to start with ÷ and we implicitly
// require it to end with ÷.
func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) {
// Check for and remove first ÷ or ×.
if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) {
return nil, nil, errors.New("expected ÷ or × as first character")
}
if bytes.HasPrefix(b, prefixBreak) {
b = b[len(prefixBreak):]
} else {
b = b[len(prefixDontBreak):]
}
boundary := true
exp = append(exp, "[][]rune{"...)
for len(b) > 0 {
if boundary {
exp = append(exp, '{')
}
exp = append(exp, "0x"...)
// Find end of hex digits.
var i int
for i = 0; i < len(b) && b[i] != ' '; i++ {
if d := b[i]; ('0' <= d || d <= '9') ||
('A' <= d || d <= 'F') ||
('a' <= d || d <= 'f') {
continue
}
return nil, nil, errors.New("bad hex digit")
}
switch i {
case 4:
orig = append(orig, "\\u"...)
case 5:
orig = append(orig, "\\U000"...)
default:
return nil, nil, errors.New("unsupport code point hex length")
}
orig = append(orig, b[:i]...)
exp = append(exp, b[:i]...)
b = b[i:]
// Check for space between hex and ÷ or ×.
if len(b) < 1 || b[0] != ' ' {
return nil, nil, errors.New("bad input")
}
b = b[1:]
// Check for next boundary.
switch {
case bytes.HasPrefix(b, breakOk):
boundary = true
b = b[len(breakOk):]
case bytes.HasPrefix(b, breakNo):
boundary = false
b = b[len(breakNo):]
default:
return nil, nil, errors.New("missing ÷ or ×")
}
if boundary {
exp = append(exp, '}')
}
exp = append(exp, ',')
if len(b) > 0 && b[0] == ' ' {
b = b[1:]
}
}
exp = append(exp, '}')
return orig, exp, nil
}