mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
241 lines
6.9 KiB
Go
241 lines
6.9 KiB
Go
|
//go:build generate
|
||
|
|
||
|
// This program generates a property file in Go file from Unicode Character
|
||
|
// Database auxiliary data files. The command line arguments are as follows:
|
||
|
//
|
||
|
// 1. The name of the Unicode data file (just the filename, without extension).
|
||
|
// 2. The name of the locally generated Go file.
|
||
|
// 3. The name of the slice mapping code points to properties.
|
||
|
// 4. The name of the generator, for logging purposes.
|
||
|
// 5. (Optional) Flags, comma-separated. The following flags are available:
|
||
|
// - "emojis": include emoji properties (Extended Pictographic only).
|
||
|
// - "gencat": include general category properties.
|
||
|
//
|
||
|
//go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis
|
||
|
//go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis
|
||
|
//go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
|
||
|
//go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
|
||
|
//go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
|
||
|
package main
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"bytes"
|
||
|
"errors"
|
||
|
"fmt"
|
||
|
"go/format"
|
||
|
"io/ioutil"
|
||
|
"log"
|
||
|
"net/http"
|
||
|
"os"
|
||
|
"regexp"
|
||
|
"sort"
|
||
|
"strconv"
|
||
|
"strings"
|
||
|
"time"
|
||
|
)
|
||
|
|
||
|
// We want to test against a specific version rather than the latest. When the
|
||
|
// package is upgraded to a new version, change these to generate new tests.
|
||
|
const (
|
||
|
gbpURL = `https://www.unicode.org/Public/14.0.0/ucd/%s.txt`
|
||
|
emojiURL = `https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt`
|
||
|
)
|
||
|
|
||
|
// The regular expression for a line containing a code point range property.
|
||
|
var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`)
|
||
|
|
||
|
func main() {
|
||
|
if len(os.Args) < 5 {
|
||
|
fmt.Println("Not enough arguments, see code for details")
|
||
|
os.Exit(1)
|
||
|
}
|
||
|
|
||
|
log.SetPrefix("gen_properties (" + os.Args[4] + "): ")
|
||
|
log.SetFlags(0)
|
||
|
|
||
|
// Parse flags.
|
||
|
flags := make(map[string]struct{})
|
||
|
if len(os.Args) >= 6 {
|
||
|
for _, flag := range strings.Split(os.Args[5], ",") {
|
||
|
flags[flag] = struct{}{}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Parse the text file and generate Go source code from it.
|
||
|
var emojis string
|
||
|
if _, ok := flags["emojis"]; ok {
|
||
|
emojis = emojiURL
|
||
|
}
|
||
|
_, includeGeneralCategory := flags["gencat"]
|
||
|
src, err := parse(fmt.Sprintf(gbpURL, os.Args[1]), emojis, includeGeneralCategory)
|
||
|
if err != nil {
|
||
|
log.Fatal(err)
|
||
|
}
|
||
|
|
||
|
// Format the Go code.
|
||
|
formatted, err := format.Source([]byte(src))
|
||
|
if err != nil {
|
||
|
log.Fatal("gofmt:", err)
|
||
|
}
|
||
|
|
||
|
// Save it to the (local) target file.
|
||
|
log.Print("Writing to ", os.Args[2])
|
||
|
if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
|
||
|
log.Fatal(err)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// parse parses the Unicode Properties text files located at the given URLs and
|
||
|
// returns their equivalent Go source code to be used in the uniseg package. If
|
||
|
// "emojiURL" is an empty string, no emoji code points will be included. If
|
||
|
// "includeGeneralCategory" is true, the Unicode General Category property will
|
||
|
// be extracted from the comments and included in the output.
|
||
|
func parse(gbpURL, emojiURL string, includeGeneralCategory bool) (string, error) {
|
||
|
// Temporary buffer to hold properties.
|
||
|
var properties [][4]string
|
||
|
|
||
|
// Open the first URL.
|
||
|
log.Printf("Parsing %s", gbpURL)
|
||
|
res, err := http.Get(gbpURL)
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
in1 := res.Body
|
||
|
defer in1.Close()
|
||
|
|
||
|
// Parse it.
|
||
|
scanner := bufio.NewScanner(in1)
|
||
|
num := 0
|
||
|
for scanner.Scan() {
|
||
|
num++
|
||
|
line := strings.TrimSpace(scanner.Text())
|
||
|
|
||
|
// Skip comments and empty lines.
|
||
|
if strings.HasPrefix(line, "#") || line == "" {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
// Everything else must be a code point range, a property and a comment.
|
||
|
from, to, property, comment, err := parseProperty(line)
|
||
|
if err != nil {
|
||
|
return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
|
||
|
}
|
||
|
properties = append(properties, [4]string{from, to, property, comment})
|
||
|
}
|
||
|
if err := scanner.Err(); err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
|
||
|
// Open the second URL.
|
||
|
if emojiURL != "" {
|
||
|
log.Printf("Parsing %s", emojiURL)
|
||
|
res, err = http.Get(emojiURL)
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
in2 := res.Body
|
||
|
defer in2.Close()
|
||
|
|
||
|
// Parse it.
|
||
|
scanner = bufio.NewScanner(in2)
|
||
|
num = 0
|
||
|
for scanner.Scan() {
|
||
|
num++
|
||
|
line := scanner.Text()
|
||
|
|
||
|
// Skip comments, empty lines, and everything not containing
|
||
|
// "Extended_Pictographic".
|
||
|
if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, "Extended_Pictographic") {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
// Everything else must be a code point range, a property and a comment.
|
||
|
from, to, property, comment, err := parseProperty(line)
|
||
|
if err != nil {
|
||
|
return "", fmt.Errorf("emojis line %d: %v", num, err)
|
||
|
}
|
||
|
properties = append(properties, [4]string{from, to, property, comment})
|
||
|
}
|
||
|
if err := scanner.Err(); err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Sort properties.
|
||
|
sort.Slice(properties, func(i, j int) bool {
|
||
|
left, _ := strconv.ParseUint(properties[i][0], 16, 64)
|
||
|
right, _ := strconv.ParseUint(properties[j][0], 16, 64)
|
||
|
return left < right
|
||
|
})
|
||
|
|
||
|
// Header.
|
||
|
var (
|
||
|
buf bytes.Buffer
|
||
|
emojiComment string
|
||
|
)
|
||
|
columns := 3
|
||
|
if includeGeneralCategory {
|
||
|
columns = 4
|
||
|
}
|
||
|
if emojiURL != "" {
|
||
|
emojiComment = `
|
||
|
// and
|
||
|
// ` + emojiURL + `
|
||
|
// ("Extended_Pictographic" only)`
|
||
|
}
|
||
|
buf.WriteString(`package uniseg
|
||
|
|
||
|
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
|
||
|
|
||
|
// ` + os.Args[3] + ` are taken from
|
||
|
// ` + gbpURL + emojiComment + `
|
||
|
// on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
|
||
|
// license agreement.
|
||
|
var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{
|
||
|
`)
|
||
|
|
||
|
// Properties.
|
||
|
for _, prop := range properties {
|
||
|
if includeGeneralCategory {
|
||
|
generalCategory := "gc" + prop[3][:2]
|
||
|
if generalCategory == "gcL&" {
|
||
|
generalCategory = "gcLC"
|
||
|
}
|
||
|
prop[3] = prop[3][3:]
|
||
|
fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3])
|
||
|
} else {
|
||
|
fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3])
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Tail.
|
||
|
buf.WriteString("}")
|
||
|
|
||
|
return buf.String(), nil
|
||
|
}
|
||
|
|
||
|
// parseProperty parses a line of the Unicode properties text file containing a
|
||
|
// property for a code point range and returns it along with its comment.
|
||
|
func parseProperty(line string) (from, to, property, comment string, err error) {
|
||
|
fields := propertyPattern.FindStringSubmatch(line)
|
||
|
if fields == nil {
|
||
|
err = errors.New("no property found")
|
||
|
return
|
||
|
}
|
||
|
from = fields[1]
|
||
|
to = fields[3]
|
||
|
if to == "" {
|
||
|
to = from
|
||
|
}
|
||
|
property = fields[4]
|
||
|
comment = fields[5]
|
||
|
return
|
||
|
}
|
||
|
|
||
|
// translateProperty translates a property name as used in the Unicode data file
|
||
|
// to a variable used in the Go code.
|
||
|
func translateProperty(prefix, property string) string {
|
||
|
return prefix + strings.ReplaceAll(property, "_", "")
|
||
|
}
|