2022-08-02 06:19:38 +00:00
|
|
|
//go:build generate
|
|
|
|
|
|
|
|
// This program generates a property file in Go file from Unicode Character
|
|
|
|
// Database auxiliary data files. The command line arguments are as follows:
|
|
|
|
//
|
2022-09-13 13:44:44 +00:00
|
|
|
// 1. The name of the Unicode data file (just the filename, without extension).
|
|
|
|
// Can be "-" (to skip) if the emoji flag is included.
|
|
|
|
// 2. The name of the locally generated Go file.
|
|
|
|
// 3. The name of the slice mapping code points to properties.
|
|
|
|
// 4. The name of the generator, for logging purposes.
|
|
|
|
// 5. (Optional) Flags, comma-separated. The following flags are available:
|
|
|
|
// - "emojis=<property>": include the specified emoji properties (e.g.
|
|
|
|
// "Extended_Pictographic").
|
|
|
|
// - "gencat": include general category properties.
|
2022-08-02 06:19:38 +00:00
|
|
|
//
|
2022-09-13 13:44:44 +00:00
|
|
|
//go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic
|
|
|
|
//go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic
|
2022-08-02 06:19:38 +00:00
|
|
|
//go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
|
|
|
|
//go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
|
|
|
|
//go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
|
2022-09-13 13:44:44 +00:00
|
|
|
//go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation
|
2022-08-02 06:19:38 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"go/format"
|
|
|
|
"io/ioutil"
|
|
|
|
"log"
|
|
|
|
"net/http"
|
|
|
|
"os"
|
|
|
|
"regexp"
|
|
|
|
"sort"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
)
|
|
|
|
|
|
|
|
// We want to test against a specific version rather than the latest. When the
|
|
|
|
// package is upgraded to a new version, change these to generate new tests.
|
|
|
|
const (
|
2024-01-30 16:47:30 +00:00
|
|
|
propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt`
|
|
|
|
emojiURL = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt`
|
2022-08-02 06:19:38 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// The regular expression for a line containing a code point range property.
|
|
|
|
var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`)
|
|
|
|
|
|
|
|
func main() {
|
|
|
|
if len(os.Args) < 5 {
|
|
|
|
fmt.Println("Not enough arguments, see code for details")
|
|
|
|
os.Exit(1)
|
|
|
|
}
|
|
|
|
|
|
|
|
log.SetPrefix("gen_properties (" + os.Args[4] + "): ")
|
|
|
|
log.SetFlags(0)
|
|
|
|
|
|
|
|
// Parse flags.
|
2022-09-13 13:44:44 +00:00
|
|
|
flags := make(map[string]string)
|
2022-08-02 06:19:38 +00:00
|
|
|
if len(os.Args) >= 6 {
|
|
|
|
for _, flag := range strings.Split(os.Args[5], ",") {
|
2022-09-13 13:44:44 +00:00
|
|
|
flagFields := strings.Split(flag, "=")
|
|
|
|
if len(flagFields) == 1 {
|
|
|
|
flags[flagFields[0]] = "yes"
|
|
|
|
} else {
|
|
|
|
flags[flagFields[0]] = flagFields[1]
|
|
|
|
}
|
2022-08-02 06:19:38 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Parse the text file and generate Go source code from it.
|
|
|
|
_, includeGeneralCategory := flags["gencat"]
|
2022-09-13 13:44:44 +00:00
|
|
|
var mainURL string
|
|
|
|
if os.Args[1] != "-" {
|
|
|
|
mainURL = fmt.Sprintf(propertyURL, os.Args[1])
|
|
|
|
}
|
|
|
|
src, err := parse(mainURL, flags["emojis"], includeGeneralCategory)
|
2022-08-02 06:19:38 +00:00
|
|
|
if err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Format the Go code.
|
|
|
|
formatted, err := format.Source([]byte(src))
|
|
|
|
if err != nil {
|
|
|
|
log.Fatal("gofmt:", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Save it to the (local) target file.
|
|
|
|
log.Print("Writing to ", os.Args[2])
|
|
|
|
if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// parse parses the Unicode Properties text files located at the given URLs and
|
|
|
|
// returns their equivalent Go source code to be used in the uniseg package. If
|
2022-09-13 13:44:44 +00:00
|
|
|
// "emojiProperty" is not an empty string, emoji code points for that emoji
|
|
|
|
// property (e.g. "Extended_Pictographic") will be included. In those cases, you
|
|
|
|
// may pass an empty "propertyURL" to skip parsing the main properties file. If
|
2022-08-02 06:19:38 +00:00
|
|
|
// "includeGeneralCategory" is true, the Unicode General Category property will
|
|
|
|
// be extracted from the comments and included in the output.
|
2022-09-13 13:44:44 +00:00
|
|
|
func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) {
|
|
|
|
if propertyURL == "" && emojiProperty == "" {
|
|
|
|
return "", errors.New("no properties to parse")
|
|
|
|
}
|
|
|
|
|
2022-08-02 06:19:38 +00:00
|
|
|
// Temporary buffer to hold properties.
|
|
|
|
var properties [][4]string
|
|
|
|
|
|
|
|
// Open the first URL.
|
2022-09-13 13:44:44 +00:00
|
|
|
if propertyURL != "" {
|
|
|
|
log.Printf("Parsing %s", propertyURL)
|
|
|
|
res, err := http.Get(propertyURL)
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
2022-08-02 06:19:38 +00:00
|
|
|
}
|
2022-09-13 13:44:44 +00:00
|
|
|
in1 := res.Body
|
|
|
|
defer in1.Close()
|
2022-08-02 06:19:38 +00:00
|
|
|
|
2022-09-13 13:44:44 +00:00
|
|
|
// Parse it.
|
|
|
|
scanner := bufio.NewScanner(in1)
|
|
|
|
num := 0
|
|
|
|
for scanner.Scan() {
|
|
|
|
num++
|
|
|
|
line := strings.TrimSpace(scanner.Text())
|
|
|
|
|
|
|
|
// Skip comments and empty lines.
|
|
|
|
if strings.HasPrefix(line, "#") || line == "" {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Everything else must be a code point range, a property and a comment.
|
|
|
|
from, to, property, comment, err := parseProperty(line)
|
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
|
|
|
|
}
|
|
|
|
properties = append(properties, [4]string{from, to, property, comment})
|
|
|
|
}
|
|
|
|
if err := scanner.Err(); err != nil {
|
|
|
|
return "", err
|
2022-08-02 06:19:38 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Open the second URL.
|
2022-09-13 13:44:44 +00:00
|
|
|
if emojiProperty != "" {
|
2022-08-02 06:19:38 +00:00
|
|
|
log.Printf("Parsing %s", emojiURL)
|
2022-09-13 13:44:44 +00:00
|
|
|
res, err := http.Get(emojiURL)
|
2022-08-02 06:19:38 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
in2 := res.Body
|
|
|
|
defer in2.Close()
|
|
|
|
|
|
|
|
// Parse it.
|
2022-09-13 13:44:44 +00:00
|
|
|
scanner := bufio.NewScanner(in2)
|
|
|
|
num := 0
|
2022-08-02 06:19:38 +00:00
|
|
|
for scanner.Scan() {
|
|
|
|
num++
|
|
|
|
line := scanner.Text()
|
|
|
|
|
|
|
|
// Skip comments, empty lines, and everything not containing
|
|
|
|
// "Extended_Pictographic".
|
2022-09-13 13:44:44 +00:00
|
|
|
if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) {
|
2022-08-02 06:19:38 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Everything else must be a code point range, a property and a comment.
|
|
|
|
from, to, property, comment, err := parseProperty(line)
|
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf("emojis line %d: %v", num, err)
|
|
|
|
}
|
|
|
|
properties = append(properties, [4]string{from, to, property, comment})
|
|
|
|
}
|
|
|
|
if err := scanner.Err(); err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-30 16:47:30 +00:00
|
|
|
// Avoid overflow during binary search.
|
|
|
|
if len(properties) >= 1<<31 {
|
|
|
|
return "", errors.New("too many properties")
|
|
|
|
}
|
|
|
|
|
2022-08-02 06:19:38 +00:00
|
|
|
// Sort properties.
|
|
|
|
sort.Slice(properties, func(i, j int) bool {
|
|
|
|
left, _ := strconv.ParseUint(properties[i][0], 16, 64)
|
|
|
|
right, _ := strconv.ParseUint(properties[j][0], 16, 64)
|
|
|
|
return left < right
|
|
|
|
})
|
|
|
|
|
|
|
|
// Header.
|
|
|
|
var (
|
|
|
|
buf bytes.Buffer
|
|
|
|
emojiComment string
|
|
|
|
)
|
|
|
|
columns := 3
|
|
|
|
if includeGeneralCategory {
|
|
|
|
columns = 4
|
|
|
|
}
|
|
|
|
if emojiURL != "" {
|
|
|
|
emojiComment = `
|
|
|
|
// and
|
|
|
|
// ` + emojiURL + `
|
|
|
|
// ("Extended_Pictographic" only)`
|
|
|
|
}
|
2024-01-30 16:47:30 +00:00
|
|
|
buf.WriteString(`// Code generated via go generate from gen_properties.go. DO NOT EDIT.
|
2022-08-02 06:19:38 +00:00
|
|
|
|
2024-01-30 16:47:30 +00:00
|
|
|
package uniseg
|
2022-08-02 06:19:38 +00:00
|
|
|
|
|
|
|
// ` + os.Args[3] + ` are taken from
|
2022-09-13 13:44:44 +00:00
|
|
|
// ` + propertyURL + emojiComment + `
|
2022-08-02 06:19:38 +00:00
|
|
|
// on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
|
|
|
|
// license agreement.
|
|
|
|
var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{
|
|
|
|
`)
|
|
|
|
|
|
|
|
// Properties.
|
|
|
|
for _, prop := range properties {
|
|
|
|
if includeGeneralCategory {
|
|
|
|
generalCategory := "gc" + prop[3][:2]
|
|
|
|
if generalCategory == "gcL&" {
|
|
|
|
generalCategory = "gcLC"
|
|
|
|
}
|
|
|
|
prop[3] = prop[3][3:]
|
|
|
|
fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3])
|
|
|
|
} else {
|
|
|
|
fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tail.
|
|
|
|
buf.WriteString("}")
|
|
|
|
|
|
|
|
return buf.String(), nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// parseProperty parses a line of the Unicode properties text file containing a
|
|
|
|
// property for a code point range and returns it along with its comment.
|
|
|
|
func parseProperty(line string) (from, to, property, comment string, err error) {
|
|
|
|
fields := propertyPattern.FindStringSubmatch(line)
|
|
|
|
if fields == nil {
|
|
|
|
err = errors.New("no property found")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
from = fields[1]
|
|
|
|
to = fields[3]
|
|
|
|
if to == "" {
|
|
|
|
to = from
|
|
|
|
}
|
|
|
|
property = fields[4]
|
|
|
|
comment = fields[5]
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// translateProperty translates a property name as used in the Unicode data file
|
|
|
|
// to a variable used in the Go code.
|
|
|
|
func translateProperty(prefix, property string) string {
|
|
|
|
return prefix + strings.ReplaceAll(property, "_", "")
|
|
|
|
}
|