vendor: run make vendor-update

This commit is contained in:
Aliaksandr Valialkin 2024-01-30 18:47:01 +02:00
parent adf585f7ed
commit 32e60fe09d
No known key found for this signature in database
GPG key ID: 52C003EE2BCDB9EB
24 changed files with 1121 additions and 571 deletions

6
go.mod
View file

@ -33,7 +33,7 @@ require (
golang.org/x/net v0.20.0 // indirect
golang.org/x/oauth2 v0.16.0
golang.org/x/sys v0.16.0
google.golang.org/api v0.159.0
google.golang.org/api v0.160.0
gopkg.in/yaml.v2 v2.4.0
)
@ -47,7 +47,7 @@ require (
github.com/AzureAD/microsoft-authentication-library-for-go v1.2.1 // indirect
github.com/VividCortex/ewma v1.2.0 // indirect
github.com/alecthomas/units v0.0.0-20231202071711-9a357b53e9c9 // indirect
github.com/aws/aws-sdk-go v1.50.5 // indirect
github.com/aws/aws-sdk-go v1.50.6 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.4 // indirect
github.com/aws/aws-sdk-go-v2/credentials v1.16.16 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.14.11 // indirect
@ -100,7 +100,7 @@ require (
github.com/prometheus/common v0.46.0 // indirect
github.com/prometheus/common/sigv4 v0.1.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/rivo/uniseg v0.4.4 // indirect
github.com/rivo/uniseg v0.4.6 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/stretchr/testify v1.8.4 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect

12
go.sum
View file

@ -83,8 +83,8 @@ github.com/andybalholm/brotli v1.0.2/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu
github.com/andybalholm/brotli v1.0.3/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/armon/go-metrics v0.4.1 h1:hR91U9KYmb6bLBYLQjyM+3j+rcd/UhE+G78SFnF8gJA=
github.com/aws/aws-sdk-go v1.38.35/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro=
github.com/aws/aws-sdk-go v1.50.5 h1:H2Aadcgwr7a2aqS6ZwcE+l1mA6ZrTseYCvjw2QLmxIA=
github.com/aws/aws-sdk-go v1.50.5/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk=
github.com/aws/aws-sdk-go v1.50.6 h1:FaXvNwHG3Ri1paUEW16Ahk9zLVqSAdqa1M3phjZR35Q=
github.com/aws/aws-sdk-go v1.50.6/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk=
github.com/aws/aws-sdk-go-v2 v1.24.1 h1:xAojnj+ktS95YZlDf0zxWBkbFtymPeDP+rvUQIH3uAU=
github.com/aws/aws-sdk-go-v2 v1.24.1/go.mod h1:LNh45Br1YAkEKaAqvmE1m8FUx6a5b/V0oAKV7of29b4=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.4 h1:OCs21ST2LrepDfD3lwlQiOqIGp6JiEUqG84GzTDoyJs=
@ -397,8 +397,8 @@ github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3c
github.com/prometheus/prometheus v0.49.1 h1:90mDvjrFnca2m+0qPSIDr3y7iHPTAagOAElz7j+HtGk=
github.com/prometheus/prometheus v0.49.1/go.mod h1:aDogiyqmv3aBIWDb5z5Sdcxuuf2BOfiJwOIm9JGpMnI=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/rivo/uniseg v0.4.6 h1:Sovz9sDSwbOz9tgUy8JpT+KgCkPYJEN/oYzlJiYTNLg=
github.com/rivo/uniseg v0.4.6/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
@ -710,8 +710,8 @@ google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0M
google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE=
google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM=
google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc=
google.golang.org/api v0.159.0 h1:fVTj+7HHiUYz4JEZCHHoRIeQX7h5FMzrA2RF/DzDdbs=
google.golang.org/api v0.159.0/go.mod h1:0mu0TpK33qnydLvWqbImq2b1eQ5FHRSDCBzAxX9ZHyw=
google.golang.org/api v0.160.0 h1:SEspjXHVqE1m5a1fRy8JFB+5jSu+V0GEDKDghF3ttO4=
google.golang.org/api v0.160.0/go.mod h1:0mu0TpK33qnydLvWqbImq2b1eQ5FHRSDCBzAxX9ZHyw=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=

View file

@ -34007,9 +34007,18 @@ var awsPartition = partition{
endpointKey{
Region: "ap-northeast-1",
}: endpoint{},
endpointKey{
Region: "ap-northeast-2",
}: endpoint{},
endpointKey{
Region: "ap-southeast-1",
}: endpoint{},
endpointKey{
Region: "ap-southeast-2",
}: endpoint{},
endpointKey{
Region: "ca-central-1",
}: endpoint{},
endpointKey{
Region: "eu-central-1",
}: endpoint{},
@ -34031,9 +34040,18 @@ var awsPartition = partition{
endpointKey{
Region: "ui-ap-northeast-1",
}: endpoint{},
endpointKey{
Region: "ui-ap-northeast-2",
}: endpoint{},
endpointKey{
Region: "ui-ap-southeast-1",
}: endpoint{},
endpointKey{
Region: "ui-ap-southeast-2",
}: endpoint{},
endpointKey{
Region: "ui-ca-central-1",
}: endpoint{},
endpointKey{
Region: "ui-eu-central-1",
}: endpoint{},
@ -35867,6 +35885,13 @@ var awscnPartition = partition{
},
},
},
"quicksight": service{
Endpoints: serviceEndpoints{
endpointKey{
Region: "cn-north-1",
}: endpoint{},
},
},
"ram": service{
Endpoints: serviceEndpoints{
endpointKey{
@ -43410,6 +43435,15 @@ var awsisoPartition = partition{
},
"datasync": service{
Endpoints: serviceEndpoints{
endpointKey{
Region: "fips-us-iso-east-1",
}: endpoint{
Hostname: "datasync-fips.us-iso-east-1.c2s.ic.gov",
CredentialScope: credentialScope{
Region: "us-iso-east-1",
},
Deprecated: boxedTrue,
},
endpointKey{
Region: "fips-us-iso-west-1",
}: endpoint{
@ -43419,6 +43453,15 @@ var awsisoPartition = partition{
},
Deprecated: boxedTrue,
},
endpointKey{
Region: "us-iso-east-1",
}: endpoint{},
endpointKey{
Region: "us-iso-east-1",
Variant: fipsVariant,
}: endpoint{
Hostname: "datasync-fips.us-iso-east-1.c2s.ic.gov",
},
endpointKey{
Region: "us-iso-west-1",
}: endpoint{},

View file

@ -5,4 +5,4 @@ package aws
const SDKName = "aws-sdk-go"
// SDKVersion is the version of this SDK
const SDKVersion = "1.50.5"
const SDKVersion = "1.50.6"

View file

@ -3,7 +3,7 @@
[![Go Reference](https://pkg.go.dev/badge/github.com/rivo/uniseg.svg)](https://pkg.go.dev/github.com/rivo/uniseg)
[![Go Report](https://img.shields.io/badge/go%20report-A%2B-brightgreen.svg)](https://goreportcard.com/report/github.com/rivo/uniseg)
This Go package implements Unicode Text Segmentation according to [Unicode Standard Annex #29](https://unicode.org/reports/tr29/), Unicode Line Breaking according to [Unicode Standard Annex #14](https://unicode.org/reports/tr14/) (Unicode version 14.0.0), and monospace font string width calculation similar to [wcwidth](https://man7.org/linux/man-pages/man3/wcwidth.3.html).
This Go package implements Unicode Text Segmentation according to [Unicode Standard Annex #29](https://unicode.org/reports/tr29/), Unicode Line Breaking according to [Unicode Standard Annex #14](https://unicode.org/reports/tr14/) (Unicode version 15.0.0), and monospace font string width calculation similar to [wcwidth](https://man7.org/linux/man-pages/man3/wcwidth.3.html).
## Background
@ -73,7 +73,7 @@ for gr.Next() {
### Using the [`Step`](https://pkg.go.dev/github.com/rivo/uniseg#Step) or [`StepString`](https://pkg.go.dev/github.com/rivo/uniseg#StepString) Function
This is orders of magnitude faster than the `Graphemes` class, but it requires the handling of states and boundaries:
This avoids allocating a new `Graphemes` object but it requires the handling of states and boundaries:
```go
str := "🇩🇪🏳️‍🌈"
@ -88,29 +88,7 @@ for len(str) > 0 {
### Advanced Examples
Breaking into grapheme clusters and evaluating line breaks:
```go
str := "First line.\nSecond line."
state := -1
var (
c string
boundaries int
)
for len(str) > 0 {
c, str, boundaries, state = uniseg.StepString(str, state)
fmt.Print(c)
if boundaries&uniseg.MaskLine == uniseg.LineCanBreak {
fmt.Print("|")
} else if boundaries&uniseg.MaskLine == uniseg.LineMustBreak {
fmt.Print("‖")
}
}
// First |line.
// ‖Second |line.‖
```
If you're only interested in word segmentation, use [`FirstWord`](https://pkg.go.dev/github.com/rivo/uniseg#FirstWord) or [`FirstWordInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstWordInString):
The [`Graphemes`](https://pkg.go.dev/github.com/rivo/uniseg#Graphemes) class offers the most convenient way to access all functionality of this package. But in some cases, it may be better to use the specialized functions directly. For example, if you're only interested in word segmentation, use [`FirstWord`](https://pkg.go.dev/github.com/rivo/uniseg#FirstWord) or [`FirstWordInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstWordInString):
```go
str := "Hello, world!"
@ -133,6 +111,8 @@ Similarly, use
- [`FirstSentence`](https://pkg.go.dev/github.com/rivo/uniseg#FirstSentence) or [`FirstSentenceInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstSentenceInString) for sentence segmentation only, and
- [`FirstLineSegment`](https://pkg.go.dev/github.com/rivo/uniseg#FirstLineSegment) or [`FirstLineSegmentInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstLineSegmentInString) for line breaking / word wrapping (although using [`Step`](https://pkg.go.dev/github.com/rivo/uniseg#Step) or [`StepString`](https://pkg.go.dev/github.com/rivo/uniseg#StepString) is preferred as it will observe grapheme cluster boundaries).
If you're only interested in the width of characters, use [`FirstGraphemeCluster`](https://pkg.go.dev/github.com/rivo/uniseg#FirstGraphemeCluster) or [`FirstGraphemeClusterInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstGraphemeClusterInString). It is much faster than using [`Step`](https://pkg.go.dev/github.com/rivo/uniseg#Step), [`StepString`](https://pkg.go.dev/github.com/rivo/uniseg#StepString), or the [`Graphemes`](https://pkg.go.dev/github.com/rivo/uniseg#Graphemes) class because it does not include the logic for word / sentence / line boundaries.
Finally, if you need to reverse a string while preserving grapheme clusters, use [`ReverseString`](https://pkg.go.dev/github.com/rivo/uniseg#ReverseString):
```go

View file

@ -1,13 +1,13 @@
package uniseg
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
package uniseg
// eastAsianWidth are taken from
// https://www.unicode.org/Public/14.0.0/ucd/EastAsianWidth.txt
// https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var eastAsianWidth = [][3]int{
{0x0000, 0x001F, prN}, // Cc [32] <control-0000>..<control-001F>
@ -504,6 +504,7 @@ var eastAsianWidth = [][3]int{
{0x0CE2, 0x0CE3, prN}, // Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
{0x0CE6, 0x0CEF, prN}, // Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
{0x0CF1, 0x0CF2, prN}, // Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
{0x0CF3, 0x0CF3, prN}, // Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
{0x0D00, 0x0D01, prN}, // Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
{0x0D02, 0x0D03, prN}, // Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
{0x0D04, 0x0D0C, prN}, // Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
@ -565,7 +566,7 @@ var eastAsianWidth = [][3]int{
{0x0EBD, 0x0EBD, prN}, // Lo LAO SEMIVOWEL SIGN NYO
{0x0EC0, 0x0EC4, prN}, // Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
{0x0EC6, 0x0EC6, prN}, // Lm LAO KO LA
{0x0EC8, 0x0ECD, prN}, // Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
{0x0EC8, 0x0ECE, prN}, // Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
{0x0ED0, 0x0ED9, prN}, // Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE
{0x0EDC, 0x0EDF, prN}, // Lo [4] LAO HO NO..LAO LETTER KHMU NYO
{0x0F00, 0x0F00, prN}, // Lo TIBETAN SYLLABLE OM
@ -1916,6 +1917,7 @@ var eastAsianWidth = [][3]int{
{0x10EAB, 0x10EAC, prN}, // Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
{0x10EAD, 0x10EAD, prN}, // Pd YEZIDI HYPHENATION MARK
{0x10EB0, 0x10EB1, prN}, // Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
{0x10EFD, 0x10EFF, prN}, // Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
{0x10F00, 0x10F1C, prN}, // Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
{0x10F1D, 0x10F26, prN}, // No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF
{0x10F27, 0x10F27, prN}, // Lo OLD SOGDIAN LIGATURE AYIN-DALETH
@ -1998,6 +2000,8 @@ var eastAsianWidth = [][3]int{
{0x11236, 0x11237, prN}, // Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
{0x11238, 0x1123D, prN}, // Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
{0x1123E, 0x1123E, prN}, // Mn KHOJKI SIGN SUKUN
{0x1123F, 0x11240, prN}, // Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
{0x11241, 0x11241, prN}, // Mn KHOJKI VOWEL SIGN VOCALIC R
{0x11280, 0x11286, prN}, // Lo [7] MULTANI LETTER A..MULTANI LETTER GA
{0x11288, 0x11288, prN}, // Lo MULTANI LETTER GHA
{0x1128A, 0x1128D, prN}, // Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@ -2160,6 +2164,7 @@ var eastAsianWidth = [][3]int{
{0x11A9E, 0x11AA2, prN}, // Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2
{0x11AB0, 0x11ABF, prN}, // Lo [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA
{0x11AC0, 0x11AF8, prN}, // Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
{0x11B00, 0x11B09, prN}, // Po [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU
{0x11C00, 0x11C08, prN}, // Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
{0x11C0A, 0x11C2E, prN}, // Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
{0x11C2F, 0x11C2F, prN}, // Mc BHAIKSUKI VOWEL SIGN AA
@ -2205,6 +2210,19 @@ var eastAsianWidth = [][3]int{
{0x11EF3, 0x11EF4, prN}, // Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
{0x11EF5, 0x11EF6, prN}, // Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
{0x11EF7, 0x11EF8, prN}, // Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
{0x11F00, 0x11F01, prN}, // Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
{0x11F02, 0x11F02, prN}, // Lo KAWI SIGN REPHA
{0x11F03, 0x11F03, prN}, // Mc KAWI SIGN VISARGA
{0x11F04, 0x11F10, prN}, // Lo [13] KAWI LETTER A..KAWI LETTER O
{0x11F12, 0x11F33, prN}, // Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
{0x11F34, 0x11F35, prN}, // Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
{0x11F36, 0x11F3A, prN}, // Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
{0x11F3E, 0x11F3F, prN}, // Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
{0x11F40, 0x11F40, prN}, // Mn KAWI VOWEL SIGN EU
{0x11F41, 0x11F41, prN}, // Mc KAWI SIGN KILLER
{0x11F42, 0x11F42, prN}, // Mn KAWI CONJOINER
{0x11F43, 0x11F4F, prN}, // Po [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL
{0x11F50, 0x11F59, prN}, // Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
{0x11FB0, 0x11FB0, prN}, // Lo LISU LETTER YHA
{0x11FC0, 0x11FD4, prN}, // No [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH
{0x11FD5, 0x11FDC, prN}, // So [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI
@ -2217,8 +2235,11 @@ var eastAsianWidth = [][3]int{
{0x12480, 0x12543, prN}, // Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
{0x12F90, 0x12FF0, prN}, // Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
{0x12FF1, 0x12FF2, prN}, // Po [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302
{0x13000, 0x1342E, prN}, // Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
{0x13430, 0x13438, prN}, // Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
{0x13000, 0x1342F, prN}, // Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
{0x13430, 0x1343F, prN}, // Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
{0x13440, 0x13440, prN}, // Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
{0x13441, 0x13446, prN}, // Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
{0x13447, 0x13455, prN}, // Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
{0x14400, 0x14646, prN}, // Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
{0x16800, 0x16A38, prN}, // Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
{0x16A40, 0x16A5E, prN}, // Lo [31] MRO LETTER TA..MRO LETTER TEK
@ -2263,7 +2284,9 @@ var eastAsianWidth = [][3]int{
{0x1AFFD, 0x1AFFE, prW}, // Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
{0x1B000, 0x1B0FF, prW}, // Lo [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
{0x1B100, 0x1B122, prW}, // Lo [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
{0x1B132, 0x1B132, prW}, // Lo HIRAGANA LETTER SMALL KO
{0x1B150, 0x1B152, prW}, // Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
{0x1B155, 0x1B155, prW}, // Lo KATAKANA LETTER SMALL KO
{0x1B164, 0x1B167, prW}, // Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
{0x1B170, 0x1B2FB, prW}, // Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
{0x1BC00, 0x1BC6A, prN}, // Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@ -2294,6 +2317,7 @@ var eastAsianWidth = [][3]int{
{0x1D200, 0x1D241, prN}, // So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
{0x1D242, 0x1D244, prN}, // Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
{0x1D245, 0x1D245, prN}, // So GREEK MUSICAL LEIMMA
{0x1D2C0, 0x1D2D3, prN}, // No [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN
{0x1D2E0, 0x1D2F3, prN}, // No [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN
{0x1D300, 0x1D356, prN}, // So [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
{0x1D360, 0x1D378, prN}, // No [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE
@ -2353,11 +2377,14 @@ var eastAsianWidth = [][3]int{
{0x1DF00, 0x1DF09, prN}, // Ll [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
{0x1DF0A, 0x1DF0A, prN}, // Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
{0x1DF0B, 0x1DF1E, prN}, // Ll [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
{0x1DF25, 0x1DF2A, prN}, // Ll [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
{0x1E000, 0x1E006, prN}, // Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
{0x1E008, 0x1E018, prN}, // Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
{0x1E01B, 0x1E021, prN}, // Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
{0x1E023, 0x1E024, prN}, // Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
{0x1E026, 0x1E02A, prN}, // Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
{0x1E030, 0x1E06D, prN}, // Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
{0x1E08F, 0x1E08F, prN}, // Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
{0x1E100, 0x1E12C, prN}, // Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
{0x1E130, 0x1E136, prN}, // Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
{0x1E137, 0x1E13D, prN}, // Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
@ -2370,6 +2397,10 @@ var eastAsianWidth = [][3]int{
{0x1E2EC, 0x1E2EF, prN}, // Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
{0x1E2F0, 0x1E2F9, prN}, // Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
{0x1E2FF, 0x1E2FF, prN}, // Sc WANCHO NGUN SIGN
{0x1E4D0, 0x1E4EA, prN}, // Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
{0x1E4EB, 0x1E4EB, prN}, // Lm NAG MUNDARI SIGN OJOD
{0x1E4EC, 0x1E4EF, prN}, // Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
{0x1E4F0, 0x1E4F9, prN}, // Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
{0x1E7E0, 0x1E7E6, prN}, // Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
{0x1E7E8, 0x1E7EB, prN}, // Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
{0x1E7ED, 0x1E7EE, prN}, // Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@ -2498,13 +2529,14 @@ var eastAsianWidth = [][3]int{
{0x1F6D0, 0x1F6D2, prW}, // So [3] PLACE OF WORSHIP..SHOPPING TROLLEY
{0x1F6D3, 0x1F6D4, prN}, // So [2] STUPA..PAGODA
{0x1F6D5, 0x1F6D7, prW}, // So [3] HINDU TEMPLE..ELEVATOR
{0x1F6DD, 0x1F6DF, prW}, // So [3] PLAYGROUND SLIDE..RING BUOY
{0x1F6DC, 0x1F6DF, prW}, // So [4] WIRELESS..RING BUOY
{0x1F6E0, 0x1F6EA, prN}, // So [11] HAMMER AND WRENCH..NORTHEAST-POINTING AIRPLANE
{0x1F6EB, 0x1F6EC, prW}, // So [2] AIRPLANE DEPARTURE..AIRPLANE ARRIVING
{0x1F6F0, 0x1F6F3, prN}, // So [4] SATELLITE..PASSENGER SHIP
{0x1F6F4, 0x1F6FC, prW}, // So [9] SCOOTER..ROLLER SKATE
{0x1F700, 0x1F773, prN}, // So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
{0x1F780, 0x1F7D8, prN}, // So [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE
{0x1F700, 0x1F776, prN}, // So [119] ALCHEMICAL SYMBOL FOR QUINTESSENCE..LUNAR ECLIPSE
{0x1F77B, 0x1F77F, prN}, // So [5] HAUMEA..ORCUS
{0x1F780, 0x1F7D9, prN}, // So [90] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NINE POINTED WHITE STAR
{0x1F7E0, 0x1F7EB, prW}, // So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
{0x1F7F0, 0x1F7F0, prW}, // So HEAVY EQUALS SIGN
{0x1F800, 0x1F80B, prN}, // So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
@ -2521,22 +2553,20 @@ var eastAsianWidth = [][3]int{
{0x1F947, 0x1F9FF, prW}, // So [185] FIRST PLACE MEDAL..NAZAR AMULET
{0x1FA00, 0x1FA53, prN}, // So [84] NEUTRAL CHESS KING..BLACK CHESS KNIGHT-BISHOP
{0x1FA60, 0x1FA6D, prN}, // So [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
{0x1FA70, 0x1FA74, prW}, // So [5] BALLET SHOES..THONG SANDAL
{0x1FA78, 0x1FA7C, prW}, // So [5] DROP OF BLOOD..CRUTCH
{0x1FA80, 0x1FA86, prW}, // So [7] YO-YO..NESTING DOLLS
{0x1FA90, 0x1FAAC, prW}, // So [29] RINGED PLANET..HAMSA
{0x1FAB0, 0x1FABA, prW}, // So [11] FLY..NEST WITH EGGS
{0x1FAC0, 0x1FAC5, prW}, // So [6] ANATOMICAL HEART..PERSON WITH CROWN
{0x1FAD0, 0x1FAD9, prW}, // So [10] BLUEBERRIES..JAR
{0x1FAE0, 0x1FAE7, prW}, // So [8] MELTING FACE..BUBBLES
{0x1FAF0, 0x1FAF6, prW}, // So [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
{0x1FA70, 0x1FA7C, prW}, // So [13] BALLET SHOES..CRUTCH
{0x1FA80, 0x1FA88, prW}, // So [9] YO-YO..FLUTE
{0x1FA90, 0x1FABD, prW}, // So [46] RINGED PLANET..WING
{0x1FABF, 0x1FAC5, prW}, // So [7] GOOSE..PERSON WITH CROWN
{0x1FACE, 0x1FADB, prW}, // So [14] MOOSE..PEA POD
{0x1FAE0, 0x1FAE8, prW}, // So [9] MELTING FACE..SHAKING FACE
{0x1FAF0, 0x1FAF8, prW}, // So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
{0x1FB00, 0x1FB92, prN}, // So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
{0x1FB94, 0x1FBCA, prN}, // So [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
{0x1FBF0, 0x1FBF9, prN}, // Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
{0x20000, 0x2A6DF, prW}, // Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
{0x2A6E0, 0x2A6FF, prW}, // Cn [32] <reserved-2A6E0>..<reserved-2A6FF>
{0x2A700, 0x2B738, prW}, // Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
{0x2B739, 0x2B73F, prW}, // Cn [7] <reserved-2B739>..<reserved-2B73F>
{0x2A700, 0x2B739, prW}, // Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
{0x2B73A, 0x2B73F, prW}, // Cn [6] <reserved-2B73A>..<reserved-2B73F>
{0x2B740, 0x2B81D, prW}, // Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
{0x2B81E, 0x2B81F, prW}, // Cn [2] <reserved-2B81E>..<reserved-2B81F>
{0x2B820, 0x2CEA1, prW}, // Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
@ -2547,7 +2577,9 @@ var eastAsianWidth = [][3]int{
{0x2FA1E, 0x2FA1F, prW}, // Cn [2] <reserved-2FA1E>..<reserved-2FA1F>
{0x2FA20, 0x2FFFD, prW}, // Cn [1502] <reserved-2FA20>..<reserved-2FFFD>
{0x30000, 0x3134A, prW}, // Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
{0x3134B, 0x3FFFD, prW}, // Cn [60595] <reserved-3134B>..<reserved-3FFFD>
{0x3134B, 0x3134F, prW}, // Cn [5] <reserved-3134B>..<reserved-3134F>
{0x31350, 0x323AF, prW}, // Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
{0x323B0, 0x3FFFD, prW}, // Cn [56398] <reserved-323B0>..<reserved-3FFFD>
{0xE0001, 0xE0001, prN}, // Cf LANGUAGE TAG
{0xE0020, 0xE007F, prN}, // Cf [96] TAG SPACE..CANCEL TAG
{0xE0100, 0xE01EF, prA}, // Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256

View file

@ -1,13 +1,13 @@
package uniseg
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
package uniseg
// emojiPresentation are taken from
//
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var emojiPresentation = [][3]int{
{0x231A, 0x231B, prEmojiPresentation}, // E0.6 [2] (⌚..⌛) watch..hourglass done
@ -211,6 +211,7 @@ var emojiPresentation = [][3]int{
{0x1F6D1, 0x1F6D2, prEmojiPresentation}, // E3.0 [2] (🛑..🛒) stop sign..shopping cart
{0x1F6D5, 0x1F6D5, prEmojiPresentation}, // E12.0 [1] (🛕) hindu temple
{0x1F6D6, 0x1F6D7, prEmojiPresentation}, // E13.0 [2] (🛖..🛗) hut..elevator
{0x1F6DC, 0x1F6DC, prEmojiPresentation}, // E15.0 [1] (🛜) wireless
{0x1F6DD, 0x1F6DF, prEmojiPresentation}, // E14.0 [3] (🛝..🛟) playground slide..ring buoy
{0x1F6EB, 0x1F6EC, prEmojiPresentation}, // E1.0 [2] (🛫..🛬) airplane departure..airplane arrival
{0x1F6F4, 0x1F6F6, prEmojiPresentation}, // E3.0 [3] (🛴..🛶) kick scooter..canoe
@ -267,19 +268,28 @@ var emojiPresentation = [][3]int{
{0x1F9E7, 0x1F9FF, prEmojiPresentation}, // E11.0 [25] (🧧..🧿) red envelope..nazar amulet
{0x1FA70, 0x1FA73, prEmojiPresentation}, // E12.0 [4] (🩰..🩳) ballet shoes..shorts
{0x1FA74, 0x1FA74, prEmojiPresentation}, // E13.0 [1] (🩴) thong sandal
{0x1FA75, 0x1FA77, prEmojiPresentation}, // E15.0 [3] (🩵..🩷) light blue heart..pink heart
{0x1FA78, 0x1FA7A, prEmojiPresentation}, // E12.0 [3] (🩸..🩺) drop of blood..stethoscope
{0x1FA7B, 0x1FA7C, prEmojiPresentation}, // E14.0 [2] (🩻..🩼) x-ray..crutch
{0x1FA80, 0x1FA82, prEmojiPresentation}, // E12.0 [3] (🪀..🪂) yo-yo..parachute
{0x1FA83, 0x1FA86, prEmojiPresentation}, // E13.0 [4] (🪃..🪆) boomerang..nesting dolls
{0x1FA87, 0x1FA88, prEmojiPresentation}, // E15.0 [2] (🪇..🪈) maracas..flute
{0x1FA90, 0x1FA95, prEmojiPresentation}, // E12.0 [6] (🪐..🪕) ringed planet..banjo
{0x1FA96, 0x1FAA8, prEmojiPresentation}, // E13.0 [19] (🪖..🪨) military helmet..rock
{0x1FAA9, 0x1FAAC, prEmojiPresentation}, // E14.0 [4] (🪩..🪬) mirror ball..hamsa
{0x1FAAD, 0x1FAAF, prEmojiPresentation}, // E15.0 [3] (🪭..🪯) folding hand fan..khanda
{0x1FAB0, 0x1FAB6, prEmojiPresentation}, // E13.0 [7] (🪰..🪶) fly..feather
{0x1FAB7, 0x1FABA, prEmojiPresentation}, // E14.0 [4] (🪷..🪺) lotus..nest with eggs
{0x1FABB, 0x1FABD, prEmojiPresentation}, // E15.0 [3] (🪻..🪽) hyacinth..wing
{0x1FABF, 0x1FABF, prEmojiPresentation}, // E15.0 [1] (🪿) goose
{0x1FAC0, 0x1FAC2, prEmojiPresentation}, // E13.0 [3] (🫀..🫂) anatomical heart..people hugging
{0x1FAC3, 0x1FAC5, prEmojiPresentation}, // E14.0 [3] (🫃..🫅) pregnant man..person with crown
{0x1FACE, 0x1FACF, prEmojiPresentation}, // E15.0 [2] (🫎..🫏) moose..donkey
{0x1FAD0, 0x1FAD6, prEmojiPresentation}, // E13.0 [7] (🫐..🫖) blueberries..teapot
{0x1FAD7, 0x1FAD9, prEmojiPresentation}, // E14.0 [3] (🫗..🫙) pouring liquid..jar
{0x1FADA, 0x1FADB, prEmojiPresentation}, // E15.0 [2] (🫚..🫛) ginger root..pea pod
{0x1FAE0, 0x1FAE7, prEmojiPresentation}, // E14.0 [8] (🫠..🫧) melting face..bubbles
{0x1FAE8, 0x1FAE8, prEmojiPresentation}, // E15.0 [1] (🫨) shaking face
{0x1FAF0, 0x1FAF6, prEmojiPresentation}, // E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
{0x1FAF7, 0x1FAF8, prEmojiPresentation}, // E15.0 [2] (🫷..🫸) leftwards pushing hand..rightwards pushing hand
}

View file

@ -32,7 +32,7 @@ import (
// We want to test against a specific version rather than the latest. When the
// package is upgraded to a new version, change these to generate new tests.
const (
testCaseURL = `https://www.unicode.org/Public/14.0.0/ucd/auxiliary/%s.txt`
testCaseURL = `https://www.unicode.org/Public/15.0.0/ucd/auxiliary/%s.txt`
)
func main() {
@ -76,9 +76,9 @@ func parse(url string) ([]byte, error) {
buf := new(bytes.Buffer)
buf.Grow(120 << 10)
buf.WriteString(`package uniseg
buf.WriteString(`// Code generated via go generate from gen_breaktest.go. DO NOT EDIT.
// Code generated via go generate from gen_breaktest.go. DO NOT EDIT.
package uniseg
// ` + os.Args[3] + ` are Grapheme testcases taken from
// ` + url + `
@ -136,7 +136,9 @@ var (
//
// E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷"
// it will append
// "\u0020\u0308\U0001F1E6"
//
// "\u0020\u0308\U0001F1E6"
//
// and "[][]rune{{0x0020,0x0308},{0x1F1E6},}"
// to orig and exp respectively.
//

View file

@ -41,8 +41,8 @@ import (
// We want to test against a specific version rather than the latest. When the
// package is upgraded to a new version, change these to generate new tests.
const (
propertyURL = `https://www.unicode.org/Public/14.0.0/ucd/%s.txt`
emojiURL = `https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt`
propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt`
emojiURL = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt`
)
// The regular expression for a line containing a code point range property.
@ -178,6 +178,11 @@ func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (stri
}
}
// Avoid overflow during binary search.
if len(properties) >= 1<<31 {
return "", errors.New("too many properties")
}
// Sort properties.
sort.Slice(properties, func(i, j int) bool {
left, _ := strconv.ParseUint(properties[i][0], 16, 64)
@ -200,9 +205,9 @@ func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (stri
// ` + emojiURL + `
// ("Extended_Pictographic" only)`
}
buf.WriteString(`package uniseg
buf.WriteString(`// Code generated via go generate from gen_properties.go. DO NOT EDIT.
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
package uniseg
// ` + os.Args[3] + ` are taken from
// ` + propertyURL + emojiComment + `

View file

@ -222,7 +222,7 @@ func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, new
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
var prop int
if state < 0 {
prop = property(graphemeCodePoints, r)
prop = propertyGraphemes(r)
} else {
prop = state >> shiftGraphemePropState
}
@ -284,7 +284,7 @@ func FirstGraphemeClusterInString(str string, state int) (cluster, rest string,
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
var prop int
if state < 0 {
prop = property(graphemeCodePoints, r)
prop = propertyGraphemes(r)
} else {
prop = state >> shiftGraphemePropState
}

View file

@ -1,13 +1,13 @@
package uniseg
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
package uniseg
// graphemeCodePoints are taken from
// https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
// https://www.unicode.org/Public/15.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var graphemeCodePoints = [][3]int{
{0x0000, 0x0009, prControl}, // Cc [10] <control-0000>..<control-0009>
@ -143,6 +143,7 @@ var graphemeCodePoints = [][3]int{
{0x0CCC, 0x0CCD, prExtend}, // Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
{0x0CD5, 0x0CD6, prExtend}, // Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
{0x0CE2, 0x0CE3, prExtend}, // Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
{0x0CF3, 0x0CF3, prSpacingMark}, // Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
{0x0D00, 0x0D01, prExtend}, // Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
{0x0D02, 0x0D03, prSpacingMark}, // Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
{0x0D3B, 0x0D3C, prExtend}, // Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
@ -172,7 +173,7 @@ var graphemeCodePoints = [][3]int{
{0x0EB1, 0x0EB1, prExtend}, // Mn LAO VOWEL SIGN MAI KAN
{0x0EB3, 0x0EB3, prSpacingMark}, // Lo LAO VOWEL SIGN AM
{0x0EB4, 0x0EBC, prExtend}, // Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
{0x0EC8, 0x0ECD, prExtend}, // Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
{0x0EC8, 0x0ECE, prExtend}, // Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
{0x0F18, 0x0F19, prExtend}, // Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
{0x0F35, 0x0F35, prExtend}, // Mn TIBETAN MARK NGAS BZUNG NYI ZLA
{0x0F37, 0x0F37, prExtend}, // Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS
@ -1336,6 +1337,7 @@ var graphemeCodePoints = [][3]int{
{0x10AE5, 0x10AE6, prExtend}, // Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
{0x10D24, 0x10D27, prExtend}, // Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
{0x10EAB, 0x10EAC, prExtend}, // Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
{0x10EFD, 0x10EFF, prExtend}, // Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
{0x10F46, 0x10F50, prExtend}, // Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
{0x10F82, 0x10F85, prExtend}, // Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
{0x11000, 0x11000, prSpacingMark}, // Mc BRAHMI SIGN CANDRABINDU
@ -1375,6 +1377,7 @@ var graphemeCodePoints = [][3]int{
{0x11235, 0x11235, prSpacingMark}, // Mc KHOJKI SIGN VIRAMA
{0x11236, 0x11237, prExtend}, // Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
{0x1123E, 0x1123E, prExtend}, // Mn KHOJKI SIGN SUKUN
{0x11241, 0x11241, prExtend}, // Mn KHOJKI VOWEL SIGN VOCALIC R
{0x112DF, 0x112DF, prExtend}, // Mn KHUDAWADI SIGN ANUSVARA
{0x112E0, 0x112E2, prSpacingMark}, // Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
{0x112E3, 0x112EA, prExtend}, // Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
@ -1494,7 +1497,18 @@ var graphemeCodePoints = [][3]int{
{0x11D97, 0x11D97, prExtend}, // Mn GUNJALA GONDI VIRAMA
{0x11EF3, 0x11EF4, prExtend}, // Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
{0x11EF5, 0x11EF6, prSpacingMark}, // Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
{0x13430, 0x13438, prControl}, // Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
{0x11F00, 0x11F01, prExtend}, // Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
{0x11F02, 0x11F02, prPrepend}, // Lo KAWI SIGN REPHA
{0x11F03, 0x11F03, prSpacingMark}, // Mc KAWI SIGN VISARGA
{0x11F34, 0x11F35, prSpacingMark}, // Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
{0x11F36, 0x11F3A, prExtend}, // Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
{0x11F3E, 0x11F3F, prSpacingMark}, // Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
{0x11F40, 0x11F40, prExtend}, // Mn KAWI VOWEL SIGN EU
{0x11F41, 0x11F41, prSpacingMark}, // Mc KAWI SIGN KILLER
{0x11F42, 0x11F42, prExtend}, // Mn KAWI CONJOINER
{0x13430, 0x1343F, prControl}, // Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
{0x13440, 0x13440, prExtend}, // Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
{0x13447, 0x13455, prExtend}, // Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
{0x16AF0, 0x16AF4, prExtend}, // Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
{0x16B30, 0x16B36, prExtend}, // Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
{0x16F4F, 0x16F4F, prExtend}, // Mn MIAO SIGN CONSONANT MODIFIER BAR
@ -1527,9 +1541,11 @@ var graphemeCodePoints = [][3]int{
{0x1E01B, 0x1E021, prExtend}, // Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
{0x1E023, 0x1E024, prExtend}, // Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
{0x1E026, 0x1E02A, prExtend}, // Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
{0x1E08F, 0x1E08F, prExtend}, // Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
{0x1E130, 0x1E136, prExtend}, // Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
{0x1E2AE, 0x1E2AE, prExtend}, // Mn TOTO SIGN RISING TONE
{0x1E2EC, 0x1E2EF, prExtend}, // Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
{0x1E4EC, 0x1E4EF, prExtend}, // Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
{0x1E8D0, 0x1E8D6, prExtend}, // Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
{0x1E944, 0x1E94A, prExtend}, // Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
{0x1F000, 0x1F003, prExtendedPictographic}, // E0.0 [4] (🀀..🀃) MAHJONG TILE EAST WIND..MAHJONG TILE NORTH WIND
@ -1780,7 +1796,8 @@ var graphemeCodePoints = [][3]int{
{0x1F6D3, 0x1F6D4, prExtendedPictographic}, // E0.0 [2] (🛓..🛔) STUPA..PAGODA
{0x1F6D5, 0x1F6D5, prExtendedPictographic}, // E12.0 [1] (🛕) hindu temple
{0x1F6D6, 0x1F6D7, prExtendedPictographic}, // E13.0 [2] (🛖..🛗) hut..elevator
{0x1F6D8, 0x1F6DC, prExtendedPictographic}, // E0.0 [5] (🛘..🛜) <reserved-1F6D8>..<reserved-1F6DC>
{0x1F6D8, 0x1F6DB, prExtendedPictographic}, // E0.0 [4] (🛘..🛛) <reserved-1F6D8>..<reserved-1F6DB>
{0x1F6DC, 0x1F6DC, prExtendedPictographic}, // E15.0 [1] (🛜) wireless
{0x1F6DD, 0x1F6DF, prExtendedPictographic}, // E14.0 [3] (🛝..🛟) playground slide..ring buoy
{0x1F6E0, 0x1F6E5, prExtendedPictographic}, // E0.7 [6] (🛠️..🛥️) hammer and wrench..motor boat
{0x1F6E6, 0x1F6E8, prExtendedPictographic}, // E0.0 [3] (🛦..🛨) UP-POINTING MILITARY AIRPLANE..UP-POINTING SMALL AIRPLANE
@ -1797,7 +1814,7 @@ var graphemeCodePoints = [][3]int{
{0x1F6FA, 0x1F6FA, prExtendedPictographic}, // E12.0 [1] (🛺) auto rickshaw
{0x1F6FB, 0x1F6FC, prExtendedPictographic}, // E13.0 [2] (🛻..🛼) pickup truck..roller skate
{0x1F6FD, 0x1F6FF, prExtendedPictographic}, // E0.0 [3] (🛽..🛿) <reserved-1F6FD>..<reserved-1F6FF>
{0x1F774, 0x1F77F, prExtendedPictographic}, // E0.0 [12] (🝴..🝿) <reserved-1F774>..<reserved-1F77F>
{0x1F774, 0x1F77F, prExtendedPictographic}, // E0.0 [12] (🝴..🝿) LOT OF FORTUNE..ORCUS
{0x1F7D5, 0x1F7DF, prExtendedPictographic}, // E0.0 [11] (🟕..🟟) CIRCLED TRIANGLE..<reserved-1F7DF>
{0x1F7E0, 0x1F7EB, prExtendedPictographic}, // E12.0 [12] (🟠..🟫) orange circle..brown square
{0x1F7EC, 0x1F7EF, prExtendedPictographic}, // E0.0 [4] (🟬..🟯) <reserved-1F7EC>..<reserved-1F7EF>
@ -1856,30 +1873,37 @@ var graphemeCodePoints = [][3]int{
{0x1FA00, 0x1FA6F, prExtendedPictographic}, // E0.0 [112] (🨀..🩯) NEUTRAL CHESS KING..<reserved-1FA6F>
{0x1FA70, 0x1FA73, prExtendedPictographic}, // E12.0 [4] (🩰..🩳) ballet shoes..shorts
{0x1FA74, 0x1FA74, prExtendedPictographic}, // E13.0 [1] (🩴) thong sandal
{0x1FA75, 0x1FA77, prExtendedPictographic}, // E0.0 [3] (🩵..🩷) <reserved-1FA75>..<reserved-1FA77>
{0x1FA75, 0x1FA77, prExtendedPictographic}, // E15.0 [3] (🩵..🩷) light blue heart..pink heart
{0x1FA78, 0x1FA7A, prExtendedPictographic}, // E12.0 [3] (🩸..🩺) drop of blood..stethoscope
{0x1FA7B, 0x1FA7C, prExtendedPictographic}, // E14.0 [2] (🩻..🩼) x-ray..crutch
{0x1FA7D, 0x1FA7F, prExtendedPictographic}, // E0.0 [3] (🩽..🩿) <reserved-1FA7D>..<reserved-1FA7F>
{0x1FA80, 0x1FA82, prExtendedPictographic}, // E12.0 [3] (🪀..🪂) yo-yo..parachute
{0x1FA83, 0x1FA86, prExtendedPictographic}, // E13.0 [4] (🪃..🪆) boomerang..nesting dolls
{0x1FA87, 0x1FA8F, prExtendedPictographic}, // E0.0 [9] (🪇..🪏) <reserved-1FA87>..<reserved-1FA8F>
{0x1FA87, 0x1FA88, prExtendedPictographic}, // E15.0 [2] (🪇..🪈) maracas..flute
{0x1FA89, 0x1FA8F, prExtendedPictographic}, // E0.0 [7] (🪉..🪏) <reserved-1FA89>..<reserved-1FA8F>
{0x1FA90, 0x1FA95, prExtendedPictographic}, // E12.0 [6] (🪐..🪕) ringed planet..banjo
{0x1FA96, 0x1FAA8, prExtendedPictographic}, // E13.0 [19] (🪖..🪨) military helmet..rock
{0x1FAA9, 0x1FAAC, prExtendedPictographic}, // E14.0 [4] (🪩..🪬) mirror ball..hamsa
{0x1FAAD, 0x1FAAF, prExtendedPictographic}, // E0.0 [3] (🪭..🪯) <reserved-1FAAD>..<reserved-1FAAF>
{0x1FAAD, 0x1FAAF, prExtendedPictographic}, // E15.0 [3] (🪭..🪯) folding hand fan..khanda
{0x1FAB0, 0x1FAB6, prExtendedPictographic}, // E13.0 [7] (🪰..🪶) fly..feather
{0x1FAB7, 0x1FABA, prExtendedPictographic}, // E14.0 [4] (🪷..🪺) lotus..nest with eggs
{0x1FABB, 0x1FABF, prExtendedPictographic}, // E0.0 [5] (🪻..🪿) <reserved-1FABB>..<reserved-1FABF>
{0x1FABB, 0x1FABD, prExtendedPictographic}, // E15.0 [3] (🪻..🪽) hyacinth..wing
{0x1FABE, 0x1FABE, prExtendedPictographic}, // E0.0 [1] (🪾) <reserved-1FABE>
{0x1FABF, 0x1FABF, prExtendedPictographic}, // E15.0 [1] (🪿) goose
{0x1FAC0, 0x1FAC2, prExtendedPictographic}, // E13.0 [3] (🫀..🫂) anatomical heart..people hugging
{0x1FAC3, 0x1FAC5, prExtendedPictographic}, // E14.0 [3] (🫃..🫅) pregnant man..person with crown
{0x1FAC6, 0x1FACF, prExtendedPictographic}, // E0.0 [10] (🫆..🫏) <reserved-1FAC6>..<reserved-1FACF>
{0x1FAC6, 0x1FACD, prExtendedPictographic}, // E0.0 [8] (🫆..🫍) <reserved-1FAC6>..<reserved-1FACD>
{0x1FACE, 0x1FACF, prExtendedPictographic}, // E15.0 [2] (🫎..🫏) moose..donkey
{0x1FAD0, 0x1FAD6, prExtendedPictographic}, // E13.0 [7] (🫐..🫖) blueberries..teapot
{0x1FAD7, 0x1FAD9, prExtendedPictographic}, // E14.0 [3] (🫗..🫙) pouring liquid..jar
{0x1FADA, 0x1FADF, prExtendedPictographic}, // E0.0 [6] (🫚..🫟) <reserved-1FADA>..<reserved-1FADF>
{0x1FADA, 0x1FADB, prExtendedPictographic}, // E15.0 [2] (🫚..🫛) ginger root..pea pod
{0x1FADC, 0x1FADF, prExtendedPictographic}, // E0.0 [4] (🫜..🫟) <reserved-1FADC>..<reserved-1FADF>
{0x1FAE0, 0x1FAE7, prExtendedPictographic}, // E14.0 [8] (🫠..🫧) melting face..bubbles
{0x1FAE8, 0x1FAEF, prExtendedPictographic}, // E0.0 [8] (🫨..🫯) <reserved-1FAE8>..<reserved-1FAEF>
{0x1FAE8, 0x1FAE8, prExtendedPictographic}, // E15.0 [1] (🫨) shaking face
{0x1FAE9, 0x1FAEF, prExtendedPictographic}, // E0.0 [7] (🫩..🫯) <reserved-1FAE9>..<reserved-1FAEF>
{0x1FAF0, 0x1FAF6, prExtendedPictographic}, // E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
{0x1FAF7, 0x1FAFF, prExtendedPictographic}, // E0.0 [9] (🫷..🫿) <reserved-1FAF7>..<reserved-1FAFF>
{0x1FAF7, 0x1FAF8, prExtendedPictographic}, // E15.0 [2] (🫷..🫸) leftwards pushing hand..rightwards pushing hand
{0x1FAF9, 0x1FAFF, prExtendedPictographic}, // E0.0 [7] (🫹..🫿) <reserved-1FAF9>..<reserved-1FAFF>
{0x1FC00, 0x1FFFD, prExtendedPictographic}, // E0.0[1022] (🰀..🿽) <reserved-1FC00>..<reserved-1FFFD>
{0xE0000, 0xE0000, prControl}, // Cn <reserved-E0000>
{0xE0001, 0xE0001, prControl}, // Cf LANGUAGE TAG

View file

@ -21,11 +21,12 @@ const (
grBoundary
)
// The grapheme cluster parser's state transitions. Maps (state, property) to
// (new state, breaking instruction, rule number). The breaking instruction
// always refers to the boundary between the last and next code point.
// grTransitions implements the grapheme cluster parser's state transitions.
// Maps state and property to a new state, a breaking instruction, and rule
// number. The breaking instruction always refers to the boundary between the
// last and next code point. Returns negative values if no transition is found.
//
// This map is queried as follows:
// This function is used as follows:
//
// 1. Find specific state + specific property. Stop if found.
// 2. Find specific state + any property.
@ -36,59 +37,96 @@ const (
// are equal. Stop.
// 6. Assume grAny and grBoundary.
//
// Unicode version 14.0.0.
var grTransitions = map[[2]int][3]int{
// Unicode version 15.0.0.
func grTransitions(state, prop int) (newState int, newProp int, boundary int) {
// It turns out that using a big switch statement is much faster than using
// a map.
switch uint64(state) | uint64(prop)<<32 {
// GB5
{grAny, prCR}: {grCR, grBoundary, 50},
{grAny, prLF}: {grControlLF, grBoundary, 50},
{grAny, prControl}: {grControlLF, grBoundary, 50},
case grAny | prCR<<32:
return grCR, grBoundary, 50
case grAny | prLF<<32:
return grControlLF, grBoundary, 50
case grAny | prControl<<32:
return grControlLF, grBoundary, 50
// GB4
{grCR, prAny}: {grAny, grBoundary, 40},
{grControlLF, prAny}: {grAny, grBoundary, 40},
case grCR | prAny<<32:
return grAny, grBoundary, 40
case grControlLF | prAny<<32:
return grAny, grBoundary, 40
// GB3.
{grCR, prLF}: {grControlLF, grNoBoundary, 30},
// GB3
case grCR | prLF<<32:
return grControlLF, grNoBoundary, 30
// GB6.
{grAny, prL}: {grL, grBoundary, 9990},
{grL, prL}: {grL, grNoBoundary, 60},
{grL, prV}: {grLVV, grNoBoundary, 60},
{grL, prLV}: {grLVV, grNoBoundary, 60},
{grL, prLVT}: {grLVTT, grNoBoundary, 60},
// GB6
case grAny | prL<<32:
return grL, grBoundary, 9990
case grL | prL<<32:
return grL, grNoBoundary, 60
case grL | prV<<32:
return grLVV, grNoBoundary, 60
case grL | prLV<<32:
return grLVV, grNoBoundary, 60
case grL | prLVT<<32:
return grLVTT, grNoBoundary, 60
// GB7.
{grAny, prLV}: {grLVV, grBoundary, 9990},
{grAny, prV}: {grLVV, grBoundary, 9990},
{grLVV, prV}: {grLVV, grNoBoundary, 70},
{grLVV, prT}: {grLVTT, grNoBoundary, 70},
// GB7
case grAny | prLV<<32:
return grLVV, grBoundary, 9990
case grAny | prV<<32:
return grLVV, grBoundary, 9990
case grLVV | prV<<32:
return grLVV, grNoBoundary, 70
case grLVV | prT<<32:
return grLVTT, grNoBoundary, 70
// GB8.
{grAny, prLVT}: {grLVTT, grBoundary, 9990},
{grAny, prT}: {grLVTT, grBoundary, 9990},
{grLVTT, prT}: {grLVTT, grNoBoundary, 80},
// GB8
case grAny | prLVT<<32:
return grLVTT, grBoundary, 9990
case grAny | prT<<32:
return grLVTT, grBoundary, 9990
case grLVTT | prT<<32:
return grLVTT, grNoBoundary, 80
// GB9.
{grAny, prExtend}: {grAny, grNoBoundary, 90},
{grAny, prZWJ}: {grAny, grNoBoundary, 90},
// GB9
case grAny | prExtend<<32:
return grAny, grNoBoundary, 90
case grAny | prZWJ<<32:
return grAny, grNoBoundary, 90
// GB9a.
{grAny, prSpacingMark}: {grAny, grNoBoundary, 91},
// GB9a
case grAny | prSpacingMark<<32:
return grAny, grNoBoundary, 91
// GB9b.
{grAny, prPrepend}: {grPrepend, grBoundary, 9990},
{grPrepend, prAny}: {grAny, grNoBoundary, 92},
// GB9b
case grAny | prPrepend<<32:
return grPrepend, grBoundary, 9990
case grPrepend | prAny<<32:
return grAny, grNoBoundary, 92
// GB11.
{grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990},
{grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110},
{grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110},
{grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},
// GB11
case grAny | prExtendedPictographic<<32:
return grExtendedPictographic, grBoundary, 9990
case grExtendedPictographic | prExtend<<32:
return grExtendedPictographic, grNoBoundary, 110
case grExtendedPictographic | prZWJ<<32:
return grExtendedPictographicZWJ, grNoBoundary, 110
case grExtendedPictographicZWJ | prExtendedPictographic<<32:
return grExtendedPictographic, grNoBoundary, 110
// GB12 / GB13.
{grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990},
{grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120},
{grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
// GB12 / GB13
case grAny | prRegionalIndicator<<32:
return grRIOdd, grBoundary, 9990
case grRIOdd | prRegionalIndicator<<32:
return grRIEven, grNoBoundary, 120
case grRIEven | prRegionalIndicator<<32:
return grRIOdd, grBoundary, 120
default:
return -1, -1, -1
}
}
// transitionGraphemeState determines the new state of the grapheme cluster
@ -97,40 +135,40 @@ var grTransitions = map[[2]int][3]int{
// table) and whether a cluster boundary was detected.
func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
// Determine the property of the next character.
prop = property(graphemeCodePoints, r)
prop = propertyGraphemes(r)
// Find the applicable transition.
transition, ok := grTransitions[[2]int{state, prop}]
if ok {
nextState, nextProp, _ := grTransitions(state, prop)
if nextState >= 0 {
// We have a specific transition. We'll use it.
return transition[0], prop, transition[1] == grBoundary
return nextState, prop, nextProp == grBoundary
}
// No specific transition found. Try the less specific ones.
transAnyProp, okAnyProp := grTransitions[[2]int{state, prAny}]
transAnyState, okAnyState := grTransitions[[2]int{grAny, prop}]
if okAnyProp && okAnyState {
anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny)
anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop)
if anyPropState >= 0 && anyStateState >= 0 {
// Both apply. We'll use a mix (see comments for grTransitions).
newState = transAnyState[0]
boundary = transAnyState[1] == grBoundary
if transAnyProp[2] < transAnyState[2] {
boundary = transAnyProp[1] == grBoundary
newState = anyStateState
boundary = anyStateProp == grBoundary
if anyPropRule < anyStateRule {
boundary = anyPropProp == grBoundary
}
return
}
if okAnyProp {
if anyPropState >= 0 {
// We only have a specific state.
return transAnyProp[0], prop, transAnyProp[1] == grBoundary
return anyPropState, prop, anyPropProp == grBoundary
// This branch will probably never be reached because okAnyState will
// always be true given the current transition map. But we keep it here
// for future modifications to the transition map where this may not be
// true anymore.
}
if okAnyState {
if anyStateState >= 0 {
// We only have a specific property.
return transAnyState[0], prop, transAnyState[1] == grBoundary
return anyStateState, prop, anyStateProp == grBoundary
}
// No known transition. GB999: Any ÷ Any.

View file

@ -80,7 +80,7 @@ func FirstLineSegment(b []byte, state int) (segment, rest []byte, mustBreak bool
}
}
// FirstLineSegmentInString is like FirstLineSegment() but its input and outputs
// FirstLineSegmentInString is like [FirstLineSegment] but its input and outputs
// are strings.
func FirstLineSegmentInString(str string, state int) (segment, rest string, mustBreak bool, newState int) {
// An empty byte slice returns nothing.
@ -122,13 +122,13 @@ func FirstLineSegmentInString(str string, state int) (segment, rest string, must
// [UAX #14]: https://www.unicode.org/reports/tr14/#Algorithm
func HasTrailingLineBreak(b []byte) bool {
r, _ := utf8.DecodeLastRune(b)
property, _ := propertyWithGenCat(lineBreakCodePoints, r)
return property == lbBK || property == lbCR || property == lbLF || property == lbNL
property, _ := propertyLineBreak(r)
return property == prBK || property == prCR || property == prLF || property == prNL
}
// HasTrailingLineBreakInString is like [HasTrailingLineBreak] but for a string.
func HasTrailingLineBreakInString(str string) bool {
r, _ := utf8.DecodeLastRuneInString(str)
property, _ := propertyWithGenCat(lineBreakCodePoints, r)
return property == lbBK || property == lbCR || property == lbLF || property == lbNL
property, _ := propertyLineBreak(r)
return property == prBK || property == prCR || property == prLF || property == prNL
}

View file

@ -1,13 +1,13 @@
package uniseg
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
package uniseg
// lineBreakCodePoints are taken from
// https://www.unicode.org/Public/14.0.0/ucd/LineBreak.txt
// https://www.unicode.org/Public/15.0.0/ucd/LineBreak.txt
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var lineBreakCodePoints = [][4]int{
{0x0000, 0x0008, prCM, gcCc}, // [9] <control-0000>..<control-0008>
@ -439,6 +439,7 @@ var lineBreakCodePoints = [][4]int{
{0x0CE2, 0x0CE3, prCM, gcMn}, // [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
{0x0CE6, 0x0CEF, prNU, gcNd}, // [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
{0x0CF1, 0x0CF2, prAL, gcLo}, // [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
{0x0CF3, 0x0CF3, prCM, gcMc}, // KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
{0x0D00, 0x0D01, prCM, gcMn}, // [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
{0x0D02, 0x0D03, prCM, gcMc}, // [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
{0x0D04, 0x0D0C, prAL, gcLo}, // [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
@ -500,7 +501,7 @@ var lineBreakCodePoints = [][4]int{
{0x0EBD, 0x0EBD, prSA, gcLo}, // LAO SEMIVOWEL SIGN NYO
{0x0EC0, 0x0EC4, prSA, gcLo}, // [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
{0x0EC6, 0x0EC6, prSA, gcLm}, // LAO KO LA
{0x0EC8, 0x0ECD, prSA, gcMn}, // [6] LAO TONE MAI EK..LAO NIGGAHITA
{0x0EC8, 0x0ECE, prSA, gcMn}, // [7] LAO TONE MAI EK..LAO YAMAKKAN
{0x0ED0, 0x0ED9, prNU, gcNd}, // [10] LAO DIGIT ZERO..LAO DIGIT NINE
{0x0EDC, 0x0EDF, prSA, gcLo}, // [4] LAO HO NO..LAO LETTER KHMU NYO
{0x0F00, 0x0F00, prAL, gcLo}, // TIBETAN SYLLABLE OM
@ -813,7 +814,11 @@ var lineBreakCodePoints = [][4]int{
{0x1D79, 0x1D7F, prAL, gcLl}, // [7] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER UPSILON WITH STROKE
{0x1D80, 0x1D9A, prAL, gcLl}, // [27] LATIN SMALL LETTER B WITH PALATAL HOOK..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK
{0x1D9B, 0x1DBF, prAL, gcLm}, // [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA
{0x1DC0, 0x1DFF, prCM, gcMn}, // [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
{0x1DC0, 0x1DCC, prCM, gcMn}, // [13] COMBINING DOTTED GRAVE ACCENT..COMBINING MACRON-BREVE
{0x1DCD, 0x1DCD, prGL, gcMn}, // COMBINING DOUBLE CIRCUMFLEX ABOVE
{0x1DCE, 0x1DFB, prCM, gcMn}, // [46] COMBINING OGONEK ABOVE..COMBINING DELETION MARK
{0x1DFC, 0x1DFC, prGL, gcMn}, // COMBINING DOUBLE INVERTED BREVE BELOW
{0x1DFD, 0x1DFF, prCM, gcMn}, // [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
{0x1E00, 0x1EFF, prAL, gcLC}, // [256] LATIN CAPITAL LETTER A WITH RING BELOW..LATIN SMALL LETTER Y WITH LOOP
{0x1F00, 0x1F15, prAL, gcLC}, // [22] GREEK SMALL LETTER ALPHA WITH PSILI..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA
{0x1F18, 0x1F1D, prAL, gcLu}, // [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA
@ -889,7 +894,7 @@ var lineBreakCodePoints = [][4]int{
{0x2054, 0x2054, prAL, gcPc}, // INVERTED UNDERTIE
{0x2055, 0x2055, prAL, gcPo}, // FLOWER PUNCTUATION MARK
{0x2056, 0x2056, prBA, gcPo}, // THREE DOT PUNCTUATION
{0x2057, 0x2057, prAL, gcPo}, // QUADRUPLE PRIME
{0x2057, 0x2057, prPO, gcPo}, // QUADRUPLE PRIME
{0x2058, 0x205B, prBA, gcPo}, // [4] FOUR DOT PUNCTUATION..FOUR DOT MARK
{0x205C, 0x205C, prAL, gcPo}, // DOTTED CROSS
{0x205D, 0x205E, prBA, gcPo}, // [2] TRICOLON..VERTICAL FOUR DOTS
@ -2751,6 +2756,7 @@ var lineBreakCodePoints = [][4]int{
{0x10EAB, 0x10EAC, prCM, gcMn}, // [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
{0x10EAD, 0x10EAD, prBA, gcPd}, // YEZIDI HYPHENATION MARK
{0x10EB0, 0x10EB1, prAL, gcLo}, // [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
{0x10EFD, 0x10EFF, prCM, gcMn}, // [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
{0x10F00, 0x10F1C, prAL, gcLo}, // [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
{0x10F1D, 0x10F26, prAL, gcNo}, // [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF
{0x10F27, 0x10F27, prAL, gcLo}, // OLD SOGDIAN LIGATURE AYIN-DALETH
@ -2840,6 +2846,8 @@ var lineBreakCodePoints = [][4]int{
{0x1123B, 0x1123C, prBA, gcPo}, // [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK
{0x1123D, 0x1123D, prAL, gcPo}, // KHOJKI ABBREVIATION SIGN
{0x1123E, 0x1123E, prCM, gcMn}, // KHOJKI SIGN SUKUN
{0x1123F, 0x11240, prAL, gcLo}, // [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
{0x11241, 0x11241, prCM, gcMn}, // KHOJKI VOWEL SIGN VOCALIC R
{0x11280, 0x11286, prAL, gcLo}, // [7] MULTANI LETTER A..MULTANI LETTER GA
{0x11288, 0x11288, prAL, gcLo}, // MULTANI LETTER GHA
{0x1128A, 0x1128D, prAL, gcLo}, // [4] MULTANI LETTER CA..MULTANI LETTER JJA
@ -3013,6 +3021,7 @@ var lineBreakCodePoints = [][4]int{
{0x11AA1, 0x11AA2, prBA, gcPo}, // [2] SOYOMBO TERMINAL MARK-1..SOYOMBO TERMINAL MARK-2
{0x11AB0, 0x11ABF, prAL, gcLo}, // [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA
{0x11AC0, 0x11AF8, prAL, gcLo}, // [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
{0x11B00, 0x11B09, prBB, gcPo}, // [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU
{0x11C00, 0x11C08, prAL, gcLo}, // [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
{0x11C0A, 0x11C2E, prAL, gcLo}, // [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
{0x11C2F, 0x11C2F, prCM, gcMc}, // BHAIKSUKI VOWEL SIGN AA
@ -3059,6 +3068,20 @@ var lineBreakCodePoints = [][4]int{
{0x11EF3, 0x11EF4, prCM, gcMn}, // [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
{0x11EF5, 0x11EF6, prCM, gcMc}, // [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
{0x11EF7, 0x11EF8, prAL, gcPo}, // [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
{0x11F00, 0x11F01, prCM, gcMn}, // [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
{0x11F02, 0x11F02, prAL, gcLo}, // KAWI SIGN REPHA
{0x11F03, 0x11F03, prCM, gcMc}, // KAWI SIGN VISARGA
{0x11F04, 0x11F10, prAL, gcLo}, // [13] KAWI LETTER A..KAWI LETTER O
{0x11F12, 0x11F33, prAL, gcLo}, // [34] KAWI LETTER KA..KAWI LETTER JNYA
{0x11F34, 0x11F35, prCM, gcMc}, // [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
{0x11F36, 0x11F3A, prCM, gcMn}, // [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
{0x11F3E, 0x11F3F, prCM, gcMc}, // [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
{0x11F40, 0x11F40, prCM, gcMn}, // KAWI VOWEL SIGN EU
{0x11F41, 0x11F41, prCM, gcMc}, // KAWI SIGN KILLER
{0x11F42, 0x11F42, prCM, gcMn}, // KAWI CONJOINER
{0x11F43, 0x11F44, prBA, gcPo}, // [2] KAWI DANDA..KAWI DOUBLE DANDA
{0x11F45, 0x11F4F, prID, gcPo}, // [11] KAWI PUNCTUATION SECTION MARKER..KAWI PUNCTUATION CLOSING SPIRAL
{0x11F50, 0x11F59, prNU, gcNd}, // [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
{0x11FB0, 0x11FB0, prAL, gcLo}, // LISU LETTER YHA
{0x11FC0, 0x11FD4, prAL, gcNo}, // [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH
{0x11FD5, 0x11FDC, prAL, gcSo}, // [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI
@ -3084,10 +3107,18 @@ var lineBreakCodePoints = [][4]int{
{0x1328A, 0x13378, prAL, gcLo}, // [239] EGYPTIAN HIEROGLYPH O037..EGYPTIAN HIEROGLYPH V011
{0x13379, 0x13379, prOP, gcLo}, // EGYPTIAN HIEROGLYPH V011A
{0x1337A, 0x1337B, prCL, gcLo}, // [2] EGYPTIAN HIEROGLYPH V011B..EGYPTIAN HIEROGLYPH V011C
{0x1337C, 0x1342E, prAL, gcLo}, // [179] EGYPTIAN HIEROGLYPH V012..EGYPTIAN HIEROGLYPH AA032
{0x1337C, 0x1342F, prAL, gcLo}, // [180] EGYPTIAN HIEROGLYPH V012..EGYPTIAN HIEROGLYPH V011D
{0x13430, 0x13436, prGL, gcCf}, // [7] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE
{0x13437, 0x13437, prOP, gcCf}, // EGYPTIAN HIEROGLYPH BEGIN SEGMENT
{0x13438, 0x13438, prCL, gcCf}, // EGYPTIAN HIEROGLYPH END SEGMENT
{0x13439, 0x1343B, prGL, gcCf}, // [3] EGYPTIAN HIEROGLYPH INSERT AT MIDDLE..EGYPTIAN HIEROGLYPH INSERT AT BOTTOM
{0x1343C, 0x1343C, prOP, gcCf}, // EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE
{0x1343D, 0x1343D, prCL, gcCf}, // EGYPTIAN HIEROGLYPH END ENCLOSURE
{0x1343E, 0x1343E, prOP, gcCf}, // EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE
{0x1343F, 0x1343F, prCL, gcCf}, // EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
{0x13440, 0x13440, prCM, gcMn}, // EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
{0x13441, 0x13446, prAL, gcLo}, // [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
{0x13447, 0x13455, prCM, gcMn}, // [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
{0x14400, 0x145CD, prAL, gcLo}, // [462] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A409
{0x145CE, 0x145CE, prOP, gcLo}, // ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK
{0x145CF, 0x145CF, prCL, gcLo}, // ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK
@ -3137,7 +3168,9 @@ var lineBreakCodePoints = [][4]int{
{0x1AFFD, 0x1AFFE, prAL, gcLm}, // [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
{0x1B000, 0x1B0FF, prID, gcLo}, // [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
{0x1B100, 0x1B122, prID, gcLo}, // [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
{0x1B132, 0x1B132, prCJ, gcLo}, // HIRAGANA LETTER SMALL KO
{0x1B150, 0x1B152, prCJ, gcLo}, // [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
{0x1B155, 0x1B155, prCJ, gcLo}, // KATAKANA LETTER SMALL KO
{0x1B164, 0x1B167, prCJ, gcLo}, // [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
{0x1B170, 0x1B2FB, prID, gcLo}, // [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
{0x1BC00, 0x1BC6A, prAL, gcLo}, // [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@ -3168,6 +3201,7 @@ var lineBreakCodePoints = [][4]int{
{0x1D200, 0x1D241, prAL, gcSo}, // [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
{0x1D242, 0x1D244, prCM, gcMn}, // [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
{0x1D245, 0x1D245, prAL, gcSo}, // GREEK MUSICAL LEIMMA
{0x1D2C0, 0x1D2D3, prAL, gcNo}, // [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN
{0x1D2E0, 0x1D2F3, prAL, gcNo}, // [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN
{0x1D300, 0x1D356, prAL, gcSo}, // [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
{0x1D360, 0x1D378, prAL, gcNo}, // [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE
@ -3228,11 +3262,14 @@ var lineBreakCodePoints = [][4]int{
{0x1DF00, 0x1DF09, prAL, gcLl}, // [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
{0x1DF0A, 0x1DF0A, prAL, gcLo}, // LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
{0x1DF0B, 0x1DF1E, prAL, gcLl}, // [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
{0x1DF25, 0x1DF2A, prAL, gcLl}, // [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
{0x1E000, 0x1E006, prCM, gcMn}, // [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
{0x1E008, 0x1E018, prCM, gcMn}, // [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
{0x1E01B, 0x1E021, prCM, gcMn}, // [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
{0x1E023, 0x1E024, prCM, gcMn}, // [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
{0x1E026, 0x1E02A, prCM, gcMn}, // [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
{0x1E030, 0x1E06D, prAL, gcLm}, // [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
{0x1E08F, 0x1E08F, prCM, gcMn}, // COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
{0x1E100, 0x1E12C, prAL, gcLo}, // [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
{0x1E130, 0x1E136, prCM, gcMn}, // [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
{0x1E137, 0x1E13D, prAL, gcLm}, // [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
@ -3245,6 +3282,10 @@ var lineBreakCodePoints = [][4]int{
{0x1E2EC, 0x1E2EF, prCM, gcMn}, // [4] WANCHO TONE TUP..WANCHO TONE KOINI
{0x1E2F0, 0x1E2F9, prNU, gcNd}, // [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
{0x1E2FF, 0x1E2FF, prPR, gcSc}, // WANCHO NGUN SIGN
{0x1E4D0, 0x1E4EA, prAL, gcLo}, // [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
{0x1E4EB, 0x1E4EB, prAL, gcLm}, // NAG MUNDARI SIGN OJOD
{0x1E4EC, 0x1E4EF, prCM, gcMn}, // [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
{0x1E4F0, 0x1E4F9, prNU, gcNd}, // [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
{0x1E7E0, 0x1E7E6, prAL, gcLo}, // [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
{0x1E7E8, 0x1E7EB, prAL, gcLo}, // [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
{0x1E7ED, 0x1E7EE, prAL, gcLo}, // [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@ -3412,16 +3453,18 @@ var lineBreakCodePoints = [][4]int{
{0x1F6C1, 0x1F6CB, prID, gcSo}, // [11] BATHTUB..COUCH AND LAMP
{0x1F6CC, 0x1F6CC, prEB, gcSo}, // SLEEPING ACCOMMODATION
{0x1F6CD, 0x1F6D7, prID, gcSo}, // [11] SHOPPING BAGS..ELEVATOR
{0x1F6D8, 0x1F6DC, prID, gcCn}, // [5] <reserved-1F6D8>..<reserved-1F6DC>
{0x1F6DD, 0x1F6EC, prID, gcSo}, // [16] PLAYGROUND SLIDE..AIRPLANE ARRIVING
{0x1F6D8, 0x1F6DB, prID, gcCn}, // [4] <reserved-1F6D8>..<reserved-1F6DB>
{0x1F6DC, 0x1F6EC, prID, gcSo}, // [17] WIRELESS..AIRPLANE ARRIVING
{0x1F6ED, 0x1F6EF, prID, gcCn}, // [3] <reserved-1F6ED>..<reserved-1F6EF>
{0x1F6F0, 0x1F6FC, prID, gcSo}, // [13] SATELLITE..ROLLER SKATE
{0x1F6FD, 0x1F6FF, prID, gcCn}, // [3] <reserved-1F6FD>..<reserved-1F6FF>
{0x1F700, 0x1F773, prAL, gcSo}, // [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
{0x1F774, 0x1F77F, prID, gcCn}, // [12] <reserved-1F774>..<reserved-1F77F>
{0x1F774, 0x1F776, prID, gcSo}, // [3] LOT OF FORTUNE..LUNAR ECLIPSE
{0x1F777, 0x1F77A, prID, gcCn}, // [4] <reserved-1F777>..<reserved-1F77A>
{0x1F77B, 0x1F77F, prID, gcSo}, // [5] HAUMEA..ORCUS
{0x1F780, 0x1F7D4, prAL, gcSo}, // [85] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..HEAVY TWELVE POINTED PINWHEEL STAR
{0x1F7D5, 0x1F7D8, prID, gcSo}, // [4] CIRCLED TRIANGLE..NEGATIVE CIRCLED SQUARE
{0x1F7D9, 0x1F7DF, prID, gcCn}, // [7] <reserved-1F7D9>..<reserved-1F7DF>
{0x1F7D5, 0x1F7D9, prID, gcSo}, // [5] CIRCLED TRIANGLE..NINE POINTED WHITE STAR
{0x1F7DA, 0x1F7DF, prID, gcCn}, // [6] <reserved-1F7DA>..<reserved-1F7DF>
{0x1F7E0, 0x1F7EB, prID, gcSo}, // [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
{0x1F7EC, 0x1F7EF, prID, gcCn}, // [4] <reserved-1F7EC>..<reserved-1F7EF>
{0x1F7F0, 0x1F7F0, prID, gcSo}, // HEAVY EQUALS SIGN
@ -3467,33 +3510,29 @@ var lineBreakCodePoints = [][4]int{
{0x1FA54, 0x1FA5F, prID, gcCn}, // [12] <reserved-1FA54>..<reserved-1FA5F>
{0x1FA60, 0x1FA6D, prID, gcSo}, // [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
{0x1FA6E, 0x1FA6F, prID, gcCn}, // [2] <reserved-1FA6E>..<reserved-1FA6F>
{0x1FA70, 0x1FA74, prID, gcSo}, // [5] BALLET SHOES..THONG SANDAL
{0x1FA75, 0x1FA77, prID, gcCn}, // [3] <reserved-1FA75>..<reserved-1FA77>
{0x1FA78, 0x1FA7C, prID, gcSo}, // [5] DROP OF BLOOD..CRUTCH
{0x1FA70, 0x1FA7C, prID, gcSo}, // [13] BALLET SHOES..CRUTCH
{0x1FA7D, 0x1FA7F, prID, gcCn}, // [3] <reserved-1FA7D>..<reserved-1FA7F>
{0x1FA80, 0x1FA86, prID, gcSo}, // [7] YO-YO..NESTING DOLLS
{0x1FA87, 0x1FA8F, prID, gcCn}, // [9] <reserved-1FA87>..<reserved-1FA8F>
{0x1FA90, 0x1FAAC, prID, gcSo}, // [29] RINGED PLANET..HAMSA
{0x1FAAD, 0x1FAAF, prID, gcCn}, // [3] <reserved-1FAAD>..<reserved-1FAAF>
{0x1FAB0, 0x1FABA, prID, gcSo}, // [11] FLY..NEST WITH EGGS
{0x1FABB, 0x1FABF, prID, gcCn}, // [5] <reserved-1FABB>..<reserved-1FABF>
{0x1FAC0, 0x1FAC2, prID, gcSo}, // [3] ANATOMICAL HEART..PEOPLE HUGGING
{0x1FA80, 0x1FA88, prID, gcSo}, // [9] YO-YO..FLUTE
{0x1FA89, 0x1FA8F, prID, gcCn}, // [7] <reserved-1FA89>..<reserved-1FA8F>
{0x1FA90, 0x1FABD, prID, gcSo}, // [46] RINGED PLANET..WING
{0x1FABE, 0x1FABE, prID, gcCn}, // <reserved-1FABE>
{0x1FABF, 0x1FAC2, prID, gcSo}, // [4] GOOSE..PEOPLE HUGGING
{0x1FAC3, 0x1FAC5, prEB, gcSo}, // [3] PREGNANT MAN..PERSON WITH CROWN
{0x1FAC6, 0x1FACF, prID, gcCn}, // [10] <reserved-1FAC6>..<reserved-1FACF>
{0x1FAD0, 0x1FAD9, prID, gcSo}, // [10] BLUEBERRIES..JAR
{0x1FADA, 0x1FADF, prID, gcCn}, // [6] <reserved-1FADA>..<reserved-1FADF>
{0x1FAE0, 0x1FAE7, prID, gcSo}, // [8] MELTING FACE..BUBBLES
{0x1FAE8, 0x1FAEF, prID, gcCn}, // [8] <reserved-1FAE8>..<reserved-1FAEF>
{0x1FAF0, 0x1FAF6, prEB, gcSo}, // [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
{0x1FAF7, 0x1FAFF, prID, gcCn}, // [9] <reserved-1FAF7>..<reserved-1FAFF>
{0x1FAC6, 0x1FACD, prID, gcCn}, // [8] <reserved-1FAC6>..<reserved-1FACD>
{0x1FACE, 0x1FADB, prID, gcSo}, // [14] MOOSE..PEA POD
{0x1FADC, 0x1FADF, prID, gcCn}, // [4] <reserved-1FADC>..<reserved-1FADF>
{0x1FAE0, 0x1FAE8, prID, gcSo}, // [9] MELTING FACE..SHAKING FACE
{0x1FAE9, 0x1FAEF, prID, gcCn}, // [7] <reserved-1FAE9>..<reserved-1FAEF>
{0x1FAF0, 0x1FAF8, prEB, gcSo}, // [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
{0x1FAF9, 0x1FAFF, prID, gcCn}, // [7] <reserved-1FAF9>..<reserved-1FAFF>
{0x1FB00, 0x1FB92, prAL, gcSo}, // [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
{0x1FB94, 0x1FBCA, prAL, gcSo}, // [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
{0x1FBF0, 0x1FBF9, prNU, gcNd}, // [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
{0x1FC00, 0x1FFFD, prID, gcCn}, // [1022] <reserved-1FC00>..<reserved-1FFFD>
{0x20000, 0x2A6DF, prID, gcLo}, // [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
{0x2A6E0, 0x2A6FF, prID, gcCn}, // [32] <reserved-2A6E0>..<reserved-2A6FF>
{0x2A700, 0x2B738, prID, gcLo}, // [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
{0x2B739, 0x2B73F, prID, gcCn}, // [7] <reserved-2B739>..<reserved-2B73F>
{0x2A700, 0x2B739, prID, gcLo}, // [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
{0x2B73A, 0x2B73F, prID, gcCn}, // [6] <reserved-2B73A>..<reserved-2B73F>
{0x2B740, 0x2B81D, prID, gcLo}, // [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
{0x2B81E, 0x2B81F, prID, gcCn}, // [2] <reserved-2B81E>..<reserved-2B81F>
{0x2B820, 0x2CEA1, prID, gcLo}, // [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
@ -3504,7 +3543,9 @@ var lineBreakCodePoints = [][4]int{
{0x2FA1E, 0x2FA1F, prID, gcCn}, // [2] <reserved-2FA1E>..<reserved-2FA1F>
{0x2FA20, 0x2FFFD, prID, gcCn}, // [1502] <reserved-2FA20>..<reserved-2FFFD>
{0x30000, 0x3134A, prID, gcLo}, // [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
{0x3134B, 0x3FFFD, prID, gcCn}, // [60595] <reserved-3134B>..<reserved-3FFFD>
{0x3134B, 0x3134F, prID, gcCn}, // [5] <reserved-3134B>..<reserved-3134F>
{0x31350, 0x323AF, prID, gcLo}, // [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
{0x323B0, 0x3FFFD, prID, gcCn}, // [56398] <reserved-323B0>..<reserved-3FFFD>
{0xE0001, 0xE0001, prCM, gcCf}, // LANGUAGE TAG
{0xE0020, 0xE007F, prCM, gcCf}, // [96] TAG SPACE..CANCEL TAG
{0xE0100, 0xE01EF, prCM, gcMn}, // [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256

View file

@ -64,222 +64,381 @@ const (
LineMustBreak // You must break the line here.
)
// The line break parser's state transitions. It's anologous to grTransitions,
// see comments there for details. Unicode version 14.0.0.
var lbTransitions = map[[2]int][3]int{
// lbTransitions implements the line break parser's state transitions. It's
// anologous to [grTransitions], see comments there for details.
//
// Unicode version 15.0.0.
func lbTransitions(state, prop int) (newState, lineBreak, rule int) {
switch uint64(state) | uint64(prop)<<32 {
// LB4.
{lbAny, prBK}: {lbBK, LineCanBreak, 310},
{lbBK, prAny}: {lbAny, LineMustBreak, 40},
case lbBK | prAny<<32:
return lbAny, LineMustBreak, 40
// LB5.
{lbAny, prCR}: {lbCR, LineCanBreak, 310},
{lbAny, prLF}: {lbLF, LineCanBreak, 310},
{lbAny, prNL}: {lbNL, LineCanBreak, 310},
{lbCR, prLF}: {lbLF, LineDontBreak, 50},
{lbCR, prAny}: {lbAny, LineMustBreak, 50},
{lbLF, prAny}: {lbAny, LineMustBreak, 50},
{lbNL, prAny}: {lbAny, LineMustBreak, 50},
case lbCR | prLF<<32:
return lbLF, LineDontBreak, 50
case lbCR | prAny<<32:
return lbAny, LineMustBreak, 50
case lbLF | prAny<<32:
return lbAny, LineMustBreak, 50
case lbNL | prAny<<32:
return lbAny, LineMustBreak, 50
// LB6.
{lbAny, prBK}: {lbBK, LineDontBreak, 60},
{lbAny, prCR}: {lbCR, LineDontBreak, 60},
{lbAny, prLF}: {lbLF, LineDontBreak, 60},
{lbAny, prNL}: {lbNL, LineDontBreak, 60},
case lbAny | prBK<<32:
return lbBK, LineDontBreak, 60
case lbAny | prCR<<32:
return lbCR, LineDontBreak, 60
case lbAny | prLF<<32:
return lbLF, LineDontBreak, 60
case lbAny | prNL<<32:
return lbNL, LineDontBreak, 60
// LB7.
{lbAny, prSP}: {lbSP, LineDontBreak, 70},
{lbAny, prZW}: {lbZW, LineDontBreak, 70},
case lbAny | prSP<<32:
return lbSP, LineDontBreak, 70
case lbAny | prZW<<32:
return lbZW, LineDontBreak, 70
// LB8.
{lbZW, prSP}: {lbZW, LineDontBreak, 70},
{lbZW, prAny}: {lbAny, LineCanBreak, 80},
case lbZW | prSP<<32:
return lbZW, LineDontBreak, 70
case lbZW | prAny<<32:
return lbAny, LineCanBreak, 80
// LB11.
{lbAny, prWJ}: {lbWJ, LineDontBreak, 110},
{lbWJ, prAny}: {lbAny, LineDontBreak, 110},
case lbAny | prWJ<<32:
return lbWJ, LineDontBreak, 110
case lbWJ | prAny<<32:
return lbAny, LineDontBreak, 110
// LB12.
{lbAny, prGL}: {lbGL, LineCanBreak, 310},
{lbGL, prAny}: {lbAny, LineDontBreak, 120},
case lbAny | prGL<<32:
return lbGL, LineCanBreak, 310
case lbGL | prAny<<32:
return lbAny, LineDontBreak, 120
// LB13 (simple transitions).
{lbAny, prCL}: {lbCL, LineCanBreak, 310},
{lbAny, prCP}: {lbCP, LineCanBreak, 310},
{lbAny, prEX}: {lbEX, LineDontBreak, 130},
{lbAny, prIS}: {lbIS, LineCanBreak, 310},
{lbAny, prSY}: {lbSY, LineCanBreak, 310},
case lbAny | prCL<<32:
return lbCL, LineCanBreak, 310
case lbAny | prCP<<32:
return lbCP, LineCanBreak, 310
case lbAny | prEX<<32:
return lbEX, LineDontBreak, 130
case lbAny | prIS<<32:
return lbIS, LineCanBreak, 310
case lbAny | prSY<<32:
return lbSY, LineCanBreak, 310
// LB14.
{lbAny, prOP}: {lbOP, LineCanBreak, 310},
{lbOP, prSP}: {lbOP, LineDontBreak, 70},
{lbOP, prAny}: {lbAny, LineDontBreak, 140},
case lbAny | prOP<<32:
return lbOP, LineCanBreak, 310
case lbOP | prSP<<32:
return lbOP, LineDontBreak, 70
case lbOP | prAny<<32:
return lbAny, LineDontBreak, 140
// LB15.
{lbQU, prSP}: {lbQUSP, LineDontBreak, 70},
{lbQU, prOP}: {lbOP, LineDontBreak, 150},
{lbQUSP, prOP}: {lbOP, LineDontBreak, 150},
case lbQU | prSP<<32:
return lbQUSP, LineDontBreak, 70
case lbQU | prOP<<32:
return lbOP, LineDontBreak, 150
case lbQUSP | prOP<<32:
return lbOP, LineDontBreak, 150
// LB16.
{lbCL, prSP}: {lbCLCPSP, LineDontBreak, 70},
{lbNUCL, prSP}: {lbCLCPSP, LineDontBreak, 70},
{lbCP, prSP}: {lbCLCPSP, LineDontBreak, 70},
{lbNUCP, prSP}: {lbCLCPSP, LineDontBreak, 70},
{lbCL, prNS}: {lbNS, LineDontBreak, 160},
{lbNUCL, prNS}: {lbNS, LineDontBreak, 160},
{lbCP, prNS}: {lbNS, LineDontBreak, 160},
{lbNUCP, prNS}: {lbNS, LineDontBreak, 160},
{lbCLCPSP, prNS}: {lbNS, LineDontBreak, 160},
case lbCL | prSP<<32:
return lbCLCPSP, LineDontBreak, 70
case lbNUCL | prSP<<32:
return lbCLCPSP, LineDontBreak, 70
case lbCP | prSP<<32:
return lbCLCPSP, LineDontBreak, 70
case lbNUCP | prSP<<32:
return lbCLCPSP, LineDontBreak, 70
case lbCL | prNS<<32:
return lbNS, LineDontBreak, 160
case lbNUCL | prNS<<32:
return lbNS, LineDontBreak, 160
case lbCP | prNS<<32:
return lbNS, LineDontBreak, 160
case lbNUCP | prNS<<32:
return lbNS, LineDontBreak, 160
case lbCLCPSP | prNS<<32:
return lbNS, LineDontBreak, 160
// LB17.
{lbAny, prB2}: {lbB2, LineCanBreak, 310},
{lbB2, prSP}: {lbB2SP, LineDontBreak, 70},
{lbB2, prB2}: {lbB2, LineDontBreak, 170},
{lbB2SP, prB2}: {lbB2, LineDontBreak, 170},
case lbAny | prB2<<32:
return lbB2, LineCanBreak, 310
case lbB2 | prSP<<32:
return lbB2SP, LineDontBreak, 70
case lbB2 | prB2<<32:
return lbB2, LineDontBreak, 170
case lbB2SP | prB2<<32:
return lbB2, LineDontBreak, 170
// LB18.
{lbSP, prAny}: {lbAny, LineCanBreak, 180},
{lbQUSP, prAny}: {lbAny, LineCanBreak, 180},
{lbCLCPSP, prAny}: {lbAny, LineCanBreak, 180},
{lbB2SP, prAny}: {lbAny, LineCanBreak, 180},
case lbSP | prAny<<32:
return lbAny, LineCanBreak, 180
case lbQUSP | prAny<<32:
return lbAny, LineCanBreak, 180
case lbCLCPSP | prAny<<32:
return lbAny, LineCanBreak, 180
case lbB2SP | prAny<<32:
return lbAny, LineCanBreak, 180
// LB19.
{lbAny, prQU}: {lbQU, LineDontBreak, 190},
{lbQU, prAny}: {lbAny, LineDontBreak, 190},
case lbAny | prQU<<32:
return lbQU, LineDontBreak, 190
case lbQU | prAny<<32:
return lbAny, LineDontBreak, 190
// LB20.
{lbAny, prCB}: {lbCB, LineCanBreak, 200},
{lbCB, prAny}: {lbAny, LineCanBreak, 200},
case lbAny | prCB<<32:
return lbCB, LineCanBreak, 200
case lbCB | prAny<<32:
return lbAny, LineCanBreak, 200
// LB21.
{lbAny, prBA}: {lbBA, LineDontBreak, 210},
{lbAny, prHY}: {lbHY, LineDontBreak, 210},
{lbAny, prNS}: {lbNS, LineDontBreak, 210},
{lbAny, prBB}: {lbBB, LineCanBreak, 310},
{lbBB, prAny}: {lbAny, LineDontBreak, 210},
case lbAny | prBA<<32:
return lbBA, LineDontBreak, 210
case lbAny | prHY<<32:
return lbHY, LineDontBreak, 210
case lbAny | prNS<<32:
return lbNS, LineDontBreak, 210
case lbAny | prBB<<32:
return lbBB, LineCanBreak, 310
case lbBB | prAny<<32:
return lbAny, LineDontBreak, 210
// LB21a.
{lbAny, prHL}: {lbHL, LineCanBreak, 310},
{lbHL, prHY}: {lbLB21a, LineDontBreak, 210},
{lbHL, prBA}: {lbLB21a, LineDontBreak, 210},
{lbLB21a, prAny}: {lbAny, LineDontBreak, 211},
case lbAny | prHL<<32:
return lbHL, LineCanBreak, 310
case lbHL | prHY<<32:
return lbLB21a, LineDontBreak, 210
case lbHL | prBA<<32:
return lbLB21a, LineDontBreak, 210
case lbLB21a | prAny<<32:
return lbAny, LineDontBreak, 211
// LB21b.
{lbSY, prHL}: {lbHL, LineDontBreak, 212},
{lbNUSY, prHL}: {lbHL, LineDontBreak, 212},
case lbSY | prHL<<32:
return lbHL, LineDontBreak, 212
case lbNUSY | prHL<<32:
return lbHL, LineDontBreak, 212
// LB22.
{lbAny, prIN}: {lbAny, LineDontBreak, 220},
case lbAny | prIN<<32:
return lbAny, LineDontBreak, 220
// LB23.
{lbAny, prAL}: {lbAL, LineCanBreak, 310},
{lbAny, prNU}: {lbNU, LineCanBreak, 310},
{lbAL, prNU}: {lbNU, LineDontBreak, 230},
{lbHL, prNU}: {lbNU, LineDontBreak, 230},
{lbNU, prAL}: {lbAL, LineDontBreak, 230},
{lbNU, prHL}: {lbHL, LineDontBreak, 230},
{lbNUNU, prAL}: {lbAL, LineDontBreak, 230},
{lbNUNU, prHL}: {lbHL, LineDontBreak, 230},
case lbAny | prAL<<32:
return lbAL, LineCanBreak, 310
case lbAny | prNU<<32:
return lbNU, LineCanBreak, 310
case lbAL | prNU<<32:
return lbNU, LineDontBreak, 230
case lbHL | prNU<<32:
return lbNU, LineDontBreak, 230
case lbNU | prAL<<32:
return lbAL, LineDontBreak, 230
case lbNU | prHL<<32:
return lbHL, LineDontBreak, 230
case lbNUNU | prAL<<32:
return lbAL, LineDontBreak, 230
case lbNUNU | prHL<<32:
return lbHL, LineDontBreak, 230
// LB23a.
{lbAny, prPR}: {lbPR, LineCanBreak, 310},
{lbAny, prID}: {lbIDEM, LineCanBreak, 310},
{lbAny, prEB}: {lbEB, LineCanBreak, 310},
{lbAny, prEM}: {lbIDEM, LineCanBreak, 310},
{lbPR, prID}: {lbIDEM, LineDontBreak, 231},
{lbPR, prEB}: {lbEB, LineDontBreak, 231},
{lbPR, prEM}: {lbIDEM, LineDontBreak, 231},
{lbIDEM, prPO}: {lbPO, LineDontBreak, 231},
{lbEB, prPO}: {lbPO, LineDontBreak, 231},
case lbAny | prPR<<32:
return lbPR, LineCanBreak, 310
case lbAny | prID<<32:
return lbIDEM, LineCanBreak, 310
case lbAny | prEB<<32:
return lbEB, LineCanBreak, 310
case lbAny | prEM<<32:
return lbIDEM, LineCanBreak, 310
case lbPR | prID<<32:
return lbIDEM, LineDontBreak, 231
case lbPR | prEB<<32:
return lbEB, LineDontBreak, 231
case lbPR | prEM<<32:
return lbIDEM, LineDontBreak, 231
case lbIDEM | prPO<<32:
return lbPO, LineDontBreak, 231
case lbEB | prPO<<32:
return lbPO, LineDontBreak, 231
// LB24.
{lbAny, prPO}: {lbPO, LineCanBreak, 310},
{lbPR, prAL}: {lbAL, LineDontBreak, 240},
{lbPR, prHL}: {lbHL, LineDontBreak, 240},
{lbPO, prAL}: {lbAL, LineDontBreak, 240},
{lbPO, prHL}: {lbHL, LineDontBreak, 240},
{lbAL, prPR}: {lbPR, LineDontBreak, 240},
{lbAL, prPO}: {lbPO, LineDontBreak, 240},
{lbHL, prPR}: {lbPR, LineDontBreak, 240},
{lbHL, prPO}: {lbPO, LineDontBreak, 240},
case lbAny | prPO<<32:
return lbPO, LineCanBreak, 310
case lbPR | prAL<<32:
return lbAL, LineDontBreak, 240
case lbPR | prHL<<32:
return lbHL, LineDontBreak, 240
case lbPO | prAL<<32:
return lbAL, LineDontBreak, 240
case lbPO | prHL<<32:
return lbHL, LineDontBreak, 240
case lbAL | prPR<<32:
return lbPR, LineDontBreak, 240
case lbAL | prPO<<32:
return lbPO, LineDontBreak, 240
case lbHL | prPR<<32:
return lbPR, LineDontBreak, 240
case lbHL | prPO<<32:
return lbPO, LineDontBreak, 240
// LB25 (simple transitions).
{lbPR, prNU}: {lbNU, LineDontBreak, 250},
{lbPO, prNU}: {lbNU, LineDontBreak, 250},
{lbOP, prNU}: {lbNU, LineDontBreak, 250},
{lbHY, prNU}: {lbNU, LineDontBreak, 250},
{lbNU, prNU}: {lbNUNU, LineDontBreak, 250},
{lbNU, prSY}: {lbNUSY, LineDontBreak, 250},
{lbNU, prIS}: {lbNUIS, LineDontBreak, 250},
{lbNUNU, prNU}: {lbNUNU, LineDontBreak, 250},
{lbNUNU, prSY}: {lbNUSY, LineDontBreak, 250},
{lbNUNU, prIS}: {lbNUIS, LineDontBreak, 250},
{lbNUSY, prNU}: {lbNUNU, LineDontBreak, 250},
{lbNUSY, prSY}: {lbNUSY, LineDontBreak, 250},
{lbNUSY, prIS}: {lbNUIS, LineDontBreak, 250},
{lbNUIS, prNU}: {lbNUNU, LineDontBreak, 250},
{lbNUIS, prSY}: {lbNUSY, LineDontBreak, 250},
{lbNUIS, prIS}: {lbNUIS, LineDontBreak, 250},
{lbNU, prCL}: {lbNUCL, LineDontBreak, 250},
{lbNU, prCP}: {lbNUCP, LineDontBreak, 250},
{lbNUNU, prCL}: {lbNUCL, LineDontBreak, 250},
{lbNUNU, prCP}: {lbNUCP, LineDontBreak, 250},
{lbNUSY, prCL}: {lbNUCL, LineDontBreak, 250},
{lbNUSY, prCP}: {lbNUCP, LineDontBreak, 250},
{lbNUIS, prCL}: {lbNUCL, LineDontBreak, 250},
{lbNUIS, prCP}: {lbNUCP, LineDontBreak, 250},
{lbNU, prPO}: {lbPO, LineDontBreak, 250},
{lbNUNU, prPO}: {lbPO, LineDontBreak, 250},
{lbNUSY, prPO}: {lbPO, LineDontBreak, 250},
{lbNUIS, prPO}: {lbPO, LineDontBreak, 250},
{lbNUCL, prPO}: {lbPO, LineDontBreak, 250},
{lbNUCP, prPO}: {lbPO, LineDontBreak, 250},
{lbNU, prPR}: {lbPR, LineDontBreak, 250},
{lbNUNU, prPR}: {lbPR, LineDontBreak, 250},
{lbNUSY, prPR}: {lbPR, LineDontBreak, 250},
{lbNUIS, prPR}: {lbPR, LineDontBreak, 250},
{lbNUCL, prPR}: {lbPR, LineDontBreak, 250},
{lbNUCP, prPR}: {lbPR, LineDontBreak, 250},
case lbPR | prNU<<32:
return lbNU, LineDontBreak, 250
case lbPO | prNU<<32:
return lbNU, LineDontBreak, 250
case lbOP | prNU<<32:
return lbNU, LineDontBreak, 250
case lbHY | prNU<<32:
return lbNU, LineDontBreak, 250
case lbNU | prNU<<32:
return lbNUNU, LineDontBreak, 250
case lbNU | prSY<<32:
return lbNUSY, LineDontBreak, 250
case lbNU | prIS<<32:
return lbNUIS, LineDontBreak, 250
case lbNUNU | prNU<<32:
return lbNUNU, LineDontBreak, 250
case lbNUNU | prSY<<32:
return lbNUSY, LineDontBreak, 250
case lbNUNU | prIS<<32:
return lbNUIS, LineDontBreak, 250
case lbNUSY | prNU<<32:
return lbNUNU, LineDontBreak, 250
case lbNUSY | prSY<<32:
return lbNUSY, LineDontBreak, 250
case lbNUSY | prIS<<32:
return lbNUIS, LineDontBreak, 250
case lbNUIS | prNU<<32:
return lbNUNU, LineDontBreak, 250
case lbNUIS | prSY<<32:
return lbNUSY, LineDontBreak, 250
case lbNUIS | prIS<<32:
return lbNUIS, LineDontBreak, 250
case lbNU | prCL<<32:
return lbNUCL, LineDontBreak, 250
case lbNU | prCP<<32:
return lbNUCP, LineDontBreak, 250
case lbNUNU | prCL<<32:
return lbNUCL, LineDontBreak, 250
case lbNUNU | prCP<<32:
return lbNUCP, LineDontBreak, 250
case lbNUSY | prCL<<32:
return lbNUCL, LineDontBreak, 250
case lbNUSY | prCP<<32:
return lbNUCP, LineDontBreak, 250
case lbNUIS | prCL<<32:
return lbNUCL, LineDontBreak, 250
case lbNUIS | prCP<<32:
return lbNUCP, LineDontBreak, 250
case lbNU | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNUNU | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNUSY | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNUIS | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNUCL | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNUCP | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNU | prPR<<32:
return lbPR, LineDontBreak, 250
case lbNUNU | prPR<<32:
return lbPR, LineDontBreak, 250
case lbNUSY | prPR<<32:
return lbPR, LineDontBreak, 250
case lbNUIS | prPR<<32:
return lbPR, LineDontBreak, 250
case lbNUCL | prPR<<32:
return lbPR, LineDontBreak, 250
case lbNUCP | prPR<<32:
return lbPR, LineDontBreak, 250
// LB26.
{lbAny, prJL}: {lbJL, LineCanBreak, 310},
{lbAny, prJV}: {lbJV, LineCanBreak, 310},
{lbAny, prJT}: {lbJT, LineCanBreak, 310},
{lbAny, prH2}: {lbH2, LineCanBreak, 310},
{lbAny, prH3}: {lbH3, LineCanBreak, 310},
{lbJL, prJL}: {lbJL, LineDontBreak, 260},
{lbJL, prJV}: {lbJV, LineDontBreak, 260},
{lbJL, prH2}: {lbH2, LineDontBreak, 260},
{lbJL, prH3}: {lbH3, LineDontBreak, 260},
{lbJV, prJV}: {lbJV, LineDontBreak, 260},
{lbJV, prJT}: {lbJT, LineDontBreak, 260},
{lbH2, prJV}: {lbJV, LineDontBreak, 260},
{lbH2, prJT}: {lbJT, LineDontBreak, 260},
{lbJT, prJT}: {lbJT, LineDontBreak, 260},
{lbH3, prJT}: {lbJT, LineDontBreak, 260},
case lbAny | prJL<<32:
return lbJL, LineCanBreak, 310
case lbAny | prJV<<32:
return lbJV, LineCanBreak, 310
case lbAny | prJT<<32:
return lbJT, LineCanBreak, 310
case lbAny | prH2<<32:
return lbH2, LineCanBreak, 310
case lbAny | prH3<<32:
return lbH3, LineCanBreak, 310
case lbJL | prJL<<32:
return lbJL, LineDontBreak, 260
case lbJL | prJV<<32:
return lbJV, LineDontBreak, 260
case lbJL | prH2<<32:
return lbH2, LineDontBreak, 260
case lbJL | prH3<<32:
return lbH3, LineDontBreak, 260
case lbJV | prJV<<32:
return lbJV, LineDontBreak, 260
case lbJV | prJT<<32:
return lbJT, LineDontBreak, 260
case lbH2 | prJV<<32:
return lbJV, LineDontBreak, 260
case lbH2 | prJT<<32:
return lbJT, LineDontBreak, 260
case lbJT | prJT<<32:
return lbJT, LineDontBreak, 260
case lbH3 | prJT<<32:
return lbJT, LineDontBreak, 260
// LB27.
{lbJL, prPO}: {lbPO, LineDontBreak, 270},
{lbJV, prPO}: {lbPO, LineDontBreak, 270},
{lbJT, prPO}: {lbPO, LineDontBreak, 270},
{lbH2, prPO}: {lbPO, LineDontBreak, 270},
{lbH3, prPO}: {lbPO, LineDontBreak, 270},
{lbPR, prJL}: {lbJL, LineDontBreak, 270},
{lbPR, prJV}: {lbJV, LineDontBreak, 270},
{lbPR, prJT}: {lbJT, LineDontBreak, 270},
{lbPR, prH2}: {lbH2, LineDontBreak, 270},
{lbPR, prH3}: {lbH3, LineDontBreak, 270},
case lbJL | prPO<<32:
return lbPO, LineDontBreak, 270
case lbJV | prPO<<32:
return lbPO, LineDontBreak, 270
case lbJT | prPO<<32:
return lbPO, LineDontBreak, 270
case lbH2 | prPO<<32:
return lbPO, LineDontBreak, 270
case lbH3 | prPO<<32:
return lbPO, LineDontBreak, 270
case lbPR | prJL<<32:
return lbJL, LineDontBreak, 270
case lbPR | prJV<<32:
return lbJV, LineDontBreak, 270
case lbPR | prJT<<32:
return lbJT, LineDontBreak, 270
case lbPR | prH2<<32:
return lbH2, LineDontBreak, 270
case lbPR | prH3<<32:
return lbH3, LineDontBreak, 270
// LB28.
{lbAL, prAL}: {lbAL, LineDontBreak, 280},
{lbAL, prHL}: {lbHL, LineDontBreak, 280},
{lbHL, prAL}: {lbAL, LineDontBreak, 280},
{lbHL, prHL}: {lbHL, LineDontBreak, 280},
case lbAL | prAL<<32:
return lbAL, LineDontBreak, 280
case lbAL | prHL<<32:
return lbHL, LineDontBreak, 280
case lbHL | prAL<<32:
return lbAL, LineDontBreak, 280
case lbHL | prHL<<32:
return lbHL, LineDontBreak, 280
// LB29.
{lbIS, prAL}: {lbAL, LineDontBreak, 290},
{lbIS, prHL}: {lbHL, LineDontBreak, 290},
{lbNUIS, prAL}: {lbAL, LineDontBreak, 290},
{lbNUIS, prHL}: {lbHL, LineDontBreak, 290},
case lbIS | prAL<<32:
return lbAL, LineDontBreak, 290
case lbIS | prHL<<32:
return lbHL, LineDontBreak, 290
case lbNUIS | prAL<<32:
return lbAL, LineDontBreak, 290
case lbNUIS | prHL<<32:
return lbHL, LineDontBreak, 290
default:
return -1, -1, -1
}
}
// transitionLineBreakState determines the new state of the line break parser
@ -290,7 +449,7 @@ var lbTransitions = map[[2]int][3]int{
// further lookups.
func transitionLineBreakState(state int, r rune, b []byte, str string) (newState int, lineBreak int) {
// Determine the property of the next character.
nextProperty, generalCategory := propertyWithGenCat(lineBreakCodePoints, r)
nextProperty, generalCategory := propertyLineBreak(r)
// Prepare.
var forceNoBreak, isCPeaFWH bool
@ -306,7 +465,7 @@ func transitionLineBreakState(state int, r rune, b []byte, str string) (newState
defer func() {
// Transition into LB30.
if newState == lbCP || newState == lbNUCP {
ea := property(eastAsianWidth, r)
ea := propertyEastAsianWidth(r)
if ea != prF && ea != prW && ea != prH {
newState |= lbCPeaFWHBit
}
@ -352,30 +511,27 @@ func transitionLineBreakState(state int, r rune, b []byte, str string) (newState
// Find the applicable transition in the table.
var rule int
transition, ok := lbTransitions[[2]int{state, nextProperty}]
if ok {
// We have a specific transition. We'll use it.
newState, lineBreak, rule = transition[0], transition[1], transition[2]
} else {
newState, lineBreak, rule = lbTransitions(state, nextProperty)
if newState < 0 {
// No specific transition found. Try the less specific ones.
transAnyProp, okAnyProp := lbTransitions[[2]int{state, prAny}]
transAnyState, okAnyState := lbTransitions[[2]int{lbAny, nextProperty}]
if okAnyProp && okAnyState {
anyPropProp, anyPropLineBreak, anyPropRule := lbTransitions(state, prAny)
anyStateProp, anyStateLineBreak, anyStateRule := lbTransitions(lbAny, nextProperty)
if anyPropProp >= 0 && anyStateProp >= 0 {
// Both apply. We'll use a mix (see comments for grTransitions).
newState, lineBreak, rule = transAnyState[0], transAnyState[1], transAnyState[2]
if transAnyProp[2] < transAnyState[2] {
lineBreak, rule = transAnyProp[1], transAnyProp[2]
newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule
if anyPropRule < anyStateRule {
lineBreak, rule = anyPropLineBreak, anyPropRule
}
} else if okAnyProp {
} else if anyPropProp >= 0 {
// We only have a specific state.
newState, lineBreak, rule = transAnyProp[0], transAnyProp[1], transAnyProp[2]
newState, lineBreak, rule = anyPropProp, anyPropLineBreak, anyPropRule
// This branch will probably never be reached because okAnyState will
// always be true given the current transition map. But we keep it here
// for future modifications to the transition map where this may not be
// true anymore.
} else if okAnyState {
} else if anyStateProp >= 0 {
// We only have a specific property.
newState, lineBreak, rule = transAnyState[0], transAnyState[1], transAnyState[2]
newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule
} else {
// No known transition. LB31: ALL ÷ ALL.
newState, lineBreak, rule = lbAny, LineCanBreak, 310
@ -414,7 +570,7 @@ func transitionLineBreakState(state int, r rune, b []byte, str string) (newState
r, _ = utf8.DecodeRuneInString(str)
}
if r != utf8.RuneError {
pr, _ := propertyWithGenCat(lineBreakCodePoints, r)
pr, _ := propertyLineBreak(r)
if pr == prNU {
return lbNU, LineDontBreak
}
@ -424,7 +580,7 @@ func transitionLineBreakState(state int, r rune, b []byte, str string) (newState
// LB30 (part one).
if rule > 300 {
if (state == lbAL || state == lbHL || state == lbNU || state == lbNUNU) && nextProperty == prOP {
ea := property(eastAsianWidth, r)
ea := propertyEastAsianWidth(r)
if ea != prF && ea != prW && ea != prH {
return lbOP, LineDontBreak
}
@ -460,7 +616,7 @@ func transitionLineBreakState(state int, r rune, b []byte, str string) (newState
return prAny, LineDontBreak
}
}
graphemeProperty := property(graphemeCodePoints, r)
graphemeProperty := propertyGraphemes(r)
if graphemeProperty == prExtendedPictographic && generalCategory == gcCn {
return lbExtPicCn, LineCanBreak
}

View file

@ -160,9 +160,49 @@ func property(dictionary [][3]int, r rune) int {
return propertySearch(dictionary, r)[2]
}
// propertyWithGenCat returns the Unicode property value and General Category
// (see constants above) of the given code point.
func propertyWithGenCat(dictionary [][4]int, r rune) (property, generalCategory int) {
entry := propertySearch(dictionary, r)
// propertyLineBreak returns the Unicode property value and General Category
// (see constants above) of the given code point, as listed in the line break
// code points table, while fast tracking ASCII digits and letters.
func propertyLineBreak(r rune) (property, generalCategory int) {
if r >= 'a' && r <= 'z' {
return prAL, gcLl
}
if r >= 'A' && r <= 'Z' {
return prAL, gcLu
}
if r >= '0' && r <= '9' {
return prNU, gcNd
}
entry := propertySearch(lineBreakCodePoints, r)
return entry[2], entry[3]
}
// propertyGraphemes returns the Unicode grapheme cluster property value of the
// given code point while fast tracking ASCII characters.
func propertyGraphemes(r rune) int {
if r >= 0x20 && r <= 0x7e {
return prAny
}
if r == 0x0a {
return prLF
}
if r == 0x0d {
return prCR
}
if r >= 0 && r <= 0x1f || r == 0x7f {
return prControl
}
return property(graphemeCodePoints, r)
}
// propertyEastAsianWidth returns the Unicode East Asian Width property value of
// the given code point while fast tracking ASCII characters.
func propertyEastAsianWidth(r rune) int {
if r >= 0x20 && r <= 0x7e {
return prNa
}
if r >= 0 && r <= 0x1f || r == 0x7f {
return prN
}
return property(eastAsianWidth, r)
}

View file

@ -1,13 +1,13 @@
package uniseg
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
package uniseg
// sentenceBreakCodePoints are taken from
// https://www.unicode.org/Public/14.0.0/ucd/auxiliary/SentenceBreakProperty.txt
// https://www.unicode.org/Public/15.0.0/ucd/auxiliary/SentenceBreakProperty.txt
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var sentenceBreakCodePoints = [][3]int{
{0x0009, 0x0009, prSp}, // Cc <control-0009>
@ -843,6 +843,7 @@ var sentenceBreakCodePoints = [][3]int{
{0x0CE2, 0x0CE3, prExtend}, // Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
{0x0CE6, 0x0CEF, prNumeric}, // Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
{0x0CF1, 0x0CF2, prOLetter}, // Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
{0x0CF3, 0x0CF3, prExtend}, // Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
{0x0D00, 0x0D01, prExtend}, // Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
{0x0D02, 0x0D03, prExtend}, // Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
{0x0D04, 0x0D0C, prOLetter}, // Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
@ -896,7 +897,7 @@ var sentenceBreakCodePoints = [][3]int{
{0x0EBD, 0x0EBD, prOLetter}, // Lo LAO SEMIVOWEL SIGN NYO
{0x0EC0, 0x0EC4, prOLetter}, // Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
{0x0EC6, 0x0EC6, prOLetter}, // Lm LAO KO LA
{0x0EC8, 0x0ECD, prExtend}, // Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
{0x0EC8, 0x0ECE, prExtend}, // Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
{0x0ED0, 0x0ED9, prNumeric}, // Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE
{0x0EDC, 0x0EDF, prOLetter}, // Lo [4] LAO HO NO..LAO LETTER KHMU NYO
{0x0F00, 0x0F00, prOLetter}, // Lo TIBETAN SYLLABLE OM
@ -958,7 +959,7 @@ var sentenceBreakCodePoints = [][3]int{
{0x10C7, 0x10C7, prUpper}, // L& GEORGIAN CAPITAL LETTER YN
{0x10CD, 0x10CD, prUpper}, // L& GEORGIAN CAPITAL LETTER AEN
{0x10D0, 0x10FA, prOLetter}, // L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN
{0x10FC, 0x10FC, prOLetter}, // Lm MODIFIER LETTER GEORGIAN NAR
{0x10FC, 0x10FC, prLower}, // Lm MODIFIER LETTER GEORGIAN NAR
{0x10FD, 0x10FF, prOLetter}, // L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN
{0x1100, 0x1248, prOLetter}, // Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA
{0x124A, 0x124D, prOLetter}, // Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE
@ -2034,7 +2035,7 @@ var sentenceBreakCodePoints = [][3]int{
{0xA7D7, 0xA7D7, prLower}, // L& LATIN SMALL LETTER MIDDLE SCOTS S
{0xA7D8, 0xA7D8, prUpper}, // L& LATIN CAPITAL LETTER SIGMOID S
{0xA7D9, 0xA7D9, prLower}, // L& LATIN SMALL LETTER SIGMOID S
{0xA7F2, 0xA7F4, prOLetter}, // Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
{0xA7F2, 0xA7F4, prLower}, // Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
{0xA7F5, 0xA7F5, prUpper}, // L& LATIN CAPITAL LETTER REVERSED HALF H
{0xA7F6, 0xA7F6, prLower}, // L& LATIN SMALL LETTER REVERSED HALF H
{0xA7F7, 0xA7F7, prOLetter}, // Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I
@ -2140,7 +2141,7 @@ var sentenceBreakCodePoints = [][3]int{
{0xAB30, 0xAB5A, prLower}, // L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
{0xAB5C, 0xAB5F, prLower}, // Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
{0xAB60, 0xAB68, prLower}, // L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE
{0xAB69, 0xAB69, prOLetter}, // Lm MODIFIER LETTER SMALL TURNED W
{0xAB69, 0xAB69, prLower}, // Lm MODIFIER LETTER SMALL TURNED W
{0xAB70, 0xABBF, prLower}, // L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA
{0xABC0, 0xABE2, prOLetter}, // Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM
{0xABE3, 0xABE4, prExtend}, // Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP
@ -2334,6 +2335,7 @@ var sentenceBreakCodePoints = [][3]int{
{0x10E80, 0x10EA9, prOLetter}, // Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
{0x10EAB, 0x10EAC, prExtend}, // Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
{0x10EB0, 0x10EB1, prOLetter}, // Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
{0x10EFD, 0x10EFF, prExtend}, // Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
{0x10F00, 0x10F1C, prOLetter}, // Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
{0x10F27, 0x10F27, prOLetter}, // Lo OLD SOGDIAN LIGATURE AYIN-DALETH
{0x10F30, 0x10F45, prOLetter}, // Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
@ -2408,6 +2410,8 @@ var sentenceBreakCodePoints = [][3]int{
{0x11238, 0x11239, prSTerm}, // Po [2] KHOJKI DANDA..KHOJKI DOUBLE DANDA
{0x1123B, 0x1123C, prSTerm}, // Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK
{0x1123E, 0x1123E, prExtend}, // Mn KHOJKI SIGN SUKUN
{0x1123F, 0x11240, prOLetter}, // Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
{0x11241, 0x11241, prExtend}, // Mn KHOJKI VOWEL SIGN VOCALIC R
{0x11280, 0x11286, prOLetter}, // Lo [7] MULTANI LETTER A..MULTANI LETTER GA
{0x11288, 0x11288, prOLetter}, // Lo MULTANI LETTER GHA
{0x1128A, 0x1128D, prOLetter}, // Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@ -2603,13 +2607,29 @@ var sentenceBreakCodePoints = [][3]int{
{0x11EF3, 0x11EF4, prExtend}, // Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
{0x11EF5, 0x11EF6, prExtend}, // Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
{0x11EF7, 0x11EF8, prSTerm}, // Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
{0x11F00, 0x11F01, prExtend}, // Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
{0x11F02, 0x11F02, prOLetter}, // Lo KAWI SIGN REPHA
{0x11F03, 0x11F03, prExtend}, // Mc KAWI SIGN VISARGA
{0x11F04, 0x11F10, prOLetter}, // Lo [13] KAWI LETTER A..KAWI LETTER O
{0x11F12, 0x11F33, prOLetter}, // Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
{0x11F34, 0x11F35, prExtend}, // Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
{0x11F36, 0x11F3A, prExtend}, // Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
{0x11F3E, 0x11F3F, prExtend}, // Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
{0x11F40, 0x11F40, prExtend}, // Mn KAWI VOWEL SIGN EU
{0x11F41, 0x11F41, prExtend}, // Mc KAWI SIGN KILLER
{0x11F42, 0x11F42, prExtend}, // Mn KAWI CONJOINER
{0x11F43, 0x11F44, prSTerm}, // Po [2] KAWI DANDA..KAWI DOUBLE DANDA
{0x11F50, 0x11F59, prNumeric}, // Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
{0x11FB0, 0x11FB0, prOLetter}, // Lo LISU LETTER YHA
{0x12000, 0x12399, prOLetter}, // Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
{0x12400, 0x1246E, prOLetter}, // Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
{0x12480, 0x12543, prOLetter}, // Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
{0x12F90, 0x12FF0, prOLetter}, // Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
{0x13000, 0x1342E, prOLetter}, // Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
{0x13430, 0x13438, prFormat}, // Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
{0x13000, 0x1342F, prOLetter}, // Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
{0x13430, 0x1343F, prFormat}, // Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
{0x13440, 0x13440, prExtend}, // Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
{0x13441, 0x13446, prOLetter}, // Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
{0x13447, 0x13455, prExtend}, // Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
{0x14400, 0x14646, prOLetter}, // Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
{0x16800, 0x16A38, prOLetter}, // Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
{0x16A40, 0x16A5E, prOLetter}, // Lo [31] MRO LETTER TA..MRO LETTER TEK
@ -2648,7 +2668,9 @@ var sentenceBreakCodePoints = [][3]int{
{0x1AFF5, 0x1AFFB, prOLetter}, // Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
{0x1AFFD, 0x1AFFE, prOLetter}, // Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
{0x1B000, 0x1B122, prOLetter}, // Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
{0x1B132, 0x1B132, prOLetter}, // Lo HIRAGANA LETTER SMALL KO
{0x1B150, 0x1B152, prOLetter}, // Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
{0x1B155, 0x1B155, prOLetter}, // Lo KATAKANA LETTER SMALL KO
{0x1B164, 0x1B167, prOLetter}, // Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
{0x1B170, 0x1B2FB, prOLetter}, // Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
{0x1BC00, 0x1BC6A, prOLetter}, // Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
@ -2738,11 +2760,14 @@ var sentenceBreakCodePoints = [][3]int{
{0x1DF00, 0x1DF09, prLower}, // L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
{0x1DF0A, 0x1DF0A, prOLetter}, // Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
{0x1DF0B, 0x1DF1E, prLower}, // L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
{0x1DF25, 0x1DF2A, prLower}, // L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
{0x1E000, 0x1E006, prExtend}, // Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
{0x1E008, 0x1E018, prExtend}, // Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
{0x1E01B, 0x1E021, prExtend}, // Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
{0x1E023, 0x1E024, prExtend}, // Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
{0x1E026, 0x1E02A, prExtend}, // Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
{0x1E030, 0x1E06D, prLower}, // Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
{0x1E08F, 0x1E08F, prExtend}, // Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
{0x1E100, 0x1E12C, prOLetter}, // Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
{0x1E130, 0x1E136, prExtend}, // Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
{0x1E137, 0x1E13D, prOLetter}, // Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
@ -2753,6 +2778,10 @@ var sentenceBreakCodePoints = [][3]int{
{0x1E2C0, 0x1E2EB, prOLetter}, // Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
{0x1E2EC, 0x1E2EF, prExtend}, // Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
{0x1E2F0, 0x1E2F9, prNumeric}, // Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
{0x1E4D0, 0x1E4EA, prOLetter}, // Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
{0x1E4EB, 0x1E4EB, prOLetter}, // Lm NAG MUNDARI SIGN OJOD
{0x1E4EC, 0x1E4EF, prExtend}, // Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
{0x1E4F0, 0x1E4F9, prNumeric}, // Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
{0x1E7E0, 0x1E7E6, prOLetter}, // Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
{0x1E7E8, 0x1E7EB, prOLetter}, // Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
{0x1E7ED, 0x1E7EE, prOLetter}, // Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@ -2803,12 +2832,13 @@ var sentenceBreakCodePoints = [][3]int{
{0x1F676, 0x1F678, prClose}, // So [3] SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT..SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
{0x1FBF0, 0x1FBF9, prNumeric}, // Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
{0x20000, 0x2A6DF, prOLetter}, // Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
{0x2A700, 0x2B738, prOLetter}, // Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
{0x2A700, 0x2B739, prOLetter}, // Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
{0x2B740, 0x2B81D, prOLetter}, // Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
{0x2B820, 0x2CEA1, prOLetter}, // Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
{0x2CEB0, 0x2EBE0, prOLetter}, // Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
{0x2F800, 0x2FA1D, prOLetter}, // Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
{0x30000, 0x3134A, prOLetter}, // Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
{0x31350, 0x323AF, prOLetter}, // Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
{0xE0001, 0xE0001, prFormat}, // Cf LANGUAGE TAG
{0xE0020, 0xE007F, prExtend}, // Cf [96] TAG SPACE..CANCEL TAG
{0xE0100, 0xE01EF, prExtend}, // Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256

View file

@ -18,104 +18,178 @@ const (
sbSB8aSp
)
// The sentence break parser's breaking instructions.
const (
sbDontBreak = iota
sbBreak
)
// The sentence break parser's state transitions. It's anologous to
// grTransitions, see comments there for details. Unicode version 14.0.0.
var sbTransitions = map[[2]int][3]int{
// sbTransitions implements the sentence break parser's state transitions. It's
// anologous to [grTransitions], see comments there for details.
//
// Unicode version 15.0.0.
func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
switch uint64(state) | uint64(prop)<<32 {
// SB3.
{sbAny, prCR}: {sbCR, sbDontBreak, 9990},
{sbCR, prLF}: {sbParaSep, sbDontBreak, 30},
case sbAny | prCR<<32:
return sbCR, false, 9990
case sbCR | prLF<<32:
return sbParaSep, false, 30
// SB4.
{sbAny, prSep}: {sbParaSep, sbDontBreak, 9990},
{sbAny, prLF}: {sbParaSep, sbDontBreak, 9990},
{sbParaSep, prAny}: {sbAny, sbBreak, 40},
{sbCR, prAny}: {sbAny, sbBreak, 40},
case sbAny | prSep<<32:
return sbParaSep, false, 9990
case sbAny | prLF<<32:
return sbParaSep, false, 9990
case sbParaSep | prAny<<32:
return sbAny, true, 40
case sbCR | prAny<<32:
return sbAny, true, 40
// SB6.
{sbAny, prATerm}: {sbATerm, sbDontBreak, 9990},
{sbATerm, prNumeric}: {sbAny, sbDontBreak, 60},
{sbSB7, prNumeric}: {sbAny, sbDontBreak, 60}, // Because ATerm also appears in SB7.
case sbAny | prATerm<<32:
return sbATerm, false, 9990
case sbATerm | prNumeric<<32:
return sbAny, false, 60
case sbSB7 | prNumeric<<32:
return sbAny, false, 60 // Because ATerm also appears in SB7.
// SB7.
{sbAny, prUpper}: {sbUpper, sbDontBreak, 9990},
{sbAny, prLower}: {sbLower, sbDontBreak, 9990},
{sbUpper, prATerm}: {sbSB7, sbDontBreak, 70},
{sbLower, prATerm}: {sbSB7, sbDontBreak, 70},
{sbSB7, prUpper}: {sbUpper, sbDontBreak, 70},
case sbAny | prUpper<<32:
return sbUpper, false, 9990
case sbAny | prLower<<32:
return sbLower, false, 9990
case sbUpper | prATerm<<32:
return sbSB7, false, 70
case sbLower | prATerm<<32:
return sbSB7, false, 70
case sbSB7 | prUpper<<32:
return sbUpper, false, 70
// SB8a.
{sbAny, prSTerm}: {sbSTerm, sbDontBreak, 9990},
{sbATerm, prSContinue}: {sbAny, sbDontBreak, 81},
{sbATerm, prATerm}: {sbATerm, sbDontBreak, 81},
{sbATerm, prSTerm}: {sbSTerm, sbDontBreak, 81},
{sbSB7, prSContinue}: {sbAny, sbDontBreak, 81},
{sbSB7, prATerm}: {sbATerm, sbDontBreak, 81},
{sbSB7, prSTerm}: {sbSTerm, sbDontBreak, 81},
{sbSB8Close, prSContinue}: {sbAny, sbDontBreak, 81},
{sbSB8Close, prATerm}: {sbATerm, sbDontBreak, 81},
{sbSB8Close, prSTerm}: {sbSTerm, sbDontBreak, 81},
{sbSB8Sp, prSContinue}: {sbAny, sbDontBreak, 81},
{sbSB8Sp, prATerm}: {sbATerm, sbDontBreak, 81},
{sbSB8Sp, prSTerm}: {sbSTerm, sbDontBreak, 81},
{sbSTerm, prSContinue}: {sbAny, sbDontBreak, 81},
{sbSTerm, prATerm}: {sbATerm, sbDontBreak, 81},
{sbSTerm, prSTerm}: {sbSTerm, sbDontBreak, 81},
{sbSB8aClose, prSContinue}: {sbAny, sbDontBreak, 81},
{sbSB8aClose, prATerm}: {sbATerm, sbDontBreak, 81},
{sbSB8aClose, prSTerm}: {sbSTerm, sbDontBreak, 81},
{sbSB8aSp, prSContinue}: {sbAny, sbDontBreak, 81},
{sbSB8aSp, prATerm}: {sbATerm, sbDontBreak, 81},
{sbSB8aSp, prSTerm}: {sbSTerm, sbDontBreak, 81},
case sbAny | prSTerm<<32:
return sbSTerm, false, 9990
case sbATerm | prSContinue<<32:
return sbAny, false, 81
case sbATerm | prATerm<<32:
return sbATerm, false, 81
case sbATerm | prSTerm<<32:
return sbSTerm, false, 81
case sbSB7 | prSContinue<<32:
return sbAny, false, 81
case sbSB7 | prATerm<<32:
return sbATerm, false, 81
case sbSB7 | prSTerm<<32:
return sbSTerm, false, 81
case sbSB8Close | prSContinue<<32:
return sbAny, false, 81
case sbSB8Close | prATerm<<32:
return sbATerm, false, 81
case sbSB8Close | prSTerm<<32:
return sbSTerm, false, 81
case sbSB8Sp | prSContinue<<32:
return sbAny, false, 81
case sbSB8Sp | prATerm<<32:
return sbATerm, false, 81
case sbSB8Sp | prSTerm<<32:
return sbSTerm, false, 81
case sbSTerm | prSContinue<<32:
return sbAny, false, 81
case sbSTerm | prATerm<<32:
return sbATerm, false, 81
case sbSTerm | prSTerm<<32:
return sbSTerm, false, 81
case sbSB8aClose | prSContinue<<32:
return sbAny, false, 81
case sbSB8aClose | prATerm<<32:
return sbATerm, false, 81
case sbSB8aClose | prSTerm<<32:
return sbSTerm, false, 81
case sbSB8aSp | prSContinue<<32:
return sbAny, false, 81
case sbSB8aSp | prATerm<<32:
return sbATerm, false, 81
case sbSB8aSp | prSTerm<<32:
return sbSTerm, false, 81
// SB9.
{sbATerm, prClose}: {sbSB8Close, sbDontBreak, 90},
{sbSB7, prClose}: {sbSB8Close, sbDontBreak, 90},
{sbSB8Close, prClose}: {sbSB8Close, sbDontBreak, 90},
{sbATerm, prSp}: {sbSB8Sp, sbDontBreak, 90},
{sbSB7, prSp}: {sbSB8Sp, sbDontBreak, 90},
{sbSB8Close, prSp}: {sbSB8Sp, sbDontBreak, 90},
{sbSTerm, prClose}: {sbSB8aClose, sbDontBreak, 90},
{sbSB8aClose, prClose}: {sbSB8aClose, sbDontBreak, 90},
{sbSTerm, prSp}: {sbSB8aSp, sbDontBreak, 90},
{sbSB8aClose, prSp}: {sbSB8aSp, sbDontBreak, 90},
{sbATerm, prSep}: {sbParaSep, sbDontBreak, 90},
{sbATerm, prCR}: {sbParaSep, sbDontBreak, 90},
{sbATerm, prLF}: {sbParaSep, sbDontBreak, 90},
{sbSB7, prSep}: {sbParaSep, sbDontBreak, 90},
{sbSB7, prCR}: {sbParaSep, sbDontBreak, 90},
{sbSB7, prLF}: {sbParaSep, sbDontBreak, 90},
{sbSB8Close, prSep}: {sbParaSep, sbDontBreak, 90},
{sbSB8Close, prCR}: {sbParaSep, sbDontBreak, 90},
{sbSB8Close, prLF}: {sbParaSep, sbDontBreak, 90},
{sbSTerm, prSep}: {sbParaSep, sbDontBreak, 90},
{sbSTerm, prCR}: {sbParaSep, sbDontBreak, 90},
{sbSTerm, prLF}: {sbParaSep, sbDontBreak, 90},
{sbSB8aClose, prSep}: {sbParaSep, sbDontBreak, 90},
{sbSB8aClose, prCR}: {sbParaSep, sbDontBreak, 90},
{sbSB8aClose, prLF}: {sbParaSep, sbDontBreak, 90},
case sbATerm | prClose<<32:
return sbSB8Close, false, 90
case sbSB7 | prClose<<32:
return sbSB8Close, false, 90
case sbSB8Close | prClose<<32:
return sbSB8Close, false, 90
case sbATerm | prSp<<32:
return sbSB8Sp, false, 90
case sbSB7 | prSp<<32:
return sbSB8Sp, false, 90
case sbSB8Close | prSp<<32:
return sbSB8Sp, false, 90
case sbSTerm | prClose<<32:
return sbSB8aClose, false, 90
case sbSB8aClose | prClose<<32:
return sbSB8aClose, false, 90
case sbSTerm | prSp<<32:
return sbSB8aSp, false, 90
case sbSB8aClose | prSp<<32:
return sbSB8aSp, false, 90
case sbATerm | prSep<<32:
return sbParaSep, false, 90
case sbATerm | prCR<<32:
return sbParaSep, false, 90
case sbATerm | prLF<<32:
return sbParaSep, false, 90
case sbSB7 | prSep<<32:
return sbParaSep, false, 90
case sbSB7 | prCR<<32:
return sbParaSep, false, 90
case sbSB7 | prLF<<32:
return sbParaSep, false, 90
case sbSB8Close | prSep<<32:
return sbParaSep, false, 90
case sbSB8Close | prCR<<32:
return sbParaSep, false, 90
case sbSB8Close | prLF<<32:
return sbParaSep, false, 90
case sbSTerm | prSep<<32:
return sbParaSep, false, 90
case sbSTerm | prCR<<32:
return sbParaSep, false, 90
case sbSTerm | prLF<<32:
return sbParaSep, false, 90
case sbSB8aClose | prSep<<32:
return sbParaSep, false, 90
case sbSB8aClose | prCR<<32:
return sbParaSep, false, 90
case sbSB8aClose | prLF<<32:
return sbParaSep, false, 90
// SB10.
{sbSB8Sp, prSp}: {sbSB8Sp, sbDontBreak, 100},
{sbSB8aSp, prSp}: {sbSB8aSp, sbDontBreak, 100},
{sbSB8Sp, prSep}: {sbParaSep, sbDontBreak, 100},
{sbSB8Sp, prCR}: {sbParaSep, sbDontBreak, 100},
{sbSB8Sp, prLF}: {sbParaSep, sbDontBreak, 100},
case sbSB8Sp | prSp<<32:
return sbSB8Sp, false, 100
case sbSB8aSp | prSp<<32:
return sbSB8aSp, false, 100
case sbSB8Sp | prSep<<32:
return sbParaSep, false, 100
case sbSB8Sp | prCR<<32:
return sbParaSep, false, 100
case sbSB8Sp | prLF<<32:
return sbParaSep, false, 100
// SB11.
{sbATerm, prAny}: {sbAny, sbBreak, 110},
{sbSB7, prAny}: {sbAny, sbBreak, 110},
{sbSB8Close, prAny}: {sbAny, sbBreak, 110},
{sbSB8Sp, prAny}: {sbAny, sbBreak, 110},
{sbSTerm, prAny}: {sbAny, sbBreak, 110},
{sbSB8aClose, prAny}: {sbAny, sbBreak, 110},
{sbSB8aSp, prAny}: {sbAny, sbBreak, 110},
case sbATerm | prAny<<32:
return sbAny, true, 110
case sbSB7 | prAny<<32:
return sbAny, true, 110
case sbSB8Close | prAny<<32:
return sbAny, true, 110
case sbSB8Sp | prAny<<32:
return sbAny, true, 110
case sbSTerm | prAny<<32:
return sbAny, true, 110
case sbSB8aClose | prAny<<32:
return sbAny, true, 110
case sbSB8aSp | prAny<<32:
return sbAny, true, 110
// We'll always break after ParaSep due to SB4.
default:
return -1, false, -1
}
}
// transitionSentenceBreakState determines the new state of the sentence break
@ -141,30 +215,27 @@ func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newS
// Find the applicable transition in the table.
var rule int
transition, ok := sbTransitions[[2]int{state, nextProperty}]
if ok {
// We have a specific transition. We'll use it.
newState, sentenceBreak, rule = transition[0], transition[1] == sbBreak, transition[2]
} else {
newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
if newState < 0 {
// No specific transition found. Try the less specific ones.
transAnyProp, okAnyProp := sbTransitions[[2]int{state, prAny}]
transAnyState, okAnyState := sbTransitions[[2]int{sbAny, nextProperty}]
if okAnyProp && okAnyState {
anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
if anyPropState >= 0 && anyStateState >= 0 {
// Both apply. We'll use a mix (see comments for grTransitions).
newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
if transAnyProp[2] < transAnyState[2] {
sentenceBreak, rule = transAnyProp[1] == sbBreak, transAnyProp[2]
newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
if anyPropRule < anyStateRule {
sentenceBreak, rule = anyPropProp, anyPropRule
}
} else if okAnyProp {
} else if anyPropState >= 0 {
// We only have a specific state.
newState, sentenceBreak, rule = transAnyProp[0], transAnyProp[1] == sbBreak, transAnyProp[2]
newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
// This branch will probably never be reached because okAnyState will
// always be true given the current transition map. But we keep it here
// for future modifications to the transition map where this may not be
// true anymore.
} else if okAnyState {
} else if anyStateState >= 0 {
// We only have a specific property.
newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
} else {
// No known transition. SB999: Any × Any.
newState, sentenceBreak, rule = sbAny, false, 9990

View file

@ -100,7 +100,7 @@ func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState i
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
var prop int
if state < 0 {
prop = property(graphemeCodePoints, r)
prop = propertyGraphemes(r)
} else {
prop = state >> shiftPropState
}
@ -179,7 +179,7 @@ func StepString(str string, state int) (cluster, rest string, boundaries int, ne
// Extract the first rune.
r, length := utf8.DecodeRuneInString(str)
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
prop := property(graphemeCodePoints, r)
prop := propertyGraphemes(r)
return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
}

View file

@ -1,5 +1,10 @@
package uniseg
// EastAsianAmbiguousWidth specifies the monospace width for East Asian
// characters classified as Ambiguous. The default is 1 but some rare fonts
// render them with a width of 2.
var EastAsianAmbiguousWidth = 1
// runeWidth returns the monospace width for the given rune. The provided
// grapheme property is a value mapped by the [graphemeCodePoints] table.
//
@ -33,9 +38,11 @@ func runeWidth(r rune, graphemeProperty int) int {
return 4
}
switch property(eastAsianWidth, r) {
switch propertyEastAsianWidth(r) {
case prW, prF:
return 2
case prA:
return EastAsianAmbiguousWidth
}
return 1

View file

@ -1,13 +1,13 @@
package uniseg
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
package uniseg
// workBreakCodePoints are taken from
// https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakProperty.txt
// https://www.unicode.org/Public/15.0.0/ucd/auxiliary/WordBreakProperty.txt
// and
// https://unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on September 10, 2022. See https://www.unicode.org/license.html for the Unicode
// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var workBreakCodePoints = [][3]int{
{0x000A, 0x000A, prLF}, // Cc <control-000A>
@ -318,6 +318,7 @@ var workBreakCodePoints = [][3]int{
{0x0CE2, 0x0CE3, prExtend}, // Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
{0x0CE6, 0x0CEF, prNumeric}, // Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
{0x0CF1, 0x0CF2, prALetter}, // Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
{0x0CF3, 0x0CF3, prExtend}, // Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
{0x0D00, 0x0D01, prExtend}, // Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
{0x0D02, 0x0D03, prExtend}, // Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
{0x0D04, 0x0D0C, prALetter}, // Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
@ -357,7 +358,7 @@ var workBreakCodePoints = [][3]int{
{0x0E50, 0x0E59, prNumeric}, // Nd [10] THAI DIGIT ZERO..THAI DIGIT NINE
{0x0EB1, 0x0EB1, prExtend}, // Mn LAO VOWEL SIGN MAI KAN
{0x0EB4, 0x0EBC, prExtend}, // Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
{0x0EC8, 0x0ECD, prExtend}, // Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
{0x0EC8, 0x0ECE, prExtend}, // Mn [7] LAO TONE MAI EK..LAO YAMAKKAN
{0x0ED0, 0x0ED9, prNumeric}, // Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE
{0x0F00, 0x0F00, prALetter}, // Lo TIBETAN SYLLABLE OM
{0x0F18, 0x0F19, prExtend}, // Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
@ -1093,6 +1094,7 @@ var workBreakCodePoints = [][3]int{
{0x10E80, 0x10EA9, prALetter}, // Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
{0x10EAB, 0x10EAC, prExtend}, // Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
{0x10EB0, 0x10EB1, prALetter}, // Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
{0x10EFD, 0x10EFF, prExtend}, // Mn [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
{0x10F00, 0x10F1C, prALetter}, // Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
{0x10F27, 0x10F27, prALetter}, // Lo OLD SOGDIAN LIGATURE AYIN-DALETH
{0x10F30, 0x10F45, prALetter}, // Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
@ -1157,6 +1159,8 @@ var workBreakCodePoints = [][3]int{
{0x11235, 0x11235, prExtend}, // Mc KHOJKI SIGN VIRAMA
{0x11236, 0x11237, prExtend}, // Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
{0x1123E, 0x1123E, prExtend}, // Mn KHOJKI SIGN SUKUN
{0x1123F, 0x11240, prALetter}, // Lo [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
{0x11241, 0x11241, prExtend}, // Mn KHOJKI VOWEL SIGN VOCALIC R
{0x11280, 0x11286, prALetter}, // Lo [7] MULTANI LETTER A..MULTANI LETTER GA
{0x11288, 0x11288, prALetter}, // Lo MULTANI LETTER GHA
{0x1128A, 0x1128D, prALetter}, // Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA
@ -1337,13 +1341,28 @@ var workBreakCodePoints = [][3]int{
{0x11EE0, 0x11EF2, prALetter}, // Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA
{0x11EF3, 0x11EF4, prExtend}, // Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
{0x11EF5, 0x11EF6, prExtend}, // Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
{0x11F00, 0x11F01, prExtend}, // Mn [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
{0x11F02, 0x11F02, prALetter}, // Lo KAWI SIGN REPHA
{0x11F03, 0x11F03, prExtend}, // Mc KAWI SIGN VISARGA
{0x11F04, 0x11F10, prALetter}, // Lo [13] KAWI LETTER A..KAWI LETTER O
{0x11F12, 0x11F33, prALetter}, // Lo [34] KAWI LETTER KA..KAWI LETTER JNYA
{0x11F34, 0x11F35, prExtend}, // Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
{0x11F36, 0x11F3A, prExtend}, // Mn [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
{0x11F3E, 0x11F3F, prExtend}, // Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
{0x11F40, 0x11F40, prExtend}, // Mn KAWI VOWEL SIGN EU
{0x11F41, 0x11F41, prExtend}, // Mc KAWI SIGN KILLER
{0x11F42, 0x11F42, prExtend}, // Mn KAWI CONJOINER
{0x11F50, 0x11F59, prNumeric}, // Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
{0x11FB0, 0x11FB0, prALetter}, // Lo LISU LETTER YHA
{0x12000, 0x12399, prALetter}, // Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
{0x12400, 0x1246E, prALetter}, // Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
{0x12480, 0x12543, prALetter}, // Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
{0x12F90, 0x12FF0, prALetter}, // Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
{0x13000, 0x1342E, prALetter}, // Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
{0x13430, 0x13438, prFormat}, // Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
{0x13000, 0x1342F, prALetter}, // Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
{0x13430, 0x1343F, prFormat}, // Cf [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
{0x13440, 0x13440, prExtend}, // Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
{0x13441, 0x13446, prALetter}, // Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
{0x13447, 0x13455, prExtend}, // Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
{0x14400, 0x14646, prALetter}, // Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
{0x16800, 0x16A38, prALetter}, // Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
{0x16A40, 0x16A5E, prALetter}, // Lo [31] MRO LETTER TA..MRO LETTER TEK
@ -1374,6 +1393,7 @@ var workBreakCodePoints = [][3]int{
{0x1AFFD, 0x1AFFE, prKatakana}, // Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
{0x1B000, 0x1B000, prKatakana}, // Lo KATAKANA LETTER ARCHAIC E
{0x1B120, 0x1B122, prKatakana}, // Lo [3] KATAKANA LETTER ARCHAIC YI..KATAKANA LETTER ARCHAIC WU
{0x1B155, 0x1B155, prKatakana}, // Lo KATAKANA LETTER SMALL KO
{0x1B164, 0x1B167, prKatakana}, // Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
{0x1BC00, 0x1BC6A, prALetter}, // Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
{0x1BC70, 0x1BC7C, prALetter}, // Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK
@ -1431,11 +1451,14 @@ var workBreakCodePoints = [][3]int{
{0x1DF00, 0x1DF09, prALetter}, // L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
{0x1DF0A, 0x1DF0A, prALetter}, // Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
{0x1DF0B, 0x1DF1E, prALetter}, // L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
{0x1DF25, 0x1DF2A, prALetter}, // L& [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
{0x1E000, 0x1E006, prExtend}, // Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
{0x1E008, 0x1E018, prExtend}, // Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
{0x1E01B, 0x1E021, prExtend}, // Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
{0x1E023, 0x1E024, prExtend}, // Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
{0x1E026, 0x1E02A, prExtend}, // Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
{0x1E030, 0x1E06D, prALetter}, // Lm [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
{0x1E08F, 0x1E08F, prExtend}, // Mn COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
{0x1E100, 0x1E12C, prALetter}, // Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
{0x1E130, 0x1E136, prExtend}, // Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
{0x1E137, 0x1E13D, prALetter}, // Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
@ -1446,6 +1469,10 @@ var workBreakCodePoints = [][3]int{
{0x1E2C0, 0x1E2EB, prALetter}, // Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH
{0x1E2EC, 0x1E2EF, prExtend}, // Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI
{0x1E2F0, 0x1E2F9, prNumeric}, // Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
{0x1E4D0, 0x1E4EA, prALetter}, // Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
{0x1E4EB, 0x1E4EB, prALetter}, // Lm NAG MUNDARI SIGN OJOD
{0x1E4EC, 0x1E4EF, prExtend}, // Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
{0x1E4F0, 0x1E4F9, prNumeric}, // Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
{0x1E7E0, 0x1E7E6, prALetter}, // Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
{0x1E7E8, 0x1E7EB, prALetter}, // Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
{0x1E7ED, 0x1E7EE, prALetter}, // Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
@ -1740,7 +1767,8 @@ var workBreakCodePoints = [][3]int{
{0x1F6D3, 0x1F6D4, prExtendedPictographic}, // E0.0 [2] (🛓..🛔) STUPA..PAGODA
{0x1F6D5, 0x1F6D5, prExtendedPictographic}, // E12.0 [1] (🛕) hindu temple
{0x1F6D6, 0x1F6D7, prExtendedPictographic}, // E13.0 [2] (🛖..🛗) hut..elevator
{0x1F6D8, 0x1F6DC, prExtendedPictographic}, // E0.0 [5] (🛘..🛜) <reserved-1F6D8>..<reserved-1F6DC>
{0x1F6D8, 0x1F6DB, prExtendedPictographic}, // E0.0 [4] (🛘..🛛) <reserved-1F6D8>..<reserved-1F6DB>
{0x1F6DC, 0x1F6DC, prExtendedPictographic}, // E15.0 [1] (🛜) wireless
{0x1F6DD, 0x1F6DF, prExtendedPictographic}, // E14.0 [3] (🛝..🛟) playground slide..ring buoy
{0x1F6E0, 0x1F6E5, prExtendedPictographic}, // E0.7 [6] (🛠️..🛥️) hammer and wrench..motor boat
{0x1F6E6, 0x1F6E8, prExtendedPictographic}, // E0.0 [3] (🛦..🛨) UP-POINTING MILITARY AIRPLANE..UP-POINTING SMALL AIRPLANE
@ -1757,7 +1785,7 @@ var workBreakCodePoints = [][3]int{
{0x1F6FA, 0x1F6FA, prExtendedPictographic}, // E12.0 [1] (🛺) auto rickshaw
{0x1F6FB, 0x1F6FC, prExtendedPictographic}, // E13.0 [2] (🛻..🛼) pickup truck..roller skate
{0x1F6FD, 0x1F6FF, prExtendedPictographic}, // E0.0 [3] (🛽..🛿) <reserved-1F6FD>..<reserved-1F6FF>
{0x1F774, 0x1F77F, prExtendedPictographic}, // E0.0 [12] (🝴..🝿) <reserved-1F774>..<reserved-1F77F>
{0x1F774, 0x1F77F, prExtendedPictographic}, // E0.0 [12] (🝴..🝿) LOT OF FORTUNE..ORCUS
{0x1F7D5, 0x1F7DF, prExtendedPictographic}, // E0.0 [11] (🟕..🟟) CIRCLED TRIANGLE..<reserved-1F7DF>
{0x1F7E0, 0x1F7EB, prExtendedPictographic}, // E12.0 [12] (🟠..🟫) orange circle..brown square
{0x1F7EC, 0x1F7EF, prExtendedPictographic}, // E0.0 [4] (🟬..🟯) <reserved-1F7EC>..<reserved-1F7EF>
@ -1816,30 +1844,37 @@ var workBreakCodePoints = [][3]int{
{0x1FA00, 0x1FA6F, prExtendedPictographic}, // E0.0 [112] (🨀..🩯) NEUTRAL CHESS KING..<reserved-1FA6F>
{0x1FA70, 0x1FA73, prExtendedPictographic}, // E12.0 [4] (🩰..🩳) ballet shoes..shorts
{0x1FA74, 0x1FA74, prExtendedPictographic}, // E13.0 [1] (🩴) thong sandal
{0x1FA75, 0x1FA77, prExtendedPictographic}, // E0.0 [3] (🩵..🩷) <reserved-1FA75>..<reserved-1FA77>
{0x1FA75, 0x1FA77, prExtendedPictographic}, // E15.0 [3] (🩵..🩷) light blue heart..pink heart
{0x1FA78, 0x1FA7A, prExtendedPictographic}, // E12.0 [3] (🩸..🩺) drop of blood..stethoscope
{0x1FA7B, 0x1FA7C, prExtendedPictographic}, // E14.0 [2] (🩻..🩼) x-ray..crutch
{0x1FA7D, 0x1FA7F, prExtendedPictographic}, // E0.0 [3] (🩽..🩿) <reserved-1FA7D>..<reserved-1FA7F>
{0x1FA80, 0x1FA82, prExtendedPictographic}, // E12.0 [3] (🪀..🪂) yo-yo..parachute
{0x1FA83, 0x1FA86, prExtendedPictographic}, // E13.0 [4] (🪃..🪆) boomerang..nesting dolls
{0x1FA87, 0x1FA8F, prExtendedPictographic}, // E0.0 [9] (🪇..🪏) <reserved-1FA87>..<reserved-1FA8F>
{0x1FA87, 0x1FA88, prExtendedPictographic}, // E15.0 [2] (🪇..🪈) maracas..flute
{0x1FA89, 0x1FA8F, prExtendedPictographic}, // E0.0 [7] (🪉..🪏) <reserved-1FA89>..<reserved-1FA8F>
{0x1FA90, 0x1FA95, prExtendedPictographic}, // E12.0 [6] (🪐..🪕) ringed planet..banjo
{0x1FA96, 0x1FAA8, prExtendedPictographic}, // E13.0 [19] (🪖..🪨) military helmet..rock
{0x1FAA9, 0x1FAAC, prExtendedPictographic}, // E14.0 [4] (🪩..🪬) mirror ball..hamsa
{0x1FAAD, 0x1FAAF, prExtendedPictographic}, // E0.0 [3] (🪭..🪯) <reserved-1FAAD>..<reserved-1FAAF>
{0x1FAAD, 0x1FAAF, prExtendedPictographic}, // E15.0 [3] (🪭..🪯) folding hand fan..khanda
{0x1FAB0, 0x1FAB6, prExtendedPictographic}, // E13.0 [7] (🪰..🪶) fly..feather
{0x1FAB7, 0x1FABA, prExtendedPictographic}, // E14.0 [4] (🪷..🪺) lotus..nest with eggs
{0x1FABB, 0x1FABF, prExtendedPictographic}, // E0.0 [5] (🪻..🪿) <reserved-1FABB>..<reserved-1FABF>
{0x1FABB, 0x1FABD, prExtendedPictographic}, // E15.0 [3] (🪻..🪽) hyacinth..wing
{0x1FABE, 0x1FABE, prExtendedPictographic}, // E0.0 [1] (🪾) <reserved-1FABE>
{0x1FABF, 0x1FABF, prExtendedPictographic}, // E15.0 [1] (🪿) goose
{0x1FAC0, 0x1FAC2, prExtendedPictographic}, // E13.0 [3] (🫀..🫂) anatomical heart..people hugging
{0x1FAC3, 0x1FAC5, prExtendedPictographic}, // E14.0 [3] (🫃..🫅) pregnant man..person with crown
{0x1FAC6, 0x1FACF, prExtendedPictographic}, // E0.0 [10] (🫆..🫏) <reserved-1FAC6>..<reserved-1FACF>
{0x1FAC6, 0x1FACD, prExtendedPictographic}, // E0.0 [8] (🫆..🫍) <reserved-1FAC6>..<reserved-1FACD>
{0x1FACE, 0x1FACF, prExtendedPictographic}, // E15.0 [2] (🫎..🫏) moose..donkey
{0x1FAD0, 0x1FAD6, prExtendedPictographic}, // E13.0 [7] (🫐..🫖) blueberries..teapot
{0x1FAD7, 0x1FAD9, prExtendedPictographic}, // E14.0 [3] (🫗..🫙) pouring liquid..jar
{0x1FADA, 0x1FADF, prExtendedPictographic}, // E0.0 [6] (🫚..🫟) <reserved-1FADA>..<reserved-1FADF>
{0x1FADA, 0x1FADB, prExtendedPictographic}, // E15.0 [2] (🫚..🫛) ginger root..pea pod
{0x1FADC, 0x1FADF, prExtendedPictographic}, // E0.0 [4] (🫜..🫟) <reserved-1FADC>..<reserved-1FADF>
{0x1FAE0, 0x1FAE7, prExtendedPictographic}, // E14.0 [8] (🫠..🫧) melting face..bubbles
{0x1FAE8, 0x1FAEF, prExtendedPictographic}, // E0.0 [8] (🫨..🫯) <reserved-1FAE8>..<reserved-1FAEF>
{0x1FAE8, 0x1FAE8, prExtendedPictographic}, // E15.0 [1] (🫨) shaking face
{0x1FAE9, 0x1FAEF, prExtendedPictographic}, // E0.0 [7] (🫩..🫯) <reserved-1FAE9>..<reserved-1FAEF>
{0x1FAF0, 0x1FAF6, prExtendedPictographic}, // E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
{0x1FAF7, 0x1FAFF, prExtendedPictographic}, // E0.0 [9] (🫷..🫿) <reserved-1FAF7>..<reserved-1FAFF>
{0x1FAF7, 0x1FAF8, prExtendedPictographic}, // E15.0 [2] (🫷..🫸) leftwards pushing hand..rightwards pushing hand
{0x1FAF9, 0x1FAFF, prExtendedPictographic}, // E0.0 [7] (🫹..🫿) <reserved-1FAF9>..<reserved-1FAFF>
{0x1FBF0, 0x1FBF9, prNumeric}, // Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
{0x1FC00, 0x1FFFD, prExtendedPictographic}, // E0.0[1022] (🰀..🿽) <reserved-1FC00>..<reserved-1FFFD>
{0xE0001, 0xE0001, prFormat}, // Cf LANGUAGE TAG

View file

@ -22,82 +22,121 @@ const (
wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
)
// The word break parser's breaking instructions.
const (
wbDontBreak = iota
wbBreak
)
// The word break parser's state transitions. It's anologous to grTransitions,
// see comments there for details. Unicode version 14.0.0.
var wbTransitions = map[[2]int][3]int{
// wbTransitions implements the word break parser's state transitions. It's
// anologous to [grTransitions], see comments there for details.
//
// Unicode version 15.0.0.
func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {
switch uint64(state) | uint64(prop)<<32 {
// WB3b.
{wbAny, prNewline}: {wbNewline, wbBreak, 32},
{wbAny, prCR}: {wbCR, wbBreak, 32},
{wbAny, prLF}: {wbLF, wbBreak, 32},
case wbAny | prNewline<<32:
return wbNewline, true, 32
case wbAny | prCR<<32:
return wbCR, true, 32
case wbAny | prLF<<32:
return wbLF, true, 32
// WB3a.
{wbNewline, prAny}: {wbAny, wbBreak, 31},
{wbCR, prAny}: {wbAny, wbBreak, 31},
{wbLF, prAny}: {wbAny, wbBreak, 31},
case wbNewline | prAny<<32:
return wbAny, true, 31
case wbCR | prAny<<32:
return wbAny, true, 31
case wbLF | prAny<<32:
return wbAny, true, 31
// WB3.
{wbCR, prLF}: {wbLF, wbDontBreak, 30},
case wbCR | prLF<<32:
return wbLF, false, 30
// WB3d.
{wbAny, prWSegSpace}: {wbWSegSpace, wbBreak, 9990},
{wbWSegSpace, prWSegSpace}: {wbWSegSpace, wbDontBreak, 34},
case wbAny | prWSegSpace<<32:
return wbWSegSpace, true, 9990
case wbWSegSpace | prWSegSpace<<32:
return wbWSegSpace, false, 34
// WB5.
{wbAny, prALetter}: {wbALetter, wbBreak, 9990},
{wbAny, prHebrewLetter}: {wbHebrewLetter, wbBreak, 9990},
{wbALetter, prALetter}: {wbALetter, wbDontBreak, 50},
{wbALetter, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 50},
{wbHebrewLetter, prALetter}: {wbALetter, wbDontBreak, 50},
{wbHebrewLetter, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 50},
case wbAny | prALetter<<32:
return wbALetter, true, 9990
case wbAny | prHebrewLetter<<32:
return wbHebrewLetter, true, 9990
case wbALetter | prALetter<<32:
return wbALetter, false, 50
case wbALetter | prHebrewLetter<<32:
return wbHebrewLetter, false, 50
case wbHebrewLetter | prALetter<<32:
return wbALetter, false, 50
case wbHebrewLetter | prHebrewLetter<<32:
return wbHebrewLetter, false, 50
// WB7. Transitions to wbWB7 handled by transitionWordBreakState().
{wbWB7, prALetter}: {wbALetter, wbDontBreak, 70},
{wbWB7, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 70},
case wbWB7 | prALetter<<32:
return wbALetter, false, 70
case wbWB7 | prHebrewLetter<<32:
return wbHebrewLetter, false, 70
// WB7a.
{wbHebrewLetter, prSingleQuote}: {wbAny, wbDontBreak, 71},
case wbHebrewLetter | prSingleQuote<<32:
return wbAny, false, 71
// WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
{wbWB7c, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 73},
case wbWB7c | prHebrewLetter<<32:
return wbHebrewLetter, false, 73
// WB8.
{wbAny, prNumeric}: {wbNumeric, wbBreak, 9990},
{wbNumeric, prNumeric}: {wbNumeric, wbDontBreak, 80},
case wbAny | prNumeric<<32:
return wbNumeric, true, 9990
case wbNumeric | prNumeric<<32:
return wbNumeric, false, 80
// WB9.
{wbALetter, prNumeric}: {wbNumeric, wbDontBreak, 90},
{wbHebrewLetter, prNumeric}: {wbNumeric, wbDontBreak, 90},
case wbALetter | prNumeric<<32:
return wbNumeric, false, 90
case wbHebrewLetter | prNumeric<<32:
return wbNumeric, false, 90
// WB10.
{wbNumeric, prALetter}: {wbALetter, wbDontBreak, 100},
{wbNumeric, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 100},
case wbNumeric | prALetter<<32:
return wbALetter, false, 100
case wbNumeric | prHebrewLetter<<32:
return wbHebrewLetter, false, 100
// WB11. Transitions to wbWB11 handled by transitionWordBreakState().
{wbWB11, prNumeric}: {wbNumeric, wbDontBreak, 110},
case wbWB11 | prNumeric<<32:
return wbNumeric, false, 110
// WB13.
{wbAny, prKatakana}: {wbKatakana, wbBreak, 9990},
{wbKatakana, prKatakana}: {wbKatakana, wbDontBreak, 130},
case wbAny | prKatakana<<32:
return wbKatakana, true, 9990
case wbKatakana | prKatakana<<32:
return wbKatakana, false, 130
// WB13a.
{wbAny, prExtendNumLet}: {wbExtendNumLet, wbBreak, 9990},
{wbALetter, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
{wbHebrewLetter, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
{wbNumeric, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
{wbKatakana, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
{wbExtendNumLet, prExtendNumLet}: {wbExtendNumLet, wbDontBreak, 131},
case wbAny | prExtendNumLet<<32:
return wbExtendNumLet, true, 9990
case wbALetter | prExtendNumLet<<32:
return wbExtendNumLet, false, 131
case wbHebrewLetter | prExtendNumLet<<32:
return wbExtendNumLet, false, 131
case wbNumeric | prExtendNumLet<<32:
return wbExtendNumLet, false, 131
case wbKatakana | prExtendNumLet<<32:
return wbExtendNumLet, false, 131
case wbExtendNumLet | prExtendNumLet<<32:
return wbExtendNumLet, false, 131
// WB13b.
{wbExtendNumLet, prALetter}: {wbALetter, wbDontBreak, 132},
{wbExtendNumLet, prHebrewLetter}: {wbHebrewLetter, wbDontBreak, 132},
{wbExtendNumLet, prNumeric}: {wbNumeric, wbDontBreak, 132},
{wbExtendNumLet, prKatakana}: {prKatakana, wbDontBreak, 132},
case wbExtendNumLet | prALetter<<32:
return wbALetter, false, 132
case wbExtendNumLet | prHebrewLetter<<32:
return wbHebrewLetter, false, 132
case wbExtendNumLet | prNumeric<<32:
return wbNumeric, false, 132
case wbExtendNumLet | prKatakana<<32:
return wbKatakana, false, 132
default:
return -1, false, -1
}
}
// transitionWordBreakState determines the new state of the word break parser
@ -141,30 +180,27 @@ func transitionWordBreakState(state int, r rune, b []byte, str string) (newState
// Find the applicable transition in the table.
var rule int
transition, ok := wbTransitions[[2]int{state, nextProperty}]
if ok {
// We have a specific transition. We'll use it.
newState, wordBreak, rule = transition[0], transition[1] == wbBreak, transition[2]
} else {
newState, wordBreak, rule = wbTransitions(state, nextProperty)
if newState < 0 {
// No specific transition found. Try the less specific ones.
transAnyProp, okAnyProp := wbTransitions[[2]int{state, prAny}]
transAnyState, okAnyState := wbTransitions[[2]int{wbAny, nextProperty}]
if okAnyProp && okAnyState {
anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)
anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)
if anyPropState >= 0 && anyStateState >= 0 {
// Both apply. We'll use a mix (see comments for grTransitions).
newState, wordBreak, rule = transAnyState[0], transAnyState[1] == wbBreak, transAnyState[2]
if transAnyProp[2] < transAnyState[2] {
wordBreak, rule = transAnyProp[1] == wbBreak, transAnyProp[2]
newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
if anyPropRule < anyStateRule {
wordBreak, rule = anyPropWordBreak, anyPropRule
}
} else if okAnyProp {
} else if anyPropState >= 0 {
// We only have a specific state.
newState, wordBreak, rule = transAnyProp[0], transAnyProp[1] == wbBreak, transAnyProp[2]
newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule
// This branch will probably never be reached because okAnyState will
// always be true given the current transition map. But we keep it here
// for future modifications to the transition map where this may not be
// true anymore.
} else if okAnyState {
} else if anyStateState >= 0 {
// We only have a specific property.
newState, wordBreak, rule = transAnyState[0], transAnyState[1] == wbBreak, transAnyState[2]
newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
} else {
// No known transition. WB999: Any ÷ Any.
newState, wordBreak, rule = wbAny, true, 9990

View file

@ -5,4 +5,4 @@
package internal
// Version is the current tagged release of the library.
const Version = "0.159.0"
const Version = "0.160.0"

6
vendor/modules.txt vendored
View file

@ -112,7 +112,7 @@ github.com/VividCortex/ewma
# github.com/alecthomas/units v0.0.0-20231202071711-9a357b53e9c9
## explicit; go 1.15
github.com/alecthomas/units
# github.com/aws/aws-sdk-go v1.50.5
# github.com/aws/aws-sdk-go v1.50.6
## explicit; go 1.19
github.com/aws/aws-sdk-go/aws
github.com/aws/aws-sdk-go/aws/auth/bearer
@ -527,7 +527,7 @@ github.com/prometheus/prometheus/util/osutil
github.com/prometheus/prometheus/util/pool
github.com/prometheus/prometheus/util/testutil
github.com/prometheus/prometheus/util/zeropool
# github.com/rivo/uniseg v0.4.4
# github.com/rivo/uniseg v0.4.6
## explicit; go 1.18
github.com/rivo/uniseg
# github.com/russross/blackfriday/v2 v2.1.0
@ -701,7 +701,7 @@ golang.org/x/text/unicode/norm
# golang.org/x/time v0.5.0
## explicit; go 1.18
golang.org/x/time/rate
# google.golang.org/api v0.159.0
# google.golang.org/api v0.160.0
## explicit; go 1.19
google.golang.org/api/googleapi
google.golang.org/api/googleapi/transport