vendor syntax standard lib

This commit is contained in:
func25 2024-09-04 19:19:40 +07:00
parent 258ccfb953
commit 5da6a23cbd
No known key found for this signature in database
GPG key ID: 746D8D0E266CD0E5
15 changed files with 4837 additions and 3 deletions

View file

@ -2,10 +2,10 @@ package regexutil
import (
"regexp"
"regexp/syntax"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil/syntax"
)
// PromRegex implements an optimized string matching for Prometheus-like regex.

View file

@ -2,8 +2,9 @@ package regexutil
import (
"regexp"
"regexp/syntax"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil/syntax"
)
// Regex implements an optimized string matching for Go regex.

View file

@ -2,9 +2,10 @@ package regexutil
import (
"fmt"
"regexp/syntax"
"sort"
"strings"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil/syntax"
)
// RemoveStartEndAnchors removes '^' at the start of expr and '$' at the end of the expr.

View file

@ -0,0 +1,296 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import "unicode"
// A patchList is a list of instruction pointers that need to be filled in (patched).
// Because the pointers haven't been filled in yet, we can reuse their storage
// to hold the list. It's kind of sleazy, but works well in practice.
// See https://swtch.com/~rsc/regexp/regexp1.html for inspiration.
//
// These aren't really pointers: they're integers, so we can reinterpret them
// this way without using package unsafe. A value l.head denotes
// p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1).
// head == 0 denotes the empty list, okay because we start every program
// with a fail instruction, so we'll never want to point at its output link.
type patchList struct {
head, tail uint32
}
func makePatchList(n uint32) patchList {
return patchList{n, n}
}
func (l patchList) patch(p *Prog, val uint32) {
head := l.head
for head != 0 {
i := &p.Inst[head>>1]
if head&1 == 0 {
head = i.Out
i.Out = val
} else {
head = i.Arg
i.Arg = val
}
}
}
func (l1 patchList) append(p *Prog, l2 patchList) patchList {
if l1.head == 0 {
return l2
}
if l2.head == 0 {
return l1
}
i := &p.Inst[l1.tail>>1]
if l1.tail&1 == 0 {
i.Out = l2.head
} else {
i.Arg = l2.head
}
return patchList{l1.head, l2.tail}
}
// A frag represents a compiled program fragment.
type frag struct {
i uint32 // index of first instruction
out patchList // where to record end instruction
nullable bool // whether fragment can match empty string
}
type compiler struct {
p *Prog
}
// Compile compiles the regexp into a program to be executed.
// The regexp should have been simplified already (returned from re.Simplify).
func Compile(re *Regexp) (*Prog, error) {
var c compiler
c.init()
f := c.compile(re)
f.out.patch(c.p, c.inst(InstMatch).i)
c.p.Start = int(f.i)
return c.p, nil
}
func (c *compiler) init() {
c.p = new(Prog)
c.p.NumCap = 2 // implicit ( and ) for whole match $0
c.inst(InstFail)
}
var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
var anyRune = []rune{0, unicode.MaxRune}
func (c *compiler) compile(re *Regexp) frag {
switch re.Op {
case OpNoMatch:
return c.fail()
case OpEmptyMatch:
return c.nop()
case OpLiteral:
if len(re.Rune) == 0 {
return c.nop()
}
var f frag
for j := range re.Rune {
f1 := c.rune(re.Rune[j:j+1], re.Flags)
if j == 0 {
f = f1
} else {
f = c.cat(f, f1)
}
}
return f
case OpCharClass:
return c.rune(re.Rune, re.Flags)
case OpAnyCharNotNL:
return c.rune(anyRuneNotNL, 0)
case OpAnyChar:
return c.rune(anyRune, 0)
case OpBeginLine:
return c.empty(EmptyBeginLine)
case OpEndLine:
return c.empty(EmptyEndLine)
case OpBeginText:
return c.empty(EmptyBeginText)
case OpEndText:
return c.empty(EmptyEndText)
case OpWordBoundary:
return c.empty(EmptyWordBoundary)
case OpNoWordBoundary:
return c.empty(EmptyNoWordBoundary)
case OpCapture:
bra := c.cap(uint32(re.Cap << 1))
sub := c.compile(re.Sub[0])
ket := c.cap(uint32(re.Cap<<1 | 1))
return c.cat(c.cat(bra, sub), ket)
case OpStar:
return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
case OpPlus:
return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
case OpQuest:
return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
case OpConcat:
if len(re.Sub) == 0 {
return c.nop()
}
var f frag
for i, sub := range re.Sub {
if i == 0 {
f = c.compile(sub)
} else {
f = c.cat(f, c.compile(sub))
}
}
return f
case OpAlternate:
var f frag
for _, sub := range re.Sub {
f = c.alt(f, c.compile(sub))
}
return f
}
panic("regexp: unhandled case in compile")
}
func (c *compiler) inst(op InstOp) frag {
// TODO: impose length limit
f := frag{i: uint32(len(c.p.Inst)), nullable: true}
c.p.Inst = append(c.p.Inst, Inst{Op: op})
return f
}
func (c *compiler) nop() frag {
f := c.inst(InstNop)
f.out = makePatchList(f.i << 1)
return f
}
func (c *compiler) fail() frag {
return frag{}
}
func (c *compiler) cap(arg uint32) frag {
f := c.inst(InstCapture)
f.out = makePatchList(f.i << 1)
c.p.Inst[f.i].Arg = arg
if c.p.NumCap < int(arg)+1 {
c.p.NumCap = int(arg) + 1
}
return f
}
func (c *compiler) cat(f1, f2 frag) frag {
// concat of failure is failure
if f1.i == 0 || f2.i == 0 {
return frag{}
}
// TODO: elide nop
f1.out.patch(c.p, f2.i)
return frag{f1.i, f2.out, f1.nullable && f2.nullable}
}
func (c *compiler) alt(f1, f2 frag) frag {
// alt of failure is other
if f1.i == 0 {
return f2
}
if f2.i == 0 {
return f1
}
f := c.inst(InstAlt)
i := &c.p.Inst[f.i]
i.Out = f1.i
i.Arg = f2.i
f.out = f1.out.append(c.p, f2.out)
f.nullable = f1.nullable || f2.nullable
return f
}
func (c *compiler) quest(f1 frag, nongreedy bool) frag {
f := c.inst(InstAlt)
i := &c.p.Inst[f.i]
if nongreedy {
i.Arg = f1.i
f.out = makePatchList(f.i << 1)
} else {
i.Out = f1.i
f.out = makePatchList(f.i<<1 | 1)
}
f.out = f.out.append(c.p, f1.out)
return f
}
// loop returns the fragment for the main loop of a plus or star.
// For plus, it can be used after changing the entry to f1.i.
// For star, it can be used directly when f1 can't match an empty string.
// (When f1 can match an empty string, f1* must be implemented as (f1+)?
// to get the priority match order correct.)
func (c *compiler) loop(f1 frag, nongreedy bool) frag {
f := c.inst(InstAlt)
i := &c.p.Inst[f.i]
if nongreedy {
i.Arg = f1.i
f.out = makePatchList(f.i << 1)
} else {
i.Out = f1.i
f.out = makePatchList(f.i<<1 | 1)
}
f1.out.patch(c.p, f.i)
return f
}
func (c *compiler) star(f1 frag, nongreedy bool) frag {
if f1.nullable {
// Use (f1+)? to get priority match order correct.
// See golang.org/issue/46123.
return c.quest(c.plus(f1, nongreedy), nongreedy)
}
return c.loop(f1, nongreedy)
}
func (c *compiler) plus(f1 frag, nongreedy bool) frag {
return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable}
}
func (c *compiler) empty(op EmptyOp) frag {
f := c.inst(InstEmptyWidth)
c.p.Inst[f.i].Arg = uint32(op)
f.out = makePatchList(f.i << 1)
return f
}
func (c *compiler) rune(r []rune, flags Flags) frag {
f := c.inst(InstRune)
f.nullable = false
i := &c.p.Inst[f.i]
i.Rune = r
flags &= FoldCase // only relevant flag is FoldCase
if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
// and sometimes not even that
flags &^= FoldCase
}
i.Arg = uint32(flags)
f.out = makePatchList(f.i << 1)
// Special cases for exec machine.
switch {
case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
i.Op = InstRune1
case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
i.Op = InstRuneAny
case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
i.Op = InstRuneAnyNotNL
}
return f
}

142
lib/regexutil/syntax/doc.go Normal file
View file

@ -0,0 +1,142 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by mksyntaxgo from the RE2 distribution. DO NOT EDIT.
/*
Package syntax parses regular expressions into parse trees and compiles
parse trees into programs. Most clients of regular expressions will use the
facilities of package [regexp] (such as [regexp.Compile] and [regexp.Match]) instead of this package.
# Syntax
The regular expression syntax understood by this package when parsing with the [Perl] flag is as follows.
Parts of the syntax can be disabled by passing alternate flags to [Parse].
Single characters:
. any character, possibly including newline (flag s=true)
[xyz] character class
[^xyz] negated character class
\d Perl character class
\D negated Perl character class
[[:alpha:]] ASCII character class
[[:^alpha:]] negated ASCII character class
\pN Unicode character class (one-letter name)
\p{Greek} Unicode character class
\PN negated Unicode character class (one-letter name)
\P{Greek} negated Unicode character class
Composites:
xy x followed by y
x|y x or y (prefer x)
Repetitions:
x* zero or more x, prefer more
x+ one or more x, prefer more
x? zero or one x, prefer one
x{n,m} n or n+1 or ... or m x, prefer more
x{n,} n or more x, prefer more
x{n} exactly n x
x*? zero or more x, prefer fewer
x+? one or more x, prefer fewer
x?? zero or one x, prefer zero
x{n,m}? n or n+1 or ... or m x, prefer fewer
x{n,}? n or more x, prefer fewer
x{n}? exactly n x
Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n}
reject forms that create a minimum or maximum repetition count above 1000.
Unlimited repetitions are not subject to this restriction.
Grouping:
(re) numbered capturing group (submatch)
(?P<name>re) named & numbered capturing group (submatch)
(?<name>re) named & numbered capturing group (submatch)
(?:re) non-capturing group
(?flags) set flags within current group; non-capturing
(?flags:re) set flags during re; non-capturing
Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:
i case-insensitive (default false)
m multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false)
s let . match \n (default false)
U ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false)
Empty strings:
^ at beginning of text or line (flag m=true)
$ at end of text (like \z not \Z) or line (flag m=true)
\A at beginning of text
\b at ASCII word boundary (\w on one side and \W, \A, or \z on the other)
\B not at ASCII word boundary
\z at end of text
Escape sequences:
\a bell (== \007)
\f form feed (== \014)
\t horizontal tab (== \011)
\n newline (== \012)
\r carriage return (== \015)
\v vertical tab character (== \013)
\* literal *, for any punctuation character *
\123 octal character code (up to three digits)
\x7F hex character code (exactly two digits)
\x{10FFFF} hex character code
\Q...\E literal text ... even if ... has punctuation
Character class elements:
x single character
A-Z character range (inclusive)
\d Perl character class
[:foo:] ASCII character class foo
\p{Foo} Unicode character class Foo
\pF Unicode character class F (one-letter name)
Named character classes as character class elements:
[\d] digits (== \d)
[^\d] not digits (== \D)
[\D] not digits (== \D)
[^\D] not not digits (== \d)
[[:name:]] named ASCII class inside character class (== [:name:])
[^[:name:]] named ASCII class inside negated character class (== [:^name:])
[\p{Name}] named Unicode property inside character class (== \p{Name})
[^\p{Name}] named Unicode property inside negated character class (== \P{Name})
Perl character classes (all ASCII-only):
\d digits (== [0-9])
\D not digits (== [^0-9])
\s whitespace (== [\t\n\f\r ])
\S not whitespace (== [^\t\n\f\r ])
\w word characters (== [0-9A-Za-z_])
\W not word characters (== [^0-9A-Za-z_])
ASCII character classes:
[[:alnum:]] alphanumeric (== [0-9A-Za-z])
[[:alpha:]] alphabetic (== [A-Za-z])
[[:ascii:]] ASCII (== [\x00-\x7F])
[[:blank:]] blank (== [\t ])
[[:cntrl:]] control (== [\x00-\x1F\x7F])
[[:digit:]] digits (== [0-9])
[[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
[[:lower:]] lower case (== [a-z])
[[:print:]] printable (== [ -~] == [ [:graph:]])
[[:punct:]] punctuation (== [!-/:-@[-`{-~])
[[:space:]] whitespace (== [\t\n\v\f\r ])
[[:upper:]] upper case (== [A-Z])
[[:word:]] word characters (== [0-9A-Za-z_])
[[:xdigit:]] hex digit (== [0-9A-Fa-f])
Unicode character classes are those in [unicode.Categories] and [unicode.Scripts].
*/
package syntax

View file

@ -0,0 +1,128 @@
#!/usr/bin/perl
# Copyright 2008 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Modified version of RE2's make_perl_groups.pl.
# Generate table entries giving character ranges
# for POSIX/Perl character classes. Rather than
# figure out what the definition is, it is easier to ask
# Perl about each letter from 0-128 and write down
# its answer.
use strict;
use warnings;
my @posixclasses = (
"[:alnum:]",
"[:alpha:]",
"[:ascii:]",
"[:blank:]",
"[:cntrl:]",
"[:digit:]",
"[:graph:]",
"[:lower:]",
"[:print:]",
"[:punct:]",
"[:space:]",
"[:upper:]",
"[:word:]",
"[:xdigit:]",
);
my @perlclasses = (
"\\d",
"\\s",
"\\w",
);
my %overrides = (
# Prior to Perl 5.18, \s did not match vertical tab.
# RE2 preserves that original behaviour.
"\\s:11" => 0,
);
sub ComputeClass($) {
my @ranges;
my ($class) = @_;
my $regexp = "[$class]";
my $start = -1;
for (my $i=0; $i<=129; $i++) {
if ($i == 129) { $i = 256; }
if ($i <= 128 && ($overrides{"$class:$i"} // chr($i) =~ $regexp)) {
if ($start < 0) {
$start = $i;
}
} else {
if ($start >= 0) {
push @ranges, [$start, $i-1];
}
$start = -1;
}
}
return @ranges;
}
sub PrintClass($$@) {
my ($cname, $name, @ranges) = @_;
print "var code$cname = []rune{ /* $name */\n";
for (my $i=0; $i<@ranges; $i++) {
my @a = @{$ranges[$i]};
printf "\t0x%x, 0x%x,\n", $a[0], $a[1];
}
print "}\n\n";
my $n = @ranges;
my $negname = $name;
if ($negname =~ /:/) {
$negname =~ s/:/:^/;
} else {
$negname =~ y/a-z/A-Z/;
}
return "\t`$name`: {+1, code$cname},\n" .
"\t`$negname`: {-1, code$cname},\n";
}
my $gen = 0;
sub PrintClasses($@) {
my ($cname, @classes) = @_;
my @entries;
foreach my $cl (@classes) {
my @ranges = ComputeClass($cl);
push @entries, PrintClass(++$gen, $cl, @ranges);
}
print "var ${cname}Group = map[string]charGroup{\n";
foreach my $e (@entries) {
print $e;
}
print "}\n";
my $count = @entries;
}
# Prepare gofmt command
my $gofmt;
if (@ARGV > 0 && $ARGV[0] =~ /\.go$/) {
# Send the output of gofmt to the given file
open($gofmt, '|-', 'gofmt >'.$ARGV[0]) or die;
} else {
open($gofmt, '|-', 'gofmt') or die;
}
# Redirect STDOUT to gofmt input
select $gofmt;
print <<EOF;
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by make_perl_groups.pl; DO NOT EDIT.
package syntax
EOF
PrintClasses("perl", @perlclasses);
PrintClasses("posix", @posixclasses);

View file

@ -0,0 +1,52 @@
// Code generated by "stringer -type Op -trimprefix Op"; DO NOT EDIT.
package syntax
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[OpNoMatch-1]
_ = x[OpEmptyMatch-2]
_ = x[OpLiteral-3]
_ = x[OpCharClass-4]
_ = x[OpAnyCharNotNL-5]
_ = x[OpAnyChar-6]
_ = x[OpBeginLine-7]
_ = x[OpEndLine-8]
_ = x[OpBeginText-9]
_ = x[OpEndText-10]
_ = x[OpWordBoundary-11]
_ = x[OpNoWordBoundary-12]
_ = x[OpCapture-13]
_ = x[OpStar-14]
_ = x[OpPlus-15]
_ = x[OpQuest-16]
_ = x[OpRepeat-17]
_ = x[OpConcat-18]
_ = x[OpAlternate-19]
_ = x[opPseudo-128]
}
const (
_Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate"
_Op_name_1 = "opPseudo"
)
var (
_Op_index_0 = [...]uint8{0, 7, 17, 24, 33, 45, 52, 61, 68, 77, 84, 96, 110, 117, 121, 125, 130, 136, 142, 151}
)
func (i Op) String() string {
switch {
case 1 <= i && i <= 19:
i -= 1
return _Op_name_0[_Op_index_0[i]:_Op_index_0[i+1]]
case i == 128:
return _Op_name_1
default:
return "Op(" + strconv.FormatInt(int64(i), 10) + ")"
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,628 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import (
"fmt"
"strings"
"testing"
"unicode"
)
type parseTest struct {
Regexp string
Dump string
}
var parseTests = []parseTest{
// Base cases
{`a`, `lit{a}`},
{`a.`, `cat{lit{a}dot{}}`},
{`a.b`, `cat{lit{a}dot{}lit{b}}`},
{`ab`, `str{ab}`},
{`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`},
{`abc`, `str{abc}`},
{`a|^`, `alt{lit{a}bol{}}`},
{`a|b`, `cc{0x61-0x62}`},
{`(a)`, `cap{lit{a}}`},
{`(a)|b`, `alt{cap{lit{a}}lit{b}}`},
{`a*`, `star{lit{a}}`},
{`a+`, `plus{lit{a}}`},
{`a?`, `que{lit{a}}`},
{`a{2}`, `rep{2,2 lit{a}}`},
{`a{2,3}`, `rep{2,3 lit{a}}`},
{`a{2,}`, `rep{2,-1 lit{a}}`},
{`a*?`, `nstar{lit{a}}`},
{`a+?`, `nplus{lit{a}}`},
{`a??`, `nque{lit{a}}`},
{`a{2}?`, `nrep{2,2 lit{a}}`},
{`a{2,3}?`, `nrep{2,3 lit{a}}`},
{`a{2,}?`, `nrep{2,-1 lit{a}}`},
// Malformed { } are treated as literals.
{`x{1001`, `str{x{1001}`},
{`x{9876543210`, `str{x{9876543210}`},
{`x{9876543210,`, `str{x{9876543210,}`},
{`x{2,1`, `str{x{2,1}`},
{`x{1,9876543210`, `str{x{1,9876543210}`},
{``, `emp{}`},
{`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
{`|x|`, `alt{emp{}lit{x}emp{}}`},
{`.`, `dot{}`},
{`^`, `bol{}`},
{`$`, `eol{}`},
{`\|`, `lit{|}`},
{`\(`, `lit{(}`},
{`\)`, `lit{)}`},
{`\*`, `lit{*}`},
{`\+`, `lit{+}`},
{`\?`, `lit{?}`},
{`{`, `lit{{}`},
{`}`, `lit{}}`},
{`\.`, `lit{.}`},
{`\^`, `lit{^}`},
{`\$`, `lit{$}`},
{`\\`, `lit{\}`},
{`[ace]`, `cc{0x61 0x63 0x65}`},
{`[abc]`, `cc{0x61-0x63}`},
{`[a-z]`, `cc{0x61-0x7a}`},
{`[a]`, `lit{a}`},
{`\-`, `lit{-}`},
{`-`, `lit{-}`},
{`\_`, `lit{_}`},
{`abc`, `str{abc}`},
{`abc|def`, `alt{str{abc}str{def}}`},
{`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
// Posix and Perl extensions
{`[[:lower:]]`, `cc{0x61-0x7a}`},
{`[a-z]`, `cc{0x61-0x7a}`},
{`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
{`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
{`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
{`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
{`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
{`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
{`\d`, `cc{0x30-0x39}`},
{`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`},
{`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`},
{`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`},
{`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`},
{`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`},
{`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`},
{`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
{`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`},
// { `\C`, `byte{}` }, // probably never
// Unicode, negatives, and a double negative.
{`\p{Braille}`, `cc{0x2800-0x28ff}`},
{`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
{`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
{`\P{^Braille}`, `cc{0x2800-0x28ff}`},
{`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
{`[\p{Braille}]`, `cc{0x2800-0x28ff}`},
{`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
{`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
{`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
{`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
{`\p{Any}`, `dot{}`},
{`\p{^Any}`, `cc{}`},
// Hex, octal.
{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
{`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`},
// More interesting regular expressions.
{`a{,2}`, `str{a{,2}}`},
{`\.\^\$\\`, `str{.^$\}`},
{`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`},
{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
{`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8
{`a*{`, `cat{star{lit{a}}lit{{}}`},
// Test precedences
{`(?:ab)*`, `star{str{ab}}`},
{`(ab)*`, `star{cap{str{ab}}}`},
{`ab|cd`, `alt{str{ab}str{cd}}`},
{`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`},
// Test flattening.
{`(?:a)`, `lit{a}`},
{`(?:ab)(?:cd)`, `str{abcd}`},
{`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
{`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
{`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`},
{`a|.`, `dot{}`},
{`.|a`, `dot{}`},
{`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`},
{`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`},
// Test Perl quoted literals
{`\Q+|*?{[\E`, `str{+|*?{[}`},
{`\Q+\E+`, `plus{lit{+}}`},
{`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`},
{`\Q\\E`, `lit{\}`},
{`\Q\\\E`, `str{\\}`},
// Test Perl \A and \z
{`(?m)^`, `bol{}`},
{`(?m)$`, `eol{}`},
{`(?-m)^`, `bot{}`},
{`(?-m)$`, `eot{}`},
{`(?m)\A`, `bot{}`},
{`(?m)\z`, `eot{\z}`},
{`(?-m)\A`, `bot{}`},
{`(?-m)\z`, `eot{\z}`},
// Test named captures
{`(?P<name>a)`, `cap{name:lit{a}}`},
{`(?<name>a)`, `cap{name:lit{a}}`},
// Case-folded literals
{`[Aa]`, `litfold{A}`},
{`[\x{100}\x{101}]`, `litfold{Ā}`},
{`[Δδ]`, `litfold{Δ}`},
// Strings
{`abcde`, `str{abcde}`},
{`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
// Factoring.
{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`},
// Bug fixes.
{`(?:.)`, `dot{}`},
{`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
{`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
{`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
{`(?:A|a)`, `litfold{A}`},
{`A|(?:A|a)`, `litfold{A}`},
{`(?s).`, `dot{}`},
{`(?-s).`, `dnl{}`},
{`(?:(?:^).)`, `cat{bol{}dot{}}`},
{`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
{`[\s\S]a`, `cat{cc{0x0-0x10ffff}lit{a}}`},
// RE2 prefix_tests
{`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`},
{`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`},
{`abc|abd|aef|bcx|bcy`,
`alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` +
`cat{str{bc}cc{0x78-0x79}}}`},
{`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
{`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
{`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
{`.c|.d`, `cat{dot{}cc{0x63-0x64}}`},
{`x{2}|x{2}[0-9]`,
`cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
{`x{2}y|x{2}[0-9]y`,
`cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
{`a.*?c|a.*?b`,
`cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`},
// Valid repetitions.
{`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``},
{`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``},
// Valid nesting.
{strings.Repeat("(", 999) + strings.Repeat(")", 999), ``},
{strings.Repeat("(?:", 999) + strings.Repeat(")*", 999), ``},
{"(" + strings.Repeat("|", 12345) + ")", ``}, // not nested at all
}
const testFlags = MatchNL | PerlX | UnicodeGroups
func TestParseSimple(t *testing.T) {
testParseDump(t, parseTests, testFlags)
}
var foldcaseTests = []parseTest{
{`AbCdE`, `strfold{ABCDE}`},
{`[Aa]`, `litfold{A}`},
{`a`, `litfold{A}`},
// 0x17F is an old English long s (looks like an f) and folds to s.
// 0x212A is the Kelvin symbol and folds to k.
{`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...]
{`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
{`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
}
func TestParseFoldCase(t *testing.T) {
testParseDump(t, foldcaseTests, FoldCase)
}
var literalTests = []parseTest{
{"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"},
}
func TestParseLiteral(t *testing.T) {
testParseDump(t, literalTests, Literal)
}
var matchnlTests = []parseTest{
{`.`, `dot{}`},
{"\n", "lit{\n}"},
{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
{`[a\n]`, `cc{0xa 0x61}`},
}
func TestParseMatchNL(t *testing.T) {
testParseDump(t, matchnlTests, MatchNL)
}
var nomatchnlTests = []parseTest{
{`.`, `dnl{}`},
{"\n", "lit{\n}"},
{`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`},
{`[a\n]`, `cc{0xa 0x61}`},
}
func TestParseNoMatchNL(t *testing.T) {
testParseDump(t, nomatchnlTests, 0)
}
// Test Parse -> Dump.
func testParseDump(t *testing.T, tests []parseTest, flags Flags) {
for _, tt := range tests {
re, err := Parse(tt.Regexp, flags)
if err != nil {
t.Errorf("Parse(%#q): %v", tt.Regexp, err)
continue
}
if tt.Dump == "" {
// It parsed. That's all we care about.
continue
}
d := dump(re)
if d != tt.Dump {
t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
}
}
}
// dump prints a string representation of the regexp showing
// the structure explicitly.
func dump(re *Regexp) string {
var b strings.Builder
dumpRegexp(&b, re)
return b.String()
}
var opNames = []string{
OpNoMatch: "no",
OpEmptyMatch: "emp",
OpLiteral: "lit",
OpCharClass: "cc",
OpAnyCharNotNL: "dnl",
OpAnyChar: "dot",
OpBeginLine: "bol",
OpEndLine: "eol",
OpBeginText: "bot",
OpEndText: "eot",
OpWordBoundary: "wb",
OpNoWordBoundary: "nwb",
OpCapture: "cap",
OpStar: "star",
OpPlus: "plus",
OpQuest: "que",
OpRepeat: "rep",
OpConcat: "cat",
OpAlternate: "alt",
}
// dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
// It is used during testing to distinguish between parses that might print
// the same using re's String method.
func dumpRegexp(b *strings.Builder, re *Regexp) {
if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
fmt.Fprintf(b, "op%d", re.Op)
} else {
switch re.Op {
default:
b.WriteString(opNames[re.Op])
case OpStar, OpPlus, OpQuest, OpRepeat:
if re.Flags&NonGreedy != 0 {
b.WriteByte('n')
}
b.WriteString(opNames[re.Op])
case OpLiteral:
if len(re.Rune) > 1 {
b.WriteString("str")
} else {
b.WriteString("lit")
}
if re.Flags&FoldCase != 0 {
for _, r := range re.Rune {
if unicode.SimpleFold(r) != r {
b.WriteString("fold")
break
}
}
}
}
}
b.WriteByte('{')
switch re.Op {
case OpEndText:
if re.Flags&WasDollar == 0 {
b.WriteString(`\z`)
}
case OpLiteral:
for _, r := range re.Rune {
b.WriteRune(r)
}
case OpConcat, OpAlternate:
for _, sub := range re.Sub {
dumpRegexp(b, sub)
}
case OpStar, OpPlus, OpQuest:
dumpRegexp(b, re.Sub[0])
case OpRepeat:
fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
dumpRegexp(b, re.Sub[0])
case OpCapture:
if re.Name != "" {
b.WriteString(re.Name)
b.WriteByte(':')
}
dumpRegexp(b, re.Sub[0])
case OpCharClass:
sep := ""
for i := 0; i < len(re.Rune); i += 2 {
b.WriteString(sep)
sep = " "
lo, hi := re.Rune[i], re.Rune[i+1]
if lo == hi {
fmt.Fprintf(b, "%#x", lo)
} else {
fmt.Fprintf(b, "%#x-%#x", lo, hi)
}
}
}
b.WriteByte('}')
}
func mkCharClass(f func(rune) bool) string {
re := &Regexp{Op: OpCharClass}
lo := rune(-1)
for i := rune(0); i <= unicode.MaxRune; i++ {
if f(i) {
if lo < 0 {
lo = i
}
} else {
if lo >= 0 {
re.Rune = append(re.Rune, lo, i-1)
lo = -1
}
}
}
if lo >= 0 {
re.Rune = append(re.Rune, lo, unicode.MaxRune)
}
return dump(re)
}
func isUpperFold(r rune) bool {
if unicode.IsUpper(r) {
return true
}
c := unicode.SimpleFold(r)
for c != r {
if unicode.IsUpper(c) {
return true
}
c = unicode.SimpleFold(c)
}
return false
}
func TestFoldConstants(t *testing.T) {
last := rune(-1)
for i := rune(0); i <= unicode.MaxRune; i++ {
if unicode.SimpleFold(i) == i {
continue
}
if last == -1 && minFold != i {
t.Errorf("minFold=%#U should be %#U", minFold, i)
}
last = i
}
if maxFold != last {
t.Errorf("maxFold=%#U should be %#U", maxFold, last)
}
}
func TestAppendRangeCollapse(t *testing.T) {
// AppendRange should collapse each of the new ranges
// into the earlier ones (it looks back two ranges), so that
// the slice never grows very large.
// Note that we are not calling cleanClass.
var r []rune
for i := rune('A'); i <= 'Z'; i++ {
r = appendRange(r, i, i)
r = appendRange(r, i+'a'-'A', i+'a'-'A')
}
if string(r) != "AZaz" {
t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
}
}
var invalidRegexps = []string{
`(`,
`)`,
`(a`,
`a)`,
`(a))`,
`(a|b|`,
`a|b|)`,
`(a|b|))`,
`(a|b`,
`a|b)`,
`(a|b))`,
`[a-z`,
`([a-z)`,
`[a-z)`,
`([a-z]))`,
`x{1001}`,
`x{9876543210}`,
`x{2,1}`,
`x{1,9876543210}`,
"\xff", // Invalid UTF-8
"[\xff]",
"[\\\xff]",
"\\\xff",
`(?P<name>a`,
`(?P<name>`,
`(?P<name`,
`(?P<x y>a)`,
`(?P<>a)`,
`(?<name>a`,
`(?<name>`,
`(?<name`,
`(?<x y>a)`,
`(?<>a)`,
`[a-Z]`,
`(?i)[a-Z]`,
`\Q\E*`,
`a{100000}`, // too much repetition
`a{100000,}`, // too much repetition
"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", // too much repetition
strings.Repeat("(", 1000) + strings.Repeat(")", 1000), // too deep
strings.Repeat("(?:", 1000) + strings.Repeat(")*", 1000), // too deep
"(" + strings.Repeat("(xx?)", 1000) + "){1000}", // too long
strings.Repeat("(xx?){1000}", 1000), // too long
strings.Repeat(`\pL`, 27000), // too many runes
}
var onlyPerl = []string{
`[a-b-c]`,
`\Qabc\E`,
`\Q*+?{[\E`,
`\Q\\E`,
`\Q\\\E`,
`\Q\\\\E`,
`\Q\\\\\E`,
`(?:a)`,
`(?P<name>a)`,
}
var onlyPOSIX = []string{
"a++",
"a**",
"a?*",
"a+*",
"a{1}*",
".{1}{2}.{3}",
}
func TestParseInvalidRegexps(t *testing.T) {
for _, regexp := range invalidRegexps {
if re, err := Parse(regexp, Perl); err == nil {
t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
}
if re, err := Parse(regexp, POSIX); err == nil {
t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
}
}
for _, regexp := range onlyPerl {
if _, err := Parse(regexp, Perl); err != nil {
t.Errorf("Parse(%#q, Perl): %v", regexp, err)
}
if re, err := Parse(regexp, POSIX); err == nil {
t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
}
}
for _, regexp := range onlyPOSIX {
if re, err := Parse(regexp, Perl); err == nil {
t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
}
if _, err := Parse(regexp, POSIX); err != nil {
t.Errorf("Parse(%#q, POSIX): %v", regexp, err)
}
}
}
func TestToStringEquivalentParse(t *testing.T) {
for _, tt := range parseTests {
re, err := Parse(tt.Regexp, testFlags)
if err != nil {
t.Errorf("Parse(%#q): %v", tt.Regexp, err)
continue
}
if tt.Dump == "" {
// It parsed. That's all we care about.
continue
}
d := dump(re)
if d != tt.Dump {
t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
continue
}
s := re.String()
if s != tt.Regexp {
// If ToString didn't return the original regexp,
// it must have found one with fewer parens.
// Unfortunately we can't check the length here, because
// ToString produces "\\{" for a literal brace,
// but "{" is a shorter equivalent in some contexts.
nre, err := Parse(s, testFlags)
if err != nil {
t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err)
continue
}
nd := dump(nre)
if d != nd {
t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd)
}
ns := nre.String()
if s != ns {
t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns)
}
}
}
}
var stringTests = []struct {
re string
out string
}{
{`x(?i:ab*c|d?e)1`, `x(?i:AB*C|D?E)1`},
{`x(?i:ab*cd?e)1`, `x(?i:AB*CD?E)1`},
{`0(?i:ab*c|d?e)1`, `(?i:0(?:AB*C|D?E)1)`},
{`0(?i:ab*cd?e)1`, `(?i:0AB*CD?E1)`},
{`x(?i:ab*c|d?e)`, `x(?i:AB*C|D?E)`},
{`x(?i:ab*cd?e)`, `x(?i:AB*CD?E)`},
{`0(?i:ab*c|d?e)`, `(?i:0(?:AB*C|D?E))`},
{`0(?i:ab*cd?e)`, `(?i:0AB*CD?E)`},
{`(?i:ab*c|d?e)1`, `(?i:(?:AB*C|D?E)1)`},
{`(?i:ab*cd?e)1`, `(?i:AB*CD?E1)`},
{`(?i:ab)[123](?i:cd)`, `(?i:AB[1-3]CD)`},
{`(?i:ab*c|d?e)`, `(?i:AB*C|D?E)`},
{`[Aa][Bb]`, `(?i:AB)`},
{`[Aa][Bb]*[Cc]`, `(?i:AB*C)`},
{`A(?:[Bb][Cc]|[Dd])[Zz]`, `A(?i:(?:BC|D)Z)`},
{`[Aa](?:[Bb][Cc]|[Dd])Z`, `(?i:A(?:BC|D))Z`},
}
func TestString(t *testing.T) {
for _, tt := range stringTests {
re, err := Parse(tt.re, Perl)
if err != nil {
t.Errorf("Parse(%#q): %v", tt.re, err)
continue
}
out := re.String()
if out != tt.out {
t.Errorf("Parse(%#q).String() = %#q, want %#q", tt.re, out, tt.out)
}
}
}

View file

@ -0,0 +1,133 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by make_perl_groups.pl; DO NOT EDIT.
package syntax
var code1 = []rune{ /* \d */
0x30, 0x39,
}
var code2 = []rune{ /* \s */
0x9, 0xa,
0xc, 0xd,
0x20, 0x20,
}
var code3 = []rune{ /* \w */
0x30, 0x39,
0x41, 0x5a,
0x5f, 0x5f,
0x61, 0x7a,
}
var perlGroup = map[string]charGroup{
`\d`: {+1, code1},
`\D`: {-1, code1},
`\s`: {+1, code2},
`\S`: {-1, code2},
`\w`: {+1, code3},
`\W`: {-1, code3},
}
var code4 = []rune{ /* [:alnum:] */
0x30, 0x39,
0x41, 0x5a,
0x61, 0x7a,
}
var code5 = []rune{ /* [:alpha:] */
0x41, 0x5a,
0x61, 0x7a,
}
var code6 = []rune{ /* [:ascii:] */
0x0, 0x7f,
}
var code7 = []rune{ /* [:blank:] */
0x9, 0x9,
0x20, 0x20,
}
var code8 = []rune{ /* [:cntrl:] */
0x0, 0x1f,
0x7f, 0x7f,
}
var code9 = []rune{ /* [:digit:] */
0x30, 0x39,
}
var code10 = []rune{ /* [:graph:] */
0x21, 0x7e,
}
var code11 = []rune{ /* [:lower:] */
0x61, 0x7a,
}
var code12 = []rune{ /* [:print:] */
0x20, 0x7e,
}
var code13 = []rune{ /* [:punct:] */
0x21, 0x2f,
0x3a, 0x40,
0x5b, 0x60,
0x7b, 0x7e,
}
var code14 = []rune{ /* [:space:] */
0x9, 0xd,
0x20, 0x20,
}
var code15 = []rune{ /* [:upper:] */
0x41, 0x5a,
}
var code16 = []rune{ /* [:word:] */
0x30, 0x39,
0x41, 0x5a,
0x5f, 0x5f,
0x61, 0x7a,
}
var code17 = []rune{ /* [:xdigit:] */
0x30, 0x39,
0x41, 0x46,
0x61, 0x66,
}
var posixGroup = map[string]charGroup{
`[:alnum:]`: {+1, code4},
`[:^alnum:]`: {-1, code4},
`[:alpha:]`: {+1, code5},
`[:^alpha:]`: {-1, code5},
`[:ascii:]`: {+1, code6},
`[:^ascii:]`: {-1, code6},
`[:blank:]`: {+1, code7},
`[:^blank:]`: {-1, code7},
`[:cntrl:]`: {+1, code8},
`[:^cntrl:]`: {-1, code8},
`[:digit:]`: {+1, code9},
`[:^digit:]`: {-1, code9},
`[:graph:]`: {+1, code10},
`[:^graph:]`: {-1, code10},
`[:lower:]`: {+1, code11},
`[:^lower:]`: {-1, code11},
`[:print:]`: {+1, code12},
`[:^print:]`: {-1, code12},
`[:punct:]`: {+1, code13},
`[:^punct:]`: {-1, code13},
`[:space:]`: {+1, code14},
`[:^space:]`: {-1, code14},
`[:upper:]`: {+1, code15},
`[:^upper:]`: {-1, code15},
`[:word:]`: {+1, code16},
`[:^word:]`: {-1, code16},
`[:xdigit:]`: {+1, code17},
`[:^xdigit:]`: {-1, code17},
}

View file

@ -0,0 +1,349 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import (
"strconv"
"strings"
"unicode"
"unicode/utf8"
)
// Compiled program.
// May not belong in this package, but convenient for now.
// A Prog is a compiled regular expression program.
type Prog struct {
Inst []Inst
Start int // index of start instruction
NumCap int // number of InstCapture insts in re
}
// An InstOp is an instruction opcode.
type InstOp uint8
const (
InstAlt InstOp = iota
InstAltMatch
InstCapture
InstEmptyWidth
InstMatch
InstFail
InstNop
InstRune
InstRune1
InstRuneAny
InstRuneAnyNotNL
)
var instOpNames = []string{
"InstAlt",
"InstAltMatch",
"InstCapture",
"InstEmptyWidth",
"InstMatch",
"InstFail",
"InstNop",
"InstRune",
"InstRune1",
"InstRuneAny",
"InstRuneAnyNotNL",
}
func (i InstOp) String() string {
if uint(i) >= uint(len(instOpNames)) {
return ""
}
return instOpNames[i]
}
// An EmptyOp specifies a kind or mixture of zero-width assertions.
type EmptyOp uint8
const (
EmptyBeginLine EmptyOp = 1 << iota
EmptyEndLine
EmptyBeginText
EmptyEndText
EmptyWordBoundary
EmptyNoWordBoundary
)
// EmptyOpContext returns the zero-width assertions
// satisfied at the position between the runes r1 and r2.
// Passing r1 == -1 indicates that the position is
// at the beginning of the text.
// Passing r2 == -1 indicates that the position is
// at the end of the text.
func EmptyOpContext(r1, r2 rune) EmptyOp {
var op EmptyOp = EmptyNoWordBoundary
var boundary byte
switch {
case IsWordChar(r1):
boundary = 1
case r1 == '\n':
op |= EmptyBeginLine
case r1 < 0:
op |= EmptyBeginText | EmptyBeginLine
}
switch {
case IsWordChar(r2):
boundary ^= 1
case r2 == '\n':
op |= EmptyEndLine
case r2 < 0:
op |= EmptyEndText | EmptyEndLine
}
if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
}
return op
}
// IsWordChar reports whether r is considered a “word character”
// during the evaluation of the \b and \B zero-width assertions.
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
func IsWordChar(r rune) bool {
// Test for lowercase letters first, as these occur more
// frequently than uppercase letters in common cases.
return 'a' <= r && r <= 'z' || 'A' <= r && r <= 'Z' || '0' <= r && r <= '9' || r == '_'
}
// An Inst is a single instruction in a regular expression program.
type Inst struct {
Op InstOp
Out uint32 // all but InstMatch, InstFail
Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
Rune []rune
}
func (p *Prog) String() string {
var b strings.Builder
dumpProg(&b, p)
return b.String()
}
// skipNop follows any no-op or capturing instructions.
func (p *Prog) skipNop(pc uint32) *Inst {
i := &p.Inst[pc]
for i.Op == InstNop || i.Op == InstCapture {
i = &p.Inst[i.Out]
}
return i
}
// op returns i.Op but merges all the Rune special cases into InstRune
func (i *Inst) op() InstOp {
op := i.Op
switch op {
case InstRune1, InstRuneAny, InstRuneAnyNotNL:
op = InstRune
}
return op
}
// Prefix returns a literal string that all matches for the
// regexp must start with. Complete is true if the prefix
// is the entire match.
func (p *Prog) Prefix() (prefix string, complete bool) {
i := p.skipNop(uint32(p.Start))
// Avoid allocation of buffer if prefix is empty.
if i.op() != InstRune || len(i.Rune) != 1 {
return "", i.Op == InstMatch
}
// Have prefix; gather characters.
var buf strings.Builder
for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
buf.WriteRune(i.Rune[0])
i = p.skipNop(i.Out)
}
return buf.String(), i.Op == InstMatch
}
// StartCond returns the leading empty-width conditions that must
// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
func (p *Prog) StartCond() EmptyOp {
var flag EmptyOp
pc := uint32(p.Start)
i := &p.Inst[pc]
Loop:
for {
switch i.Op {
case InstEmptyWidth:
flag |= EmptyOp(i.Arg)
case InstFail:
return ^EmptyOp(0)
case InstCapture, InstNop:
// skip
default:
break Loop
}
pc = i.Out
i = &p.Inst[pc]
}
return flag
}
const noMatch = -1
// MatchRune reports whether the instruction matches (and consumes) r.
// It should only be called when i.Op == [InstRune].
func (i *Inst) MatchRune(r rune) bool {
return i.MatchRunePos(r) != noMatch
}
// MatchRunePos checks whether the instruction matches (and consumes) r.
// If so, MatchRunePos returns the index of the matching rune pair
// (or, when len(i.Rune) == 1, rune singleton).
// If not, MatchRunePos returns -1.
// MatchRunePos should only be called when i.Op == [InstRune].
func (i *Inst) MatchRunePos(r rune) int {
rune := i.Rune
switch len(rune) {
case 0:
return noMatch
case 1:
// Special case: single-rune slice is from literal string, not char class.
r0 := rune[0]
if r == r0 {
return 0
}
if Flags(i.Arg)&FoldCase != 0 {
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
if r == r1 {
return 0
}
}
}
return noMatch
case 2:
if r >= rune[0] && r <= rune[1] {
return 0
}
return noMatch
case 4, 6, 8:
// Linear search for a few pairs.
// Should handle ASCII well.
for j := 0; j < len(rune); j += 2 {
if r < rune[j] {
return noMatch
}
if r <= rune[j+1] {
return j / 2
}
}
return noMatch
}
// Otherwise binary search.
lo := 0
hi := len(rune) / 2
for lo < hi {
m := int(uint(lo+hi) >> 1)
if c := rune[2*m]; c <= r {
if r <= rune[2*m+1] {
return m
}
lo = m + 1
} else {
hi = m
}
}
return noMatch
}
// MatchEmptyWidth reports whether the instruction matches
// an empty string between the runes before and after.
// It should only be called when i.Op == [InstEmptyWidth].
func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
switch EmptyOp(i.Arg) {
case EmptyBeginLine:
return before == '\n' || before == -1
case EmptyEndLine:
return after == '\n' || after == -1
case EmptyBeginText:
return before == -1
case EmptyEndText:
return after == -1
case EmptyWordBoundary:
return IsWordChar(before) != IsWordChar(after)
case EmptyNoWordBoundary:
return IsWordChar(before) == IsWordChar(after)
}
panic("unknown empty width arg")
}
func (i *Inst) String() string {
var b strings.Builder
dumpInst(&b, i)
return b.String()
}
func bw(b *strings.Builder, args ...string) {
for _, s := range args {
b.WriteString(s)
}
}
func dumpProg(b *strings.Builder, p *Prog) {
for j := range p.Inst {
i := &p.Inst[j]
pc := strconv.Itoa(j)
if len(pc) < 3 {
b.WriteString(" "[len(pc):])
}
if j == p.Start {
pc += "*"
}
bw(b, pc, "\t")
dumpInst(b, i)
bw(b, "\n")
}
}
func u32(i uint32) string {
return strconv.FormatUint(uint64(i), 10)
}
func dumpInst(b *strings.Builder, i *Inst) {
switch i.Op {
case InstAlt:
bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
case InstAltMatch:
bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
case InstCapture:
bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
case InstEmptyWidth:
bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
case InstMatch:
bw(b, "match")
case InstFail:
bw(b, "fail")
case InstNop:
bw(b, "nop -> ", u32(i.Out))
case InstRune:
if i.Rune == nil {
// shouldn't happen
bw(b, "rune <nil>")
}
bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
if Flags(i.Arg)&FoldCase != 0 {
bw(b, "/i")
}
bw(b, " -> ", u32(i.Out))
case InstRune1:
bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
case InstRuneAny:
bw(b, "any -> ", u32(i.Out))
case InstRuneAnyNotNL:
bw(b, "anynotnl -> ", u32(i.Out))
}
}

View file

@ -0,0 +1,144 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import "testing"
var compileTests = []struct {
Regexp string
Prog string
}{
{"a", ` 0 fail
1* rune1 "a" -> 2
2 match
`},
{"[A-M][n-z]", ` 0 fail
1* rune "AM" -> 2
2 rune "nz" -> 3
3 match
`},
{"", ` 0 fail
1* nop -> 2
2 match
`},
{"a?", ` 0 fail
1 rune1 "a" -> 3
2* alt -> 1, 3
3 match
`},
{"a??", ` 0 fail
1 rune1 "a" -> 3
2* alt -> 3, 1
3 match
`},
{"a+", ` 0 fail
1* rune1 "a" -> 2
2 alt -> 1, 3
3 match
`},
{"a+?", ` 0 fail
1* rune1 "a" -> 2
2 alt -> 3, 1
3 match
`},
{"a*", ` 0 fail
1 rune1 "a" -> 2
2* alt -> 1, 3
3 match
`},
{"a*?", ` 0 fail
1 rune1 "a" -> 2
2* alt -> 3, 1
3 match
`},
{"a+b+", ` 0 fail
1* rune1 "a" -> 2
2 alt -> 1, 3
3 rune1 "b" -> 4
4 alt -> 3, 5
5 match
`},
{"(a+)(b+)", ` 0 fail
1* cap 2 -> 2
2 rune1 "a" -> 3
3 alt -> 2, 4
4 cap 3 -> 5
5 cap 4 -> 6
6 rune1 "b" -> 7
7 alt -> 6, 8
8 cap 5 -> 9
9 match
`},
{"a+|b+", ` 0 fail
1 rune1 "a" -> 2
2 alt -> 1, 6
3 rune1 "b" -> 4
4 alt -> 3, 6
5* alt -> 1, 3
6 match
`},
{"A[Aa]", ` 0 fail
1* rune1 "A" -> 2
2 rune "A"/i -> 3
3 match
`},
{"(?:(?:^).)", ` 0 fail
1* empty 4 -> 2
2 anynotnl -> 3
3 match
`},
{"(?:|a)+", ` 0 fail
1 nop -> 4
2 rune1 "a" -> 4
3* alt -> 1, 2
4 alt -> 3, 5
5 match
`},
{"(?:|a)*", ` 0 fail
1 nop -> 4
2 rune1 "a" -> 4
3 alt -> 1, 2
4 alt -> 3, 6
5* alt -> 3, 6
6 match
`},
}
func TestCompile(t *testing.T) {
for _, tt := range compileTests {
re, _ := Parse(tt.Regexp, Perl)
p, _ := Compile(re)
s := p.String()
if s != tt.Prog {
t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog)
}
}
}
func BenchmarkEmptyOpContext(b *testing.B) {
for i := 0; i < b.N; i++ {
var r1 rune = -1
for _, r2 := range "foo, bar, baz\nsome input text.\n" {
EmptyOpContext(r1, r2)
r1 = r2
}
EmptyOpContext(r1, -1)
}
}
var sink any
func BenchmarkIsWordChar(b *testing.B) {
const chars = "Don't communicate by sharing memory, share memory by communicating."
for i := 0; i < b.N; i++ {
for _, r := range chars {
sink = IsWordChar(r)
}
}
if sink == nil {
b.Fatal("Benchmark did not run")
}
sink = nil
}

View file

@ -0,0 +1,511 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
// Note to implementers:
// In this package, re is always a *Regexp and r is always a rune.
import (
"slices"
"strconv"
"strings"
"unicode"
)
// A Regexp is a node in a regular expression syntax tree.
type Regexp struct {
Op Op // operator
Flags Flags
Sub []*Regexp // subexpressions, if any
Sub0 [1]*Regexp // storage for short Sub
Rune []rune // matched runes, for OpLiteral, OpCharClass
Rune0 [2]rune // storage for short Rune
Min, Max int // min, max for OpRepeat
Cap int // capturing index, for OpCapture
Name string // capturing name, for OpCapture
}
//go:generate stringer -type Op -trimprefix Op
// An Op is a single regular expression operator.
type Op uint8
// Operators are listed in precedence order, tightest binding to weakest.
// Character class operators are listed simplest to most complex
// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
const (
OpNoMatch Op = 1 + iota // matches no strings
OpEmptyMatch // matches empty string
OpLiteral // matches Runes sequence
OpCharClass // matches Runes interpreted as range pair list
OpAnyCharNotNL // matches any character except newline
OpAnyChar // matches any character
OpBeginLine // matches empty string at beginning of line
OpEndLine // matches empty string at end of line
OpBeginText // matches empty string at beginning of text
OpEndText // matches empty string at end of text
OpWordBoundary // matches word boundary `\b`
OpNoWordBoundary // matches word non-boundary `\B`
OpCapture // capturing subexpression with index Cap, optional name Name
OpStar // matches Sub[0] zero or more times
OpPlus // matches Sub[0] one or more times
OpQuest // matches Sub[0] zero or one times
OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
OpConcat // matches concatenation of Subs
OpAlternate // matches alternation of Subs
)
const opPseudo Op = 128 // where pseudo-ops start
// Equal reports whether x and y have identical structure.
func (x *Regexp) Equal(y *Regexp) bool {
if x == nil || y == nil {
return x == y
}
if x.Op != y.Op {
return false
}
switch x.Op {
case OpEndText:
// The parse flags remember whether this is \z or \Z.
if x.Flags&WasDollar != y.Flags&WasDollar {
return false
}
case OpLiteral, OpCharClass:
return slices.Equal(x.Rune, y.Rune)
case OpAlternate, OpConcat:
return slices.EqualFunc(x.Sub, y.Sub, (*Regexp).Equal)
case OpStar, OpPlus, OpQuest:
if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpRepeat:
if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpCapture:
if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
}
return true
}
// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
type printFlags uint8
const (
flagI printFlags = 1 << iota // (?i:
flagM // (?m:
flagS // (?s:
flagOff // )
flagPrec // (?: )
negShift = 5 // flagI<<negShift is (?-i:
)
// addSpan enables the flags f around start..last,
// by setting flags[start] = f and flags[last] = flagOff.
func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
if *flags == nil {
*flags = make(map[*Regexp]printFlags)
}
(*flags)[start] = f
(*flags)[last] |= flagOff // maybe start==last
}
// calcFlags calculates the flags to print around each subexpression in re,
// storing that information in (*flags)[sub] for each affected subexpression.
// The first time an entry needs to be written to *flags, calcFlags allocates the map.
// calcFlags also calculates the flags that must be active or can't be active
// around re and returns those flags.
func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
switch re.Op {
default:
return 0, 0
case OpLiteral:
// If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
// according to whether (?i) is active.
// If literal is not fold-sensitive, return 0, 0.
for _, r := range re.Rune {
if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
if re.Flags&FoldCase != 0 {
return flagI, 0
} else {
return 0, flagI
}
}
}
return 0, 0
case OpCharClass:
// If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
// If literal is not fold-sensitive, return 0, 0.
return calcFlagsI(re)
case OpAnyCharNotNL: // (?-s).
return 0, flagS
case OpAnyChar: // (?s).
return flagS, 0
case OpBeginLine, OpEndLine: // (?m)^ (?m)$
return flagM, 0
case OpEndText:
if re.Flags&WasDollar != 0 { // (?-m)$
return 0, flagM
}
return 0, 0
case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
return calcFlags(re.Sub[0], flags)
case OpConcat, OpAlternate:
// Gather the must and cant for each subexpression.
// When we find a conflicting subexpression, insert the necessary
// flags around the previously identified span and start over.
var must, cant, allCant printFlags
start := 0
last := 0
did := false
for i, sub := range re.Sub {
subMust, subCant := calcFlags(sub, flags)
if must&subCant != 0 || subMust&cant != 0 {
if must != 0 {
addSpan(re.Sub[start], re.Sub[last], must, flags)
}
must = 0
cant = 0
start = i
did = true
}
must |= subMust
cant |= subCant
allCant |= subCant
if subMust != 0 {
last = i
}
if must == 0 && start == i {
start++
}
}
if !did {
// No conflicts: pass the accumulated must and cant upward.
return must, cant
}
if must != 0 {
// Conflicts found; need to finish final span.
addSpan(re.Sub[start], re.Sub[last], must, flags)
}
return 0, allCant
}
}
func calcFlagsI(re *Regexp) (must, cant printFlags) {
inside := 0
outside := 0
pre := rune(minFold)
for i := 0; i < len(re.Rune); i += 2 {
lo := max(minFold, re.Rune[i])
hi := min(maxFold, re.Rune[i+1])
inside += int(hi - lo)
outside += int(hi - pre)
pre = max(minFold, hi)
}
outside += int(maxFold - pre)
if inside > outside {
for i := 0; i < len(re.Rune); i += 2 {
lo := max(minFold, re.Rune[i])
hi := min(maxFold, re.Rune[i+1])
for r := lo; r <= hi; r++ {
for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
return 0, flagI
}
}
}
}
return 0, 0
}
// Check characters outside the defined range
for i := 0; i < len(re.Rune); i += 2 {
lo := max(minFold, re.Rune[i])
hi := min(maxFold, re.Rune[i+1])
// Check characters between `pre` and `lo` (outside the defined range)
for r := pre; r < lo; r++ {
for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
if inCharClass(f, re.Rune) {
return 0, flagI
}
}
}
pre = max(minFold, hi)
}
// Check characters between `pre` and `maxFold`
for r := pre; r <= maxFold; r++ {
for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
if inCharClass(f, re.Rune) {
return 0, flagI
}
}
}
return 0, 0
}
// writeRegexp writes the Perl syntax for the regular expression re to b.
func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
f |= flags[re]
if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
// flagPrec is redundant with other flags being added and terminated
f &^= flagPrec
}
if f&^(flagOff|flagPrec) != 0 {
b.WriteString(`(?`)
if f&flagI != 0 {
b.WriteString(`i`)
}
if f&flagM != 0 {
b.WriteString(`m`)
}
if f&flagS != 0 {
b.WriteString(`s`)
}
if f&((flagM|flagS)<<negShift) != 0 {
b.WriteString(`-`)
if f&(flagM<<negShift) != 0 {
b.WriteString(`m`)
}
if f&(flagS<<negShift) != 0 {
b.WriteString(`s`)
}
}
b.WriteString(`:`)
}
if f&flagOff != 0 {
defer b.WriteString(`)`)
}
if f&flagPrec != 0 {
b.WriteString(`(?:`)
defer b.WriteString(`)`)
}
switch re.Op {
default:
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
case OpNoMatch:
b.WriteString(`[^\x00-\x{10FFFF}]`)
case OpEmptyMatch:
b.WriteString(`(?:)`)
case OpLiteral:
for _, r := range re.Rune {
escape(b, r, false)
}
case OpCharClass:
if len(re.Rune)%2 != 0 {
b.WriteString(`[invalid char class]`)
break
}
b.WriteRune('[')
if len(re.Rune) == 0 {
b.WriteString(`^\x00-\x{10FFFF}`)
} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
// Contains 0 and MaxRune. Probably a negated class.
// Print the gaps.
b.WriteRune('^')
for i := 1; i < len(re.Rune)-1; i += 2 {
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
escape(b, lo, lo == '-')
if lo != hi {
if hi != lo+1 {
b.WriteRune('-')
}
escape(b, hi, hi == '-')
}
}
} else {
for i := 0; i < len(re.Rune); i += 2 {
lo, hi := re.Rune[i], re.Rune[i+1]
escape(b, lo, lo == '-')
if lo != hi {
if hi != lo+1 {
b.WriteRune('-')
}
escape(b, hi, hi == '-')
}
}
}
b.WriteRune(']')
case OpAnyCharNotNL, OpAnyChar:
b.WriteString(`.`)
case OpBeginLine:
b.WriteString(`^`)
case OpEndLine:
b.WriteString(`$`)
case OpBeginText:
b.WriteString(`\A`)
case OpEndText:
if re.Flags&WasDollar != 0 {
b.WriteString(`$`)
} else {
b.WriteString(`\z`)
}
case OpWordBoundary:
b.WriteString(`\b`)
case OpNoWordBoundary:
b.WriteString(`\B`)
case OpCapture:
if re.Name != "" {
b.WriteString(`(?P<`)
b.WriteString(re.Name)
b.WriteRune('>')
} else {
b.WriteRune('(')
}
if re.Sub[0].Op != OpEmptyMatch {
writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
}
b.WriteRune(')')
case OpStar, OpPlus, OpQuest, OpRepeat:
p := printFlags(0)
sub := re.Sub[0]
if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
p = flagPrec
}
writeRegexp(b, sub, p, flags)
switch re.Op {
case OpStar:
b.WriteRune('*')
case OpPlus:
b.WriteRune('+')
case OpQuest:
b.WriteRune('?')
case OpRepeat:
b.WriteRune('{')
b.WriteString(strconv.Itoa(re.Min))
if re.Max != re.Min {
b.WriteRune(',')
if re.Max >= 0 {
b.WriteString(strconv.Itoa(re.Max))
}
}
b.WriteRune('}')
}
if re.Flags&NonGreedy != 0 {
b.WriteRune('?')
}
case OpConcat:
for _, sub := range re.Sub {
p := printFlags(0)
if sub.Op == OpAlternate {
p = flagPrec
}
writeRegexp(b, sub, p, flags)
}
case OpAlternate:
for i, sub := range re.Sub {
if i > 0 {
b.WriteRune('|')
}
writeRegexp(b, sub, 0, flags)
}
}
}
func (re *Regexp) String() string {
var b strings.Builder
var flags map[*Regexp]printFlags
must, cant := calcFlags(re, &flags)
must |= (cant &^ flagI) << negShift
if must != 0 {
must |= flagOff
}
writeRegexp(&b, re, must, flags)
return b.String()
}
const meta = `\.+*?()|[]{}^$`
func escape(b *strings.Builder, r rune, force bool) {
if unicode.IsPrint(r) {
if strings.ContainsRune(meta, r) || force {
b.WriteRune('\\')
}
b.WriteRune(r)
return
}
switch r {
case '\a':
b.WriteString(`\a`)
case '\f':
b.WriteString(`\f`)
case '\n':
b.WriteString(`\n`)
case '\r':
b.WriteString(`\r`)
case '\t':
b.WriteString(`\t`)
case '\v':
b.WriteString(`\v`)
default:
if r < 0x100 {
b.WriteString(`\x`)
s := strconv.FormatInt(int64(r), 16)
if len(s) == 1 {
b.WriteRune('0')
}
b.WriteString(s)
break
}
b.WriteString(`\x{`)
b.WriteString(strconv.FormatInt(int64(r), 16))
b.WriteString(`}`)
}
}
// MaxCap walks the regexp to find the maximum capture index.
func (re *Regexp) MaxCap() int {
m := 0
if re.Op == OpCapture {
m = re.Cap
}
for _, sub := range re.Sub {
if n := sub.MaxCap(); m < n {
m = n
}
}
return m
}
// CapNames walks the regexp to find the names of capturing groups.
func (re *Regexp) CapNames() []string {
names := make([]string, re.MaxCap()+1)
re.capNames(names)
return names
}
func (re *Regexp) capNames(names []string) {
if re.Op == OpCapture {
names[re.Cap] = re.Name
}
for _, sub := range re.Sub {
sub.capNames(names)
}
}

View file

@ -0,0 +1,151 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
// Simplify returns a regexp equivalent to re but without counted repetitions
// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
// The resulting regexp will execute correctly but its string representation
// will not produce the same parse tree, because capturing parentheses
// may have been duplicated or removed. For example, the simplified form
// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
// The returned regexp may share structure with or be the original.
func (re *Regexp) Simplify() *Regexp {
if re == nil {
return nil
}
switch re.Op {
case OpCapture, OpConcat, OpAlternate:
// Simplify children, building new Regexp if children change.
nre := re
for i, sub := range re.Sub {
nsub := sub.Simplify()
if nre == re && nsub != sub {
// Start a copy.
nre = new(Regexp)
*nre = *re
nre.Rune = nil
nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
}
if nre != re {
nre.Sub = append(nre.Sub, nsub)
}
}
return nre
case OpStar, OpPlus, OpQuest:
sub := re.Sub[0].Simplify()
return simplify1(re.Op, re.Flags, sub, re)
case OpRepeat:
// Special special case: x{0} matches the empty string
// and doesn't even need to consider x.
if re.Min == 0 && re.Max == 0 {
return &Regexp{Op: OpEmptyMatch}
}
// The fun begins.
sub := re.Sub[0].Simplify()
// x{n,} means at least n matches of x.
if re.Max == -1 {
// Special case: x{0,} is x*.
if re.Min == 0 {
return simplify1(OpStar, re.Flags, sub, nil)
}
// Special case: x{1,} is x+.
if re.Min == 1 {
return simplify1(OpPlus, re.Flags, sub, nil)
}
// General case: x{4,} is xxxx+.
nre := &Regexp{Op: OpConcat}
nre.Sub = nre.Sub0[:0]
for i := 0; i < re.Min-1; i++ {
nre.Sub = append(nre.Sub, sub)
}
nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
return nre
}
// Special case x{0} handled above.
// Special case: x{1} is just x.
if re.Min == 1 && re.Max == 1 {
return sub
}
// General case: x{n,m} means n copies of x and m copies of x?
// The machine will do less work if we nest the final m copies,
// so that x{2,5} = xx(x(x(x)?)?)?
// Build leading prefix: xx.
var prefix *Regexp
if re.Min > 0 {
prefix = &Regexp{Op: OpConcat}
prefix.Sub = prefix.Sub0[:0]
for i := 0; i < re.Min; i++ {
prefix.Sub = append(prefix.Sub, sub)
}
}
// Build and attach suffix: (x(x(x)?)?)?
if re.Max > re.Min {
suffix := simplify1(OpQuest, re.Flags, sub, nil)
for i := re.Min + 1; i < re.Max; i++ {
nre2 := &Regexp{Op: OpConcat}
nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
suffix = simplify1(OpQuest, re.Flags, nre2, nil)
}
if prefix == nil {
return suffix
}
prefix.Sub = append(prefix.Sub, suffix)
}
if prefix != nil {
return prefix
}
// Some degenerate case like min > max or min < max < 0.
// Handle as impossible match.
return &Regexp{Op: OpNoMatch}
}
return re
}
// simplify1 implements Simplify for the unary OpStar,
// OpPlus, and OpQuest operators. It returns the simple regexp
// equivalent to
//
// Regexp{Op: op, Flags: flags, Sub: {sub}}
//
// under the assumption that sub is already simple, and
// without first allocating that structure. If the regexp
// to be returned turns out to be equivalent to re, simplify1
// returns re instead.
//
// simplify1 is factored out of Simplify because the implementation
// for other operators generates these unary expressions.
// Letting them call simplify1 makes sure the expressions they
// generate are simple.
func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if sub.Op == OpEmptyMatch {
return sub
}
// The operators are idempotent if the flags match.
if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
return sub
}
if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
return re
}
re = &Regexp{Op: op, Flags: flags}
re.Sub = append(re.Sub0[:0], sub)
return re
}

View file

@ -0,0 +1,164 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
import "testing"
var simplifyTests = []struct {
Regexp string
Simple string
}{
// Already-simple constructs
{`a`, `a`},
{`ab`, `ab`},
{`a|b`, `[ab]`},
{`ab|cd`, `ab|cd`},
{`(ab)*`, `(ab)*`},
{`(ab)+`, `(ab)+`},
{`(ab)?`, `(ab)?`},
{`.`, `(?s:.)`},
{`^`, `(?m:^)`},
{`$`, `(?m:$)`},
{`[ac]`, `[ac]`},
{`[^ac]`, `[^ac]`},
// Posix character classes
{`[[:alnum:]]`, `[0-9A-Za-z]`},
{`[[:alpha:]]`, `[A-Za-z]`},
{`[[:blank:]]`, `[\t ]`},
{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
{`[[:digit:]]`, `[0-9]`},
{`[[:graph:]]`, `[!-~]`},
{`[[:lower:]]`, `[a-z]`},
{`[[:print:]]`, `[ -~]`},
{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
{`[[:space:]]`, `[\t-\r ]`},
{`[[:upper:]]`, `[A-Z]`},
{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
// Perl character classes
{`\d`, `[0-9]`},
{`\s`, `[\t\n\f\r ]`},
{`\w`, `[0-9A-Z_a-z]`},
{`\D`, `[^0-9]`},
{`\S`, `[^\t\n\f\r ]`},
{`\W`, `[^0-9A-Z_a-z]`},
{`[\d]`, `[0-9]`},
{`[\s]`, `[\t\n\f\r ]`},
{`[\w]`, `[0-9A-Z_a-z]`},
{`[\D]`, `[^0-9]`},
{`[\S]`, `[^\t\n\f\r ]`},
{`[\W]`, `[^0-9A-Z_a-z]`},
// Posix repetitions
{`a{1}`, `a`},
{`a{2}`, `aa`},
{`a{5}`, `aaaaa`},
{`a{0,1}`, `a?`},
// The next three are illegible because Simplify inserts (?:)
// parens instead of () parens to avoid creating extra
// captured subexpressions. The comments show a version with fewer parens.
{`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)?
{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)?
{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
{`a{0,2}`, `(?:aa?)?`}, // (aa?)?
{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)?
{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)?
{`a{0,}`, `a*`},
{`a{1,}`, `a+`},
{`a{2,}`, `aa+`},
{`a{5,}`, `aaaaa+`},
// Test that operators simplify their arguments.
{`(?:a{1,}){1,}`, `a+`},
{`(a{1,}b{1,})`, `(a+b+)`},
{`a{1,}|b{1,}`, `a+|b+`},
{`(?:a{1,})*`, `(?:a+)*`},
{`(?:a{1,})+`, `a+`},
{`(?:a{1,})?`, `(?:a+)?`},
{``, `(?:)`},
{`a{0}`, `(?:)`},
// Character class simplification
{`[ab]`, `[ab]`},
{`[abc]`, `[a-c]`},
{`[a-za-za-z]`, `[a-z]`},
{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
{`[ABCDEFGH]`, `[A-H]`},
{`[AB-CD-EF-GH]`, `[A-H]`},
{`[W-ZP-XE-R]`, `[E-Z]`},
{`[a-ee-gg-m]`, `[a-m]`},
{`[a-ea-ha-m]`, `[a-m]`},
{`[a-ma-ha-e]`, `[a-m]`},
{`[a-zA-Z0-9 -~]`, `[ -~]`},
// Empty character classes
{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
// Full character classes
{`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
// Unicode case folding.
{`(?i)A`, `(?i:A)`},
{`(?i)a`, `(?i:A)`},
{`(?i)[A]`, `(?i:A)`},
{`(?i)[a]`, `(?i:A)`},
{`(?i)K`, `(?i:K)`},
{`(?i)k`, `(?i:K)`},
{`(?i)\x{212a}`, "(?i:K)"},
{`(?i)[K]`, "[Kk\u212A]"},
{`(?i)[k]`, "[Kk\u212A]"},
{`(?i)[\x{212a}]`, "[Kk\u212A]"},
{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
{`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
// Empty string as a regular expression.
// The empty string must be preserved inside parens in order
// to make submatches work right, so these tests are less
// interesting than they might otherwise be. String inserts
// explicit (?:) in place of non-parenthesized empty strings,
// to make them easier to spot for other parsers.
{`(a|b|c|)`, `([a-c]|(?:))`},
{`(a|b|)`, `([ab]|(?:))`},
{`(|)`, `()`},
{`a()`, `a()`},
{`(()|())`, `(()|())`},
{`(a|)`, `(a|(?:))`},
{`ab()cd()`, `ab()cd()`},
{`()`, `()`},
{`()*`, `()*`},
{`()+`, `()+`},
{`()?`, `()?`},
{`(){0}`, `(?:)`},
{`(){1}`, `()`},
{`(){1,}`, `()+`},
{`(){0,2}`, `(?:()()?)?`},
}
func TestSimplify2(t *testing.T) {
re, err := Parse(`[a-ee-gg-m]`, Perl|DotNL)
if err != nil {
t.Fatal(err)
}
s := re.Simplify().String()
if s != `[a-m]` {
t.Errorf("Simplify(%#q) = %#q, want %#q", re.String(), s, `[a-m]`)
}
}
func TestSimplify(t *testing.T) {
for _, tt := range simplifyTests {
re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
if err != nil {
t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
continue
}
s := re.Simplify().String()
if s != tt.Simple {
t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
}
}
}