mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-21 14:44:00 +00:00
vendor syntax standard lib
This commit is contained in:
parent
258ccfb953
commit
5da6a23cbd
15 changed files with 4837 additions and 3 deletions
|
@ -2,10 +2,10 @@ package regexutil
|
|||
|
||||
import (
|
||||
"regexp"
|
||||
"regexp/syntax"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil/syntax"
|
||||
)
|
||||
|
||||
// PromRegex implements an optimized string matching for Prometheus-like regex.
|
||||
|
|
|
@ -2,8 +2,9 @@ package regexutil
|
|||
|
||||
import (
|
||||
"regexp"
|
||||
"regexp/syntax"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil/syntax"
|
||||
)
|
||||
|
||||
// Regex implements an optimized string matching for Go regex.
|
||||
|
|
|
@ -2,9 +2,10 @@ package regexutil
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp/syntax"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/regexutil/syntax"
|
||||
)
|
||||
|
||||
// RemoveStartEndAnchors removes '^' at the start of expr and '$' at the end of the expr.
|
||||
|
|
296
lib/regexutil/syntax/compile.go
Normal file
296
lib/regexutil/syntax/compile.go
Normal file
|
@ -0,0 +1,296 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package syntax
|
||||
|
||||
import "unicode"
|
||||
|
||||
// A patchList is a list of instruction pointers that need to be filled in (patched).
|
||||
// Because the pointers haven't been filled in yet, we can reuse their storage
|
||||
// to hold the list. It's kind of sleazy, but works well in practice.
|
||||
// See https://swtch.com/~rsc/regexp/regexp1.html for inspiration.
|
||||
//
|
||||
// These aren't really pointers: they're integers, so we can reinterpret them
|
||||
// this way without using package unsafe. A value l.head denotes
|
||||
// p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1).
|
||||
// head == 0 denotes the empty list, okay because we start every program
|
||||
// with a fail instruction, so we'll never want to point at its output link.
|
||||
type patchList struct {
|
||||
head, tail uint32
|
||||
}
|
||||
|
||||
func makePatchList(n uint32) patchList {
|
||||
return patchList{n, n}
|
||||
}
|
||||
|
||||
func (l patchList) patch(p *Prog, val uint32) {
|
||||
head := l.head
|
||||
for head != 0 {
|
||||
i := &p.Inst[head>>1]
|
||||
if head&1 == 0 {
|
||||
head = i.Out
|
||||
i.Out = val
|
||||
} else {
|
||||
head = i.Arg
|
||||
i.Arg = val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (l1 patchList) append(p *Prog, l2 patchList) patchList {
|
||||
if l1.head == 0 {
|
||||
return l2
|
||||
}
|
||||
if l2.head == 0 {
|
||||
return l1
|
||||
}
|
||||
|
||||
i := &p.Inst[l1.tail>>1]
|
||||
if l1.tail&1 == 0 {
|
||||
i.Out = l2.head
|
||||
} else {
|
||||
i.Arg = l2.head
|
||||
}
|
||||
return patchList{l1.head, l2.tail}
|
||||
}
|
||||
|
||||
// A frag represents a compiled program fragment.
|
||||
type frag struct {
|
||||
i uint32 // index of first instruction
|
||||
out patchList // where to record end instruction
|
||||
nullable bool // whether fragment can match empty string
|
||||
}
|
||||
|
||||
type compiler struct {
|
||||
p *Prog
|
||||
}
|
||||
|
||||
// Compile compiles the regexp into a program to be executed.
|
||||
// The regexp should have been simplified already (returned from re.Simplify).
|
||||
func Compile(re *Regexp) (*Prog, error) {
|
||||
var c compiler
|
||||
c.init()
|
||||
f := c.compile(re)
|
||||
f.out.patch(c.p, c.inst(InstMatch).i)
|
||||
c.p.Start = int(f.i)
|
||||
return c.p, nil
|
||||
}
|
||||
|
||||
func (c *compiler) init() {
|
||||
c.p = new(Prog)
|
||||
c.p.NumCap = 2 // implicit ( and ) for whole match $0
|
||||
c.inst(InstFail)
|
||||
}
|
||||
|
||||
var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
|
||||
var anyRune = []rune{0, unicode.MaxRune}
|
||||
|
||||
func (c *compiler) compile(re *Regexp) frag {
|
||||
switch re.Op {
|
||||
case OpNoMatch:
|
||||
return c.fail()
|
||||
case OpEmptyMatch:
|
||||
return c.nop()
|
||||
case OpLiteral:
|
||||
if len(re.Rune) == 0 {
|
||||
return c.nop()
|
||||
}
|
||||
var f frag
|
||||
for j := range re.Rune {
|
||||
f1 := c.rune(re.Rune[j:j+1], re.Flags)
|
||||
if j == 0 {
|
||||
f = f1
|
||||
} else {
|
||||
f = c.cat(f, f1)
|
||||
}
|
||||
}
|
||||
return f
|
||||
case OpCharClass:
|
||||
return c.rune(re.Rune, re.Flags)
|
||||
case OpAnyCharNotNL:
|
||||
return c.rune(anyRuneNotNL, 0)
|
||||
case OpAnyChar:
|
||||
return c.rune(anyRune, 0)
|
||||
case OpBeginLine:
|
||||
return c.empty(EmptyBeginLine)
|
||||
case OpEndLine:
|
||||
return c.empty(EmptyEndLine)
|
||||
case OpBeginText:
|
||||
return c.empty(EmptyBeginText)
|
||||
case OpEndText:
|
||||
return c.empty(EmptyEndText)
|
||||
case OpWordBoundary:
|
||||
return c.empty(EmptyWordBoundary)
|
||||
case OpNoWordBoundary:
|
||||
return c.empty(EmptyNoWordBoundary)
|
||||
case OpCapture:
|
||||
bra := c.cap(uint32(re.Cap << 1))
|
||||
sub := c.compile(re.Sub[0])
|
||||
ket := c.cap(uint32(re.Cap<<1 | 1))
|
||||
return c.cat(c.cat(bra, sub), ket)
|
||||
case OpStar:
|
||||
return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
|
||||
case OpPlus:
|
||||
return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
|
||||
case OpQuest:
|
||||
return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
|
||||
case OpConcat:
|
||||
if len(re.Sub) == 0 {
|
||||
return c.nop()
|
||||
}
|
||||
var f frag
|
||||
for i, sub := range re.Sub {
|
||||
if i == 0 {
|
||||
f = c.compile(sub)
|
||||
} else {
|
||||
f = c.cat(f, c.compile(sub))
|
||||
}
|
||||
}
|
||||
return f
|
||||
case OpAlternate:
|
||||
var f frag
|
||||
for _, sub := range re.Sub {
|
||||
f = c.alt(f, c.compile(sub))
|
||||
}
|
||||
return f
|
||||
}
|
||||
panic("regexp: unhandled case in compile")
|
||||
}
|
||||
|
||||
func (c *compiler) inst(op InstOp) frag {
|
||||
// TODO: impose length limit
|
||||
f := frag{i: uint32(len(c.p.Inst)), nullable: true}
|
||||
c.p.Inst = append(c.p.Inst, Inst{Op: op})
|
||||
return f
|
||||
}
|
||||
|
||||
func (c *compiler) nop() frag {
|
||||
f := c.inst(InstNop)
|
||||
f.out = makePatchList(f.i << 1)
|
||||
return f
|
||||
}
|
||||
|
||||
func (c *compiler) fail() frag {
|
||||
return frag{}
|
||||
}
|
||||
|
||||
func (c *compiler) cap(arg uint32) frag {
|
||||
f := c.inst(InstCapture)
|
||||
f.out = makePatchList(f.i << 1)
|
||||
c.p.Inst[f.i].Arg = arg
|
||||
|
||||
if c.p.NumCap < int(arg)+1 {
|
||||
c.p.NumCap = int(arg) + 1
|
||||
}
|
||||
return f
|
||||
}
|
||||
|
||||
func (c *compiler) cat(f1, f2 frag) frag {
|
||||
// concat of failure is failure
|
||||
if f1.i == 0 || f2.i == 0 {
|
||||
return frag{}
|
||||
}
|
||||
|
||||
// TODO: elide nop
|
||||
|
||||
f1.out.patch(c.p, f2.i)
|
||||
return frag{f1.i, f2.out, f1.nullable && f2.nullable}
|
||||
}
|
||||
|
||||
func (c *compiler) alt(f1, f2 frag) frag {
|
||||
// alt of failure is other
|
||||
if f1.i == 0 {
|
||||
return f2
|
||||
}
|
||||
if f2.i == 0 {
|
||||
return f1
|
||||
}
|
||||
|
||||
f := c.inst(InstAlt)
|
||||
i := &c.p.Inst[f.i]
|
||||
i.Out = f1.i
|
||||
i.Arg = f2.i
|
||||
f.out = f1.out.append(c.p, f2.out)
|
||||
f.nullable = f1.nullable || f2.nullable
|
||||
return f
|
||||
}
|
||||
|
||||
func (c *compiler) quest(f1 frag, nongreedy bool) frag {
|
||||
f := c.inst(InstAlt)
|
||||
i := &c.p.Inst[f.i]
|
||||
if nongreedy {
|
||||
i.Arg = f1.i
|
||||
f.out = makePatchList(f.i << 1)
|
||||
} else {
|
||||
i.Out = f1.i
|
||||
f.out = makePatchList(f.i<<1 | 1)
|
||||
}
|
||||
f.out = f.out.append(c.p, f1.out)
|
||||
return f
|
||||
}
|
||||
|
||||
// loop returns the fragment for the main loop of a plus or star.
|
||||
// For plus, it can be used after changing the entry to f1.i.
|
||||
// For star, it can be used directly when f1 can't match an empty string.
|
||||
// (When f1 can match an empty string, f1* must be implemented as (f1+)?
|
||||
// to get the priority match order correct.)
|
||||
func (c *compiler) loop(f1 frag, nongreedy bool) frag {
|
||||
f := c.inst(InstAlt)
|
||||
i := &c.p.Inst[f.i]
|
||||
if nongreedy {
|
||||
i.Arg = f1.i
|
||||
f.out = makePatchList(f.i << 1)
|
||||
} else {
|
||||
i.Out = f1.i
|
||||
f.out = makePatchList(f.i<<1 | 1)
|
||||
}
|
||||
f1.out.patch(c.p, f.i)
|
||||
return f
|
||||
}
|
||||
|
||||
func (c *compiler) star(f1 frag, nongreedy bool) frag {
|
||||
if f1.nullable {
|
||||
// Use (f1+)? to get priority match order correct.
|
||||
// See golang.org/issue/46123.
|
||||
return c.quest(c.plus(f1, nongreedy), nongreedy)
|
||||
}
|
||||
return c.loop(f1, nongreedy)
|
||||
}
|
||||
|
||||
func (c *compiler) plus(f1 frag, nongreedy bool) frag {
|
||||
return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable}
|
||||
}
|
||||
|
||||
func (c *compiler) empty(op EmptyOp) frag {
|
||||
f := c.inst(InstEmptyWidth)
|
||||
c.p.Inst[f.i].Arg = uint32(op)
|
||||
f.out = makePatchList(f.i << 1)
|
||||
return f
|
||||
}
|
||||
|
||||
func (c *compiler) rune(r []rune, flags Flags) frag {
|
||||
f := c.inst(InstRune)
|
||||
f.nullable = false
|
||||
i := &c.p.Inst[f.i]
|
||||
i.Rune = r
|
||||
flags &= FoldCase // only relevant flag is FoldCase
|
||||
if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
|
||||
// and sometimes not even that
|
||||
flags &^= FoldCase
|
||||
}
|
||||
i.Arg = uint32(flags)
|
||||
f.out = makePatchList(f.i << 1)
|
||||
|
||||
// Special cases for exec machine.
|
||||
switch {
|
||||
case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
|
||||
i.Op = InstRune1
|
||||
case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
|
||||
i.Op = InstRuneAny
|
||||
case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
|
||||
i.Op = InstRuneAnyNotNL
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
142
lib/regexutil/syntax/doc.go
Normal file
142
lib/regexutil/syntax/doc.go
Normal file
|
@ -0,0 +1,142 @@
|
|||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Code generated by mksyntaxgo from the RE2 distribution. DO NOT EDIT.
|
||||
|
||||
/*
|
||||
Package syntax parses regular expressions into parse trees and compiles
|
||||
parse trees into programs. Most clients of regular expressions will use the
|
||||
facilities of package [regexp] (such as [regexp.Compile] and [regexp.Match]) instead of this package.
|
||||
|
||||
# Syntax
|
||||
|
||||
The regular expression syntax understood by this package when parsing with the [Perl] flag is as follows.
|
||||
Parts of the syntax can be disabled by passing alternate flags to [Parse].
|
||||
|
||||
Single characters:
|
||||
|
||||
. any character, possibly including newline (flag s=true)
|
||||
[xyz] character class
|
||||
[^xyz] negated character class
|
||||
\d Perl character class
|
||||
\D negated Perl character class
|
||||
[[:alpha:]] ASCII character class
|
||||
[[:^alpha:]] negated ASCII character class
|
||||
\pN Unicode character class (one-letter name)
|
||||
\p{Greek} Unicode character class
|
||||
\PN negated Unicode character class (one-letter name)
|
||||
\P{Greek} negated Unicode character class
|
||||
|
||||
Composites:
|
||||
|
||||
xy x followed by y
|
||||
x|y x or y (prefer x)
|
||||
|
||||
Repetitions:
|
||||
|
||||
x* zero or more x, prefer more
|
||||
x+ one or more x, prefer more
|
||||
x? zero or one x, prefer one
|
||||
x{n,m} n or n+1 or ... or m x, prefer more
|
||||
x{n,} n or more x, prefer more
|
||||
x{n} exactly n x
|
||||
x*? zero or more x, prefer fewer
|
||||
x+? one or more x, prefer fewer
|
||||
x?? zero or one x, prefer zero
|
||||
x{n,m}? n or n+1 or ... or m x, prefer fewer
|
||||
x{n,}? n or more x, prefer fewer
|
||||
x{n}? exactly n x
|
||||
|
||||
Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n}
|
||||
reject forms that create a minimum or maximum repetition count above 1000.
|
||||
Unlimited repetitions are not subject to this restriction.
|
||||
|
||||
Grouping:
|
||||
|
||||
(re) numbered capturing group (submatch)
|
||||
(?P<name>re) named & numbered capturing group (submatch)
|
||||
(?<name>re) named & numbered capturing group (submatch)
|
||||
(?:re) non-capturing group
|
||||
(?flags) set flags within current group; non-capturing
|
||||
(?flags:re) set flags during re; non-capturing
|
||||
|
||||
Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:
|
||||
|
||||
i case-insensitive (default false)
|
||||
m multi-line mode: ^ and $ match begin/end line in addition to begin/end text (default false)
|
||||
s let . match \n (default false)
|
||||
U ungreedy: swap meaning of x* and x*?, x+ and x+?, etc (default false)
|
||||
|
||||
Empty strings:
|
||||
|
||||
^ at beginning of text or line (flag m=true)
|
||||
$ at end of text (like \z not \Z) or line (flag m=true)
|
||||
\A at beginning of text
|
||||
\b at ASCII word boundary (\w on one side and \W, \A, or \z on the other)
|
||||
\B not at ASCII word boundary
|
||||
\z at end of text
|
||||
|
||||
Escape sequences:
|
||||
|
||||
\a bell (== \007)
|
||||
\f form feed (== \014)
|
||||
\t horizontal tab (== \011)
|
||||
\n newline (== \012)
|
||||
\r carriage return (== \015)
|
||||
\v vertical tab character (== \013)
|
||||
\* literal *, for any punctuation character *
|
||||
\123 octal character code (up to three digits)
|
||||
\x7F hex character code (exactly two digits)
|
||||
\x{10FFFF} hex character code
|
||||
\Q...\E literal text ... even if ... has punctuation
|
||||
|
||||
Character class elements:
|
||||
|
||||
x single character
|
||||
A-Z character range (inclusive)
|
||||
\d Perl character class
|
||||
[:foo:] ASCII character class foo
|
||||
\p{Foo} Unicode character class Foo
|
||||
\pF Unicode character class F (one-letter name)
|
||||
|
||||
Named character classes as character class elements:
|
||||
|
||||
[\d] digits (== \d)
|
||||
[^\d] not digits (== \D)
|
||||
[\D] not digits (== \D)
|
||||
[^\D] not not digits (== \d)
|
||||
[[:name:]] named ASCII class inside character class (== [:name:])
|
||||
[^[:name:]] named ASCII class inside negated character class (== [:^name:])
|
||||
[\p{Name}] named Unicode property inside character class (== \p{Name})
|
||||
[^\p{Name}] named Unicode property inside negated character class (== \P{Name})
|
||||
|
||||
Perl character classes (all ASCII-only):
|
||||
|
||||
\d digits (== [0-9])
|
||||
\D not digits (== [^0-9])
|
||||
\s whitespace (== [\t\n\f\r ])
|
||||
\S not whitespace (== [^\t\n\f\r ])
|
||||
\w word characters (== [0-9A-Za-z_])
|
||||
\W not word characters (== [^0-9A-Za-z_])
|
||||
|
||||
ASCII character classes:
|
||||
|
||||
[[:alnum:]] alphanumeric (== [0-9A-Za-z])
|
||||
[[:alpha:]] alphabetic (== [A-Za-z])
|
||||
[[:ascii:]] ASCII (== [\x00-\x7F])
|
||||
[[:blank:]] blank (== [\t ])
|
||||
[[:cntrl:]] control (== [\x00-\x1F\x7F])
|
||||
[[:digit:]] digits (== [0-9])
|
||||
[[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
|
||||
[[:lower:]] lower case (== [a-z])
|
||||
[[:print:]] printable (== [ -~] == [ [:graph:]])
|
||||
[[:punct:]] punctuation (== [!-/:-@[-`{-~])
|
||||
[[:space:]] whitespace (== [\t\n\v\f\r ])
|
||||
[[:upper:]] upper case (== [A-Z])
|
||||
[[:word:]] word characters (== [0-9A-Za-z_])
|
||||
[[:xdigit:]] hex digit (== [0-9A-Fa-f])
|
||||
|
||||
Unicode character classes are those in [unicode.Categories] and [unicode.Scripts].
|
||||
*/
|
||||
package syntax
|
128
lib/regexutil/syntax/make_perl_groups.pl
Executable file
128
lib/regexutil/syntax/make_perl_groups.pl
Executable file
|
@ -0,0 +1,128 @@
|
|||
#!/usr/bin/perl
|
||||
# Copyright 2008 The Go Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Modified version of RE2's make_perl_groups.pl.
|
||||
|
||||
# Generate table entries giving character ranges
|
||||
# for POSIX/Perl character classes. Rather than
|
||||
# figure out what the definition is, it is easier to ask
|
||||
# Perl about each letter from 0-128 and write down
|
||||
# its answer.
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
my @posixclasses = (
|
||||
"[:alnum:]",
|
||||
"[:alpha:]",
|
||||
"[:ascii:]",
|
||||
"[:blank:]",
|
||||
"[:cntrl:]",
|
||||
"[:digit:]",
|
||||
"[:graph:]",
|
||||
"[:lower:]",
|
||||
"[:print:]",
|
||||
"[:punct:]",
|
||||
"[:space:]",
|
||||
"[:upper:]",
|
||||
"[:word:]",
|
||||
"[:xdigit:]",
|
||||
);
|
||||
|
||||
my @perlclasses = (
|
||||
"\\d",
|
||||
"\\s",
|
||||
"\\w",
|
||||
);
|
||||
|
||||
my %overrides = (
|
||||
# Prior to Perl 5.18, \s did not match vertical tab.
|
||||
# RE2 preserves that original behaviour.
|
||||
"\\s:11" => 0,
|
||||
);
|
||||
|
||||
sub ComputeClass($) {
|
||||
my @ranges;
|
||||
my ($class) = @_;
|
||||
my $regexp = "[$class]";
|
||||
my $start = -1;
|
||||
for (my $i=0; $i<=129; $i++) {
|
||||
if ($i == 129) { $i = 256; }
|
||||
if ($i <= 128 && ($overrides{"$class:$i"} // chr($i) =~ $regexp)) {
|
||||
if ($start < 0) {
|
||||
$start = $i;
|
||||
}
|
||||
} else {
|
||||
if ($start >= 0) {
|
||||
push @ranges, [$start, $i-1];
|
||||
}
|
||||
$start = -1;
|
||||
}
|
||||
}
|
||||
return @ranges;
|
||||
}
|
||||
|
||||
sub PrintClass($$@) {
|
||||
my ($cname, $name, @ranges) = @_;
|
||||
print "var code$cname = []rune{ /* $name */\n";
|
||||
for (my $i=0; $i<@ranges; $i++) {
|
||||
my @a = @{$ranges[$i]};
|
||||
printf "\t0x%x, 0x%x,\n", $a[0], $a[1];
|
||||
}
|
||||
print "}\n\n";
|
||||
my $n = @ranges;
|
||||
my $negname = $name;
|
||||
if ($negname =~ /:/) {
|
||||
$negname =~ s/:/:^/;
|
||||
} else {
|
||||
$negname =~ y/a-z/A-Z/;
|
||||
}
|
||||
return "\t`$name`: {+1, code$cname},\n" .
|
||||
"\t`$negname`: {-1, code$cname},\n";
|
||||
}
|
||||
|
||||
my $gen = 0;
|
||||
|
||||
sub PrintClasses($@) {
|
||||
my ($cname, @classes) = @_;
|
||||
my @entries;
|
||||
foreach my $cl (@classes) {
|
||||
my @ranges = ComputeClass($cl);
|
||||
push @entries, PrintClass(++$gen, $cl, @ranges);
|
||||
}
|
||||
print "var ${cname}Group = map[string]charGroup{\n";
|
||||
foreach my $e (@entries) {
|
||||
print $e;
|
||||
}
|
||||
print "}\n";
|
||||
my $count = @entries;
|
||||
}
|
||||
|
||||
# Prepare gofmt command
|
||||
my $gofmt;
|
||||
|
||||
if (@ARGV > 0 && $ARGV[0] =~ /\.go$/) {
|
||||
# Send the output of gofmt to the given file
|
||||
open($gofmt, '|-', 'gofmt >'.$ARGV[0]) or die;
|
||||
} else {
|
||||
open($gofmt, '|-', 'gofmt') or die;
|
||||
}
|
||||
|
||||
# Redirect STDOUT to gofmt input
|
||||
select $gofmt;
|
||||
|
||||
print <<EOF;
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Code generated by make_perl_groups.pl; DO NOT EDIT.
|
||||
|
||||
package syntax
|
||||
|
||||
EOF
|
||||
|
||||
PrintClasses("perl", @perlclasses);
|
||||
PrintClasses("posix", @posixclasses);
|
52
lib/regexutil/syntax/op_string.go
Normal file
52
lib/regexutil/syntax/op_string.go
Normal file
|
@ -0,0 +1,52 @@
|
|||
// Code generated by "stringer -type Op -trimprefix Op"; DO NOT EDIT.
|
||||
|
||||
package syntax
|
||||
|
||||
import "strconv"
|
||||
|
||||
func _() {
|
||||
// An "invalid array index" compiler error signifies that the constant values have changed.
|
||||
// Re-run the stringer command to generate them again.
|
||||
var x [1]struct{}
|
||||
_ = x[OpNoMatch-1]
|
||||
_ = x[OpEmptyMatch-2]
|
||||
_ = x[OpLiteral-3]
|
||||
_ = x[OpCharClass-4]
|
||||
_ = x[OpAnyCharNotNL-5]
|
||||
_ = x[OpAnyChar-6]
|
||||
_ = x[OpBeginLine-7]
|
||||
_ = x[OpEndLine-8]
|
||||
_ = x[OpBeginText-9]
|
||||
_ = x[OpEndText-10]
|
||||
_ = x[OpWordBoundary-11]
|
||||
_ = x[OpNoWordBoundary-12]
|
||||
_ = x[OpCapture-13]
|
||||
_ = x[OpStar-14]
|
||||
_ = x[OpPlus-15]
|
||||
_ = x[OpQuest-16]
|
||||
_ = x[OpRepeat-17]
|
||||
_ = x[OpConcat-18]
|
||||
_ = x[OpAlternate-19]
|
||||
_ = x[opPseudo-128]
|
||||
}
|
||||
|
||||
const (
|
||||
_Op_name_0 = "NoMatchEmptyMatchLiteralCharClassAnyCharNotNLAnyCharBeginLineEndLineBeginTextEndTextWordBoundaryNoWordBoundaryCaptureStarPlusQuestRepeatConcatAlternate"
|
||||
_Op_name_1 = "opPseudo"
|
||||
)
|
||||
|
||||
var (
|
||||
_Op_index_0 = [...]uint8{0, 7, 17, 24, 33, 45, 52, 61, 68, 77, 84, 96, 110, 117, 121, 125, 130, 136, 142, 151}
|
||||
)
|
||||
|
||||
func (i Op) String() string {
|
||||
switch {
|
||||
case 1 <= i && i <= 19:
|
||||
i -= 1
|
||||
return _Op_name_0[_Op_index_0[i]:_Op_index_0[i+1]]
|
||||
case i == 128:
|
||||
return _Op_name_1
|
||||
default:
|
||||
return "Op(" + strconv.FormatInt(int64(i), 10) + ")"
|
||||
}
|
||||
}
|
2134
lib/regexutil/syntax/parse.go
Normal file
2134
lib/regexutil/syntax/parse.go
Normal file
File diff suppressed because it is too large
Load diff
628
lib/regexutil/syntax/parse_test.go
Normal file
628
lib/regexutil/syntax/parse_test.go
Normal file
|
@ -0,0 +1,628 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package syntax
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
type parseTest struct {
|
||||
Regexp string
|
||||
Dump string
|
||||
}
|
||||
|
||||
var parseTests = []parseTest{
|
||||
// Base cases
|
||||
{`a`, `lit{a}`},
|
||||
{`a.`, `cat{lit{a}dot{}}`},
|
||||
{`a.b`, `cat{lit{a}dot{}lit{b}}`},
|
||||
{`ab`, `str{ab}`},
|
||||
{`a.b.c`, `cat{lit{a}dot{}lit{b}dot{}lit{c}}`},
|
||||
{`abc`, `str{abc}`},
|
||||
{`a|^`, `alt{lit{a}bol{}}`},
|
||||
{`a|b`, `cc{0x61-0x62}`},
|
||||
{`(a)`, `cap{lit{a}}`},
|
||||
{`(a)|b`, `alt{cap{lit{a}}lit{b}}`},
|
||||
{`a*`, `star{lit{a}}`},
|
||||
{`a+`, `plus{lit{a}}`},
|
||||
{`a?`, `que{lit{a}}`},
|
||||
{`a{2}`, `rep{2,2 lit{a}}`},
|
||||
{`a{2,3}`, `rep{2,3 lit{a}}`},
|
||||
{`a{2,}`, `rep{2,-1 lit{a}}`},
|
||||
{`a*?`, `nstar{lit{a}}`},
|
||||
{`a+?`, `nplus{lit{a}}`},
|
||||
{`a??`, `nque{lit{a}}`},
|
||||
{`a{2}?`, `nrep{2,2 lit{a}}`},
|
||||
{`a{2,3}?`, `nrep{2,3 lit{a}}`},
|
||||
{`a{2,}?`, `nrep{2,-1 lit{a}}`},
|
||||
// Malformed { } are treated as literals.
|
||||
{`x{1001`, `str{x{1001}`},
|
||||
{`x{9876543210`, `str{x{9876543210}`},
|
||||
{`x{9876543210,`, `str{x{9876543210,}`},
|
||||
{`x{2,1`, `str{x{2,1}`},
|
||||
{`x{1,9876543210`, `str{x{1,9876543210}`},
|
||||
{``, `emp{}`},
|
||||
{`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
|
||||
{`|x|`, `alt{emp{}lit{x}emp{}}`},
|
||||
{`.`, `dot{}`},
|
||||
{`^`, `bol{}`},
|
||||
{`$`, `eol{}`},
|
||||
{`\|`, `lit{|}`},
|
||||
{`\(`, `lit{(}`},
|
||||
{`\)`, `lit{)}`},
|
||||
{`\*`, `lit{*}`},
|
||||
{`\+`, `lit{+}`},
|
||||
{`\?`, `lit{?}`},
|
||||
{`{`, `lit{{}`},
|
||||
{`}`, `lit{}}`},
|
||||
{`\.`, `lit{.}`},
|
||||
{`\^`, `lit{^}`},
|
||||
{`\$`, `lit{$}`},
|
||||
{`\\`, `lit{\}`},
|
||||
{`[ace]`, `cc{0x61 0x63 0x65}`},
|
||||
{`[abc]`, `cc{0x61-0x63}`},
|
||||
{`[a-z]`, `cc{0x61-0x7a}`},
|
||||
{`[a]`, `lit{a}`},
|
||||
{`\-`, `lit{-}`},
|
||||
{`-`, `lit{-}`},
|
||||
{`\_`, `lit{_}`},
|
||||
{`abc`, `str{abc}`},
|
||||
{`abc|def`, `alt{str{abc}str{def}}`},
|
||||
{`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
|
||||
|
||||
// Posix and Perl extensions
|
||||
{`[[:lower:]]`, `cc{0x61-0x7a}`},
|
||||
{`[a-z]`, `cc{0x61-0x7a}`},
|
||||
{`[^[:lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
|
||||
{`[[:^lower:]]`, `cc{0x0-0x60 0x7b-0x10ffff}`},
|
||||
{`(?i)[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
|
||||
{`(?i)[a-z]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
|
||||
{`(?i)[^[:lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
|
||||
{`(?i)[[:^lower:]]`, `cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
|
||||
{`\d`, `cc{0x30-0x39}`},
|
||||
{`\D`, `cc{0x0-0x2f 0x3a-0x10ffff}`},
|
||||
{`\s`, `cc{0x9-0xa 0xc-0xd 0x20}`},
|
||||
{`\S`, `cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}`},
|
||||
{`\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}`},
|
||||
{`\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}`},
|
||||
{`(?i)\w`, `cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}`},
|
||||
{`(?i)\W`, `cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}`},
|
||||
{`[^\\]`, `cc{0x0-0x5b 0x5d-0x10ffff}`},
|
||||
// { `\C`, `byte{}` }, // probably never
|
||||
|
||||
// Unicode, negatives, and a double negative.
|
||||
{`\p{Braille}`, `cc{0x2800-0x28ff}`},
|
||||
{`\P{Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
|
||||
{`\p{^Braille}`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
|
||||
{`\P{^Braille}`, `cc{0x2800-0x28ff}`},
|
||||
{`\pZ`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
|
||||
{`[\p{Braille}]`, `cc{0x2800-0x28ff}`},
|
||||
{`[\P{Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
|
||||
{`[\p{^Braille}]`, `cc{0x0-0x27ff 0x2900-0x10ffff}`},
|
||||
{`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
|
||||
{`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
|
||||
{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
|
||||
{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
|
||||
{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
|
||||
{`\p{Any}`, `dot{}`},
|
||||
{`\p{^Any}`, `cc{}`},
|
||||
|
||||
// Hex, octal.
|
||||
{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
|
||||
{`[\x{41}-\x7a]\x61`, `cat{cc{0x41-0x7a}lit{a}}`},
|
||||
|
||||
// More interesting regular expressions.
|
||||
{`a{,2}`, `str{a{,2}}`},
|
||||
{`\.\^\$\\`, `str{.^$\}`},
|
||||
{`[a-zABC]`, `cc{0x41-0x43 0x61-0x7a}`},
|
||||
{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
|
||||
{`[α-ε☺]`, `cc{0x3b1-0x3b5 0x263a}`}, // utf-8
|
||||
{`a*{`, `cat{star{lit{a}}lit{{}}`},
|
||||
|
||||
// Test precedences
|
||||
{`(?:ab)*`, `star{str{ab}}`},
|
||||
{`(ab)*`, `star{cap{str{ab}}}`},
|
||||
{`ab|cd`, `alt{str{ab}str{cd}}`},
|
||||
{`a(b|c)d`, `cat{lit{a}cap{cc{0x62-0x63}}lit{d}}`},
|
||||
|
||||
// Test flattening.
|
||||
{`(?:a)`, `lit{a}`},
|
||||
{`(?:ab)(?:cd)`, `str{abcd}`},
|
||||
{`(?:a+b+)(?:c+d+)`, `cat{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
|
||||
{`(?:a+|b+)|(?:c+|d+)`, `alt{plus{lit{a}}plus{lit{b}}plus{lit{c}}plus{lit{d}}}`},
|
||||
{`(?:a|b)|(?:c|d)`, `cc{0x61-0x64}`},
|
||||
{`a|.`, `dot{}`},
|
||||
{`.|a`, `dot{}`},
|
||||
{`(?:[abc]|A|Z|hello|world)`, `alt{cc{0x41 0x5a 0x61-0x63}str{hello}str{world}}`},
|
||||
{`(?:[abc]|A|Z)`, `cc{0x41 0x5a 0x61-0x63}`},
|
||||
|
||||
// Test Perl quoted literals
|
||||
{`\Q+|*?{[\E`, `str{+|*?{[}`},
|
||||
{`\Q+\E+`, `plus{lit{+}}`},
|
||||
{`\Qab\E+`, `cat{lit{a}plus{lit{b}}}`},
|
||||
{`\Q\\E`, `lit{\}`},
|
||||
{`\Q\\\E`, `str{\\}`},
|
||||
|
||||
// Test Perl \A and \z
|
||||
{`(?m)^`, `bol{}`},
|
||||
{`(?m)$`, `eol{}`},
|
||||
{`(?-m)^`, `bot{}`},
|
||||
{`(?-m)$`, `eot{}`},
|
||||
{`(?m)\A`, `bot{}`},
|
||||
{`(?m)\z`, `eot{\z}`},
|
||||
{`(?-m)\A`, `bot{}`},
|
||||
{`(?-m)\z`, `eot{\z}`},
|
||||
|
||||
// Test named captures
|
||||
{`(?P<name>a)`, `cap{name:lit{a}}`},
|
||||
{`(?<name>a)`, `cap{name:lit{a}}`},
|
||||
|
||||
// Case-folded literals
|
||||
{`[Aa]`, `litfold{A}`},
|
||||
{`[\x{100}\x{101}]`, `litfold{Ā}`},
|
||||
{`[Δδ]`, `litfold{Δ}`},
|
||||
|
||||
// Strings
|
||||
{`abcde`, `str{abcde}`},
|
||||
{`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
|
||||
|
||||
// Factoring.
|
||||
{`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
|
||||
{`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`},
|
||||
|
||||
// Bug fixes.
|
||||
{`(?:.)`, `dot{}`},
|
||||
{`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`},
|
||||
{`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`},
|
||||
{`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`},
|
||||
{`(?:A|a)`, `litfold{A}`},
|
||||
{`A|(?:A|a)`, `litfold{A}`},
|
||||
{`(?s).`, `dot{}`},
|
||||
{`(?-s).`, `dnl{}`},
|
||||
{`(?:(?:^).)`, `cat{bol{}dot{}}`},
|
||||
{`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
|
||||
{`[\s\S]a`, `cat{cc{0x0-0x10ffff}lit{a}}`},
|
||||
|
||||
// RE2 prefix_tests
|
||||
{`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`},
|
||||
{`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`},
|
||||
{`abc|abd|aef|bcx|bcy`,
|
||||
`alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` +
|
||||
`cat{str{bc}cc{0x78-0x79}}}`},
|
||||
{`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
|
||||
{`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
|
||||
{`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
|
||||
{`.c|.d`, `cat{dot{}cc{0x63-0x64}}`},
|
||||
{`x{2}|x{2}[0-9]`,
|
||||
`cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
|
||||
{`x{2}y|x{2}[0-9]y`,
|
||||
`cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
|
||||
{`a.*?c|a.*?b`,
|
||||
`cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`},
|
||||
|
||||
// Valid repetitions.
|
||||
{`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``},
|
||||
{`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``},
|
||||
|
||||
// Valid nesting.
|
||||
{strings.Repeat("(", 999) + strings.Repeat(")", 999), ``},
|
||||
{strings.Repeat("(?:", 999) + strings.Repeat(")*", 999), ``},
|
||||
{"(" + strings.Repeat("|", 12345) + ")", ``}, // not nested at all
|
||||
}
|
||||
|
||||
const testFlags = MatchNL | PerlX | UnicodeGroups
|
||||
|
||||
func TestParseSimple(t *testing.T) {
|
||||
testParseDump(t, parseTests, testFlags)
|
||||
}
|
||||
|
||||
var foldcaseTests = []parseTest{
|
||||
{`AbCdE`, `strfold{ABCDE}`},
|
||||
{`[Aa]`, `litfold{A}`},
|
||||
{`a`, `litfold{A}`},
|
||||
|
||||
// 0x17F is an old English long s (looks like an f) and folds to s.
|
||||
// 0x212A is the Kelvin symbol and folds to k.
|
||||
{`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...]
|
||||
{`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
|
||||
{`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
|
||||
}
|
||||
|
||||
func TestParseFoldCase(t *testing.T) {
|
||||
testParseDump(t, foldcaseTests, FoldCase)
|
||||
}
|
||||
|
||||
var literalTests = []parseTest{
|
||||
{"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"},
|
||||
}
|
||||
|
||||
func TestParseLiteral(t *testing.T) {
|
||||
testParseDump(t, literalTests, Literal)
|
||||
}
|
||||
|
||||
var matchnlTests = []parseTest{
|
||||
{`.`, `dot{}`},
|
||||
{"\n", "lit{\n}"},
|
||||
{`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
|
||||
{`[a\n]`, `cc{0xa 0x61}`},
|
||||
}
|
||||
|
||||
func TestParseMatchNL(t *testing.T) {
|
||||
testParseDump(t, matchnlTests, MatchNL)
|
||||
}
|
||||
|
||||
var nomatchnlTests = []parseTest{
|
||||
{`.`, `dnl{}`},
|
||||
{"\n", "lit{\n}"},
|
||||
{`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`},
|
||||
{`[a\n]`, `cc{0xa 0x61}`},
|
||||
}
|
||||
|
||||
func TestParseNoMatchNL(t *testing.T) {
|
||||
testParseDump(t, nomatchnlTests, 0)
|
||||
}
|
||||
|
||||
// Test Parse -> Dump.
|
||||
func testParseDump(t *testing.T, tests []parseTest, flags Flags) {
|
||||
for _, tt := range tests {
|
||||
re, err := Parse(tt.Regexp, flags)
|
||||
if err != nil {
|
||||
t.Errorf("Parse(%#q): %v", tt.Regexp, err)
|
||||
continue
|
||||
}
|
||||
if tt.Dump == "" {
|
||||
// It parsed. That's all we care about.
|
||||
continue
|
||||
}
|
||||
d := dump(re)
|
||||
if d != tt.Dump {
|
||||
t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dump prints a string representation of the regexp showing
|
||||
// the structure explicitly.
|
||||
func dump(re *Regexp) string {
|
||||
var b strings.Builder
|
||||
dumpRegexp(&b, re)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
var opNames = []string{
|
||||
OpNoMatch: "no",
|
||||
OpEmptyMatch: "emp",
|
||||
OpLiteral: "lit",
|
||||
OpCharClass: "cc",
|
||||
OpAnyCharNotNL: "dnl",
|
||||
OpAnyChar: "dot",
|
||||
OpBeginLine: "bol",
|
||||
OpEndLine: "eol",
|
||||
OpBeginText: "bot",
|
||||
OpEndText: "eot",
|
||||
OpWordBoundary: "wb",
|
||||
OpNoWordBoundary: "nwb",
|
||||
OpCapture: "cap",
|
||||
OpStar: "star",
|
||||
OpPlus: "plus",
|
||||
OpQuest: "que",
|
||||
OpRepeat: "rep",
|
||||
OpConcat: "cat",
|
||||
OpAlternate: "alt",
|
||||
}
|
||||
|
||||
// dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
|
||||
// It is used during testing to distinguish between parses that might print
|
||||
// the same using re's String method.
|
||||
func dumpRegexp(b *strings.Builder, re *Regexp) {
|
||||
if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
|
||||
fmt.Fprintf(b, "op%d", re.Op)
|
||||
} else {
|
||||
switch re.Op {
|
||||
default:
|
||||
b.WriteString(opNames[re.Op])
|
||||
case OpStar, OpPlus, OpQuest, OpRepeat:
|
||||
if re.Flags&NonGreedy != 0 {
|
||||
b.WriteByte('n')
|
||||
}
|
||||
b.WriteString(opNames[re.Op])
|
||||
case OpLiteral:
|
||||
if len(re.Rune) > 1 {
|
||||
b.WriteString("str")
|
||||
} else {
|
||||
b.WriteString("lit")
|
||||
}
|
||||
if re.Flags&FoldCase != 0 {
|
||||
for _, r := range re.Rune {
|
||||
if unicode.SimpleFold(r) != r {
|
||||
b.WriteString("fold")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
b.WriteByte('{')
|
||||
switch re.Op {
|
||||
case OpEndText:
|
||||
if re.Flags&WasDollar == 0 {
|
||||
b.WriteString(`\z`)
|
||||
}
|
||||
case OpLiteral:
|
||||
for _, r := range re.Rune {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
case OpConcat, OpAlternate:
|
||||
for _, sub := range re.Sub {
|
||||
dumpRegexp(b, sub)
|
||||
}
|
||||
case OpStar, OpPlus, OpQuest:
|
||||
dumpRegexp(b, re.Sub[0])
|
||||
case OpRepeat:
|
||||
fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
|
||||
dumpRegexp(b, re.Sub[0])
|
||||
case OpCapture:
|
||||
if re.Name != "" {
|
||||
b.WriteString(re.Name)
|
||||
b.WriteByte(':')
|
||||
}
|
||||
dumpRegexp(b, re.Sub[0])
|
||||
case OpCharClass:
|
||||
sep := ""
|
||||
for i := 0; i < len(re.Rune); i += 2 {
|
||||
b.WriteString(sep)
|
||||
sep = " "
|
||||
lo, hi := re.Rune[i], re.Rune[i+1]
|
||||
if lo == hi {
|
||||
fmt.Fprintf(b, "%#x", lo)
|
||||
} else {
|
||||
fmt.Fprintf(b, "%#x-%#x", lo, hi)
|
||||
}
|
||||
}
|
||||
}
|
||||
b.WriteByte('}')
|
||||
}
|
||||
|
||||
func mkCharClass(f func(rune) bool) string {
|
||||
re := &Regexp{Op: OpCharClass}
|
||||
lo := rune(-1)
|
||||
for i := rune(0); i <= unicode.MaxRune; i++ {
|
||||
if f(i) {
|
||||
if lo < 0 {
|
||||
lo = i
|
||||
}
|
||||
} else {
|
||||
if lo >= 0 {
|
||||
re.Rune = append(re.Rune, lo, i-1)
|
||||
lo = -1
|
||||
}
|
||||
}
|
||||
}
|
||||
if lo >= 0 {
|
||||
re.Rune = append(re.Rune, lo, unicode.MaxRune)
|
||||
}
|
||||
return dump(re)
|
||||
}
|
||||
|
||||
func isUpperFold(r rune) bool {
|
||||
if unicode.IsUpper(r) {
|
||||
return true
|
||||
}
|
||||
c := unicode.SimpleFold(r)
|
||||
for c != r {
|
||||
if unicode.IsUpper(c) {
|
||||
return true
|
||||
}
|
||||
c = unicode.SimpleFold(c)
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func TestFoldConstants(t *testing.T) {
|
||||
last := rune(-1)
|
||||
for i := rune(0); i <= unicode.MaxRune; i++ {
|
||||
if unicode.SimpleFold(i) == i {
|
||||
continue
|
||||
}
|
||||
if last == -1 && minFold != i {
|
||||
t.Errorf("minFold=%#U should be %#U", minFold, i)
|
||||
}
|
||||
last = i
|
||||
}
|
||||
if maxFold != last {
|
||||
t.Errorf("maxFold=%#U should be %#U", maxFold, last)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAppendRangeCollapse(t *testing.T) {
|
||||
// AppendRange should collapse each of the new ranges
|
||||
// into the earlier ones (it looks back two ranges), so that
|
||||
// the slice never grows very large.
|
||||
// Note that we are not calling cleanClass.
|
||||
var r []rune
|
||||
for i := rune('A'); i <= 'Z'; i++ {
|
||||
r = appendRange(r, i, i)
|
||||
r = appendRange(r, i+'a'-'A', i+'a'-'A')
|
||||
}
|
||||
if string(r) != "AZaz" {
|
||||
t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
|
||||
}
|
||||
}
|
||||
|
||||
var invalidRegexps = []string{
|
||||
`(`,
|
||||
`)`,
|
||||
`(a`,
|
||||
`a)`,
|
||||
`(a))`,
|
||||
`(a|b|`,
|
||||
`a|b|)`,
|
||||
`(a|b|))`,
|
||||
`(a|b`,
|
||||
`a|b)`,
|
||||
`(a|b))`,
|
||||
`[a-z`,
|
||||
`([a-z)`,
|
||||
`[a-z)`,
|
||||
`([a-z]))`,
|
||||
`x{1001}`,
|
||||
`x{9876543210}`,
|
||||
`x{2,1}`,
|
||||
`x{1,9876543210}`,
|
||||
"\xff", // Invalid UTF-8
|
||||
"[\xff]",
|
||||
"[\\\xff]",
|
||||
"\\\xff",
|
||||
`(?P<name>a`,
|
||||
`(?P<name>`,
|
||||
`(?P<name`,
|
||||
`(?P<x y>a)`,
|
||||
`(?P<>a)`,
|
||||
`(?<name>a`,
|
||||
`(?<name>`,
|
||||
`(?<name`,
|
||||
`(?<x y>a)`,
|
||||
`(?<>a)`,
|
||||
`[a-Z]`,
|
||||
`(?i)[a-Z]`,
|
||||
`\Q\E*`,
|
||||
`a{100000}`, // too much repetition
|
||||
`a{100000,}`, // too much repetition
|
||||
"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", // too much repetition
|
||||
strings.Repeat("(", 1000) + strings.Repeat(")", 1000), // too deep
|
||||
strings.Repeat("(?:", 1000) + strings.Repeat(")*", 1000), // too deep
|
||||
"(" + strings.Repeat("(xx?)", 1000) + "){1000}", // too long
|
||||
strings.Repeat("(xx?){1000}", 1000), // too long
|
||||
strings.Repeat(`\pL`, 27000), // too many runes
|
||||
}
|
||||
|
||||
var onlyPerl = []string{
|
||||
`[a-b-c]`,
|
||||
`\Qabc\E`,
|
||||
`\Q*+?{[\E`,
|
||||
`\Q\\E`,
|
||||
`\Q\\\E`,
|
||||
`\Q\\\\E`,
|
||||
`\Q\\\\\E`,
|
||||
`(?:a)`,
|
||||
`(?P<name>a)`,
|
||||
}
|
||||
|
||||
var onlyPOSIX = []string{
|
||||
"a++",
|
||||
"a**",
|
||||
"a?*",
|
||||
"a+*",
|
||||
"a{1}*",
|
||||
".{1}{2}.{3}",
|
||||
}
|
||||
|
||||
func TestParseInvalidRegexps(t *testing.T) {
|
||||
for _, regexp := range invalidRegexps {
|
||||
if re, err := Parse(regexp, Perl); err == nil {
|
||||
t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
|
||||
}
|
||||
if re, err := Parse(regexp, POSIX); err == nil {
|
||||
t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
|
||||
}
|
||||
}
|
||||
for _, regexp := range onlyPerl {
|
||||
if _, err := Parse(regexp, Perl); err != nil {
|
||||
t.Errorf("Parse(%#q, Perl): %v", regexp, err)
|
||||
}
|
||||
if re, err := Parse(regexp, POSIX); err == nil {
|
||||
t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
|
||||
}
|
||||
}
|
||||
for _, regexp := range onlyPOSIX {
|
||||
if re, err := Parse(regexp, Perl); err == nil {
|
||||
t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
|
||||
}
|
||||
if _, err := Parse(regexp, POSIX); err != nil {
|
||||
t.Errorf("Parse(%#q, POSIX): %v", regexp, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestToStringEquivalentParse(t *testing.T) {
|
||||
for _, tt := range parseTests {
|
||||
re, err := Parse(tt.Regexp, testFlags)
|
||||
if err != nil {
|
||||
t.Errorf("Parse(%#q): %v", tt.Regexp, err)
|
||||
continue
|
||||
}
|
||||
if tt.Dump == "" {
|
||||
// It parsed. That's all we care about.
|
||||
continue
|
||||
}
|
||||
d := dump(re)
|
||||
if d != tt.Dump {
|
||||
t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
|
||||
continue
|
||||
}
|
||||
|
||||
s := re.String()
|
||||
if s != tt.Regexp {
|
||||
// If ToString didn't return the original regexp,
|
||||
// it must have found one with fewer parens.
|
||||
// Unfortunately we can't check the length here, because
|
||||
// ToString produces "\\{" for a literal brace,
|
||||
// but "{" is a shorter equivalent in some contexts.
|
||||
nre, err := Parse(s, testFlags)
|
||||
if err != nil {
|
||||
t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, s, err)
|
||||
continue
|
||||
}
|
||||
nd := dump(nre)
|
||||
if d != nd {
|
||||
t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd)
|
||||
}
|
||||
|
||||
ns := nre.String()
|
||||
if s != ns {
|
||||
t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var stringTests = []struct {
|
||||
re string
|
||||
out string
|
||||
}{
|
||||
{`x(?i:ab*c|d?e)1`, `x(?i:AB*C|D?E)1`},
|
||||
{`x(?i:ab*cd?e)1`, `x(?i:AB*CD?E)1`},
|
||||
{`0(?i:ab*c|d?e)1`, `(?i:0(?:AB*C|D?E)1)`},
|
||||
{`0(?i:ab*cd?e)1`, `(?i:0AB*CD?E1)`},
|
||||
{`x(?i:ab*c|d?e)`, `x(?i:AB*C|D?E)`},
|
||||
{`x(?i:ab*cd?e)`, `x(?i:AB*CD?E)`},
|
||||
{`0(?i:ab*c|d?e)`, `(?i:0(?:AB*C|D?E))`},
|
||||
{`0(?i:ab*cd?e)`, `(?i:0AB*CD?E)`},
|
||||
{`(?i:ab*c|d?e)1`, `(?i:(?:AB*C|D?E)1)`},
|
||||
{`(?i:ab*cd?e)1`, `(?i:AB*CD?E1)`},
|
||||
{`(?i:ab)[123](?i:cd)`, `(?i:AB[1-3]CD)`},
|
||||
{`(?i:ab*c|d?e)`, `(?i:AB*C|D?E)`},
|
||||
{`[Aa][Bb]`, `(?i:AB)`},
|
||||
{`[Aa][Bb]*[Cc]`, `(?i:AB*C)`},
|
||||
{`A(?:[Bb][Cc]|[Dd])[Zz]`, `A(?i:(?:BC|D)Z)`},
|
||||
{`[Aa](?:[Bb][Cc]|[Dd])Z`, `(?i:A(?:BC|D))Z`},
|
||||
}
|
||||
|
||||
func TestString(t *testing.T) {
|
||||
for _, tt := range stringTests {
|
||||
re, err := Parse(tt.re, Perl)
|
||||
if err != nil {
|
||||
t.Errorf("Parse(%#q): %v", tt.re, err)
|
||||
continue
|
||||
}
|
||||
out := re.String()
|
||||
if out != tt.out {
|
||||
t.Errorf("Parse(%#q).String() = %#q, want %#q", tt.re, out, tt.out)
|
||||
}
|
||||
}
|
||||
}
|
133
lib/regexutil/syntax/perl_groups.go
Normal file
133
lib/regexutil/syntax/perl_groups.go
Normal file
|
@ -0,0 +1,133 @@
|
|||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Code generated by make_perl_groups.pl; DO NOT EDIT.
|
||||
|
||||
package syntax
|
||||
|
||||
var code1 = []rune{ /* \d */
|
||||
0x30, 0x39,
|
||||
}
|
||||
|
||||
var code2 = []rune{ /* \s */
|
||||
0x9, 0xa,
|
||||
0xc, 0xd,
|
||||
0x20, 0x20,
|
||||
}
|
||||
|
||||
var code3 = []rune{ /* \w */
|
||||
0x30, 0x39,
|
||||
0x41, 0x5a,
|
||||
0x5f, 0x5f,
|
||||
0x61, 0x7a,
|
||||
}
|
||||
|
||||
var perlGroup = map[string]charGroup{
|
||||
`\d`: {+1, code1},
|
||||
`\D`: {-1, code1},
|
||||
`\s`: {+1, code2},
|
||||
`\S`: {-1, code2},
|
||||
`\w`: {+1, code3},
|
||||
`\W`: {-1, code3},
|
||||
}
|
||||
var code4 = []rune{ /* [:alnum:] */
|
||||
0x30, 0x39,
|
||||
0x41, 0x5a,
|
||||
0x61, 0x7a,
|
||||
}
|
||||
|
||||
var code5 = []rune{ /* [:alpha:] */
|
||||
0x41, 0x5a,
|
||||
0x61, 0x7a,
|
||||
}
|
||||
|
||||
var code6 = []rune{ /* [:ascii:] */
|
||||
0x0, 0x7f,
|
||||
}
|
||||
|
||||
var code7 = []rune{ /* [:blank:] */
|
||||
0x9, 0x9,
|
||||
0x20, 0x20,
|
||||
}
|
||||
|
||||
var code8 = []rune{ /* [:cntrl:] */
|
||||
0x0, 0x1f,
|
||||
0x7f, 0x7f,
|
||||
}
|
||||
|
||||
var code9 = []rune{ /* [:digit:] */
|
||||
0x30, 0x39,
|
||||
}
|
||||
|
||||
var code10 = []rune{ /* [:graph:] */
|
||||
0x21, 0x7e,
|
||||
}
|
||||
|
||||
var code11 = []rune{ /* [:lower:] */
|
||||
0x61, 0x7a,
|
||||
}
|
||||
|
||||
var code12 = []rune{ /* [:print:] */
|
||||
0x20, 0x7e,
|
||||
}
|
||||
|
||||
var code13 = []rune{ /* [:punct:] */
|
||||
0x21, 0x2f,
|
||||
0x3a, 0x40,
|
||||
0x5b, 0x60,
|
||||
0x7b, 0x7e,
|
||||
}
|
||||
|
||||
var code14 = []rune{ /* [:space:] */
|
||||
0x9, 0xd,
|
||||
0x20, 0x20,
|
||||
}
|
||||
|
||||
var code15 = []rune{ /* [:upper:] */
|
||||
0x41, 0x5a,
|
||||
}
|
||||
|
||||
var code16 = []rune{ /* [:word:] */
|
||||
0x30, 0x39,
|
||||
0x41, 0x5a,
|
||||
0x5f, 0x5f,
|
||||
0x61, 0x7a,
|
||||
}
|
||||
|
||||
var code17 = []rune{ /* [:xdigit:] */
|
||||
0x30, 0x39,
|
||||
0x41, 0x46,
|
||||
0x61, 0x66,
|
||||
}
|
||||
|
||||
var posixGroup = map[string]charGroup{
|
||||
`[:alnum:]`: {+1, code4},
|
||||
`[:^alnum:]`: {-1, code4},
|
||||
`[:alpha:]`: {+1, code5},
|
||||
`[:^alpha:]`: {-1, code5},
|
||||
`[:ascii:]`: {+1, code6},
|
||||
`[:^ascii:]`: {-1, code6},
|
||||
`[:blank:]`: {+1, code7},
|
||||
`[:^blank:]`: {-1, code7},
|
||||
`[:cntrl:]`: {+1, code8},
|
||||
`[:^cntrl:]`: {-1, code8},
|
||||
`[:digit:]`: {+1, code9},
|
||||
`[:^digit:]`: {-1, code9},
|
||||
`[:graph:]`: {+1, code10},
|
||||
`[:^graph:]`: {-1, code10},
|
||||
`[:lower:]`: {+1, code11},
|
||||
`[:^lower:]`: {-1, code11},
|
||||
`[:print:]`: {+1, code12},
|
||||
`[:^print:]`: {-1, code12},
|
||||
`[:punct:]`: {+1, code13},
|
||||
`[:^punct:]`: {-1, code13},
|
||||
`[:space:]`: {+1, code14},
|
||||
`[:^space:]`: {-1, code14},
|
||||
`[:upper:]`: {+1, code15},
|
||||
`[:^upper:]`: {-1, code15},
|
||||
`[:word:]`: {+1, code16},
|
||||
`[:^word:]`: {-1, code16},
|
||||
`[:xdigit:]`: {+1, code17},
|
||||
`[:^xdigit:]`: {-1, code17},
|
||||
}
|
349
lib/regexutil/syntax/prog.go
Normal file
349
lib/regexutil/syntax/prog.go
Normal file
|
@ -0,0 +1,349 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package syntax
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// Compiled program.
|
||||
// May not belong in this package, but convenient for now.
|
||||
|
||||
// A Prog is a compiled regular expression program.
|
||||
type Prog struct {
|
||||
Inst []Inst
|
||||
Start int // index of start instruction
|
||||
NumCap int // number of InstCapture insts in re
|
||||
}
|
||||
|
||||
// An InstOp is an instruction opcode.
|
||||
type InstOp uint8
|
||||
|
||||
const (
|
||||
InstAlt InstOp = iota
|
||||
InstAltMatch
|
||||
InstCapture
|
||||
InstEmptyWidth
|
||||
InstMatch
|
||||
InstFail
|
||||
InstNop
|
||||
InstRune
|
||||
InstRune1
|
||||
InstRuneAny
|
||||
InstRuneAnyNotNL
|
||||
)
|
||||
|
||||
var instOpNames = []string{
|
||||
"InstAlt",
|
||||
"InstAltMatch",
|
||||
"InstCapture",
|
||||
"InstEmptyWidth",
|
||||
"InstMatch",
|
||||
"InstFail",
|
||||
"InstNop",
|
||||
"InstRune",
|
||||
"InstRune1",
|
||||
"InstRuneAny",
|
||||
"InstRuneAnyNotNL",
|
||||
}
|
||||
|
||||
func (i InstOp) String() string {
|
||||
if uint(i) >= uint(len(instOpNames)) {
|
||||
return ""
|
||||
}
|
||||
return instOpNames[i]
|
||||
}
|
||||
|
||||
// An EmptyOp specifies a kind or mixture of zero-width assertions.
|
||||
type EmptyOp uint8
|
||||
|
||||
const (
|
||||
EmptyBeginLine EmptyOp = 1 << iota
|
||||
EmptyEndLine
|
||||
EmptyBeginText
|
||||
EmptyEndText
|
||||
EmptyWordBoundary
|
||||
EmptyNoWordBoundary
|
||||
)
|
||||
|
||||
// EmptyOpContext returns the zero-width assertions
|
||||
// satisfied at the position between the runes r1 and r2.
|
||||
// Passing r1 == -1 indicates that the position is
|
||||
// at the beginning of the text.
|
||||
// Passing r2 == -1 indicates that the position is
|
||||
// at the end of the text.
|
||||
func EmptyOpContext(r1, r2 rune) EmptyOp {
|
||||
var op EmptyOp = EmptyNoWordBoundary
|
||||
var boundary byte
|
||||
switch {
|
||||
case IsWordChar(r1):
|
||||
boundary = 1
|
||||
case r1 == '\n':
|
||||
op |= EmptyBeginLine
|
||||
case r1 < 0:
|
||||
op |= EmptyBeginText | EmptyBeginLine
|
||||
}
|
||||
switch {
|
||||
case IsWordChar(r2):
|
||||
boundary ^= 1
|
||||
case r2 == '\n':
|
||||
op |= EmptyEndLine
|
||||
case r2 < 0:
|
||||
op |= EmptyEndText | EmptyEndLine
|
||||
}
|
||||
if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
|
||||
op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
|
||||
}
|
||||
return op
|
||||
}
|
||||
|
||||
// IsWordChar reports whether r is considered a “word character”
|
||||
// during the evaluation of the \b and \B zero-width assertions.
|
||||
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
|
||||
func IsWordChar(r rune) bool {
|
||||
// Test for lowercase letters first, as these occur more
|
||||
// frequently than uppercase letters in common cases.
|
||||
return 'a' <= r && r <= 'z' || 'A' <= r && r <= 'Z' || '0' <= r && r <= '9' || r == '_'
|
||||
}
|
||||
|
||||
// An Inst is a single instruction in a regular expression program.
|
||||
type Inst struct {
|
||||
Op InstOp
|
||||
Out uint32 // all but InstMatch, InstFail
|
||||
Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
|
||||
Rune []rune
|
||||
}
|
||||
|
||||
func (p *Prog) String() string {
|
||||
var b strings.Builder
|
||||
dumpProg(&b, p)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// skipNop follows any no-op or capturing instructions.
|
||||
func (p *Prog) skipNop(pc uint32) *Inst {
|
||||
i := &p.Inst[pc]
|
||||
for i.Op == InstNop || i.Op == InstCapture {
|
||||
i = &p.Inst[i.Out]
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
// op returns i.Op but merges all the Rune special cases into InstRune
|
||||
func (i *Inst) op() InstOp {
|
||||
op := i.Op
|
||||
switch op {
|
||||
case InstRune1, InstRuneAny, InstRuneAnyNotNL:
|
||||
op = InstRune
|
||||
}
|
||||
return op
|
||||
}
|
||||
|
||||
// Prefix returns a literal string that all matches for the
|
||||
// regexp must start with. Complete is true if the prefix
|
||||
// is the entire match.
|
||||
func (p *Prog) Prefix() (prefix string, complete bool) {
|
||||
i := p.skipNop(uint32(p.Start))
|
||||
|
||||
// Avoid allocation of buffer if prefix is empty.
|
||||
if i.op() != InstRune || len(i.Rune) != 1 {
|
||||
return "", i.Op == InstMatch
|
||||
}
|
||||
|
||||
// Have prefix; gather characters.
|
||||
var buf strings.Builder
|
||||
for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
|
||||
buf.WriteRune(i.Rune[0])
|
||||
i = p.skipNop(i.Out)
|
||||
}
|
||||
return buf.String(), i.Op == InstMatch
|
||||
}
|
||||
|
||||
// StartCond returns the leading empty-width conditions that must
|
||||
// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
|
||||
func (p *Prog) StartCond() EmptyOp {
|
||||
var flag EmptyOp
|
||||
pc := uint32(p.Start)
|
||||
i := &p.Inst[pc]
|
||||
Loop:
|
||||
for {
|
||||
switch i.Op {
|
||||
case InstEmptyWidth:
|
||||
flag |= EmptyOp(i.Arg)
|
||||
case InstFail:
|
||||
return ^EmptyOp(0)
|
||||
case InstCapture, InstNop:
|
||||
// skip
|
||||
default:
|
||||
break Loop
|
||||
}
|
||||
pc = i.Out
|
||||
i = &p.Inst[pc]
|
||||
}
|
||||
return flag
|
||||
}
|
||||
|
||||
const noMatch = -1
|
||||
|
||||
// MatchRune reports whether the instruction matches (and consumes) r.
|
||||
// It should only be called when i.Op == [InstRune].
|
||||
func (i *Inst) MatchRune(r rune) bool {
|
||||
return i.MatchRunePos(r) != noMatch
|
||||
}
|
||||
|
||||
// MatchRunePos checks whether the instruction matches (and consumes) r.
|
||||
// If so, MatchRunePos returns the index of the matching rune pair
|
||||
// (or, when len(i.Rune) == 1, rune singleton).
|
||||
// If not, MatchRunePos returns -1.
|
||||
// MatchRunePos should only be called when i.Op == [InstRune].
|
||||
func (i *Inst) MatchRunePos(r rune) int {
|
||||
rune := i.Rune
|
||||
|
||||
switch len(rune) {
|
||||
case 0:
|
||||
return noMatch
|
||||
|
||||
case 1:
|
||||
// Special case: single-rune slice is from literal string, not char class.
|
||||
r0 := rune[0]
|
||||
if r == r0 {
|
||||
return 0
|
||||
}
|
||||
if Flags(i.Arg)&FoldCase != 0 {
|
||||
for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
|
||||
if r == r1 {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
}
|
||||
return noMatch
|
||||
|
||||
case 2:
|
||||
if r >= rune[0] && r <= rune[1] {
|
||||
return 0
|
||||
}
|
||||
return noMatch
|
||||
|
||||
case 4, 6, 8:
|
||||
// Linear search for a few pairs.
|
||||
// Should handle ASCII well.
|
||||
for j := 0; j < len(rune); j += 2 {
|
||||
if r < rune[j] {
|
||||
return noMatch
|
||||
}
|
||||
if r <= rune[j+1] {
|
||||
return j / 2
|
||||
}
|
||||
}
|
||||
return noMatch
|
||||
}
|
||||
|
||||
// Otherwise binary search.
|
||||
lo := 0
|
||||
hi := len(rune) / 2
|
||||
for lo < hi {
|
||||
m := int(uint(lo+hi) >> 1)
|
||||
if c := rune[2*m]; c <= r {
|
||||
if r <= rune[2*m+1] {
|
||||
return m
|
||||
}
|
||||
lo = m + 1
|
||||
} else {
|
||||
hi = m
|
||||
}
|
||||
}
|
||||
return noMatch
|
||||
}
|
||||
|
||||
// MatchEmptyWidth reports whether the instruction matches
|
||||
// an empty string between the runes before and after.
|
||||
// It should only be called when i.Op == [InstEmptyWidth].
|
||||
func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
|
||||
switch EmptyOp(i.Arg) {
|
||||
case EmptyBeginLine:
|
||||
return before == '\n' || before == -1
|
||||
case EmptyEndLine:
|
||||
return after == '\n' || after == -1
|
||||
case EmptyBeginText:
|
||||
return before == -1
|
||||
case EmptyEndText:
|
||||
return after == -1
|
||||
case EmptyWordBoundary:
|
||||
return IsWordChar(before) != IsWordChar(after)
|
||||
case EmptyNoWordBoundary:
|
||||
return IsWordChar(before) == IsWordChar(after)
|
||||
}
|
||||
panic("unknown empty width arg")
|
||||
}
|
||||
|
||||
func (i *Inst) String() string {
|
||||
var b strings.Builder
|
||||
dumpInst(&b, i)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func bw(b *strings.Builder, args ...string) {
|
||||
for _, s := range args {
|
||||
b.WriteString(s)
|
||||
}
|
||||
}
|
||||
|
||||
func dumpProg(b *strings.Builder, p *Prog) {
|
||||
for j := range p.Inst {
|
||||
i := &p.Inst[j]
|
||||
pc := strconv.Itoa(j)
|
||||
if len(pc) < 3 {
|
||||
b.WriteString(" "[len(pc):])
|
||||
}
|
||||
if j == p.Start {
|
||||
pc += "*"
|
||||
}
|
||||
bw(b, pc, "\t")
|
||||
dumpInst(b, i)
|
||||
bw(b, "\n")
|
||||
}
|
||||
}
|
||||
|
||||
func u32(i uint32) string {
|
||||
return strconv.FormatUint(uint64(i), 10)
|
||||
}
|
||||
|
||||
func dumpInst(b *strings.Builder, i *Inst) {
|
||||
switch i.Op {
|
||||
case InstAlt:
|
||||
bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
|
||||
case InstAltMatch:
|
||||
bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
|
||||
case InstCapture:
|
||||
bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
|
||||
case InstEmptyWidth:
|
||||
bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
|
||||
case InstMatch:
|
||||
bw(b, "match")
|
||||
case InstFail:
|
||||
bw(b, "fail")
|
||||
case InstNop:
|
||||
bw(b, "nop -> ", u32(i.Out))
|
||||
case InstRune:
|
||||
if i.Rune == nil {
|
||||
// shouldn't happen
|
||||
bw(b, "rune <nil>")
|
||||
}
|
||||
bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
|
||||
if Flags(i.Arg)&FoldCase != 0 {
|
||||
bw(b, "/i")
|
||||
}
|
||||
bw(b, " -> ", u32(i.Out))
|
||||
case InstRune1:
|
||||
bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
|
||||
case InstRuneAny:
|
||||
bw(b, "any -> ", u32(i.Out))
|
||||
case InstRuneAnyNotNL:
|
||||
bw(b, "anynotnl -> ", u32(i.Out))
|
||||
}
|
||||
}
|
144
lib/regexutil/syntax/prog_test.go
Normal file
144
lib/regexutil/syntax/prog_test.go
Normal file
|
@ -0,0 +1,144 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package syntax
|
||||
|
||||
import "testing"
|
||||
|
||||
var compileTests = []struct {
|
||||
Regexp string
|
||||
Prog string
|
||||
}{
|
||||
{"a", ` 0 fail
|
||||
1* rune1 "a" -> 2
|
||||
2 match
|
||||
`},
|
||||
{"[A-M][n-z]", ` 0 fail
|
||||
1* rune "AM" -> 2
|
||||
2 rune "nz" -> 3
|
||||
3 match
|
||||
`},
|
||||
{"", ` 0 fail
|
||||
1* nop -> 2
|
||||
2 match
|
||||
`},
|
||||
{"a?", ` 0 fail
|
||||
1 rune1 "a" -> 3
|
||||
2* alt -> 1, 3
|
||||
3 match
|
||||
`},
|
||||
{"a??", ` 0 fail
|
||||
1 rune1 "a" -> 3
|
||||
2* alt -> 3, 1
|
||||
3 match
|
||||
`},
|
||||
{"a+", ` 0 fail
|
||||
1* rune1 "a" -> 2
|
||||
2 alt -> 1, 3
|
||||
3 match
|
||||
`},
|
||||
{"a+?", ` 0 fail
|
||||
1* rune1 "a" -> 2
|
||||
2 alt -> 3, 1
|
||||
3 match
|
||||
`},
|
||||
{"a*", ` 0 fail
|
||||
1 rune1 "a" -> 2
|
||||
2* alt -> 1, 3
|
||||
3 match
|
||||
`},
|
||||
{"a*?", ` 0 fail
|
||||
1 rune1 "a" -> 2
|
||||
2* alt -> 3, 1
|
||||
3 match
|
||||
`},
|
||||
{"a+b+", ` 0 fail
|
||||
1* rune1 "a" -> 2
|
||||
2 alt -> 1, 3
|
||||
3 rune1 "b" -> 4
|
||||
4 alt -> 3, 5
|
||||
5 match
|
||||
`},
|
||||
{"(a+)(b+)", ` 0 fail
|
||||
1* cap 2 -> 2
|
||||
2 rune1 "a" -> 3
|
||||
3 alt -> 2, 4
|
||||
4 cap 3 -> 5
|
||||
5 cap 4 -> 6
|
||||
6 rune1 "b" -> 7
|
||||
7 alt -> 6, 8
|
||||
8 cap 5 -> 9
|
||||
9 match
|
||||
`},
|
||||
{"a+|b+", ` 0 fail
|
||||
1 rune1 "a" -> 2
|
||||
2 alt -> 1, 6
|
||||
3 rune1 "b" -> 4
|
||||
4 alt -> 3, 6
|
||||
5* alt -> 1, 3
|
||||
6 match
|
||||
`},
|
||||
{"A[Aa]", ` 0 fail
|
||||
1* rune1 "A" -> 2
|
||||
2 rune "A"/i -> 3
|
||||
3 match
|
||||
`},
|
||||
{"(?:(?:^).)", ` 0 fail
|
||||
1* empty 4 -> 2
|
||||
2 anynotnl -> 3
|
||||
3 match
|
||||
`},
|
||||
{"(?:|a)+", ` 0 fail
|
||||
1 nop -> 4
|
||||
2 rune1 "a" -> 4
|
||||
3* alt -> 1, 2
|
||||
4 alt -> 3, 5
|
||||
5 match
|
||||
`},
|
||||
{"(?:|a)*", ` 0 fail
|
||||
1 nop -> 4
|
||||
2 rune1 "a" -> 4
|
||||
3 alt -> 1, 2
|
||||
4 alt -> 3, 6
|
||||
5* alt -> 3, 6
|
||||
6 match
|
||||
`},
|
||||
}
|
||||
|
||||
func TestCompile(t *testing.T) {
|
||||
for _, tt := range compileTests {
|
||||
re, _ := Parse(tt.Regexp, Perl)
|
||||
p, _ := Compile(re)
|
||||
s := p.String()
|
||||
if s != tt.Prog {
|
||||
t.Errorf("compiled %#q:\n--- have\n%s---\n--- want\n%s---", tt.Regexp, s, tt.Prog)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkEmptyOpContext(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
var r1 rune = -1
|
||||
for _, r2 := range "foo, bar, baz\nsome input text.\n" {
|
||||
EmptyOpContext(r1, r2)
|
||||
r1 = r2
|
||||
}
|
||||
EmptyOpContext(r1, -1)
|
||||
}
|
||||
}
|
||||
|
||||
var sink any
|
||||
|
||||
func BenchmarkIsWordChar(b *testing.B) {
|
||||
const chars = "Don't communicate by sharing memory, share memory by communicating."
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, r := range chars {
|
||||
sink = IsWordChar(r)
|
||||
}
|
||||
}
|
||||
if sink == nil {
|
||||
b.Fatal("Benchmark did not run")
|
||||
}
|
||||
sink = nil
|
||||
}
|
511
lib/regexutil/syntax/regexp.go
Normal file
511
lib/regexutil/syntax/regexp.go
Normal file
|
@ -0,0 +1,511 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package syntax
|
||||
|
||||
// Note to implementers:
|
||||
// In this package, re is always a *Regexp and r is always a rune.
|
||||
|
||||
import (
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// A Regexp is a node in a regular expression syntax tree.
|
||||
type Regexp struct {
|
||||
Op Op // operator
|
||||
Flags Flags
|
||||
Sub []*Regexp // subexpressions, if any
|
||||
Sub0 [1]*Regexp // storage for short Sub
|
||||
Rune []rune // matched runes, for OpLiteral, OpCharClass
|
||||
Rune0 [2]rune // storage for short Rune
|
||||
Min, Max int // min, max for OpRepeat
|
||||
Cap int // capturing index, for OpCapture
|
||||
Name string // capturing name, for OpCapture
|
||||
}
|
||||
|
||||
//go:generate stringer -type Op -trimprefix Op
|
||||
|
||||
// An Op is a single regular expression operator.
|
||||
type Op uint8
|
||||
|
||||
// Operators are listed in precedence order, tightest binding to weakest.
|
||||
// Character class operators are listed simplest to most complex
|
||||
// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
|
||||
|
||||
const (
|
||||
OpNoMatch Op = 1 + iota // matches no strings
|
||||
OpEmptyMatch // matches empty string
|
||||
OpLiteral // matches Runes sequence
|
||||
OpCharClass // matches Runes interpreted as range pair list
|
||||
OpAnyCharNotNL // matches any character except newline
|
||||
OpAnyChar // matches any character
|
||||
OpBeginLine // matches empty string at beginning of line
|
||||
OpEndLine // matches empty string at end of line
|
||||
OpBeginText // matches empty string at beginning of text
|
||||
OpEndText // matches empty string at end of text
|
||||
OpWordBoundary // matches word boundary `\b`
|
||||
OpNoWordBoundary // matches word non-boundary `\B`
|
||||
OpCapture // capturing subexpression with index Cap, optional name Name
|
||||
OpStar // matches Sub[0] zero or more times
|
||||
OpPlus // matches Sub[0] one or more times
|
||||
OpQuest // matches Sub[0] zero or one times
|
||||
OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
|
||||
OpConcat // matches concatenation of Subs
|
||||
OpAlternate // matches alternation of Subs
|
||||
)
|
||||
|
||||
const opPseudo Op = 128 // where pseudo-ops start
|
||||
|
||||
// Equal reports whether x and y have identical structure.
|
||||
func (x *Regexp) Equal(y *Regexp) bool {
|
||||
if x == nil || y == nil {
|
||||
return x == y
|
||||
}
|
||||
if x.Op != y.Op {
|
||||
return false
|
||||
}
|
||||
switch x.Op {
|
||||
case OpEndText:
|
||||
// The parse flags remember whether this is \z or \Z.
|
||||
if x.Flags&WasDollar != y.Flags&WasDollar {
|
||||
return false
|
||||
}
|
||||
|
||||
case OpLiteral, OpCharClass:
|
||||
return slices.Equal(x.Rune, y.Rune)
|
||||
|
||||
case OpAlternate, OpConcat:
|
||||
return slices.EqualFunc(x.Sub, y.Sub, (*Regexp).Equal)
|
||||
|
||||
case OpStar, OpPlus, OpQuest:
|
||||
if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
|
||||
return false
|
||||
}
|
||||
|
||||
case OpRepeat:
|
||||
if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
|
||||
return false
|
||||
}
|
||||
|
||||
case OpCapture:
|
||||
if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
|
||||
type printFlags uint8
|
||||
|
||||
const (
|
||||
flagI printFlags = 1 << iota // (?i:
|
||||
flagM // (?m:
|
||||
flagS // (?s:
|
||||
flagOff // )
|
||||
flagPrec // (?: )
|
||||
negShift = 5 // flagI<<negShift is (?-i:
|
||||
)
|
||||
|
||||
// addSpan enables the flags f around start..last,
|
||||
// by setting flags[start] = f and flags[last] = flagOff.
|
||||
func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
|
||||
if *flags == nil {
|
||||
*flags = make(map[*Regexp]printFlags)
|
||||
}
|
||||
(*flags)[start] = f
|
||||
(*flags)[last] |= flagOff // maybe start==last
|
||||
}
|
||||
|
||||
// calcFlags calculates the flags to print around each subexpression in re,
|
||||
// storing that information in (*flags)[sub] for each affected subexpression.
|
||||
// The first time an entry needs to be written to *flags, calcFlags allocates the map.
|
||||
// calcFlags also calculates the flags that must be active or can't be active
|
||||
// around re and returns those flags.
|
||||
func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
|
||||
switch re.Op {
|
||||
default:
|
||||
return 0, 0
|
||||
|
||||
case OpLiteral:
|
||||
// If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
|
||||
// according to whether (?i) is active.
|
||||
// If literal is not fold-sensitive, return 0, 0.
|
||||
for _, r := range re.Rune {
|
||||
if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
|
||||
if re.Flags&FoldCase != 0 {
|
||||
return flagI, 0
|
||||
} else {
|
||||
return 0, flagI
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, 0
|
||||
|
||||
case OpCharClass:
|
||||
// If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
|
||||
// If literal is not fold-sensitive, return 0, 0.
|
||||
return calcFlagsI(re)
|
||||
|
||||
case OpAnyCharNotNL: // (?-s).
|
||||
return 0, flagS
|
||||
|
||||
case OpAnyChar: // (?s).
|
||||
return flagS, 0
|
||||
|
||||
case OpBeginLine, OpEndLine: // (?m)^ (?m)$
|
||||
return flagM, 0
|
||||
|
||||
case OpEndText:
|
||||
if re.Flags&WasDollar != 0 { // (?-m)$
|
||||
return 0, flagM
|
||||
}
|
||||
return 0, 0
|
||||
|
||||
case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
|
||||
return calcFlags(re.Sub[0], flags)
|
||||
|
||||
case OpConcat, OpAlternate:
|
||||
// Gather the must and cant for each subexpression.
|
||||
// When we find a conflicting subexpression, insert the necessary
|
||||
// flags around the previously identified span and start over.
|
||||
var must, cant, allCant printFlags
|
||||
start := 0
|
||||
last := 0
|
||||
did := false
|
||||
for i, sub := range re.Sub {
|
||||
subMust, subCant := calcFlags(sub, flags)
|
||||
if must&subCant != 0 || subMust&cant != 0 {
|
||||
if must != 0 {
|
||||
addSpan(re.Sub[start], re.Sub[last], must, flags)
|
||||
}
|
||||
must = 0
|
||||
cant = 0
|
||||
start = i
|
||||
did = true
|
||||
}
|
||||
must |= subMust
|
||||
cant |= subCant
|
||||
allCant |= subCant
|
||||
if subMust != 0 {
|
||||
last = i
|
||||
}
|
||||
if must == 0 && start == i {
|
||||
start++
|
||||
}
|
||||
}
|
||||
if !did {
|
||||
// No conflicts: pass the accumulated must and cant upward.
|
||||
return must, cant
|
||||
}
|
||||
if must != 0 {
|
||||
// Conflicts found; need to finish final span.
|
||||
addSpan(re.Sub[start], re.Sub[last], must, flags)
|
||||
}
|
||||
return 0, allCant
|
||||
}
|
||||
}
|
||||
|
||||
func calcFlagsI(re *Regexp) (must, cant printFlags) {
|
||||
inside := 0
|
||||
outside := 0
|
||||
pre := rune(minFold)
|
||||
for i := 0; i < len(re.Rune); i += 2 {
|
||||
lo := max(minFold, re.Rune[i])
|
||||
hi := min(maxFold, re.Rune[i+1])
|
||||
|
||||
inside += int(hi - lo)
|
||||
outside += int(hi - pre)
|
||||
pre = max(minFold, hi)
|
||||
}
|
||||
|
||||
outside += int(maxFold - pre)
|
||||
|
||||
if inside > outside {
|
||||
for i := 0; i < len(re.Rune); i += 2 {
|
||||
lo := max(minFold, re.Rune[i])
|
||||
hi := min(maxFold, re.Rune[i+1])
|
||||
for r := lo; r <= hi; r++ {
|
||||
for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
|
||||
if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
|
||||
return 0, flagI
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
// Check characters outside the defined range
|
||||
for i := 0; i < len(re.Rune); i += 2 {
|
||||
lo := max(minFold, re.Rune[i])
|
||||
hi := min(maxFold, re.Rune[i+1])
|
||||
// Check characters between `pre` and `lo` (outside the defined range)
|
||||
for r := pre; r < lo; r++ {
|
||||
for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
|
||||
if inCharClass(f, re.Rune) {
|
||||
return 0, flagI
|
||||
}
|
||||
}
|
||||
}
|
||||
pre = max(minFold, hi)
|
||||
}
|
||||
|
||||
// Check characters between `pre` and `maxFold`
|
||||
for r := pre; r <= maxFold; r++ {
|
||||
for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
|
||||
if inCharClass(f, re.Rune) {
|
||||
return 0, flagI
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
// writeRegexp writes the Perl syntax for the regular expression re to b.
|
||||
func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
|
||||
f |= flags[re]
|
||||
if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
|
||||
// flagPrec is redundant with other flags being added and terminated
|
||||
f &^= flagPrec
|
||||
}
|
||||
if f&^(flagOff|flagPrec) != 0 {
|
||||
b.WriteString(`(?`)
|
||||
if f&flagI != 0 {
|
||||
b.WriteString(`i`)
|
||||
}
|
||||
if f&flagM != 0 {
|
||||
b.WriteString(`m`)
|
||||
}
|
||||
if f&flagS != 0 {
|
||||
b.WriteString(`s`)
|
||||
}
|
||||
if f&((flagM|flagS)<<negShift) != 0 {
|
||||
b.WriteString(`-`)
|
||||
if f&(flagM<<negShift) != 0 {
|
||||
b.WriteString(`m`)
|
||||
}
|
||||
if f&(flagS<<negShift) != 0 {
|
||||
b.WriteString(`s`)
|
||||
}
|
||||
}
|
||||
b.WriteString(`:`)
|
||||
}
|
||||
if f&flagOff != 0 {
|
||||
defer b.WriteString(`)`)
|
||||
}
|
||||
if f&flagPrec != 0 {
|
||||
b.WriteString(`(?:`)
|
||||
defer b.WriteString(`)`)
|
||||
}
|
||||
|
||||
switch re.Op {
|
||||
default:
|
||||
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
|
||||
case OpNoMatch:
|
||||
b.WriteString(`[^\x00-\x{10FFFF}]`)
|
||||
case OpEmptyMatch:
|
||||
b.WriteString(`(?:)`)
|
||||
case OpLiteral:
|
||||
for _, r := range re.Rune {
|
||||
escape(b, r, false)
|
||||
}
|
||||
case OpCharClass:
|
||||
if len(re.Rune)%2 != 0 {
|
||||
b.WriteString(`[invalid char class]`)
|
||||
break
|
||||
}
|
||||
b.WriteRune('[')
|
||||
if len(re.Rune) == 0 {
|
||||
b.WriteString(`^\x00-\x{10FFFF}`)
|
||||
} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
|
||||
// Contains 0 and MaxRune. Probably a negated class.
|
||||
// Print the gaps.
|
||||
b.WriteRune('^')
|
||||
for i := 1; i < len(re.Rune)-1; i += 2 {
|
||||
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
|
||||
escape(b, lo, lo == '-')
|
||||
if lo != hi {
|
||||
if hi != lo+1 {
|
||||
b.WriteRune('-')
|
||||
}
|
||||
escape(b, hi, hi == '-')
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for i := 0; i < len(re.Rune); i += 2 {
|
||||
lo, hi := re.Rune[i], re.Rune[i+1]
|
||||
escape(b, lo, lo == '-')
|
||||
if lo != hi {
|
||||
if hi != lo+1 {
|
||||
b.WriteRune('-')
|
||||
}
|
||||
escape(b, hi, hi == '-')
|
||||
}
|
||||
}
|
||||
}
|
||||
b.WriteRune(']')
|
||||
case OpAnyCharNotNL, OpAnyChar:
|
||||
b.WriteString(`.`)
|
||||
case OpBeginLine:
|
||||
b.WriteString(`^`)
|
||||
case OpEndLine:
|
||||
b.WriteString(`$`)
|
||||
case OpBeginText:
|
||||
b.WriteString(`\A`)
|
||||
case OpEndText:
|
||||
if re.Flags&WasDollar != 0 {
|
||||
b.WriteString(`$`)
|
||||
} else {
|
||||
b.WriteString(`\z`)
|
||||
}
|
||||
case OpWordBoundary:
|
||||
b.WriteString(`\b`)
|
||||
case OpNoWordBoundary:
|
||||
b.WriteString(`\B`)
|
||||
case OpCapture:
|
||||
if re.Name != "" {
|
||||
b.WriteString(`(?P<`)
|
||||
b.WriteString(re.Name)
|
||||
b.WriteRune('>')
|
||||
} else {
|
||||
b.WriteRune('(')
|
||||
}
|
||||
if re.Sub[0].Op != OpEmptyMatch {
|
||||
writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
|
||||
}
|
||||
b.WriteRune(')')
|
||||
case OpStar, OpPlus, OpQuest, OpRepeat:
|
||||
p := printFlags(0)
|
||||
sub := re.Sub[0]
|
||||
if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
|
||||
p = flagPrec
|
||||
}
|
||||
writeRegexp(b, sub, p, flags)
|
||||
|
||||
switch re.Op {
|
||||
case OpStar:
|
||||
b.WriteRune('*')
|
||||
case OpPlus:
|
||||
b.WriteRune('+')
|
||||
case OpQuest:
|
||||
b.WriteRune('?')
|
||||
case OpRepeat:
|
||||
b.WriteRune('{')
|
||||
b.WriteString(strconv.Itoa(re.Min))
|
||||
if re.Max != re.Min {
|
||||
b.WriteRune(',')
|
||||
if re.Max >= 0 {
|
||||
b.WriteString(strconv.Itoa(re.Max))
|
||||
}
|
||||
}
|
||||
b.WriteRune('}')
|
||||
}
|
||||
if re.Flags&NonGreedy != 0 {
|
||||
b.WriteRune('?')
|
||||
}
|
||||
case OpConcat:
|
||||
for _, sub := range re.Sub {
|
||||
p := printFlags(0)
|
||||
if sub.Op == OpAlternate {
|
||||
p = flagPrec
|
||||
}
|
||||
writeRegexp(b, sub, p, flags)
|
||||
}
|
||||
case OpAlternate:
|
||||
for i, sub := range re.Sub {
|
||||
if i > 0 {
|
||||
b.WriteRune('|')
|
||||
}
|
||||
writeRegexp(b, sub, 0, flags)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (re *Regexp) String() string {
|
||||
var b strings.Builder
|
||||
var flags map[*Regexp]printFlags
|
||||
must, cant := calcFlags(re, &flags)
|
||||
must |= (cant &^ flagI) << negShift
|
||||
if must != 0 {
|
||||
must |= flagOff
|
||||
}
|
||||
writeRegexp(&b, re, must, flags)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
const meta = `\.+*?()|[]{}^$`
|
||||
|
||||
func escape(b *strings.Builder, r rune, force bool) {
|
||||
if unicode.IsPrint(r) {
|
||||
if strings.ContainsRune(meta, r) || force {
|
||||
b.WriteRune('\\')
|
||||
}
|
||||
b.WriteRune(r)
|
||||
return
|
||||
}
|
||||
|
||||
switch r {
|
||||
case '\a':
|
||||
b.WriteString(`\a`)
|
||||
case '\f':
|
||||
b.WriteString(`\f`)
|
||||
case '\n':
|
||||
b.WriteString(`\n`)
|
||||
case '\r':
|
||||
b.WriteString(`\r`)
|
||||
case '\t':
|
||||
b.WriteString(`\t`)
|
||||
case '\v':
|
||||
b.WriteString(`\v`)
|
||||
default:
|
||||
if r < 0x100 {
|
||||
b.WriteString(`\x`)
|
||||
s := strconv.FormatInt(int64(r), 16)
|
||||
if len(s) == 1 {
|
||||
b.WriteRune('0')
|
||||
}
|
||||
b.WriteString(s)
|
||||
break
|
||||
}
|
||||
b.WriteString(`\x{`)
|
||||
b.WriteString(strconv.FormatInt(int64(r), 16))
|
||||
b.WriteString(`}`)
|
||||
}
|
||||
}
|
||||
|
||||
// MaxCap walks the regexp to find the maximum capture index.
|
||||
func (re *Regexp) MaxCap() int {
|
||||
m := 0
|
||||
if re.Op == OpCapture {
|
||||
m = re.Cap
|
||||
}
|
||||
for _, sub := range re.Sub {
|
||||
if n := sub.MaxCap(); m < n {
|
||||
m = n
|
||||
}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// CapNames walks the regexp to find the names of capturing groups.
|
||||
func (re *Regexp) CapNames() []string {
|
||||
names := make([]string, re.MaxCap()+1)
|
||||
re.capNames(names)
|
||||
return names
|
||||
}
|
||||
|
||||
func (re *Regexp) capNames(names []string) {
|
||||
if re.Op == OpCapture {
|
||||
names[re.Cap] = re.Name
|
||||
}
|
||||
for _, sub := range re.Sub {
|
||||
sub.capNames(names)
|
||||
}
|
||||
}
|
151
lib/regexutil/syntax/simplify.go
Normal file
151
lib/regexutil/syntax/simplify.go
Normal file
|
@ -0,0 +1,151 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package syntax
|
||||
|
||||
// Simplify returns a regexp equivalent to re but without counted repetitions
|
||||
// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
|
||||
// The resulting regexp will execute correctly but its string representation
|
||||
// will not produce the same parse tree, because capturing parentheses
|
||||
// may have been duplicated or removed. For example, the simplified form
|
||||
// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
|
||||
// The returned regexp may share structure with or be the original.
|
||||
func (re *Regexp) Simplify() *Regexp {
|
||||
if re == nil {
|
||||
return nil
|
||||
}
|
||||
switch re.Op {
|
||||
case OpCapture, OpConcat, OpAlternate:
|
||||
// Simplify children, building new Regexp if children change.
|
||||
nre := re
|
||||
for i, sub := range re.Sub {
|
||||
nsub := sub.Simplify()
|
||||
if nre == re && nsub != sub {
|
||||
// Start a copy.
|
||||
nre = new(Regexp)
|
||||
*nre = *re
|
||||
nre.Rune = nil
|
||||
nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
|
||||
}
|
||||
if nre != re {
|
||||
nre.Sub = append(nre.Sub, nsub)
|
||||
}
|
||||
}
|
||||
return nre
|
||||
|
||||
case OpStar, OpPlus, OpQuest:
|
||||
sub := re.Sub[0].Simplify()
|
||||
return simplify1(re.Op, re.Flags, sub, re)
|
||||
|
||||
case OpRepeat:
|
||||
// Special special case: x{0} matches the empty string
|
||||
// and doesn't even need to consider x.
|
||||
if re.Min == 0 && re.Max == 0 {
|
||||
return &Regexp{Op: OpEmptyMatch}
|
||||
}
|
||||
|
||||
// The fun begins.
|
||||
sub := re.Sub[0].Simplify()
|
||||
|
||||
// x{n,} means at least n matches of x.
|
||||
if re.Max == -1 {
|
||||
// Special case: x{0,} is x*.
|
||||
if re.Min == 0 {
|
||||
return simplify1(OpStar, re.Flags, sub, nil)
|
||||
}
|
||||
|
||||
// Special case: x{1,} is x+.
|
||||
if re.Min == 1 {
|
||||
return simplify1(OpPlus, re.Flags, sub, nil)
|
||||
}
|
||||
|
||||
// General case: x{4,} is xxxx+.
|
||||
nre := &Regexp{Op: OpConcat}
|
||||
nre.Sub = nre.Sub0[:0]
|
||||
for i := 0; i < re.Min-1; i++ {
|
||||
nre.Sub = append(nre.Sub, sub)
|
||||
}
|
||||
nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
|
||||
return nre
|
||||
}
|
||||
|
||||
// Special case x{0} handled above.
|
||||
|
||||
// Special case: x{1} is just x.
|
||||
if re.Min == 1 && re.Max == 1 {
|
||||
return sub
|
||||
}
|
||||
|
||||
// General case: x{n,m} means n copies of x and m copies of x?
|
||||
// The machine will do less work if we nest the final m copies,
|
||||
// so that x{2,5} = xx(x(x(x)?)?)?
|
||||
|
||||
// Build leading prefix: xx.
|
||||
var prefix *Regexp
|
||||
if re.Min > 0 {
|
||||
prefix = &Regexp{Op: OpConcat}
|
||||
prefix.Sub = prefix.Sub0[:0]
|
||||
for i := 0; i < re.Min; i++ {
|
||||
prefix.Sub = append(prefix.Sub, sub)
|
||||
}
|
||||
}
|
||||
|
||||
// Build and attach suffix: (x(x(x)?)?)?
|
||||
if re.Max > re.Min {
|
||||
suffix := simplify1(OpQuest, re.Flags, sub, nil)
|
||||
for i := re.Min + 1; i < re.Max; i++ {
|
||||
nre2 := &Regexp{Op: OpConcat}
|
||||
nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
|
||||
suffix = simplify1(OpQuest, re.Flags, nre2, nil)
|
||||
}
|
||||
if prefix == nil {
|
||||
return suffix
|
||||
}
|
||||
prefix.Sub = append(prefix.Sub, suffix)
|
||||
}
|
||||
if prefix != nil {
|
||||
return prefix
|
||||
}
|
||||
|
||||
// Some degenerate case like min > max or min < max < 0.
|
||||
// Handle as impossible match.
|
||||
return &Regexp{Op: OpNoMatch}
|
||||
}
|
||||
|
||||
return re
|
||||
}
|
||||
|
||||
// simplify1 implements Simplify for the unary OpStar,
|
||||
// OpPlus, and OpQuest operators. It returns the simple regexp
|
||||
// equivalent to
|
||||
//
|
||||
// Regexp{Op: op, Flags: flags, Sub: {sub}}
|
||||
//
|
||||
// under the assumption that sub is already simple, and
|
||||
// without first allocating that structure. If the regexp
|
||||
// to be returned turns out to be equivalent to re, simplify1
|
||||
// returns re instead.
|
||||
//
|
||||
// simplify1 is factored out of Simplify because the implementation
|
||||
// for other operators generates these unary expressions.
|
||||
// Letting them call simplify1 makes sure the expressions they
|
||||
// generate are simple.
|
||||
func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
|
||||
// Special case: repeat the empty string as much as
|
||||
// you want, but it's still the empty string.
|
||||
if sub.Op == OpEmptyMatch {
|
||||
return sub
|
||||
}
|
||||
// The operators are idempotent if the flags match.
|
||||
if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
|
||||
return sub
|
||||
}
|
||||
if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
|
||||
return re
|
||||
}
|
||||
|
||||
re = &Regexp{Op: op, Flags: flags}
|
||||
re.Sub = append(re.Sub0[:0], sub)
|
||||
return re
|
||||
}
|
164
lib/regexutil/syntax/simplify_test.go
Normal file
164
lib/regexutil/syntax/simplify_test.go
Normal file
|
@ -0,0 +1,164 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package syntax
|
||||
|
||||
import "testing"
|
||||
|
||||
var simplifyTests = []struct {
|
||||
Regexp string
|
||||
Simple string
|
||||
}{
|
||||
// Already-simple constructs
|
||||
{`a`, `a`},
|
||||
{`ab`, `ab`},
|
||||
{`a|b`, `[ab]`},
|
||||
{`ab|cd`, `ab|cd`},
|
||||
{`(ab)*`, `(ab)*`},
|
||||
{`(ab)+`, `(ab)+`},
|
||||
{`(ab)?`, `(ab)?`},
|
||||
{`.`, `(?s:.)`},
|
||||
{`^`, `(?m:^)`},
|
||||
{`$`, `(?m:$)`},
|
||||
{`[ac]`, `[ac]`},
|
||||
{`[^ac]`, `[^ac]`},
|
||||
|
||||
// Posix character classes
|
||||
{`[[:alnum:]]`, `[0-9A-Za-z]`},
|
||||
{`[[:alpha:]]`, `[A-Za-z]`},
|
||||
{`[[:blank:]]`, `[\t ]`},
|
||||
{`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
|
||||
{`[[:digit:]]`, `[0-9]`},
|
||||
{`[[:graph:]]`, `[!-~]`},
|
||||
{`[[:lower:]]`, `[a-z]`},
|
||||
{`[[:print:]]`, `[ -~]`},
|
||||
{`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
|
||||
{`[[:space:]]`, `[\t-\r ]`},
|
||||
{`[[:upper:]]`, `[A-Z]`},
|
||||
{`[[:xdigit:]]`, `[0-9A-Fa-f]`},
|
||||
|
||||
// Perl character classes
|
||||
{`\d`, `[0-9]`},
|
||||
{`\s`, `[\t\n\f\r ]`},
|
||||
{`\w`, `[0-9A-Z_a-z]`},
|
||||
{`\D`, `[^0-9]`},
|
||||
{`\S`, `[^\t\n\f\r ]`},
|
||||
{`\W`, `[^0-9A-Z_a-z]`},
|
||||
{`[\d]`, `[0-9]`},
|
||||
{`[\s]`, `[\t\n\f\r ]`},
|
||||
{`[\w]`, `[0-9A-Z_a-z]`},
|
||||
{`[\D]`, `[^0-9]`},
|
||||
{`[\S]`, `[^\t\n\f\r ]`},
|
||||
{`[\W]`, `[^0-9A-Z_a-z]`},
|
||||
|
||||
// Posix repetitions
|
||||
{`a{1}`, `a`},
|
||||
{`a{2}`, `aa`},
|
||||
{`a{5}`, `aaaaa`},
|
||||
{`a{0,1}`, `a?`},
|
||||
// The next three are illegible because Simplify inserts (?:)
|
||||
// parens instead of () parens to avoid creating extra
|
||||
// captured subexpressions. The comments show a version with fewer parens.
|
||||
{`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)?
|
||||
{`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)?
|
||||
{`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
|
||||
{`a{0,2}`, `(?:aa?)?`}, // (aa?)?
|
||||
{`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)?
|
||||
{`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)?
|
||||
{`a{0,}`, `a*`},
|
||||
{`a{1,}`, `a+`},
|
||||
{`a{2,}`, `aa+`},
|
||||
{`a{5,}`, `aaaaa+`},
|
||||
|
||||
// Test that operators simplify their arguments.
|
||||
{`(?:a{1,}){1,}`, `a+`},
|
||||
{`(a{1,}b{1,})`, `(a+b+)`},
|
||||
{`a{1,}|b{1,}`, `a+|b+`},
|
||||
{`(?:a{1,})*`, `(?:a+)*`},
|
||||
{`(?:a{1,})+`, `a+`},
|
||||
{`(?:a{1,})?`, `(?:a+)?`},
|
||||
{``, `(?:)`},
|
||||
{`a{0}`, `(?:)`},
|
||||
|
||||
// Character class simplification
|
||||
{`[ab]`, `[ab]`},
|
||||
{`[abc]`, `[a-c]`},
|
||||
{`[a-za-za-z]`, `[a-z]`},
|
||||
{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
|
||||
{`[ABCDEFGH]`, `[A-H]`},
|
||||
{`[AB-CD-EF-GH]`, `[A-H]`},
|
||||
{`[W-ZP-XE-R]`, `[E-Z]`},
|
||||
{`[a-ee-gg-m]`, `[a-m]`},
|
||||
{`[a-ea-ha-m]`, `[a-m]`},
|
||||
{`[a-ma-ha-e]`, `[a-m]`},
|
||||
{`[a-zA-Z0-9 -~]`, `[ -~]`},
|
||||
|
||||
// Empty character classes
|
||||
{`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
|
||||
|
||||
// Full character classes
|
||||
{`[[:cntrl:][:^cntrl:]]`, `(?s:.)`},
|
||||
|
||||
// Unicode case folding.
|
||||
{`(?i)A`, `(?i:A)`},
|
||||
{`(?i)a`, `(?i:A)`},
|
||||
{`(?i)[A]`, `(?i:A)`},
|
||||
{`(?i)[a]`, `(?i:A)`},
|
||||
{`(?i)K`, `(?i:K)`},
|
||||
{`(?i)k`, `(?i:K)`},
|
||||
{`(?i)\x{212a}`, "(?i:K)"},
|
||||
{`(?i)[K]`, "[Kk\u212A]"},
|
||||
{`(?i)[k]`, "[Kk\u212A]"},
|
||||
{`(?i)[\x{212a}]`, "[Kk\u212A]"},
|
||||
{`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
|
||||
{`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
|
||||
{`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`},
|
||||
|
||||
// Empty string as a regular expression.
|
||||
// The empty string must be preserved inside parens in order
|
||||
// to make submatches work right, so these tests are less
|
||||
// interesting than they might otherwise be. String inserts
|
||||
// explicit (?:) in place of non-parenthesized empty strings,
|
||||
// to make them easier to spot for other parsers.
|
||||
{`(a|b|c|)`, `([a-c]|(?:))`},
|
||||
{`(a|b|)`, `([ab]|(?:))`},
|
||||
{`(|)`, `()`},
|
||||
{`a()`, `a()`},
|
||||
{`(()|())`, `(()|())`},
|
||||
{`(a|)`, `(a|(?:))`},
|
||||
{`ab()cd()`, `ab()cd()`},
|
||||
{`()`, `()`},
|
||||
{`()*`, `()*`},
|
||||
{`()+`, `()+`},
|
||||
{`()?`, `()?`},
|
||||
{`(){0}`, `(?:)`},
|
||||
{`(){1}`, `()`},
|
||||
{`(){1,}`, `()+`},
|
||||
{`(){0,2}`, `(?:()()?)?`},
|
||||
}
|
||||
|
||||
func TestSimplify2(t *testing.T) {
|
||||
re, err := Parse(`[a-ee-gg-m]`, Perl|DotNL)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
s := re.Simplify().String()
|
||||
if s != `[a-m]` {
|
||||
t.Errorf("Simplify(%#q) = %#q, want %#q", re.String(), s, `[a-m]`)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSimplify(t *testing.T) {
|
||||
for _, tt := range simplifyTests {
|
||||
re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
|
||||
if err != nil {
|
||||
t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
|
||||
continue
|
||||
}
|
||||
s := re.Simplify().String()
|
||||
if s != tt.Simple {
|
||||
t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue