// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package syntax // Note to implementers: // In this package, re is always a *Regexp and r is always a rune. import ( "slices" "strconv" "strings" "unicode" ) // A Regexp is a node in a regular expression syntax tree. type Regexp struct { Op Op // operator Flags Flags Sub []*Regexp // subexpressions, if any Sub0 [1]*Regexp // storage for short Sub Rune []rune // matched runes, for OpLiteral, OpCharClass Rune0 [2]rune // storage for short Rune Min, Max int // min, max for OpRepeat Cap int // capturing index, for OpCapture Name string // capturing name, for OpCapture } //go:generate stringer -type Op -trimprefix Op // An Op is a single regular expression operator. type Op uint8 // Operators are listed in precedence order, tightest binding to weakest. // Character class operators are listed simplest to most complex // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). const ( OpNoMatch Op = 1 + iota // matches no strings OpEmptyMatch // matches empty string OpLiteral // matches Runes sequence OpCharClass // matches Runes interpreted as range pair list OpAnyCharNotNL // matches any character except newline OpAnyChar // matches any character OpBeginLine // matches empty string at beginning of line OpEndLine // matches empty string at end of line OpBeginText // matches empty string at beginning of text OpEndText // matches empty string at end of text OpWordBoundary // matches word boundary `\b` OpNoWordBoundary // matches word non-boundary `\B` OpCapture // capturing subexpression with index Cap, optional name Name OpStar // matches Sub[0] zero or more times OpPlus // matches Sub[0] one or more times OpQuest // matches Sub[0] zero or one times OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) OpConcat // matches concatenation of Subs OpAlternate // matches alternation of Subs ) const opPseudo Op = 128 // where pseudo-ops start // Equal reports whether x and y have identical structure. func (x *Regexp) Equal(y *Regexp) bool { if x == nil || y == nil { return x == y } if x.Op != y.Op { return false } switch x.Op { case OpEndText: // The parse flags remember whether this is \z or \Z. if x.Flags&WasDollar != y.Flags&WasDollar { return false } case OpLiteral, OpCharClass: return slices.Equal(x.Rune, y.Rune) case OpAlternate, OpConcat: return slices.EqualFunc(x.Sub, y.Sub, func(a, b *Regexp) bool { return a.Equal(b) }) case OpStar, OpPlus, OpQuest: if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { return false } case OpRepeat: if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { return false } case OpCapture: if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { return false } } return true } // printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp. type printFlags uint8 const ( flagI printFlags = 1 << iota // (?i: flagM // (?m: flagS // (?s: flagOff // ) flagPrec // (?: ) negShift = 5 // flagI<<negShift is (?-i: ) // addSpan enables the flags f around start..last, // by setting flags[start] = f and flags[last] = flagOff. func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) { if *flags == nil { *flags = make(map[*Regexp]printFlags) } (*flags)[start] = f (*flags)[last] |= flagOff // maybe start==last } // calcFlags calculates the flags to print around each subexpression in re, // storing that information in (*flags)[sub] for each affected subexpression. // The first time an entry needs to be written to *flags, calcFlags allocates the map. // calcFlags also calculates the flags that must be active or can't be active // around re and returns those flags. func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) { switch re.Op { default: return 0, 0 case OpLiteral: // If literal is fold-sensitive, return (flagI, 0) or (0, flagI) // according to whether (?i) is active. // If literal is not fold-sensitive, return 0, 0. for _, r := range re.Rune { if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r { if re.Flags&FoldCase != 0 { return flagI, 0 } else { return 0, flagI } } } return 0, 0 case OpCharClass: // If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out. // If literal is not fold-sensitive, return 0, 0. for i := 0; i < len(re.Rune); i += 2 { lo := max(minFold, re.Rune[i]) hi := min(maxFold, re.Rune[i+1]) for r := lo; r <= hi; r++ { for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) { if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) { return 0, flagI } } } } return 0, 0 case OpAnyCharNotNL: // (?-s). return 0, flagS case OpAnyChar: // (?s). return flagS, 0 case OpBeginLine, OpEndLine: // (?m)^ (?m)$ return flagM, 0 case OpEndText: if re.Flags&WasDollar != 0 { // (?-m)$ return 0, flagM } return 0, 0 case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat: return calcFlags(re.Sub[0], flags) case OpConcat, OpAlternate: // Gather the must and cant for each subexpression. // When we find a conflicting subexpression, insert the necessary // flags around the previously identified span and start over. var must, cant, allCant printFlags start := 0 last := 0 did := false for i, sub := range re.Sub { subMust, subCant := calcFlags(sub, flags) if must&subCant != 0 || subMust&cant != 0 { if must != 0 { addSpan(re.Sub[start], re.Sub[last], must, flags) } must = 0 cant = 0 start = i did = true } must |= subMust cant |= subCant allCant |= subCant if subMust != 0 { last = i } if must == 0 && start == i { start++ } } if !did { // No conflicts: pass the accumulated must and cant upward. return must, cant } if must != 0 { // Conflicts found; need to finish final span. addSpan(re.Sub[start], re.Sub[last], must, flags) } return 0, allCant } } // writeRegexp writes the Perl syntax for the regular expression re to b. func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) { f |= flags[re] if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 { // flagPrec is redundant with other flags being added and terminated f &^= flagPrec } if f&^(flagOff|flagPrec) != 0 { b.WriteString(`(?`) if f&flagI != 0 { b.WriteString(`i`) } if f&flagM != 0 { b.WriteString(`m`) } if f&flagS != 0 { b.WriteString(`s`) } if f&((flagM|flagS)<<negShift) != 0 { b.WriteString(`-`) if f&(flagM<<negShift) != 0 { b.WriteString(`m`) } if f&(flagS<<negShift) != 0 { b.WriteString(`s`) } } b.WriteString(`:`) } if f&flagOff != 0 { defer b.WriteString(`)`) } if f&flagPrec != 0 { b.WriteString(`(?:`) defer b.WriteString(`)`) } switch re.Op { default: b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") case OpNoMatch: b.WriteString(`[^\x00-\x{10FFFF}]`) case OpEmptyMatch: b.WriteString(`(?:)`) case OpLiteral: for _, r := range re.Rune { escape(b, r, false) } case OpCharClass: if len(re.Rune)%2 != 0 { b.WriteString(`[invalid char class]`) break } b.WriteRune('[') if len(re.Rune) == 0 { b.WriteString(`^\x00-\x{10FFFF}`) } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { // Contains 0 and MaxRune. Probably a negated class. // Print the gaps. b.WriteRune('^') for i := 1; i < len(re.Rune)-1; i += 2 { lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 escape(b, lo, lo == '-') if lo != hi { if hi != lo+1 { b.WriteRune('-') } escape(b, hi, hi == '-') } } } else { for i := 0; i < len(re.Rune); i += 2 { lo, hi := re.Rune[i], re.Rune[i+1] escape(b, lo, lo == '-') if lo != hi { if hi != lo+1 { b.WriteRune('-') } escape(b, hi, hi == '-') } } } b.WriteRune(']') case OpAnyCharNotNL, OpAnyChar: b.WriteString(`.`) case OpBeginLine: b.WriteString(`^`) case OpEndLine: b.WriteString(`$`) case OpBeginText: b.WriteString(`\A`) case OpEndText: if re.Flags&WasDollar != 0 { b.WriteString(`$`) } else { b.WriteString(`\z`) } case OpWordBoundary: b.WriteString(`\b`) case OpNoWordBoundary: b.WriteString(`\B`) case OpCapture: if re.Name != "" { b.WriteString(`(?P<`) b.WriteString(re.Name) b.WriteRune('>') } else { b.WriteRune('(') } if re.Sub[0].Op != OpEmptyMatch { writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags) } b.WriteRune(')') case OpStar, OpPlus, OpQuest, OpRepeat: p := printFlags(0) sub := re.Sub[0] if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { p = flagPrec } writeRegexp(b, sub, p, flags) switch re.Op { case OpStar: b.WriteRune('*') case OpPlus: b.WriteRune('+') case OpQuest: b.WriteRune('?') case OpRepeat: b.WriteRune('{') b.WriteString(strconv.Itoa(re.Min)) if re.Max != re.Min { b.WriteRune(',') if re.Max >= 0 { b.WriteString(strconv.Itoa(re.Max)) } } b.WriteRune('}') } if re.Flags&NonGreedy != 0 { b.WriteRune('?') } case OpConcat: for _, sub := range re.Sub { p := printFlags(0) if sub.Op == OpAlternate { p = flagPrec } writeRegexp(b, sub, p, flags) } case OpAlternate: for i, sub := range re.Sub { if i > 0 { b.WriteRune('|') } writeRegexp(b, sub, 0, flags) } } } func (re *Regexp) String() string { var b strings.Builder var flags map[*Regexp]printFlags must, cant := calcFlags(re, &flags) must |= (cant &^ flagI) << negShift if must != 0 { must |= flagOff } writeRegexp(&b, re, must, flags) return b.String() } const meta = `\.+*?()|[]{}^$` func escape(b *strings.Builder, r rune, force bool) { if unicode.IsPrint(r) { if strings.ContainsRune(meta, r) || force { b.WriteRune('\\') } b.WriteRune(r) return } switch r { case '\a': b.WriteString(`\a`) case '\f': b.WriteString(`\f`) case '\n': b.WriteString(`\n`) case '\r': b.WriteString(`\r`) case '\t': b.WriteString(`\t`) case '\v': b.WriteString(`\v`) default: if r < 0x100 { b.WriteString(`\x`) s := strconv.FormatInt(int64(r), 16) if len(s) == 1 { b.WriteRune('0') } b.WriteString(s) break } b.WriteString(`\x{`) b.WriteString(strconv.FormatInt(int64(r), 16)) b.WriteString(`}`) } } // MaxCap walks the regexp to find the maximum capture index. func (re *Regexp) MaxCap() int { m := 0 if re.Op == OpCapture { m = re.Cap } for _, sub := range re.Sub { if n := sub.MaxCap(); m < n { m = n } } return m } // CapNames walks the regexp to find the names of capturing groups. func (re *Regexp) CapNames() []string { names := make([]string, re.MaxCap()+1) re.capNames(names) return names } func (re *Regexp) capNames(names []string) { if re.Op == OpCapture { names[re.Cap] = re.Name } for _, sub := range re.Sub { sub.capNames(names) } }