| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326 |
- // Copyright 2011 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the GO-LICENSE file.
- // Modifications of this file, if any, are
- //
- // Copyright 2023 The Regexp Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package syntax // modernc.org/regexp/syntax
- // Note to implementers:
- // In this package, re is always a *Regexp and r is always a rune.
- import (
- "strconv"
- "strings"
- "unicode"
- )
- // A Regexp is a node in a regular expression syntax tree.
- type Regexp struct {
- Op Op // operator
- Flags Flags
- Sub []*Regexp // subexpressions, if any
- Sub0 [1]*Regexp // storage for short Sub
- Rune []rune // matched runes, for OpLiteral, OpCharClass
- Rune0 [2]rune // storage for short Rune
- Min, Max int // min, max for OpRepeat
- Cap int // capturing index, for OpCapture
- Name string // capturing name, for OpCapture
- }
- //go:generate stringer -type Op -trimprefix Op
- // An Op is a single regular expression operator.
- type Op uint8
- // Operators are listed in precedence order, tightest binding to weakest.
- // Character class operators are listed simplest to most complex
- // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
- const (
- OpNoMatch Op = 1 + iota // matches no strings
- OpEmptyMatch // matches empty string
- OpLiteral // matches Runes sequence
- OpCharClass // matches Runes interpreted as range pair list
- OpAnyCharNotNL // matches any character except newline
- OpAnyChar // matches any character
- OpBeginLine // matches empty string at beginning of line
- OpEndLine // matches empty string at end of line
- OpBeginText // matches empty string at beginning of text
- OpEndText // matches empty string at end of text
- OpWordBoundary // matches word boundary `\b`
- OpNoWordBoundary // matches word non-boundary `\B`
- OpCapture // capturing subexpression with index Cap, optional name Name
- OpStar // matches Sub[0] zero or more times
- OpPlus // matches Sub[0] one or more times
- OpQuest // matches Sub[0] zero or one times
- OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
- OpConcat // matches concatenation of Subs
- OpAlternate // matches alternation of Subs
- )
- const opPseudo Op = 128 // where pseudo-ops start
- // Equal reports whether x and y have identical structure.
- func (x *Regexp) Equal(y *Regexp) bool {
- if x == nil || y == nil {
- return x == y
- }
- if x.Op != y.Op {
- return false
- }
- switch x.Op {
- case OpEndText:
- // The parse flags remember whether this is \z or \Z.
- if x.Flags&WasDollar != y.Flags&WasDollar {
- return false
- }
- case OpLiteral, OpCharClass:
- if len(x.Rune) != len(y.Rune) {
- return false
- }
- for i, r := range x.Rune {
- if r != y.Rune[i] {
- return false
- }
- }
- case OpAlternate, OpConcat:
- if len(x.Sub) != len(y.Sub) {
- return false
- }
- for i, sub := range x.Sub {
- if !sub.Equal(y.Sub[i]) {
- return false
- }
- }
- case OpStar, OpPlus, OpQuest:
- if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
- return false
- }
- case OpRepeat:
- if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
- return false
- }
- case OpCapture:
- if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
- return false
- }
- }
- return true
- }
- // writeRegexp writes the Perl syntax for the regular expression re to b.
- func writeRegexp(b *strings.Builder, re *Regexp) {
- switch re.Op {
- default:
- b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
- case OpNoMatch:
- b.WriteString(`[^\x00-\x{10FFFF}]`)
- case OpEmptyMatch:
- b.WriteString(`(?:)`)
- case OpLiteral:
- if re.Flags&FoldCase != 0 {
- b.WriteString(`(?i:`)
- }
- for _, r := range re.Rune {
- escape(b, r, false)
- }
- if re.Flags&FoldCase != 0 {
- b.WriteString(`)`)
- }
- case OpCharClass:
- if len(re.Rune)%2 != 0 {
- b.WriteString(`[invalid char class]`)
- break
- }
- b.WriteRune('[')
- if len(re.Rune) == 0 {
- b.WriteString(`^\x00-\x{10FFFF}`)
- } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
- // Contains 0 and MaxRune. Probably a negated class.
- // Print the gaps.
- b.WriteRune('^')
- for i := 1; i < len(re.Rune)-1; i += 2 {
- lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
- escape(b, lo, lo == '-')
- if lo != hi {
- b.WriteRune('-')
- escape(b, hi, hi == '-')
- }
- }
- } else {
- for i := 0; i < len(re.Rune); i += 2 {
- lo, hi := re.Rune[i], re.Rune[i+1]
- escape(b, lo, lo == '-')
- if lo != hi {
- b.WriteRune('-')
- escape(b, hi, hi == '-')
- }
- }
- }
- b.WriteRune(']')
- case OpAnyCharNotNL:
- b.WriteString(`(?-s:.)`)
- case OpAnyChar:
- b.WriteString(`(?s:.)`)
- case OpBeginLine:
- b.WriteString(`(?m:^)`)
- case OpEndLine:
- b.WriteString(`(?m:$)`)
- case OpBeginText:
- b.WriteString(`\A`)
- case OpEndText:
- if re.Flags&WasDollar != 0 {
- b.WriteString(`(?-m:$)`)
- } else {
- b.WriteString(`\z`)
- }
- case OpWordBoundary:
- b.WriteString(`\b`)
- case OpNoWordBoundary:
- b.WriteString(`\B`)
- case OpCapture:
- if re.Name != "" {
- b.WriteString(`(?P<`)
- b.WriteString(re.Name)
- b.WriteRune('>')
- } else {
- b.WriteRune('(')
- }
- if re.Sub[0].Op != OpEmptyMatch {
- writeRegexp(b, re.Sub[0])
- }
- b.WriteRune(')')
- case OpStar, OpPlus, OpQuest, OpRepeat:
- if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
- b.WriteString(`(?:`)
- writeRegexp(b, sub)
- b.WriteString(`)`)
- } else {
- writeRegexp(b, sub)
- }
- switch re.Op {
- case OpStar:
- b.WriteRune('*')
- case OpPlus:
- b.WriteRune('+')
- case OpQuest:
- b.WriteRune('?')
- case OpRepeat:
- b.WriteRune('{')
- b.WriteString(strconv.Itoa(re.Min))
- if re.Max != re.Min {
- b.WriteRune(',')
- if re.Max >= 0 {
- b.WriteString(strconv.Itoa(re.Max))
- }
- }
- b.WriteRune('}')
- }
- if re.Flags&NonGreedy != 0 {
- b.WriteRune('?')
- }
- case OpConcat:
- for _, sub := range re.Sub {
- if sub.Op == OpAlternate {
- b.WriteString(`(?:`)
- writeRegexp(b, sub)
- b.WriteString(`)`)
- } else {
- writeRegexp(b, sub)
- }
- }
- case OpAlternate:
- for i, sub := range re.Sub {
- if i > 0 {
- b.WriteRune('|')
- }
- writeRegexp(b, sub)
- }
- }
- }
- func (re *Regexp) String() string {
- var b strings.Builder
- writeRegexp(&b, re)
- return b.String()
- }
- const meta = `\.+*?()|[]{}^$`
- func escape(b *strings.Builder, r rune, force bool) {
- if unicode.IsPrint(r) {
- if strings.ContainsRune(meta, r) || force {
- b.WriteRune('\\')
- }
- b.WriteRune(r)
- return
- }
- switch r {
- case '\a':
- b.WriteString(`\a`)
- case '\f':
- b.WriteString(`\f`)
- case '\n':
- b.WriteString(`\n`)
- case '\r':
- b.WriteString(`\r`)
- case '\t':
- b.WriteString(`\t`)
- case '\v':
- b.WriteString(`\v`)
- default:
- if r < 0x100 {
- b.WriteString(`\x`)
- s := strconv.FormatInt(int64(r), 16)
- if len(s) == 1 {
- b.WriteRune('0')
- }
- b.WriteString(s)
- break
- }
- b.WriteString(`\x{`)
- b.WriteString(strconv.FormatInt(int64(r), 16))
- b.WriteString(`}`)
- }
- }
- // MaxCap walks the regexp to find the maximum capture index.
- func (re *Regexp) MaxCap() int {
- m := 0
- if re.Op == OpCapture {
- m = re.Cap
- }
- for _, sub := range re.Sub {
- if n := sub.MaxCap(); m < n {
- m = n
- }
- }
- return m
- }
- // CapNames walks the regexp to find the names of capturing groups.
- func (re *Regexp) CapNames() []string {
- names := make([]string, re.MaxCap()+1)
- re.capNames(names)
- return names
- }
- func (re *Regexp) capNames(names []string) {
- if re.Op == OpCapture {
- names[re.Cap] = re.Name
- }
- for _, sub := range re.Sub {
- sub.capNames(names)
- }
- }
|