| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126 |
- // Copyright 2011 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the GO-LICENSE file.
- // Modifications of this file, if any, are
- //
- // Copyright 2023 The Regexp Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package syntax // modernc.org/regexp/syntax
- import (
- "sort"
- "strings"
- "unicode"
- "unicode/utf8"
- )
- // An Error describes a failure to parse a regular expression
- // and gives the offending expression.
- type Error struct {
- Code ErrorCode
- Expr string
- }
- func (e *Error) Error() string {
- return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
- }
- // An ErrorCode describes a failure to parse a regular expression.
- type ErrorCode string
- const (
- // Unexpected error
- ErrInternalError ErrorCode = "regexp/syntax: internal error"
- // Parse errors
- ErrInvalidCharClass ErrorCode = "invalid character class"
- ErrInvalidCharRange ErrorCode = "invalid character class range"
- ErrInvalidEscape ErrorCode = "invalid escape sequence"
- ErrInvalidNamedCapture ErrorCode = "invalid named capture"
- ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
- ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
- ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
- ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
- ErrMissingBracket ErrorCode = "missing closing ]"
- ErrMissingParen ErrorCode = "missing closing )"
- ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
- ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
- ErrUnexpectedParen ErrorCode = "unexpected )"
- ErrNestingDepth ErrorCode = "expression nests too deeply"
- ErrLarge ErrorCode = "expression too large"
- )
- func (e ErrorCode) String() string {
- return string(e)
- }
- // Flags control the behavior of the parser and record information about regexp context.
- type Flags uint16
- const (
- FoldCase Flags = 1 << iota // case-insensitive match
- Literal // treat pattern as literal string
- ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline
- DotNL // allow . to match newline
- OneLine // treat ^ and $ as only matching at beginning and end of text
- NonGreedy // make repetition operators default to non-greedy
- PerlX // allow Perl extensions
- UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation
- WasDollar // regexp OpEndText was $, not \z
- Simple // regexp contains no counted repetition
- DisableOptimizations // can help constructing a DFA later
- MatchNL = ClassNL | DotNL
- Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
- POSIX Flags = 0 // POSIX syntax
- )
- // Pseudo-ops for parsing stack.
- const (
- opLeftParen = opPseudo + iota
- opVerticalBar
- )
- // maxHeight is the maximum height of a regexp parse tree.
- // It is somewhat arbitrarily chosen, but the idea is to be large enough
- // that no one will actually hit in real use but at the same time small enough
- // that recursion on the Regexp tree will not hit the 1GB Go stack limit.
- // The maximum amount of stack for a single recursive frame is probably
- // closer to 1kB, so this could potentially be raised, but it seems unlikely
- // that people have regexps nested even this deeply.
- // We ran a test on Google's C++ code base and turned up only
- // a single use case with depth > 100; it had depth 128.
- // Using depth 1000 should be plenty of margin.
- // As an optimization, we don't even bother calculating heights
- // until we've allocated at least maxHeight Regexp structures.
- const maxHeight = 1000
- // maxSize is the maximum size of a compiled regexp in Insts.
- // It too is somewhat arbitrarily chosen, but the idea is to be large enough
- // to allow significant regexps while at the same time small enough that
- // the compiled form will not take up too much memory.
- // 128 MB is enough for a 3.3 million Inst structures, which roughly
- // corresponds to a 3.3 MB regexp.
- const (
- maxSize = 128 << 20 / instSize
- instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words
- )
- // maxRunes is the maximum number of runes allowed in a regexp tree
- // counting the runes in all the nodes.
- // Ignoring character classes p.numRunes is always less than the length of the regexp.
- // Character classes can make it much larger: each \pL adds 1292 runes.
- // 128 MB is enough for 32M runes, which is over 26k \pL instances.
- // Note that repetitions do not make copies of the rune slices,
- // so \pL{1000} is only one rune slice, not 1000.
- // We could keep a cache of character classes we've seen,
- // so that all the \pL we see use the same rune list,
- // but that doesn't remove the problem entirely:
- // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
- // And because the Rune slice is exposed directly in the Regexp,
- // there is not an opportunity to change the representation to allow
- // partial sharing between different character classes.
- // So the limit is the best we can do.
- const (
- maxRunes = 128 << 20 / runeSize
- runeSize = 4 // rune is int32
- )
- type parser struct {
- flags Flags // parse mode flags
- stack []*Regexp // stack of parsed expressions
- free *Regexp
- numCap int // number of capturing groups seen
- wholeRegexp string
- tmpClass []rune // temporary char class work space
- numRegexp int // number of regexps allocated
- numRunes int // number of runes in char classes
- repeats int64 // product of all repetitions seen
- height map[*Regexp]int // regexp height, for height limit check
- size map[*Regexp]int64 // regexp compiled size, for size limit check
- }
- func (p *parser) optimizations() bool { return p.flags&DisableOptimizations == 0 }
- func (p *parser) newRegexp(op Op) *Regexp {
- re := p.free
- if re != nil {
- p.free = re.Sub0[0]
- *re = Regexp{}
- } else {
- re = new(Regexp)
- p.numRegexp++
- }
- re.Op = op
- return re
- }
- func (p *parser) reuse(re *Regexp) {
- if p.height != nil {
- delete(p.height, re)
- }
- re.Sub0[0] = p.free
- p.free = re
- }
- func (p *parser) checkLimits(re *Regexp) {
- if p.numRunes > maxRunes {
- panic(ErrLarge)
- }
- p.checkSize(re)
- p.checkHeight(re)
- }
- func (p *parser) checkSize(re *Regexp) {
- if p.size == nil {
- // We haven't started tracking size yet.
- // Do a relatively cheap check to see if we need to start.
- // Maintain the product of all the repeats we've seen
- // and don't track if the total number of regexp nodes
- // we've seen times the repeat product is in budget.
- if p.repeats == 0 {
- p.repeats = 1
- }
- if re.Op == OpRepeat {
- n := re.Max
- if n == -1 {
- n = re.Min
- }
- if n <= 0 {
- n = 1
- }
- if int64(n) > maxSize/p.repeats {
- p.repeats = maxSize
- } else {
- p.repeats *= int64(n)
- }
- }
- if int64(p.numRegexp) < maxSize/p.repeats {
- return
- }
- // We need to start tracking size.
- // Make the map and belatedly populate it
- // with info about everything we've constructed so far.
- p.size = make(map[*Regexp]int64)
- for _, re := range p.stack {
- p.checkSize(re)
- }
- }
- if p.calcSize(re, true) > maxSize {
- panic(ErrLarge)
- }
- }
- func (p *parser) calcSize(re *Regexp, force bool) int64 {
- if !force {
- if size, ok := p.size[re]; ok {
- return size
- }
- }
- var size int64
- switch re.Op {
- case OpLiteral:
- size = int64(len(re.Rune))
- case OpCapture, OpStar:
- // star can be 1+ or 2+; assume 2 pessimistically
- size = 2 + p.calcSize(re.Sub[0], false)
- case OpPlus, OpQuest:
- size = 1 + p.calcSize(re.Sub[0], false)
- case OpConcat:
- for _, sub := range re.Sub {
- size += p.calcSize(sub, false)
- }
- case OpAlternate:
- for _, sub := range re.Sub {
- size += p.calcSize(sub, false)
- }
- if len(re.Sub) > 1 {
- size += int64(len(re.Sub)) - 1
- }
- case OpRepeat:
- sub := p.calcSize(re.Sub[0], false)
- if re.Max == -1 {
- if re.Min == 0 {
- size = 2 + sub // x*
- } else {
- size = 1 + int64(re.Min)*sub // xxx+
- }
- break
- }
- // x{2,5} = xx(x(x(x)?)?)?
- size = int64(re.Max)*sub + int64(re.Max-re.Min)
- }
- if size < 1 {
- size = 1
- }
- p.size[re] = size
- return size
- }
- func (p *parser) checkHeight(re *Regexp) {
- if p.numRegexp < maxHeight {
- return
- }
- if p.height == nil {
- p.height = make(map[*Regexp]int)
- for _, re := range p.stack {
- p.checkHeight(re)
- }
- }
- if p.calcHeight(re, true) > maxHeight {
- panic(ErrNestingDepth)
- }
- }
- func (p *parser) calcHeight(re *Regexp, force bool) int {
- if !force {
- if h, ok := p.height[re]; ok {
- return h
- }
- }
- h := 1
- for _, sub := range re.Sub {
- hsub := p.calcHeight(sub, false)
- if h < 1+hsub {
- h = 1 + hsub
- }
- }
- p.height[re] = h
- return h
- }
- // Parse stack manipulation.
- // push pushes the regexp re onto the parse stack and returns the regexp.
- func (p *parser) push(re *Regexp) *Regexp {
- p.numRunes += len(re.Rune)
- if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
- // Single rune.
- if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
- return nil
- }
- re.Op = OpLiteral
- re.Rune = re.Rune[:1]
- re.Flags = p.flags &^ FoldCase
- } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
- re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
- unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
- unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
- re.Op == OpCharClass && len(re.Rune) == 2 &&
- re.Rune[0]+1 == re.Rune[1] &&
- unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
- unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
- // Case-insensitive rune like [Aa] or [Δδ].
- if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
- return nil
- }
- // Rewrite as (case-insensitive) literal.
- re.Op = OpLiteral
- re.Rune = re.Rune[:1]
- re.Flags = p.flags | FoldCase
- } else {
- // Incremental concatenation.
- p.maybeConcat(-1, 0)
- }
- p.stack = append(p.stack, re)
- p.checkLimits(re)
- return re
- }
- // maybeConcat implements incremental concatenation
- // of literal runes into string nodes. The parser calls this
- // before each push, so only the top fragment of the stack
- // might need processing. Since this is called before a push,
- // the topmost literal is no longer subject to operators like *
- // (Otherwise ab* would turn into (ab)*.)
- // If r >= 0 and there's a node left over, maybeConcat uses it
- // to push r with the given flags.
- // maybeConcat reports whether r was pushed.
- func (p *parser) maybeConcat(r rune, flags Flags) bool {
- n := len(p.stack)
- if n < 2 {
- return false
- }
- re1 := p.stack[n-1]
- re2 := p.stack[n-2]
- if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
- return false
- }
- // Push re1 into re2.
- re2.Rune = append(re2.Rune, re1.Rune...)
- // Reuse re1 if possible.
- if r >= 0 {
- re1.Rune = re1.Rune0[:1]
- re1.Rune[0] = r
- re1.Flags = flags
- return true
- }
- p.stack = p.stack[:n-1]
- p.reuse(re1)
- return false // did not push r
- }
- // literal pushes a literal regexp for the rune r on the stack.
- func (p *parser) literal(r rune) {
- re := p.newRegexp(OpLiteral)
- re.Flags = p.flags
- if p.flags&FoldCase != 0 {
- r = minFoldRune(r)
- }
- re.Rune0[0] = r
- re.Rune = re.Rune0[:1]
- p.push(re)
- }
- // minFoldRune returns the minimum rune fold-equivalent to r.
- func minFoldRune(r rune) rune {
- if r < minFold || r > maxFold {
- return r
- }
- min := r
- r0 := r
- for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
- if min > r {
- min = r
- }
- }
- return min
- }
- // op pushes a regexp with the given op onto the stack
- // and returns that regexp.
- func (p *parser) op(op Op) *Regexp {
- re := p.newRegexp(op)
- re.Flags = p.flags
- return p.push(re)
- }
- // repeat replaces the top stack element with itself repeated according to op, min, max.
- // before is the regexp suffix starting at the repetition operator.
- // after is the regexp suffix following after the repetition operator.
- // repeat returns an updated 'after' and an error, if any.
- func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
- flags := p.flags
- if p.flags&PerlX != 0 {
- if len(after) > 0 && after[0] == '?' {
- after = after[1:]
- flags ^= NonGreedy
- }
- if lastRepeat != "" {
- // In Perl it is not allowed to stack repetition operators:
- // a** is a syntax error, not a doubled star, and a++ means
- // something else entirely, which we don't support!
- return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
- }
- }
- n := len(p.stack)
- if n == 0 {
- return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
- }
- sub := p.stack[n-1]
- if sub.Op >= opPseudo {
- return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
- }
- re := p.newRegexp(op)
- re.Min = min
- re.Max = max
- re.Flags = flags
- re.Sub = re.Sub0[:1]
- re.Sub[0] = sub
- p.stack[n-1] = re
- p.checkLimits(re)
- if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) {
- return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
- }
- return after, nil
- }
- // repeatIsValid reports whether the repetition re is valid.
- // Valid means that the combination of the top-level repetition
- // and any inner repetitions does not exceed n copies of the
- // innermost thing.
- // This function rewalks the regexp tree and is called for every repetition,
- // so we have to worry about inducing quadratic behavior in the parser.
- // We avoid this by only calling repeatIsValid when min or max >= 2.
- // In that case the depth of any >= 2 nesting can only get to 9 without
- // triggering a parse error, so each subtree can only be rewalked 9 times.
- func repeatIsValid(re *Regexp, n int) bool {
- if re.Op == OpRepeat {
- m := re.Max
- if m == 0 {
- return true
- }
- if m < 0 {
- m = re.Min
- }
- if m > n {
- return false
- }
- if m > 0 {
- n /= m
- }
- }
- for _, sub := range re.Sub {
- if !repeatIsValid(sub, n) {
- return false
- }
- }
- return true
- }
- // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
- func (p *parser) concat() *Regexp {
- p.maybeConcat(-1, 0)
- // Scan down to find pseudo-operator | or (.
- i := len(p.stack)
- for i > 0 && p.stack[i-1].Op < opPseudo {
- i--
- }
- subs := p.stack[i:]
- p.stack = p.stack[:i]
- // Empty concatenation is special case.
- if len(subs) == 0 {
- return p.push(p.newRegexp(OpEmptyMatch))
- }
- return p.push(p.collapse(subs, OpConcat))
- }
- // alternate replaces the top of the stack (above the topmost '(') with its alternation.
- func (p *parser) alternate() *Regexp {
- // Scan down to find pseudo-operator (.
- // There are no | above (.
- i := len(p.stack)
- for i > 0 && p.stack[i-1].Op < opPseudo {
- i--
- }
- subs := p.stack[i:]
- p.stack = p.stack[:i]
- // Make sure top class is clean.
- // All the others already are (see swapVerticalBar).
- if len(subs) > 0 {
- cleanAlt(subs[len(subs)-1])
- }
- // Empty alternate is special case
- // (shouldn't happen but easy to handle).
- if len(subs) == 0 {
- return p.push(p.newRegexp(OpNoMatch))
- }
- return p.push(p.collapse(subs, OpAlternate))
- }
- // cleanAlt cleans re for eventual inclusion in an alternation.
- func cleanAlt(re *Regexp) {
- switch re.Op {
- case OpCharClass:
- re.Rune = cleanClass(&re.Rune)
- if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
- re.Rune = nil
- re.Op = OpAnyChar
- return
- }
- if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
- re.Rune = nil
- re.Op = OpAnyCharNotNL
- return
- }
- if cap(re.Rune)-len(re.Rune) > 100 {
- // re.Rune will not grow any more.
- // Make a copy or inline to reclaim storage.
- re.Rune = append(re.Rune0[:0], re.Rune...)
- }
- }
- }
- // collapse returns the result of applying op to sub.
- // If sub contains op nodes, they all get hoisted up
- // so that there is never a concat of a concat or an
- // alternate of an alternate.
- func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
- if len(subs) == 1 {
- return subs[0]
- }
- re := p.newRegexp(op)
- re.Sub = re.Sub0[:0]
- for _, sub := range subs {
- if sub.Op == op {
- re.Sub = append(re.Sub, sub.Sub...)
- p.reuse(sub)
- } else {
- re.Sub = append(re.Sub, sub)
- }
- }
- if op == OpAlternate {
- re.Sub = p.factor(re.Sub)
- if len(re.Sub) == 1 {
- old := re
- re = re.Sub[0]
- p.reuse(old)
- }
- }
- return re
- }
- // factor factors common prefixes from the alternation list sub.
- // It returns a replacement list that reuses the same storage and
- // frees (passes to p.reuse) any removed *Regexps.
- //
- // For example,
- //
- // ABC|ABD|AEF|BCX|BCY
- //
- // simplifies by literal prefix extraction to
- //
- // A(B(C|D)|EF)|BC(X|Y)
- //
- // which simplifies by character class introduction to
- //
- // A(B[CD]|EF)|BC[XY]
- func (p *parser) factor(sub []*Regexp) []*Regexp {
- if len(sub) < 2 {
- return sub
- }
- // Round 1: Factor out common literal prefixes.
- var str []rune
- var strflags Flags
- start := 0
- out := sub[:0]
- if p.optimizations() {
- for i := 0; i <= len(sub); i++ {
- // Invariant: the Regexps that were in sub[0:start] have been
- // used or marked for reuse, and the slice space has been reused
- // for out (len(out) <= start).
- //
- // Invariant: sub[start:i] consists of regexps that all begin
- // with str as modified by strflags.
- var istr []rune
- var iflags Flags
- if i < len(sub) {
- istr, iflags = p.leadingString(sub[i])
- if iflags == strflags {
- same := 0
- for same < len(str) && same < len(istr) && str[same] == istr[same] {
- same++
- }
- if same > 0 {
- // Matches at least one rune in current range.
- // Keep going around.
- str = str[:same]
- continue
- }
- }
- }
- // Found end of a run with common leading literal string:
- // sub[start:i] all begin with str[0:len(str)], but sub[i]
- // does not even begin with str[0].
- //
- // Factor out common string and append factored expression to out.
- if i == start {
- // Nothing to do - run of length 0.
- } else if i == start+1 {
- // Just one: don't bother factoring.
- out = append(out, sub[start])
- } else {
- // Construct factored form: prefix(suffix1|suffix2|...)
- prefix := p.newRegexp(OpLiteral)
- prefix.Flags = strflags
- prefix.Rune = append(prefix.Rune[:0], str...)
- for j := start; j < i; j++ {
- sub[j] = p.removeLeadingString(sub[j], len(str))
- p.checkLimits(sub[j])
- }
- suffix := p.collapse(sub[start:i], OpAlternate) // recurse
- re := p.newRegexp(OpConcat)
- re.Sub = append(re.Sub[:0], prefix, suffix)
- out = append(out, re)
- }
- // Prepare for next iteration.
- start = i
- str = istr
- strflags = iflags
- }
- sub = out
- // Round 2: Factor out common simple prefixes,
- // just the first piece of each concatenation.
- // This will be good enough a lot of the time.
- //
- // Complex subexpressions (e.g. involving quantifiers)
- // are not safe to factor because that collapses their
- // distinct paths through the automaton, which affects
- // correctness in some cases.
- start = 0
- out = sub[:0]
- var first *Regexp
- for i := 0; i <= len(sub); i++ {
- // Invariant: the Regexps that were in sub[0:start] have been
- // used or marked for reuse, and the slice space has been reused
- // for out (len(out) <= start).
- //
- // Invariant: sub[start:i] consists of regexps that all begin with ifirst.
- var ifirst *Regexp
- if i < len(sub) {
- ifirst = p.leadingRegexp(sub[i])
- if first != nil && first.Equal(ifirst) &&
- // first must be a character class OR a fixed repeat of a character class.
- (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) {
- continue
- }
- }
- // Found end of a run with common leading regexp:
- // sub[start:i] all begin with first but sub[i] does not.
- //
- // Factor out common regexp and append factored expression to out.
- if i == start {
- // Nothing to do - run of length 0.
- } else if i == start+1 {
- // Just one: don't bother factoring.
- out = append(out, sub[start])
- } else {
- // Construct factored form: prefix(suffix1|suffix2|...)
- prefix := first
- for j := start; j < i; j++ {
- reuse := j != start // prefix came from sub[start]
- sub[j] = p.removeLeadingRegexp(sub[j], reuse)
- p.checkLimits(sub[j])
- }
- suffix := p.collapse(sub[start:i], OpAlternate) // recurse
- re := p.newRegexp(OpConcat)
- re.Sub = append(re.Sub[:0], prefix, suffix)
- out = append(out, re)
- }
- // Prepare for next iteration.
- start = i
- first = ifirst
- }
- sub = out
- // Round 3: Collapse runs of single literals into character classes.
- start = 0
- out = sub[:0]
- for i := 0; i <= len(sub); i++ {
- // Invariant: the Regexps that were in sub[0:start] have been
- // used or marked for reuse, and the slice space has been reused
- // for out (len(out) <= start).
- //
- // Invariant: sub[start:i] consists of regexps that are either
- // literal runes or character classes.
- if i < len(sub) && isCharClass(sub[i]) {
- continue
- }
- // sub[i] is not a char or char class;
- // emit char class for sub[start:i]...
- if i == start {
- // Nothing to do - run of length 0.
- } else if i == start+1 {
- out = append(out, sub[start])
- } else {
- // Make new char class.
- // Start with most complex regexp in sub[start].
- max := start
- for j := start + 1; j < i; j++ {
- if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
- max = j
- }
- }
- sub[start], sub[max] = sub[max], sub[start]
- for j := start + 1; j < i; j++ {
- mergeCharClass(sub[start], sub[j])
- p.reuse(sub[j])
- }
- cleanAlt(sub[start])
- out = append(out, sub[start])
- }
- // ... and then emit sub[i].
- if i < len(sub) {
- out = append(out, sub[i])
- }
- start = i + 1
- }
- sub = out
- }
- // Round 4: Collapse runs of empty matches into a single empty match.
- start = 0
- out = sub[:0]
- for i := range sub {
- if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
- continue
- }
- out = append(out, sub[i])
- }
- sub = out
- return sub
- }
- // leadingString returns the leading literal string that re begins with.
- // The string refers to storage in re or its children.
- func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
- if re.Op == OpConcat && len(re.Sub) > 0 {
- re = re.Sub[0]
- }
- if re.Op != OpLiteral {
- return nil, 0
- }
- return re.Rune, re.Flags & FoldCase
- }
- // removeLeadingString removes the first n leading runes
- // from the beginning of re. It returns the replacement for re.
- func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
- if re.Op == OpConcat && len(re.Sub) > 0 {
- // Removing a leading string in a concatenation
- // might simplify the concatenation.
- sub := re.Sub[0]
- sub = p.removeLeadingString(sub, n)
- re.Sub[0] = sub
- if sub.Op == OpEmptyMatch {
- p.reuse(sub)
- switch len(re.Sub) {
- case 0, 1:
- // Impossible but handle.
- re.Op = OpEmptyMatch
- re.Sub = nil
- case 2:
- old := re
- re = re.Sub[1]
- p.reuse(old)
- default:
- copy(re.Sub, re.Sub[1:])
- re.Sub = re.Sub[:len(re.Sub)-1]
- }
- }
- return re
- }
- if re.Op == OpLiteral {
- re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
- if len(re.Rune) == 0 {
- re.Op = OpEmptyMatch
- }
- }
- return re
- }
- // leadingRegexp returns the leading regexp that re begins with.
- // The regexp refers to storage in re or its children.
- func (p *parser) leadingRegexp(re *Regexp) *Regexp {
- if re.Op == OpEmptyMatch {
- return nil
- }
- if re.Op == OpConcat && len(re.Sub) > 0 {
- sub := re.Sub[0]
- if sub.Op == OpEmptyMatch {
- return nil
- }
- return sub
- }
- return re
- }
- // removeLeadingRegexp removes the leading regexp in re.
- // It returns the replacement for re.
- // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
- func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
- if re.Op == OpConcat && len(re.Sub) > 0 {
- if reuse {
- p.reuse(re.Sub[0])
- }
- re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
- switch len(re.Sub) {
- case 0:
- re.Op = OpEmptyMatch
- re.Sub = nil
- case 1:
- old := re
- re = re.Sub[0]
- p.reuse(old)
- }
- return re
- }
- if reuse {
- p.reuse(re)
- }
- return p.newRegexp(OpEmptyMatch)
- }
- func literalRegexp(s string, flags Flags) *Regexp {
- re := &Regexp{Op: OpLiteral}
- re.Flags = flags
- re.Rune = re.Rune0[:0] // use local storage for small strings
- for _, c := range s {
- if len(re.Rune) >= cap(re.Rune) {
- // string is too long to fit in Rune0. let Go handle it
- re.Rune = []rune(s)
- break
- }
- re.Rune = append(re.Rune, c)
- }
- return re
- }
- // Parsing.
- // Parse parses a regular expression string s, controlled by the specified
- // Flags, and returns a regular expression parse tree. The syntax is
- // described in the top-level comment.
- func Parse(s string, flags Flags) (*Regexp, error) {
- return parse(s, flags)
- }
- func parse(s string, flags Flags) (_ *Regexp, err error) {
- defer func() {
- switch r := recover(); r {
- default:
- panic(r)
- case nil:
- // ok
- case ErrLarge: // too big
- err = &Error{Code: ErrLarge, Expr: s}
- case ErrNestingDepth:
- err = &Error{Code: ErrNestingDepth, Expr: s}
- }
- }()
- if flags&Literal != 0 {
- // Trivial parser for literal string.
- if err := checkUTF8(s); err != nil {
- return nil, err
- }
- return literalRegexp(s, flags), nil
- }
- // Otherwise, must do real work.
- var (
- p parser
- c rune
- op Op
- lastRepeat string
- )
- p.flags = flags
- p.wholeRegexp = s
- t := s
- for t != "" {
- repeat := ""
- BigSwitch:
- switch t[0] {
- default:
- if c, t, err = nextRune(t); err != nil {
- return nil, err
- }
- p.literal(c)
- case '(':
- if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
- // Flag changes and non-capturing groups.
- if t, err = p.parsePerlFlags(t); err != nil {
- return nil, err
- }
- break
- }
- p.numCap++
- p.op(opLeftParen).Cap = p.numCap
- t = t[1:]
- case '|':
- if err = p.parseVerticalBar(); err != nil {
- return nil, err
- }
- t = t[1:]
- case ')':
- if err = p.parseRightParen(); err != nil {
- return nil, err
- }
- t = t[1:]
- case '^':
- if p.flags&OneLine != 0 {
- p.op(OpBeginText)
- } else {
- p.op(OpBeginLine)
- }
- t = t[1:]
- case '$':
- if p.flags&OneLine != 0 {
- p.op(OpEndText).Flags |= WasDollar
- } else {
- p.op(OpEndLine)
- }
- t = t[1:]
- case '.':
- if p.flags&DotNL != 0 {
- p.op(OpAnyChar)
- } else {
- p.op(OpAnyCharNotNL)
- }
- t = t[1:]
- case '[':
- if t, err = p.parseClass(t); err != nil {
- return nil, err
- }
- case '*', '+', '?':
- before := t
- switch t[0] {
- case '*':
- op = OpStar
- case '+':
- op = OpPlus
- case '?':
- op = OpQuest
- }
- after := t[1:]
- if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil {
- return nil, err
- }
- repeat = before
- t = after
- case '{':
- op = OpRepeat
- before := t
- min, max, after, ok := p.parseRepeat(t)
- if !ok {
- // If the repeat cannot be parsed, { is a literal.
- p.literal('{')
- t = t[1:]
- break
- }
- if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
- // Numbers were too big, or max is present and min > max.
- return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
- }
- if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
- return nil, err
- }
- repeat = before
- t = after
- case '\\':
- if p.flags&PerlX != 0 && len(t) >= 2 {
- switch t[1] {
- case 'A':
- p.op(OpBeginText)
- t = t[2:]
- break BigSwitch
- case 'b':
- p.op(OpWordBoundary)
- t = t[2:]
- break BigSwitch
- case 'B':
- p.op(OpNoWordBoundary)
- t = t[2:]
- break BigSwitch
- case 'C':
- // any byte; not supported
- return nil, &Error{ErrInvalidEscape, t[:2]}
- case 'Q':
- // \Q ... \E: the ... is always literals
- var lit string
- lit, t, _ = strings.Cut(t[2:], `\E`)
- for lit != "" {
- c, rest, err := nextRune(lit)
- if err != nil {
- return nil, err
- }
- p.literal(c)
- lit = rest
- }
- break BigSwitch
- case 'z':
- p.op(OpEndText)
- t = t[2:]
- break BigSwitch
- }
- }
- re := p.newRegexp(OpCharClass)
- re.Flags = p.flags
- // Look for Unicode character group like \p{Han}
- if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
- r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
- if err != nil {
- return nil, err
- }
- if r != nil {
- re.Rune = r
- t = rest
- p.push(re)
- break BigSwitch
- }
- }
- // Perl character class escape.
- if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
- re.Rune = r
- t = rest
- p.push(re)
- break BigSwitch
- }
- p.reuse(re)
- // Ordinary single-character escape.
- if c, t, err = p.parseEscape(t); err != nil {
- return nil, err
- }
- p.literal(c)
- }
- lastRepeat = repeat
- }
- p.concat()
- if p.swapVerticalBar() {
- // pop vertical bar
- p.stack = p.stack[:len(p.stack)-1]
- }
- p.alternate()
- n := len(p.stack)
- if n != 1 {
- return nil, &Error{ErrMissingParen, s}
- }
- return p.stack[0], nil
- }
- // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
- // If s is not of that form, it returns ok == false.
- // If s has the right form but the values are too big, it returns min == -1, ok == true.
- func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
- if s == "" || s[0] != '{' {
- return
- }
- s = s[1:]
- var ok1 bool
- if min, s, ok1 = p.parseInt(s); !ok1 {
- return
- }
- if s == "" {
- return
- }
- if s[0] != ',' {
- max = min
- } else {
- s = s[1:]
- if s == "" {
- return
- }
- if s[0] == '}' {
- max = -1
- } else if max, s, ok1 = p.parseInt(s); !ok1 {
- return
- } else if max < 0 {
- // parseInt found too big a number
- min = -1
- }
- }
- if s == "" || s[0] != '}' {
- return
- }
- rest = s[1:]
- ok = true
- return
- }
- // parsePerlFlags parses a Perl flag setting or non-capturing group or both,
- // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.
- // The caller must have ensured that s begins with "(?".
- func (p *parser) parsePerlFlags(s string) (rest string, err error) {
- t := s
- // Check for named captures, first introduced in Python's regexp library.
- // As usual, there are three slightly different syntaxes:
- //
- // (?P<name>expr) the original, introduced by Python
- // (?<name>expr) the .NET alteration, adopted by Perl 5.10
- // (?'name'expr) another .NET alteration, adopted by Perl 5.10
- //
- // Perl 5.10 gave in and implemented the Python version too,
- // but they claim that the last two are the preferred forms.
- // PCRE and languages based on it (specifically, PHP and Ruby)
- // support all three as well. EcmaScript 4 uses only the Python form.
- //
- // In both the open source world (via Code Search) and the
- // Google source tree, (?P<expr>name) is the dominant form,
- // so that's the one we implement. One is enough.
- if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
- // Pull out name.
- end := strings.IndexRune(t, '>')
- if end < 0 {
- if err = checkUTF8(t); err != nil {
- return "", err
- }
- return "", &Error{ErrInvalidNamedCapture, s}
- }
- capture := t[:end+1] // "(?P<name>"
- name := t[4:end] // "name"
- if err = checkUTF8(name); err != nil {
- return "", err
- }
- if !isValidCaptureName(name) {
- return "", &Error{ErrInvalidNamedCapture, capture}
- }
- // Like ordinary capture, but named.
- p.numCap++
- re := p.op(opLeftParen)
- re.Cap = p.numCap
- re.Name = name
- return t[end+1:], nil
- }
- // Non-capturing group. Might also twiddle Perl flags.
- var c rune
- t = t[2:] // skip (?
- flags := p.flags
- sign := +1
- sawFlag := false
- Loop:
- for t != "" {
- if c, t, err = nextRune(t); err != nil {
- return "", err
- }
- switch c {
- default:
- break Loop
- // Flags.
- case 'i':
- flags |= FoldCase
- sawFlag = true
- case 'm':
- flags &^= OneLine
- sawFlag = true
- case 's':
- flags |= DotNL
- sawFlag = true
- case 'U':
- flags |= NonGreedy
- sawFlag = true
- // Switch to negation.
- case '-':
- if sign < 0 {
- break Loop
- }
- sign = -1
- // Invert flags so that | above turn into &^ and vice versa.
- // We'll invert flags again before using it below.
- flags = ^flags
- sawFlag = false
- // End of flags, starting group or not.
- case ':', ')':
- if sign < 0 {
- if !sawFlag {
- break Loop
- }
- flags = ^flags
- }
- if c == ':' {
- // Open new group
- p.op(opLeftParen)
- }
- p.flags = flags
- return t, nil
- }
- }
- return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
- }
- // isValidCaptureName reports whether name
- // is a valid capture name: [A-Za-z0-9_]+.
- // PCRE limits names to 32 bytes.
- // Python rejects names starting with digits.
- // We don't enforce either of those.
- func isValidCaptureName(name string) bool {
- if name == "" {
- return false
- }
- for _, c := range name {
- if c != '_' && !isalnum(c) {
- return false
- }
- }
- return true
- }
- // parseInt parses a decimal integer.
- func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
- if s == "" || s[0] < '0' || '9' < s[0] {
- return
- }
- // Disallow leading zeros.
- if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
- return
- }
- t := s
- for s != "" && '0' <= s[0] && s[0] <= '9' {
- s = s[1:]
- }
- rest = s
- ok = true
- // Have digits, compute value.
- t = t[:len(t)-len(s)]
- for i := 0; i < len(t); i++ {
- // Avoid overflow.
- if n >= 1e8 {
- n = -1
- break
- }
- n = n*10 + int(t[i]) - '0'
- }
- return
- }
- // can this be represented as a character class?
- // single-rune literal string, char class, ., and .|\n.
- func isCharClass(re *Regexp) bool {
- return re.Op == OpLiteral && len(re.Rune) == 1 ||
- re.Op == OpCharClass ||
- re.Op == OpAnyCharNotNL ||
- re.Op == OpAnyChar
- }
- // does re match r?
- func matchRune(re *Regexp, r rune) bool {
- switch re.Op {
- case OpLiteral:
- return len(re.Rune) == 1 && re.Rune[0] == r
- case OpCharClass:
- for i := 0; i < len(re.Rune); i += 2 {
- if re.Rune[i] <= r && r <= re.Rune[i+1] {
- return true
- }
- }
- return false
- case OpAnyCharNotNL:
- return r != '\n'
- case OpAnyChar:
- return true
- }
- return false
- }
- // parseVerticalBar handles a | in the input.
- func (p *parser) parseVerticalBar() error {
- p.concat()
- // The concatenation we just parsed is on top of the stack.
- // If it sits above an opVerticalBar, swap it below
- // (things below an opVerticalBar become an alternation).
- // Otherwise, push a new vertical bar.
- if !p.swapVerticalBar() {
- p.op(opVerticalBar)
- }
- return nil
- }
- // mergeCharClass makes dst = dst|src.
- // The caller must ensure that dst.Op >= src.Op,
- // to reduce the amount of copying.
- func mergeCharClass(dst, src *Regexp) {
- switch dst.Op {
- case OpAnyChar:
- // src doesn't add anything.
- case OpAnyCharNotNL:
- // src might add \n
- if matchRune(src, '\n') {
- dst.Op = OpAnyChar
- }
- case OpCharClass:
- // src is simpler, so either literal or char class
- if src.Op == OpLiteral {
- dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
- } else {
- dst.Rune = appendClass(dst.Rune, src.Rune)
- }
- case OpLiteral:
- // both literal
- if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
- break
- }
- dst.Op = OpCharClass
- dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
- dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
- }
- }
- // If the top of the stack is an element followed by an opVerticalBar
- // swapVerticalBar swaps the two and returns true.
- // Otherwise it returns false.
- func (p *parser) swapVerticalBar() bool {
- // If above and below vertical bar are literal or char class,
- // can merge into a single char class.
- n := len(p.stack)
- if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) && p.optimizations() {
- re1 := p.stack[n-1]
- re3 := p.stack[n-3]
- // Make re3 the more complex of the two.
- if re1.Op > re3.Op {
- re1, re3 = re3, re1
- p.stack[n-3] = re3
- }
- mergeCharClass(re3, re1)
- p.reuse(re1)
- p.stack = p.stack[:n-1]
- return true
- }
- if n >= 2 {
- re1 := p.stack[n-1]
- re2 := p.stack[n-2]
- if re2.Op == opVerticalBar {
- if n >= 3 {
- // Now out of reach.
- // Clean opportunistically.
- cleanAlt(p.stack[n-3])
- }
- p.stack[n-2] = re1
- p.stack[n-1] = re2
- return true
- }
- }
- return false
- }
- // parseRightParen handles a ) in the input.
- func (p *parser) parseRightParen() error {
- p.concat()
- if p.swapVerticalBar() {
- // pop vertical bar
- p.stack = p.stack[:len(p.stack)-1]
- }
- p.alternate()
- n := len(p.stack)
- if n < 2 {
- return &Error{ErrUnexpectedParen, p.wholeRegexp}
- }
- re1 := p.stack[n-1]
- re2 := p.stack[n-2]
- p.stack = p.stack[:n-2]
- if re2.Op != opLeftParen {
- return &Error{ErrUnexpectedParen, p.wholeRegexp}
- }
- // Restore flags at time of paren.
- p.flags = re2.Flags
- if re2.Cap == 0 {
- // Just for grouping.
- p.push(re1)
- } else {
- re2.Op = OpCapture
- re2.Sub = re2.Sub0[:1]
- re2.Sub[0] = re1
- p.push(re2)
- }
- return nil
- }
- // parseEscape parses an escape sequence at the beginning of s
- // and returns the rune.
- func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
- t := s[1:]
- if t == "" {
- return 0, "", &Error{ErrTrailingBackslash, ""}
- }
- c, t, err := nextRune(t)
- if err != nil {
- return 0, "", err
- }
- Switch:
- switch c {
- default:
- if c < utf8.RuneSelf && !isalnum(c) {
- // Escaped non-word characters are always themselves.
- // PCRE is not quite so rigorous: it accepts things like
- // \q, but we don't. We once rejected \_, but too many
- // programs and people insist on using it, so allow \_.
- return c, t, nil
- }
- // Octal escapes.
- case '1', '2', '3', '4', '5', '6', '7':
- // Single non-zero digit is a backreference; not supported
- if t == "" || t[0] < '0' || t[0] > '7' {
- break
- }
- fallthrough
- case '0':
- // Consume up to three octal digits; already have one.
- r = c - '0'
- for i := 1; i < 3; i++ {
- if t == "" || t[0] < '0' || t[0] > '7' {
- break
- }
- r = r*8 + rune(t[0]) - '0'
- t = t[1:]
- }
- return r, t, nil
- // Hexadecimal escapes.
- case 'x':
- if t == "" {
- break
- }
- if c, t, err = nextRune(t); err != nil {
- return 0, "", err
- }
- if c == '{' {
- // Any number of digits in braces.
- // Perl accepts any text at all; it ignores all text
- // after the first non-hex digit. We require only hex digits,
- // and at least one.
- nhex := 0
- r = 0
- for {
- if t == "" {
- break Switch
- }
- if c, t, err = nextRune(t); err != nil {
- return 0, "", err
- }
- if c == '}' {
- break
- }
- v := unhex(c)
- if v < 0 {
- break Switch
- }
- r = r*16 + v
- if r > unicode.MaxRune {
- break Switch
- }
- nhex++
- }
- if nhex == 0 {
- break Switch
- }
- return r, t, nil
- }
- // Easy case: two hex digits.
- x := unhex(c)
- if c, t, err = nextRune(t); err != nil {
- return 0, "", err
- }
- y := unhex(c)
- if x < 0 || y < 0 {
- break
- }
- return x*16 + y, t, nil
- // C escapes. There is no case 'b', to avoid misparsing
- // the Perl word-boundary \b as the C backspace \b
- // when in POSIX mode. In Perl, /\b/ means word-boundary
- // but /[\b]/ means backspace. We don't support that.
- // If you want a backspace, embed a literal backspace
- // character or use \x08.
- case 'a':
- return '\a', t, err
- case 'f':
- return '\f', t, err
- case 'n':
- return '\n', t, err
- case 'r':
- return '\r', t, err
- case 't':
- return '\t', t, err
- case 'v':
- return '\v', t, err
- }
- return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
- }
- // parseClassChar parses a character class character at the beginning of s
- // and returns it.
- func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
- if s == "" {
- return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
- }
- // Allow regular escape sequences even though
- // many need not be escaped in this context.
- if s[0] == '\\' {
- return p.parseEscape(s)
- }
- return nextRune(s)
- }
- type charGroup struct {
- sign int
- class []rune
- }
- // parsePerlClassEscape parses a leading Perl character class escape like \d
- // from the beginning of s. If one is present, it appends the characters to r
- // and returns the new slice r and the remainder of the string.
- func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
- if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
- return
- }
- g := perlGroup[s[0:2]]
- if g.sign == 0 {
- return
- }
- return p.appendGroup(r, g), s[2:]
- }
- // parseNamedClass parses a leading POSIX named character class like [:alnum:]
- // from the beginning of s. If one is present, it appends the characters to r
- // and returns the new slice r and the remainder of the string.
- func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
- if len(s) < 2 || s[0] != '[' || s[1] != ':' {
- return
- }
- i := strings.Index(s[2:], ":]")
- if i < 0 {
- return
- }
- i += 2
- name, s := s[0:i+2], s[i+2:]
- g := posixGroup[name]
- if g.sign == 0 {
- return nil, "", &Error{ErrInvalidCharRange, name}
- }
- return p.appendGroup(r, g), s, nil
- }
- func (p *parser) appendGroup(r []rune, g charGroup) []rune {
- if p.flags&FoldCase == 0 {
- if g.sign < 0 {
- r = appendNegatedClass(r, g.class)
- } else {
- r = appendClass(r, g.class)
- }
- } else {
- tmp := p.tmpClass[:0]
- tmp = appendFoldedClass(tmp, g.class)
- p.tmpClass = tmp
- tmp = cleanClass(&p.tmpClass)
- if g.sign < 0 {
- r = appendNegatedClass(r, tmp)
- } else {
- r = appendClass(r, tmp)
- }
- }
- return r
- }
- var anyTable = &unicode.RangeTable{
- R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
- R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
- }
- // unicodeTable returns the unicode.RangeTable identified by name
- // and the table of additional fold-equivalent code points.
- func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
- // Special case: "Any" means any.
- if name == "Any" {
- return anyTable, anyTable
- }
- if t := unicode.Categories[name]; t != nil {
- return t, unicode.FoldCategory[name]
- }
- if t := unicode.Scripts[name]; t != nil {
- return t, unicode.FoldScript[name]
- }
- return nil, nil
- }
- // parseUnicodeClass parses a leading Unicode character class like \p{Han}
- // from the beginning of s. If one is present, it appends the characters to r
- // and returns the new slice r and the remainder of the string.
- func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
- if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
- return
- }
- // Committed to parse or return error.
- sign := +1
- if s[1] == 'P' {
- sign = -1
- }
- t := s[2:]
- c, t, err := nextRune(t)
- if err != nil {
- return
- }
- var seq, name string
- if c != '{' {
- // Single-letter name.
- seq = s[:len(s)-len(t)]
- name = seq[2:]
- } else {
- // Name is in braces.
- end := strings.IndexRune(s, '}')
- if end < 0 {
- if err = checkUTF8(s); err != nil {
- return
- }
- return nil, "", &Error{ErrInvalidCharRange, s}
- }
- seq, t = s[:end+1], s[end+1:]
- name = s[3:end]
- if err = checkUTF8(name); err != nil {
- return
- }
- }
- // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
- if name != "" && name[0] == '^' {
- sign = -sign
- name = name[1:]
- }
- tab, fold := unicodeTable(name)
- if tab == nil {
- return nil, "", &Error{ErrInvalidCharRange, seq}
- }
- if p.flags&FoldCase == 0 || fold == nil {
- if sign > 0 {
- r = appendTable(r, tab)
- } else {
- r = appendNegatedTable(r, tab)
- }
- } else {
- // Merge and clean tab and fold in a temporary buffer.
- // This is necessary for the negative case and just tidy
- // for the positive case.
- tmp := p.tmpClass[:0]
- tmp = appendTable(tmp, tab)
- tmp = appendTable(tmp, fold)
- p.tmpClass = tmp
- tmp = cleanClass(&p.tmpClass)
- if sign > 0 {
- r = appendClass(r, tmp)
- } else {
- r = appendNegatedClass(r, tmp)
- }
- }
- return r, t, nil
- }
- // parseClass parses a character class at the beginning of s
- // and pushes it onto the parse stack.
- func (p *parser) parseClass(s string) (rest string, err error) {
- t := s[1:] // chop [
- re := p.newRegexp(OpCharClass)
- re.Flags = p.flags
- re.Rune = re.Rune0[:0]
- sign := +1
- if t != "" && t[0] == '^' {
- sign = -1
- t = t[1:]
- // If character class does not match \n, add it here,
- // so that negation later will do the right thing.
- if p.flags&ClassNL == 0 {
- re.Rune = append(re.Rune, '\n', '\n')
- }
- }
- class := re.Rune
- first := true // ] and - are okay as first char in class
- for t == "" || t[0] != ']' || first {
- // POSIX: - is only okay unescaped as first or last in class.
- // Perl: - is okay anywhere.
- if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
- _, size := utf8.DecodeRuneInString(t[1:])
- return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
- }
- first = false
- // Look for POSIX [:alnum:] etc.
- if len(t) > 2 && t[0] == '[' && t[1] == ':' {
- nclass, nt, err := p.parseNamedClass(t, class)
- if err != nil {
- return "", err
- }
- if nclass != nil {
- class, t = nclass, nt
- continue
- }
- }
- // Look for Unicode character group like \p{Han}.
- nclass, nt, err := p.parseUnicodeClass(t, class)
- if err != nil {
- return "", err
- }
- if nclass != nil {
- class, t = nclass, nt
- continue
- }
- // Look for Perl character class symbols (extension).
- if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
- class, t = nclass, nt
- continue
- }
- // Single character or simple range.
- rng := t
- var lo, hi rune
- if lo, t, err = p.parseClassChar(t, s); err != nil {
- return "", err
- }
- hi = lo
- // [a-] means (a|-) so check for final ].
- if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
- t = t[1:]
- if hi, t, err = p.parseClassChar(t, s); err != nil {
- return "", err
- }
- if hi < lo {
- rng = rng[:len(rng)-len(t)]
- return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
- }
- }
- if p.flags&FoldCase == 0 {
- class = appendRange(class, lo, hi)
- } else {
- class = appendFoldedRange(class, lo, hi)
- }
- }
- t = t[1:] // chop ]
- // Use &re.Rune instead of &class to avoid allocation.
- re.Rune = class
- class = cleanClass(&re.Rune)
- if sign < 0 {
- class = negateClass(class)
- }
- re.Rune = class
- p.push(re)
- return t, nil
- }
- // cleanClass sorts the ranges (pairs of elements of r),
- // merges them, and eliminates duplicates.
- func cleanClass(rp *[]rune) []rune {
- // Sort by lo increasing, hi decreasing to break ties.
- sort.Sort(ranges{rp})
- r := *rp
- if len(r) < 2 {
- return r
- }
- // Merge abutting, overlapping.
- w := 2 // write index
- for i := 2; i < len(r); i += 2 {
- lo, hi := r[i], r[i+1]
- if lo <= r[w-1]+1 {
- // merge with previous range
- if hi > r[w-1] {
- r[w-1] = hi
- }
- continue
- }
- // new disjoint range
- r[w] = lo
- r[w+1] = hi
- w += 2
- }
- return r[:w]
- }
- // appendLiteral returns the result of appending the literal x to the class r.
- func appendLiteral(r []rune, x rune, flags Flags) []rune {
- if flags&FoldCase != 0 {
- return appendFoldedRange(r, x, x)
- }
- return appendRange(r, x, x)
- }
- // appendRange returns the result of appending the range lo-hi to the class r.
- func appendRange(r []rune, lo, hi rune) []rune {
- // Expand last range or next to last range if it overlaps or abuts.
- // Checking two ranges helps when appending case-folded
- // alphabets, so that one range can be expanding A-Z and the
- // other expanding a-z.
- n := len(r)
- for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
- if n >= i {
- rlo, rhi := r[n-i], r[n-i+1]
- if lo <= rhi+1 && rlo <= hi+1 {
- if lo < rlo {
- r[n-i] = lo
- }
- if hi > rhi {
- r[n-i+1] = hi
- }
- return r
- }
- }
- }
- return append(r, lo, hi)
- }
- const (
- // minimum and maximum runes involved in folding.
- // checked during test.
- minFold = 0x0041
- maxFold = 0x1e943
- )
- // appendFoldedRange returns the result of appending the range lo-hi
- // and its case folding-equivalent runes to the class r.
- func appendFoldedRange(r []rune, lo, hi rune) []rune {
- // Optimizations.
- if lo <= minFold && hi >= maxFold {
- // Range is full: folding can't add more.
- return appendRange(r, lo, hi)
- }
- if hi < minFold || lo > maxFold {
- // Range is outside folding possibilities.
- return appendRange(r, lo, hi)
- }
- if lo < minFold {
- // [lo, minFold-1] needs no folding.
- r = appendRange(r, lo, minFold-1)
- lo = minFold
- }
- if hi > maxFold {
- // [maxFold+1, hi] needs no folding.
- r = appendRange(r, maxFold+1, hi)
- hi = maxFold
- }
- // Brute force. Depend on appendRange to coalesce ranges on the fly.
- for c := lo; c <= hi; c++ {
- r = appendRange(r, c, c)
- f := unicode.SimpleFold(c)
- for f != c {
- r = appendRange(r, f, f)
- f = unicode.SimpleFold(f)
- }
- }
- return r
- }
- // appendClass returns the result of appending the class x to the class r.
- // It assume x is clean.
- func appendClass(r []rune, x []rune) []rune {
- for i := 0; i < len(x); i += 2 {
- r = appendRange(r, x[i], x[i+1])
- }
- return r
- }
- // appendFoldedClass returns the result of appending the case folding of the class x to the class r.
- func appendFoldedClass(r []rune, x []rune) []rune {
- for i := 0; i < len(x); i += 2 {
- r = appendFoldedRange(r, x[i], x[i+1])
- }
- return r
- }
- // appendNegatedClass returns the result of appending the negation of the class x to the class r.
- // It assumes x is clean.
- func appendNegatedClass(r []rune, x []rune) []rune {
- nextLo := '\u0000'
- for i := 0; i < len(x); i += 2 {
- lo, hi := x[i], x[i+1]
- if nextLo <= lo-1 {
- r = appendRange(r, nextLo, lo-1)
- }
- nextLo = hi + 1
- }
- if nextLo <= unicode.MaxRune {
- r = appendRange(r, nextLo, unicode.MaxRune)
- }
- return r
- }
- // appendTable returns the result of appending x to the class r.
- func appendTable(r []rune, x *unicode.RangeTable) []rune {
- for _, xr := range x.R16 {
- lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
- if stride == 1 {
- r = appendRange(r, lo, hi)
- continue
- }
- for c := lo; c <= hi; c += stride {
- r = appendRange(r, c, c)
- }
- }
- for _, xr := range x.R32 {
- lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
- if stride == 1 {
- r = appendRange(r, lo, hi)
- continue
- }
- for c := lo; c <= hi; c += stride {
- r = appendRange(r, c, c)
- }
- }
- return r
- }
- // appendNegatedTable returns the result of appending the negation of x to the class r.
- func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
- nextLo := '\u0000' // lo end of next class to add
- for _, xr := range x.R16 {
- lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
- if stride == 1 {
- if nextLo <= lo-1 {
- r = appendRange(r, nextLo, lo-1)
- }
- nextLo = hi + 1
- continue
- }
- for c := lo; c <= hi; c += stride {
- if nextLo <= c-1 {
- r = appendRange(r, nextLo, c-1)
- }
- nextLo = c + 1
- }
- }
- for _, xr := range x.R32 {
- lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
- if stride == 1 {
- if nextLo <= lo-1 {
- r = appendRange(r, nextLo, lo-1)
- }
- nextLo = hi + 1
- continue
- }
- for c := lo; c <= hi; c += stride {
- if nextLo <= c-1 {
- r = appendRange(r, nextLo, c-1)
- }
- nextLo = c + 1
- }
- }
- if nextLo <= unicode.MaxRune {
- r = appendRange(r, nextLo, unicode.MaxRune)
- }
- return r
- }
- // negateClass overwrites r and returns r's negation.
- // It assumes the class r is already clean.
- func negateClass(r []rune) []rune {
- nextLo := '\u0000' // lo end of next class to add
- w := 0 // write index
- for i := 0; i < len(r); i += 2 {
- lo, hi := r[i], r[i+1]
- if nextLo <= lo-1 {
- r[w] = nextLo
- r[w+1] = lo - 1
- w += 2
- }
- nextLo = hi + 1
- }
- r = r[:w]
- if nextLo <= unicode.MaxRune {
- // It's possible for the negation to have one more
- // range - this one - than the original class, so use append.
- r = append(r, nextLo, unicode.MaxRune)
- }
- return r
- }
- // ranges implements sort.Interface on a []rune.
- // The choice of receiver type definition is strange
- // but avoids an allocation since we already have
- // a *[]rune.
- type ranges struct {
- p *[]rune
- }
- func (ra ranges) Less(i, j int) bool {
- p := *ra.p
- i *= 2
- j *= 2
- return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
- }
- func (ra ranges) Len() int {
- return len(*ra.p) / 2
- }
- func (ra ranges) Swap(i, j int) {
- p := *ra.p
- i *= 2
- j *= 2
- p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
- }
- func checkUTF8(s string) error {
- for s != "" {
- rune, size := utf8.DecodeRuneInString(s)
- if rune == utf8.RuneError && size == 1 {
- return &Error{Code: ErrInvalidUTF8, Expr: s}
- }
- s = s[size:]
- }
- return nil
- }
- func nextRune(s string) (c rune, t string, err error) {
- c, size := utf8.DecodeRuneInString(s)
- if c == utf8.RuneError && size == 1 {
- return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
- }
- return c, s[size:], nil
- }
- func isalnum(c rune) bool {
- return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
- }
- func unhex(c rune) rune {
- if '0' <= c && c <= '9' {
- return c - '0'
- }
- if 'a' <= c && c <= 'f' {
- return c - 'a' + 10
- }
- if 'A' <= c && c <= 'F' {
- return c - 'A' + 10
- }
- return -1
- }
|