parse.go 52 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the GO-LICENSE file.
  4. // Modifications of this file, if any, are
  5. //
  6. // Copyright 2023 The Regexp Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style
  8. // license that can be found in the LICENSE file.
  9. package syntax // modernc.org/regexp/syntax
  10. import (
  11. "sort"
  12. "strings"
  13. "unicode"
  14. "unicode/utf8"
  15. )
  16. // An Error describes a failure to parse a regular expression
  17. // and gives the offending expression.
  18. type Error struct {
  19. Code ErrorCode
  20. Expr string
  21. }
  22. func (e *Error) Error() string {
  23. return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
  24. }
  25. // An ErrorCode describes a failure to parse a regular expression.
  26. type ErrorCode string
  27. const (
  28. // Unexpected error
  29. ErrInternalError ErrorCode = "regexp/syntax: internal error"
  30. // Parse errors
  31. ErrInvalidCharClass ErrorCode = "invalid character class"
  32. ErrInvalidCharRange ErrorCode = "invalid character class range"
  33. ErrInvalidEscape ErrorCode = "invalid escape sequence"
  34. ErrInvalidNamedCapture ErrorCode = "invalid named capture"
  35. ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
  36. ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
  37. ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
  38. ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
  39. ErrMissingBracket ErrorCode = "missing closing ]"
  40. ErrMissingParen ErrorCode = "missing closing )"
  41. ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
  42. ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
  43. ErrUnexpectedParen ErrorCode = "unexpected )"
  44. ErrNestingDepth ErrorCode = "expression nests too deeply"
  45. ErrLarge ErrorCode = "expression too large"
  46. )
  47. func (e ErrorCode) String() string {
  48. return string(e)
  49. }
  50. // Flags control the behavior of the parser and record information about regexp context.
  51. type Flags uint16
  52. const (
  53. FoldCase Flags = 1 << iota // case-insensitive match
  54. Literal // treat pattern as literal string
  55. ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline
  56. DotNL // allow . to match newline
  57. OneLine // treat ^ and $ as only matching at beginning and end of text
  58. NonGreedy // make repetition operators default to non-greedy
  59. PerlX // allow Perl extensions
  60. UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation
  61. WasDollar // regexp OpEndText was $, not \z
  62. Simple // regexp contains no counted repetition
  63. DisableOptimizations // can help constructing a DFA later
  64. MatchNL = ClassNL | DotNL
  65. Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
  66. POSIX Flags = 0 // POSIX syntax
  67. )
  68. // Pseudo-ops for parsing stack.
  69. const (
  70. opLeftParen = opPseudo + iota
  71. opVerticalBar
  72. )
  73. // maxHeight is the maximum height of a regexp parse tree.
  74. // It is somewhat arbitrarily chosen, but the idea is to be large enough
  75. // that no one will actually hit in real use but at the same time small enough
  76. // that recursion on the Regexp tree will not hit the 1GB Go stack limit.
  77. // The maximum amount of stack for a single recursive frame is probably
  78. // closer to 1kB, so this could potentially be raised, but it seems unlikely
  79. // that people have regexps nested even this deeply.
  80. // We ran a test on Google's C++ code base and turned up only
  81. // a single use case with depth > 100; it had depth 128.
  82. // Using depth 1000 should be plenty of margin.
  83. // As an optimization, we don't even bother calculating heights
  84. // until we've allocated at least maxHeight Regexp structures.
  85. const maxHeight = 1000
  86. // maxSize is the maximum size of a compiled regexp in Insts.
  87. // It too is somewhat arbitrarily chosen, but the idea is to be large enough
  88. // to allow significant regexps while at the same time small enough that
  89. // the compiled form will not take up too much memory.
  90. // 128 MB is enough for a 3.3 million Inst structures, which roughly
  91. // corresponds to a 3.3 MB regexp.
  92. const (
  93. maxSize = 128 << 20 / instSize
  94. instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words
  95. )
  96. // maxRunes is the maximum number of runes allowed in a regexp tree
  97. // counting the runes in all the nodes.
  98. // Ignoring character classes p.numRunes is always less than the length of the regexp.
  99. // Character classes can make it much larger: each \pL adds 1292 runes.
  100. // 128 MB is enough for 32M runes, which is over 26k \pL instances.
  101. // Note that repetitions do not make copies of the rune slices,
  102. // so \pL{1000} is only one rune slice, not 1000.
  103. // We could keep a cache of character classes we've seen,
  104. // so that all the \pL we see use the same rune list,
  105. // but that doesn't remove the problem entirely:
  106. // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
  107. // And because the Rune slice is exposed directly in the Regexp,
  108. // there is not an opportunity to change the representation to allow
  109. // partial sharing between different character classes.
  110. // So the limit is the best we can do.
  111. const (
  112. maxRunes = 128 << 20 / runeSize
  113. runeSize = 4 // rune is int32
  114. )
  115. type parser struct {
  116. flags Flags // parse mode flags
  117. stack []*Regexp // stack of parsed expressions
  118. free *Regexp
  119. numCap int // number of capturing groups seen
  120. wholeRegexp string
  121. tmpClass []rune // temporary char class work space
  122. numRegexp int // number of regexps allocated
  123. numRunes int // number of runes in char classes
  124. repeats int64 // product of all repetitions seen
  125. height map[*Regexp]int // regexp height, for height limit check
  126. size map[*Regexp]int64 // regexp compiled size, for size limit check
  127. }
  128. func (p *parser) optimizations() bool { return p.flags&DisableOptimizations == 0 }
  129. func (p *parser) newRegexp(op Op) *Regexp {
  130. re := p.free
  131. if re != nil {
  132. p.free = re.Sub0[0]
  133. *re = Regexp{}
  134. } else {
  135. re = new(Regexp)
  136. p.numRegexp++
  137. }
  138. re.Op = op
  139. return re
  140. }
  141. func (p *parser) reuse(re *Regexp) {
  142. if p.height != nil {
  143. delete(p.height, re)
  144. }
  145. re.Sub0[0] = p.free
  146. p.free = re
  147. }
  148. func (p *parser) checkLimits(re *Regexp) {
  149. if p.numRunes > maxRunes {
  150. panic(ErrLarge)
  151. }
  152. p.checkSize(re)
  153. p.checkHeight(re)
  154. }
  155. func (p *parser) checkSize(re *Regexp) {
  156. if p.size == nil {
  157. // We haven't started tracking size yet.
  158. // Do a relatively cheap check to see if we need to start.
  159. // Maintain the product of all the repeats we've seen
  160. // and don't track if the total number of regexp nodes
  161. // we've seen times the repeat product is in budget.
  162. if p.repeats == 0 {
  163. p.repeats = 1
  164. }
  165. if re.Op == OpRepeat {
  166. n := re.Max
  167. if n == -1 {
  168. n = re.Min
  169. }
  170. if n <= 0 {
  171. n = 1
  172. }
  173. if int64(n) > maxSize/p.repeats {
  174. p.repeats = maxSize
  175. } else {
  176. p.repeats *= int64(n)
  177. }
  178. }
  179. if int64(p.numRegexp) < maxSize/p.repeats {
  180. return
  181. }
  182. // We need to start tracking size.
  183. // Make the map and belatedly populate it
  184. // with info about everything we've constructed so far.
  185. p.size = make(map[*Regexp]int64)
  186. for _, re := range p.stack {
  187. p.checkSize(re)
  188. }
  189. }
  190. if p.calcSize(re, true) > maxSize {
  191. panic(ErrLarge)
  192. }
  193. }
  194. func (p *parser) calcSize(re *Regexp, force bool) int64 {
  195. if !force {
  196. if size, ok := p.size[re]; ok {
  197. return size
  198. }
  199. }
  200. var size int64
  201. switch re.Op {
  202. case OpLiteral:
  203. size = int64(len(re.Rune))
  204. case OpCapture, OpStar:
  205. // star can be 1+ or 2+; assume 2 pessimistically
  206. size = 2 + p.calcSize(re.Sub[0], false)
  207. case OpPlus, OpQuest:
  208. size = 1 + p.calcSize(re.Sub[0], false)
  209. case OpConcat:
  210. for _, sub := range re.Sub {
  211. size += p.calcSize(sub, false)
  212. }
  213. case OpAlternate:
  214. for _, sub := range re.Sub {
  215. size += p.calcSize(sub, false)
  216. }
  217. if len(re.Sub) > 1 {
  218. size += int64(len(re.Sub)) - 1
  219. }
  220. case OpRepeat:
  221. sub := p.calcSize(re.Sub[0], false)
  222. if re.Max == -1 {
  223. if re.Min == 0 {
  224. size = 2 + sub // x*
  225. } else {
  226. size = 1 + int64(re.Min)*sub // xxx+
  227. }
  228. break
  229. }
  230. // x{2,5} = xx(x(x(x)?)?)?
  231. size = int64(re.Max)*sub + int64(re.Max-re.Min)
  232. }
  233. if size < 1 {
  234. size = 1
  235. }
  236. p.size[re] = size
  237. return size
  238. }
  239. func (p *parser) checkHeight(re *Regexp) {
  240. if p.numRegexp < maxHeight {
  241. return
  242. }
  243. if p.height == nil {
  244. p.height = make(map[*Regexp]int)
  245. for _, re := range p.stack {
  246. p.checkHeight(re)
  247. }
  248. }
  249. if p.calcHeight(re, true) > maxHeight {
  250. panic(ErrNestingDepth)
  251. }
  252. }
  253. func (p *parser) calcHeight(re *Regexp, force bool) int {
  254. if !force {
  255. if h, ok := p.height[re]; ok {
  256. return h
  257. }
  258. }
  259. h := 1
  260. for _, sub := range re.Sub {
  261. hsub := p.calcHeight(sub, false)
  262. if h < 1+hsub {
  263. h = 1 + hsub
  264. }
  265. }
  266. p.height[re] = h
  267. return h
  268. }
  269. // Parse stack manipulation.
  270. // push pushes the regexp re onto the parse stack and returns the regexp.
  271. func (p *parser) push(re *Regexp) *Regexp {
  272. p.numRunes += len(re.Rune)
  273. if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
  274. // Single rune.
  275. if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
  276. return nil
  277. }
  278. re.Op = OpLiteral
  279. re.Rune = re.Rune[:1]
  280. re.Flags = p.flags &^ FoldCase
  281. } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
  282. re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
  283. unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
  284. unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
  285. re.Op == OpCharClass && len(re.Rune) == 2 &&
  286. re.Rune[0]+1 == re.Rune[1] &&
  287. unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
  288. unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
  289. // Case-insensitive rune like [Aa] or [Δδ].
  290. if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
  291. return nil
  292. }
  293. // Rewrite as (case-insensitive) literal.
  294. re.Op = OpLiteral
  295. re.Rune = re.Rune[:1]
  296. re.Flags = p.flags | FoldCase
  297. } else {
  298. // Incremental concatenation.
  299. p.maybeConcat(-1, 0)
  300. }
  301. p.stack = append(p.stack, re)
  302. p.checkLimits(re)
  303. return re
  304. }
  305. // maybeConcat implements incremental concatenation
  306. // of literal runes into string nodes. The parser calls this
  307. // before each push, so only the top fragment of the stack
  308. // might need processing. Since this is called before a push,
  309. // the topmost literal is no longer subject to operators like *
  310. // (Otherwise ab* would turn into (ab)*.)
  311. // If r >= 0 and there's a node left over, maybeConcat uses it
  312. // to push r with the given flags.
  313. // maybeConcat reports whether r was pushed.
  314. func (p *parser) maybeConcat(r rune, flags Flags) bool {
  315. n := len(p.stack)
  316. if n < 2 {
  317. return false
  318. }
  319. re1 := p.stack[n-1]
  320. re2 := p.stack[n-2]
  321. if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
  322. return false
  323. }
  324. // Push re1 into re2.
  325. re2.Rune = append(re2.Rune, re1.Rune...)
  326. // Reuse re1 if possible.
  327. if r >= 0 {
  328. re1.Rune = re1.Rune0[:1]
  329. re1.Rune[0] = r
  330. re1.Flags = flags
  331. return true
  332. }
  333. p.stack = p.stack[:n-1]
  334. p.reuse(re1)
  335. return false // did not push r
  336. }
  337. // literal pushes a literal regexp for the rune r on the stack.
  338. func (p *parser) literal(r rune) {
  339. re := p.newRegexp(OpLiteral)
  340. re.Flags = p.flags
  341. if p.flags&FoldCase != 0 {
  342. r = minFoldRune(r)
  343. }
  344. re.Rune0[0] = r
  345. re.Rune = re.Rune0[:1]
  346. p.push(re)
  347. }
  348. // minFoldRune returns the minimum rune fold-equivalent to r.
  349. func minFoldRune(r rune) rune {
  350. if r < minFold || r > maxFold {
  351. return r
  352. }
  353. min := r
  354. r0 := r
  355. for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
  356. if min > r {
  357. min = r
  358. }
  359. }
  360. return min
  361. }
  362. // op pushes a regexp with the given op onto the stack
  363. // and returns that regexp.
  364. func (p *parser) op(op Op) *Regexp {
  365. re := p.newRegexp(op)
  366. re.Flags = p.flags
  367. return p.push(re)
  368. }
  369. // repeat replaces the top stack element with itself repeated according to op, min, max.
  370. // before is the regexp suffix starting at the repetition operator.
  371. // after is the regexp suffix following after the repetition operator.
  372. // repeat returns an updated 'after' and an error, if any.
  373. func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
  374. flags := p.flags
  375. if p.flags&PerlX != 0 {
  376. if len(after) > 0 && after[0] == '?' {
  377. after = after[1:]
  378. flags ^= NonGreedy
  379. }
  380. if lastRepeat != "" {
  381. // In Perl it is not allowed to stack repetition operators:
  382. // a** is a syntax error, not a doubled star, and a++ means
  383. // something else entirely, which we don't support!
  384. return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
  385. }
  386. }
  387. n := len(p.stack)
  388. if n == 0 {
  389. return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
  390. }
  391. sub := p.stack[n-1]
  392. if sub.Op >= opPseudo {
  393. return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
  394. }
  395. re := p.newRegexp(op)
  396. re.Min = min
  397. re.Max = max
  398. re.Flags = flags
  399. re.Sub = re.Sub0[:1]
  400. re.Sub[0] = sub
  401. p.stack[n-1] = re
  402. p.checkLimits(re)
  403. if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) {
  404. return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
  405. }
  406. return after, nil
  407. }
  408. // repeatIsValid reports whether the repetition re is valid.
  409. // Valid means that the combination of the top-level repetition
  410. // and any inner repetitions does not exceed n copies of the
  411. // innermost thing.
  412. // This function rewalks the regexp tree and is called for every repetition,
  413. // so we have to worry about inducing quadratic behavior in the parser.
  414. // We avoid this by only calling repeatIsValid when min or max >= 2.
  415. // In that case the depth of any >= 2 nesting can only get to 9 without
  416. // triggering a parse error, so each subtree can only be rewalked 9 times.
  417. func repeatIsValid(re *Regexp, n int) bool {
  418. if re.Op == OpRepeat {
  419. m := re.Max
  420. if m == 0 {
  421. return true
  422. }
  423. if m < 0 {
  424. m = re.Min
  425. }
  426. if m > n {
  427. return false
  428. }
  429. if m > 0 {
  430. n /= m
  431. }
  432. }
  433. for _, sub := range re.Sub {
  434. if !repeatIsValid(sub, n) {
  435. return false
  436. }
  437. }
  438. return true
  439. }
  440. // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
  441. func (p *parser) concat() *Regexp {
  442. p.maybeConcat(-1, 0)
  443. // Scan down to find pseudo-operator | or (.
  444. i := len(p.stack)
  445. for i > 0 && p.stack[i-1].Op < opPseudo {
  446. i--
  447. }
  448. subs := p.stack[i:]
  449. p.stack = p.stack[:i]
  450. // Empty concatenation is special case.
  451. if len(subs) == 0 {
  452. return p.push(p.newRegexp(OpEmptyMatch))
  453. }
  454. return p.push(p.collapse(subs, OpConcat))
  455. }
  456. // alternate replaces the top of the stack (above the topmost '(') with its alternation.
  457. func (p *parser) alternate() *Regexp {
  458. // Scan down to find pseudo-operator (.
  459. // There are no | above (.
  460. i := len(p.stack)
  461. for i > 0 && p.stack[i-1].Op < opPseudo {
  462. i--
  463. }
  464. subs := p.stack[i:]
  465. p.stack = p.stack[:i]
  466. // Make sure top class is clean.
  467. // All the others already are (see swapVerticalBar).
  468. if len(subs) > 0 {
  469. cleanAlt(subs[len(subs)-1])
  470. }
  471. // Empty alternate is special case
  472. // (shouldn't happen but easy to handle).
  473. if len(subs) == 0 {
  474. return p.push(p.newRegexp(OpNoMatch))
  475. }
  476. return p.push(p.collapse(subs, OpAlternate))
  477. }
  478. // cleanAlt cleans re for eventual inclusion in an alternation.
  479. func cleanAlt(re *Regexp) {
  480. switch re.Op {
  481. case OpCharClass:
  482. re.Rune = cleanClass(&re.Rune)
  483. if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
  484. re.Rune = nil
  485. re.Op = OpAnyChar
  486. return
  487. }
  488. if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
  489. re.Rune = nil
  490. re.Op = OpAnyCharNotNL
  491. return
  492. }
  493. if cap(re.Rune)-len(re.Rune) > 100 {
  494. // re.Rune will not grow any more.
  495. // Make a copy or inline to reclaim storage.
  496. re.Rune = append(re.Rune0[:0], re.Rune...)
  497. }
  498. }
  499. }
  500. // collapse returns the result of applying op to sub.
  501. // If sub contains op nodes, they all get hoisted up
  502. // so that there is never a concat of a concat or an
  503. // alternate of an alternate.
  504. func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
  505. if len(subs) == 1 {
  506. return subs[0]
  507. }
  508. re := p.newRegexp(op)
  509. re.Sub = re.Sub0[:0]
  510. for _, sub := range subs {
  511. if sub.Op == op {
  512. re.Sub = append(re.Sub, sub.Sub...)
  513. p.reuse(sub)
  514. } else {
  515. re.Sub = append(re.Sub, sub)
  516. }
  517. }
  518. if op == OpAlternate {
  519. re.Sub = p.factor(re.Sub)
  520. if len(re.Sub) == 1 {
  521. old := re
  522. re = re.Sub[0]
  523. p.reuse(old)
  524. }
  525. }
  526. return re
  527. }
  528. // factor factors common prefixes from the alternation list sub.
  529. // It returns a replacement list that reuses the same storage and
  530. // frees (passes to p.reuse) any removed *Regexps.
  531. //
  532. // For example,
  533. //
  534. // ABC|ABD|AEF|BCX|BCY
  535. //
  536. // simplifies by literal prefix extraction to
  537. //
  538. // A(B(C|D)|EF)|BC(X|Y)
  539. //
  540. // which simplifies by character class introduction to
  541. //
  542. // A(B[CD]|EF)|BC[XY]
  543. func (p *parser) factor(sub []*Regexp) []*Regexp {
  544. if len(sub) < 2 {
  545. return sub
  546. }
  547. // Round 1: Factor out common literal prefixes.
  548. var str []rune
  549. var strflags Flags
  550. start := 0
  551. out := sub[:0]
  552. if p.optimizations() {
  553. for i := 0; i <= len(sub); i++ {
  554. // Invariant: the Regexps that were in sub[0:start] have been
  555. // used or marked for reuse, and the slice space has been reused
  556. // for out (len(out) <= start).
  557. //
  558. // Invariant: sub[start:i] consists of regexps that all begin
  559. // with str as modified by strflags.
  560. var istr []rune
  561. var iflags Flags
  562. if i < len(sub) {
  563. istr, iflags = p.leadingString(sub[i])
  564. if iflags == strflags {
  565. same := 0
  566. for same < len(str) && same < len(istr) && str[same] == istr[same] {
  567. same++
  568. }
  569. if same > 0 {
  570. // Matches at least one rune in current range.
  571. // Keep going around.
  572. str = str[:same]
  573. continue
  574. }
  575. }
  576. }
  577. // Found end of a run with common leading literal string:
  578. // sub[start:i] all begin with str[0:len(str)], but sub[i]
  579. // does not even begin with str[0].
  580. //
  581. // Factor out common string and append factored expression to out.
  582. if i == start {
  583. // Nothing to do - run of length 0.
  584. } else if i == start+1 {
  585. // Just one: don't bother factoring.
  586. out = append(out, sub[start])
  587. } else {
  588. // Construct factored form: prefix(suffix1|suffix2|...)
  589. prefix := p.newRegexp(OpLiteral)
  590. prefix.Flags = strflags
  591. prefix.Rune = append(prefix.Rune[:0], str...)
  592. for j := start; j < i; j++ {
  593. sub[j] = p.removeLeadingString(sub[j], len(str))
  594. p.checkLimits(sub[j])
  595. }
  596. suffix := p.collapse(sub[start:i], OpAlternate) // recurse
  597. re := p.newRegexp(OpConcat)
  598. re.Sub = append(re.Sub[:0], prefix, suffix)
  599. out = append(out, re)
  600. }
  601. // Prepare for next iteration.
  602. start = i
  603. str = istr
  604. strflags = iflags
  605. }
  606. sub = out
  607. // Round 2: Factor out common simple prefixes,
  608. // just the first piece of each concatenation.
  609. // This will be good enough a lot of the time.
  610. //
  611. // Complex subexpressions (e.g. involving quantifiers)
  612. // are not safe to factor because that collapses their
  613. // distinct paths through the automaton, which affects
  614. // correctness in some cases.
  615. start = 0
  616. out = sub[:0]
  617. var first *Regexp
  618. for i := 0; i <= len(sub); i++ {
  619. // Invariant: the Regexps that were in sub[0:start] have been
  620. // used or marked for reuse, and the slice space has been reused
  621. // for out (len(out) <= start).
  622. //
  623. // Invariant: sub[start:i] consists of regexps that all begin with ifirst.
  624. var ifirst *Regexp
  625. if i < len(sub) {
  626. ifirst = p.leadingRegexp(sub[i])
  627. if first != nil && first.Equal(ifirst) &&
  628. // first must be a character class OR a fixed repeat of a character class.
  629. (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) {
  630. continue
  631. }
  632. }
  633. // Found end of a run with common leading regexp:
  634. // sub[start:i] all begin with first but sub[i] does not.
  635. //
  636. // Factor out common regexp and append factored expression to out.
  637. if i == start {
  638. // Nothing to do - run of length 0.
  639. } else if i == start+1 {
  640. // Just one: don't bother factoring.
  641. out = append(out, sub[start])
  642. } else {
  643. // Construct factored form: prefix(suffix1|suffix2|...)
  644. prefix := first
  645. for j := start; j < i; j++ {
  646. reuse := j != start // prefix came from sub[start]
  647. sub[j] = p.removeLeadingRegexp(sub[j], reuse)
  648. p.checkLimits(sub[j])
  649. }
  650. suffix := p.collapse(sub[start:i], OpAlternate) // recurse
  651. re := p.newRegexp(OpConcat)
  652. re.Sub = append(re.Sub[:0], prefix, suffix)
  653. out = append(out, re)
  654. }
  655. // Prepare for next iteration.
  656. start = i
  657. first = ifirst
  658. }
  659. sub = out
  660. // Round 3: Collapse runs of single literals into character classes.
  661. start = 0
  662. out = sub[:0]
  663. for i := 0; i <= len(sub); i++ {
  664. // Invariant: the Regexps that were in sub[0:start] have been
  665. // used or marked for reuse, and the slice space has been reused
  666. // for out (len(out) <= start).
  667. //
  668. // Invariant: sub[start:i] consists of regexps that are either
  669. // literal runes or character classes.
  670. if i < len(sub) && isCharClass(sub[i]) {
  671. continue
  672. }
  673. // sub[i] is not a char or char class;
  674. // emit char class for sub[start:i]...
  675. if i == start {
  676. // Nothing to do - run of length 0.
  677. } else if i == start+1 {
  678. out = append(out, sub[start])
  679. } else {
  680. // Make new char class.
  681. // Start with most complex regexp in sub[start].
  682. max := start
  683. for j := start + 1; j < i; j++ {
  684. if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
  685. max = j
  686. }
  687. }
  688. sub[start], sub[max] = sub[max], sub[start]
  689. for j := start + 1; j < i; j++ {
  690. mergeCharClass(sub[start], sub[j])
  691. p.reuse(sub[j])
  692. }
  693. cleanAlt(sub[start])
  694. out = append(out, sub[start])
  695. }
  696. // ... and then emit sub[i].
  697. if i < len(sub) {
  698. out = append(out, sub[i])
  699. }
  700. start = i + 1
  701. }
  702. sub = out
  703. }
  704. // Round 4: Collapse runs of empty matches into a single empty match.
  705. start = 0
  706. out = sub[:0]
  707. for i := range sub {
  708. if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
  709. continue
  710. }
  711. out = append(out, sub[i])
  712. }
  713. sub = out
  714. return sub
  715. }
  716. // leadingString returns the leading literal string that re begins with.
  717. // The string refers to storage in re or its children.
  718. func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
  719. if re.Op == OpConcat && len(re.Sub) > 0 {
  720. re = re.Sub[0]
  721. }
  722. if re.Op != OpLiteral {
  723. return nil, 0
  724. }
  725. return re.Rune, re.Flags & FoldCase
  726. }
  727. // removeLeadingString removes the first n leading runes
  728. // from the beginning of re. It returns the replacement for re.
  729. func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
  730. if re.Op == OpConcat && len(re.Sub) > 0 {
  731. // Removing a leading string in a concatenation
  732. // might simplify the concatenation.
  733. sub := re.Sub[0]
  734. sub = p.removeLeadingString(sub, n)
  735. re.Sub[0] = sub
  736. if sub.Op == OpEmptyMatch {
  737. p.reuse(sub)
  738. switch len(re.Sub) {
  739. case 0, 1:
  740. // Impossible but handle.
  741. re.Op = OpEmptyMatch
  742. re.Sub = nil
  743. case 2:
  744. old := re
  745. re = re.Sub[1]
  746. p.reuse(old)
  747. default:
  748. copy(re.Sub, re.Sub[1:])
  749. re.Sub = re.Sub[:len(re.Sub)-1]
  750. }
  751. }
  752. return re
  753. }
  754. if re.Op == OpLiteral {
  755. re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
  756. if len(re.Rune) == 0 {
  757. re.Op = OpEmptyMatch
  758. }
  759. }
  760. return re
  761. }
  762. // leadingRegexp returns the leading regexp that re begins with.
  763. // The regexp refers to storage in re or its children.
  764. func (p *parser) leadingRegexp(re *Regexp) *Regexp {
  765. if re.Op == OpEmptyMatch {
  766. return nil
  767. }
  768. if re.Op == OpConcat && len(re.Sub) > 0 {
  769. sub := re.Sub[0]
  770. if sub.Op == OpEmptyMatch {
  771. return nil
  772. }
  773. return sub
  774. }
  775. return re
  776. }
  777. // removeLeadingRegexp removes the leading regexp in re.
  778. // It returns the replacement for re.
  779. // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
  780. func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
  781. if re.Op == OpConcat && len(re.Sub) > 0 {
  782. if reuse {
  783. p.reuse(re.Sub[0])
  784. }
  785. re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
  786. switch len(re.Sub) {
  787. case 0:
  788. re.Op = OpEmptyMatch
  789. re.Sub = nil
  790. case 1:
  791. old := re
  792. re = re.Sub[0]
  793. p.reuse(old)
  794. }
  795. return re
  796. }
  797. if reuse {
  798. p.reuse(re)
  799. }
  800. return p.newRegexp(OpEmptyMatch)
  801. }
  802. func literalRegexp(s string, flags Flags) *Regexp {
  803. re := &Regexp{Op: OpLiteral}
  804. re.Flags = flags
  805. re.Rune = re.Rune0[:0] // use local storage for small strings
  806. for _, c := range s {
  807. if len(re.Rune) >= cap(re.Rune) {
  808. // string is too long to fit in Rune0. let Go handle it
  809. re.Rune = []rune(s)
  810. break
  811. }
  812. re.Rune = append(re.Rune, c)
  813. }
  814. return re
  815. }
  816. // Parsing.
  817. // Parse parses a regular expression string s, controlled by the specified
  818. // Flags, and returns a regular expression parse tree. The syntax is
  819. // described in the top-level comment.
  820. func Parse(s string, flags Flags) (*Regexp, error) {
  821. return parse(s, flags)
  822. }
  823. func parse(s string, flags Flags) (_ *Regexp, err error) {
  824. defer func() {
  825. switch r := recover(); r {
  826. default:
  827. panic(r)
  828. case nil:
  829. // ok
  830. case ErrLarge: // too big
  831. err = &Error{Code: ErrLarge, Expr: s}
  832. case ErrNestingDepth:
  833. err = &Error{Code: ErrNestingDepth, Expr: s}
  834. }
  835. }()
  836. if flags&Literal != 0 {
  837. // Trivial parser for literal string.
  838. if err := checkUTF8(s); err != nil {
  839. return nil, err
  840. }
  841. return literalRegexp(s, flags), nil
  842. }
  843. // Otherwise, must do real work.
  844. var (
  845. p parser
  846. c rune
  847. op Op
  848. lastRepeat string
  849. )
  850. p.flags = flags
  851. p.wholeRegexp = s
  852. t := s
  853. for t != "" {
  854. repeat := ""
  855. BigSwitch:
  856. switch t[0] {
  857. default:
  858. if c, t, err = nextRune(t); err != nil {
  859. return nil, err
  860. }
  861. p.literal(c)
  862. case '(':
  863. if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
  864. // Flag changes and non-capturing groups.
  865. if t, err = p.parsePerlFlags(t); err != nil {
  866. return nil, err
  867. }
  868. break
  869. }
  870. p.numCap++
  871. p.op(opLeftParen).Cap = p.numCap
  872. t = t[1:]
  873. case '|':
  874. if err = p.parseVerticalBar(); err != nil {
  875. return nil, err
  876. }
  877. t = t[1:]
  878. case ')':
  879. if err = p.parseRightParen(); err != nil {
  880. return nil, err
  881. }
  882. t = t[1:]
  883. case '^':
  884. if p.flags&OneLine != 0 {
  885. p.op(OpBeginText)
  886. } else {
  887. p.op(OpBeginLine)
  888. }
  889. t = t[1:]
  890. case '$':
  891. if p.flags&OneLine != 0 {
  892. p.op(OpEndText).Flags |= WasDollar
  893. } else {
  894. p.op(OpEndLine)
  895. }
  896. t = t[1:]
  897. case '.':
  898. if p.flags&DotNL != 0 {
  899. p.op(OpAnyChar)
  900. } else {
  901. p.op(OpAnyCharNotNL)
  902. }
  903. t = t[1:]
  904. case '[':
  905. if t, err = p.parseClass(t); err != nil {
  906. return nil, err
  907. }
  908. case '*', '+', '?':
  909. before := t
  910. switch t[0] {
  911. case '*':
  912. op = OpStar
  913. case '+':
  914. op = OpPlus
  915. case '?':
  916. op = OpQuest
  917. }
  918. after := t[1:]
  919. if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil {
  920. return nil, err
  921. }
  922. repeat = before
  923. t = after
  924. case '{':
  925. op = OpRepeat
  926. before := t
  927. min, max, after, ok := p.parseRepeat(t)
  928. if !ok {
  929. // If the repeat cannot be parsed, { is a literal.
  930. p.literal('{')
  931. t = t[1:]
  932. break
  933. }
  934. if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
  935. // Numbers were too big, or max is present and min > max.
  936. return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
  937. }
  938. if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
  939. return nil, err
  940. }
  941. repeat = before
  942. t = after
  943. case '\\':
  944. if p.flags&PerlX != 0 && len(t) >= 2 {
  945. switch t[1] {
  946. case 'A':
  947. p.op(OpBeginText)
  948. t = t[2:]
  949. break BigSwitch
  950. case 'b':
  951. p.op(OpWordBoundary)
  952. t = t[2:]
  953. break BigSwitch
  954. case 'B':
  955. p.op(OpNoWordBoundary)
  956. t = t[2:]
  957. break BigSwitch
  958. case 'C':
  959. // any byte; not supported
  960. return nil, &Error{ErrInvalidEscape, t[:2]}
  961. case 'Q':
  962. // \Q ... \E: the ... is always literals
  963. var lit string
  964. lit, t, _ = strings.Cut(t[2:], `\E`)
  965. for lit != "" {
  966. c, rest, err := nextRune(lit)
  967. if err != nil {
  968. return nil, err
  969. }
  970. p.literal(c)
  971. lit = rest
  972. }
  973. break BigSwitch
  974. case 'z':
  975. p.op(OpEndText)
  976. t = t[2:]
  977. break BigSwitch
  978. }
  979. }
  980. re := p.newRegexp(OpCharClass)
  981. re.Flags = p.flags
  982. // Look for Unicode character group like \p{Han}
  983. if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
  984. r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
  985. if err != nil {
  986. return nil, err
  987. }
  988. if r != nil {
  989. re.Rune = r
  990. t = rest
  991. p.push(re)
  992. break BigSwitch
  993. }
  994. }
  995. // Perl character class escape.
  996. if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
  997. re.Rune = r
  998. t = rest
  999. p.push(re)
  1000. break BigSwitch
  1001. }
  1002. p.reuse(re)
  1003. // Ordinary single-character escape.
  1004. if c, t, err = p.parseEscape(t); err != nil {
  1005. return nil, err
  1006. }
  1007. p.literal(c)
  1008. }
  1009. lastRepeat = repeat
  1010. }
  1011. p.concat()
  1012. if p.swapVerticalBar() {
  1013. // pop vertical bar
  1014. p.stack = p.stack[:len(p.stack)-1]
  1015. }
  1016. p.alternate()
  1017. n := len(p.stack)
  1018. if n != 1 {
  1019. return nil, &Error{ErrMissingParen, s}
  1020. }
  1021. return p.stack[0], nil
  1022. }
  1023. // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
  1024. // If s is not of that form, it returns ok == false.
  1025. // If s has the right form but the values are too big, it returns min == -1, ok == true.
  1026. func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
  1027. if s == "" || s[0] != '{' {
  1028. return
  1029. }
  1030. s = s[1:]
  1031. var ok1 bool
  1032. if min, s, ok1 = p.parseInt(s); !ok1 {
  1033. return
  1034. }
  1035. if s == "" {
  1036. return
  1037. }
  1038. if s[0] != ',' {
  1039. max = min
  1040. } else {
  1041. s = s[1:]
  1042. if s == "" {
  1043. return
  1044. }
  1045. if s[0] == '}' {
  1046. max = -1
  1047. } else if max, s, ok1 = p.parseInt(s); !ok1 {
  1048. return
  1049. } else if max < 0 {
  1050. // parseInt found too big a number
  1051. min = -1
  1052. }
  1053. }
  1054. if s == "" || s[0] != '}' {
  1055. return
  1056. }
  1057. rest = s[1:]
  1058. ok = true
  1059. return
  1060. }
  1061. // parsePerlFlags parses a Perl flag setting or non-capturing group or both,
  1062. // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.
  1063. // The caller must have ensured that s begins with "(?".
  1064. func (p *parser) parsePerlFlags(s string) (rest string, err error) {
  1065. t := s
  1066. // Check for named captures, first introduced in Python's regexp library.
  1067. // As usual, there are three slightly different syntaxes:
  1068. //
  1069. // (?P<name>expr) the original, introduced by Python
  1070. // (?<name>expr) the .NET alteration, adopted by Perl 5.10
  1071. // (?'name'expr) another .NET alteration, adopted by Perl 5.10
  1072. //
  1073. // Perl 5.10 gave in and implemented the Python version too,
  1074. // but they claim that the last two are the preferred forms.
  1075. // PCRE and languages based on it (specifically, PHP and Ruby)
  1076. // support all three as well. EcmaScript 4 uses only the Python form.
  1077. //
  1078. // In both the open source world (via Code Search) and the
  1079. // Google source tree, (?P<expr>name) is the dominant form,
  1080. // so that's the one we implement. One is enough.
  1081. if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
  1082. // Pull out name.
  1083. end := strings.IndexRune(t, '>')
  1084. if end < 0 {
  1085. if err = checkUTF8(t); err != nil {
  1086. return "", err
  1087. }
  1088. return "", &Error{ErrInvalidNamedCapture, s}
  1089. }
  1090. capture := t[:end+1] // "(?P<name>"
  1091. name := t[4:end] // "name"
  1092. if err = checkUTF8(name); err != nil {
  1093. return "", err
  1094. }
  1095. if !isValidCaptureName(name) {
  1096. return "", &Error{ErrInvalidNamedCapture, capture}
  1097. }
  1098. // Like ordinary capture, but named.
  1099. p.numCap++
  1100. re := p.op(opLeftParen)
  1101. re.Cap = p.numCap
  1102. re.Name = name
  1103. return t[end+1:], nil
  1104. }
  1105. // Non-capturing group. Might also twiddle Perl flags.
  1106. var c rune
  1107. t = t[2:] // skip (?
  1108. flags := p.flags
  1109. sign := +1
  1110. sawFlag := false
  1111. Loop:
  1112. for t != "" {
  1113. if c, t, err = nextRune(t); err != nil {
  1114. return "", err
  1115. }
  1116. switch c {
  1117. default:
  1118. break Loop
  1119. // Flags.
  1120. case 'i':
  1121. flags |= FoldCase
  1122. sawFlag = true
  1123. case 'm':
  1124. flags &^= OneLine
  1125. sawFlag = true
  1126. case 's':
  1127. flags |= DotNL
  1128. sawFlag = true
  1129. case 'U':
  1130. flags |= NonGreedy
  1131. sawFlag = true
  1132. // Switch to negation.
  1133. case '-':
  1134. if sign < 0 {
  1135. break Loop
  1136. }
  1137. sign = -1
  1138. // Invert flags so that | above turn into &^ and vice versa.
  1139. // We'll invert flags again before using it below.
  1140. flags = ^flags
  1141. sawFlag = false
  1142. // End of flags, starting group or not.
  1143. case ':', ')':
  1144. if sign < 0 {
  1145. if !sawFlag {
  1146. break Loop
  1147. }
  1148. flags = ^flags
  1149. }
  1150. if c == ':' {
  1151. // Open new group
  1152. p.op(opLeftParen)
  1153. }
  1154. p.flags = flags
  1155. return t, nil
  1156. }
  1157. }
  1158. return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
  1159. }
  1160. // isValidCaptureName reports whether name
  1161. // is a valid capture name: [A-Za-z0-9_]+.
  1162. // PCRE limits names to 32 bytes.
  1163. // Python rejects names starting with digits.
  1164. // We don't enforce either of those.
  1165. func isValidCaptureName(name string) bool {
  1166. if name == "" {
  1167. return false
  1168. }
  1169. for _, c := range name {
  1170. if c != '_' && !isalnum(c) {
  1171. return false
  1172. }
  1173. }
  1174. return true
  1175. }
  1176. // parseInt parses a decimal integer.
  1177. func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
  1178. if s == "" || s[0] < '0' || '9' < s[0] {
  1179. return
  1180. }
  1181. // Disallow leading zeros.
  1182. if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
  1183. return
  1184. }
  1185. t := s
  1186. for s != "" && '0' <= s[0] && s[0] <= '9' {
  1187. s = s[1:]
  1188. }
  1189. rest = s
  1190. ok = true
  1191. // Have digits, compute value.
  1192. t = t[:len(t)-len(s)]
  1193. for i := 0; i < len(t); i++ {
  1194. // Avoid overflow.
  1195. if n >= 1e8 {
  1196. n = -1
  1197. break
  1198. }
  1199. n = n*10 + int(t[i]) - '0'
  1200. }
  1201. return
  1202. }
  1203. // can this be represented as a character class?
  1204. // single-rune literal string, char class, ., and .|\n.
  1205. func isCharClass(re *Regexp) bool {
  1206. return re.Op == OpLiteral && len(re.Rune) == 1 ||
  1207. re.Op == OpCharClass ||
  1208. re.Op == OpAnyCharNotNL ||
  1209. re.Op == OpAnyChar
  1210. }
  1211. // does re match r?
  1212. func matchRune(re *Regexp, r rune) bool {
  1213. switch re.Op {
  1214. case OpLiteral:
  1215. return len(re.Rune) == 1 && re.Rune[0] == r
  1216. case OpCharClass:
  1217. for i := 0; i < len(re.Rune); i += 2 {
  1218. if re.Rune[i] <= r && r <= re.Rune[i+1] {
  1219. return true
  1220. }
  1221. }
  1222. return false
  1223. case OpAnyCharNotNL:
  1224. return r != '\n'
  1225. case OpAnyChar:
  1226. return true
  1227. }
  1228. return false
  1229. }
  1230. // parseVerticalBar handles a | in the input.
  1231. func (p *parser) parseVerticalBar() error {
  1232. p.concat()
  1233. // The concatenation we just parsed is on top of the stack.
  1234. // If it sits above an opVerticalBar, swap it below
  1235. // (things below an opVerticalBar become an alternation).
  1236. // Otherwise, push a new vertical bar.
  1237. if !p.swapVerticalBar() {
  1238. p.op(opVerticalBar)
  1239. }
  1240. return nil
  1241. }
  1242. // mergeCharClass makes dst = dst|src.
  1243. // The caller must ensure that dst.Op >= src.Op,
  1244. // to reduce the amount of copying.
  1245. func mergeCharClass(dst, src *Regexp) {
  1246. switch dst.Op {
  1247. case OpAnyChar:
  1248. // src doesn't add anything.
  1249. case OpAnyCharNotNL:
  1250. // src might add \n
  1251. if matchRune(src, '\n') {
  1252. dst.Op = OpAnyChar
  1253. }
  1254. case OpCharClass:
  1255. // src is simpler, so either literal or char class
  1256. if src.Op == OpLiteral {
  1257. dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
  1258. } else {
  1259. dst.Rune = appendClass(dst.Rune, src.Rune)
  1260. }
  1261. case OpLiteral:
  1262. // both literal
  1263. if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
  1264. break
  1265. }
  1266. dst.Op = OpCharClass
  1267. dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
  1268. dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
  1269. }
  1270. }
  1271. // If the top of the stack is an element followed by an opVerticalBar
  1272. // swapVerticalBar swaps the two and returns true.
  1273. // Otherwise it returns false.
  1274. func (p *parser) swapVerticalBar() bool {
  1275. // If above and below vertical bar are literal or char class,
  1276. // can merge into a single char class.
  1277. n := len(p.stack)
  1278. if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) && p.optimizations() {
  1279. re1 := p.stack[n-1]
  1280. re3 := p.stack[n-3]
  1281. // Make re3 the more complex of the two.
  1282. if re1.Op > re3.Op {
  1283. re1, re3 = re3, re1
  1284. p.stack[n-3] = re3
  1285. }
  1286. mergeCharClass(re3, re1)
  1287. p.reuse(re1)
  1288. p.stack = p.stack[:n-1]
  1289. return true
  1290. }
  1291. if n >= 2 {
  1292. re1 := p.stack[n-1]
  1293. re2 := p.stack[n-2]
  1294. if re2.Op == opVerticalBar {
  1295. if n >= 3 {
  1296. // Now out of reach.
  1297. // Clean opportunistically.
  1298. cleanAlt(p.stack[n-3])
  1299. }
  1300. p.stack[n-2] = re1
  1301. p.stack[n-1] = re2
  1302. return true
  1303. }
  1304. }
  1305. return false
  1306. }
  1307. // parseRightParen handles a ) in the input.
  1308. func (p *parser) parseRightParen() error {
  1309. p.concat()
  1310. if p.swapVerticalBar() {
  1311. // pop vertical bar
  1312. p.stack = p.stack[:len(p.stack)-1]
  1313. }
  1314. p.alternate()
  1315. n := len(p.stack)
  1316. if n < 2 {
  1317. return &Error{ErrUnexpectedParen, p.wholeRegexp}
  1318. }
  1319. re1 := p.stack[n-1]
  1320. re2 := p.stack[n-2]
  1321. p.stack = p.stack[:n-2]
  1322. if re2.Op != opLeftParen {
  1323. return &Error{ErrUnexpectedParen, p.wholeRegexp}
  1324. }
  1325. // Restore flags at time of paren.
  1326. p.flags = re2.Flags
  1327. if re2.Cap == 0 {
  1328. // Just for grouping.
  1329. p.push(re1)
  1330. } else {
  1331. re2.Op = OpCapture
  1332. re2.Sub = re2.Sub0[:1]
  1333. re2.Sub[0] = re1
  1334. p.push(re2)
  1335. }
  1336. return nil
  1337. }
  1338. // parseEscape parses an escape sequence at the beginning of s
  1339. // and returns the rune.
  1340. func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
  1341. t := s[1:]
  1342. if t == "" {
  1343. return 0, "", &Error{ErrTrailingBackslash, ""}
  1344. }
  1345. c, t, err := nextRune(t)
  1346. if err != nil {
  1347. return 0, "", err
  1348. }
  1349. Switch:
  1350. switch c {
  1351. default:
  1352. if c < utf8.RuneSelf && !isalnum(c) {
  1353. // Escaped non-word characters are always themselves.
  1354. // PCRE is not quite so rigorous: it accepts things like
  1355. // \q, but we don't. We once rejected \_, but too many
  1356. // programs and people insist on using it, so allow \_.
  1357. return c, t, nil
  1358. }
  1359. // Octal escapes.
  1360. case '1', '2', '3', '4', '5', '6', '7':
  1361. // Single non-zero digit is a backreference; not supported
  1362. if t == "" || t[0] < '0' || t[0] > '7' {
  1363. break
  1364. }
  1365. fallthrough
  1366. case '0':
  1367. // Consume up to three octal digits; already have one.
  1368. r = c - '0'
  1369. for i := 1; i < 3; i++ {
  1370. if t == "" || t[0] < '0' || t[0] > '7' {
  1371. break
  1372. }
  1373. r = r*8 + rune(t[0]) - '0'
  1374. t = t[1:]
  1375. }
  1376. return r, t, nil
  1377. // Hexadecimal escapes.
  1378. case 'x':
  1379. if t == "" {
  1380. break
  1381. }
  1382. if c, t, err = nextRune(t); err != nil {
  1383. return 0, "", err
  1384. }
  1385. if c == '{' {
  1386. // Any number of digits in braces.
  1387. // Perl accepts any text at all; it ignores all text
  1388. // after the first non-hex digit. We require only hex digits,
  1389. // and at least one.
  1390. nhex := 0
  1391. r = 0
  1392. for {
  1393. if t == "" {
  1394. break Switch
  1395. }
  1396. if c, t, err = nextRune(t); err != nil {
  1397. return 0, "", err
  1398. }
  1399. if c == '}' {
  1400. break
  1401. }
  1402. v := unhex(c)
  1403. if v < 0 {
  1404. break Switch
  1405. }
  1406. r = r*16 + v
  1407. if r > unicode.MaxRune {
  1408. break Switch
  1409. }
  1410. nhex++
  1411. }
  1412. if nhex == 0 {
  1413. break Switch
  1414. }
  1415. return r, t, nil
  1416. }
  1417. // Easy case: two hex digits.
  1418. x := unhex(c)
  1419. if c, t, err = nextRune(t); err != nil {
  1420. return 0, "", err
  1421. }
  1422. y := unhex(c)
  1423. if x < 0 || y < 0 {
  1424. break
  1425. }
  1426. return x*16 + y, t, nil
  1427. // C escapes. There is no case 'b', to avoid misparsing
  1428. // the Perl word-boundary \b as the C backspace \b
  1429. // when in POSIX mode. In Perl, /\b/ means word-boundary
  1430. // but /[\b]/ means backspace. We don't support that.
  1431. // If you want a backspace, embed a literal backspace
  1432. // character or use \x08.
  1433. case 'a':
  1434. return '\a', t, err
  1435. case 'f':
  1436. return '\f', t, err
  1437. case 'n':
  1438. return '\n', t, err
  1439. case 'r':
  1440. return '\r', t, err
  1441. case 't':
  1442. return '\t', t, err
  1443. case 'v':
  1444. return '\v', t, err
  1445. }
  1446. return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
  1447. }
  1448. // parseClassChar parses a character class character at the beginning of s
  1449. // and returns it.
  1450. func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
  1451. if s == "" {
  1452. return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
  1453. }
  1454. // Allow regular escape sequences even though
  1455. // many need not be escaped in this context.
  1456. if s[0] == '\\' {
  1457. return p.parseEscape(s)
  1458. }
  1459. return nextRune(s)
  1460. }
  1461. type charGroup struct {
  1462. sign int
  1463. class []rune
  1464. }
  1465. // parsePerlClassEscape parses a leading Perl character class escape like \d
  1466. // from the beginning of s. If one is present, it appends the characters to r
  1467. // and returns the new slice r and the remainder of the string.
  1468. func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
  1469. if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
  1470. return
  1471. }
  1472. g := perlGroup[s[0:2]]
  1473. if g.sign == 0 {
  1474. return
  1475. }
  1476. return p.appendGroup(r, g), s[2:]
  1477. }
  1478. // parseNamedClass parses a leading POSIX named character class like [:alnum:]
  1479. // from the beginning of s. If one is present, it appends the characters to r
  1480. // and returns the new slice r and the remainder of the string.
  1481. func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
  1482. if len(s) < 2 || s[0] != '[' || s[1] != ':' {
  1483. return
  1484. }
  1485. i := strings.Index(s[2:], ":]")
  1486. if i < 0 {
  1487. return
  1488. }
  1489. i += 2
  1490. name, s := s[0:i+2], s[i+2:]
  1491. g := posixGroup[name]
  1492. if g.sign == 0 {
  1493. return nil, "", &Error{ErrInvalidCharRange, name}
  1494. }
  1495. return p.appendGroup(r, g), s, nil
  1496. }
  1497. func (p *parser) appendGroup(r []rune, g charGroup) []rune {
  1498. if p.flags&FoldCase == 0 {
  1499. if g.sign < 0 {
  1500. r = appendNegatedClass(r, g.class)
  1501. } else {
  1502. r = appendClass(r, g.class)
  1503. }
  1504. } else {
  1505. tmp := p.tmpClass[:0]
  1506. tmp = appendFoldedClass(tmp, g.class)
  1507. p.tmpClass = tmp
  1508. tmp = cleanClass(&p.tmpClass)
  1509. if g.sign < 0 {
  1510. r = appendNegatedClass(r, tmp)
  1511. } else {
  1512. r = appendClass(r, tmp)
  1513. }
  1514. }
  1515. return r
  1516. }
  1517. var anyTable = &unicode.RangeTable{
  1518. R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
  1519. R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
  1520. }
  1521. // unicodeTable returns the unicode.RangeTable identified by name
  1522. // and the table of additional fold-equivalent code points.
  1523. func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
  1524. // Special case: "Any" means any.
  1525. if name == "Any" {
  1526. return anyTable, anyTable
  1527. }
  1528. if t := unicode.Categories[name]; t != nil {
  1529. return t, unicode.FoldCategory[name]
  1530. }
  1531. if t := unicode.Scripts[name]; t != nil {
  1532. return t, unicode.FoldScript[name]
  1533. }
  1534. return nil, nil
  1535. }
  1536. // parseUnicodeClass parses a leading Unicode character class like \p{Han}
  1537. // from the beginning of s. If one is present, it appends the characters to r
  1538. // and returns the new slice r and the remainder of the string.
  1539. func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
  1540. if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
  1541. return
  1542. }
  1543. // Committed to parse or return error.
  1544. sign := +1
  1545. if s[1] == 'P' {
  1546. sign = -1
  1547. }
  1548. t := s[2:]
  1549. c, t, err := nextRune(t)
  1550. if err != nil {
  1551. return
  1552. }
  1553. var seq, name string
  1554. if c != '{' {
  1555. // Single-letter name.
  1556. seq = s[:len(s)-len(t)]
  1557. name = seq[2:]
  1558. } else {
  1559. // Name is in braces.
  1560. end := strings.IndexRune(s, '}')
  1561. if end < 0 {
  1562. if err = checkUTF8(s); err != nil {
  1563. return
  1564. }
  1565. return nil, "", &Error{ErrInvalidCharRange, s}
  1566. }
  1567. seq, t = s[:end+1], s[end+1:]
  1568. name = s[3:end]
  1569. if err = checkUTF8(name); err != nil {
  1570. return
  1571. }
  1572. }
  1573. // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
  1574. if name != "" && name[0] == '^' {
  1575. sign = -sign
  1576. name = name[1:]
  1577. }
  1578. tab, fold := unicodeTable(name)
  1579. if tab == nil {
  1580. return nil, "", &Error{ErrInvalidCharRange, seq}
  1581. }
  1582. if p.flags&FoldCase == 0 || fold == nil {
  1583. if sign > 0 {
  1584. r = appendTable(r, tab)
  1585. } else {
  1586. r = appendNegatedTable(r, tab)
  1587. }
  1588. } else {
  1589. // Merge and clean tab and fold in a temporary buffer.
  1590. // This is necessary for the negative case and just tidy
  1591. // for the positive case.
  1592. tmp := p.tmpClass[:0]
  1593. tmp = appendTable(tmp, tab)
  1594. tmp = appendTable(tmp, fold)
  1595. p.tmpClass = tmp
  1596. tmp = cleanClass(&p.tmpClass)
  1597. if sign > 0 {
  1598. r = appendClass(r, tmp)
  1599. } else {
  1600. r = appendNegatedClass(r, tmp)
  1601. }
  1602. }
  1603. return r, t, nil
  1604. }
  1605. // parseClass parses a character class at the beginning of s
  1606. // and pushes it onto the parse stack.
  1607. func (p *parser) parseClass(s string) (rest string, err error) {
  1608. t := s[1:] // chop [
  1609. re := p.newRegexp(OpCharClass)
  1610. re.Flags = p.flags
  1611. re.Rune = re.Rune0[:0]
  1612. sign := +1
  1613. if t != "" && t[0] == '^' {
  1614. sign = -1
  1615. t = t[1:]
  1616. // If character class does not match \n, add it here,
  1617. // so that negation later will do the right thing.
  1618. if p.flags&ClassNL == 0 {
  1619. re.Rune = append(re.Rune, '\n', '\n')
  1620. }
  1621. }
  1622. class := re.Rune
  1623. first := true // ] and - are okay as first char in class
  1624. for t == "" || t[0] != ']' || first {
  1625. // POSIX: - is only okay unescaped as first or last in class.
  1626. // Perl: - is okay anywhere.
  1627. if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
  1628. _, size := utf8.DecodeRuneInString(t[1:])
  1629. return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
  1630. }
  1631. first = false
  1632. // Look for POSIX [:alnum:] etc.
  1633. if len(t) > 2 && t[0] == '[' && t[1] == ':' {
  1634. nclass, nt, err := p.parseNamedClass(t, class)
  1635. if err != nil {
  1636. return "", err
  1637. }
  1638. if nclass != nil {
  1639. class, t = nclass, nt
  1640. continue
  1641. }
  1642. }
  1643. // Look for Unicode character group like \p{Han}.
  1644. nclass, nt, err := p.parseUnicodeClass(t, class)
  1645. if err != nil {
  1646. return "", err
  1647. }
  1648. if nclass != nil {
  1649. class, t = nclass, nt
  1650. continue
  1651. }
  1652. // Look for Perl character class symbols (extension).
  1653. if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
  1654. class, t = nclass, nt
  1655. continue
  1656. }
  1657. // Single character or simple range.
  1658. rng := t
  1659. var lo, hi rune
  1660. if lo, t, err = p.parseClassChar(t, s); err != nil {
  1661. return "", err
  1662. }
  1663. hi = lo
  1664. // [a-] means (a|-) so check for final ].
  1665. if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
  1666. t = t[1:]
  1667. if hi, t, err = p.parseClassChar(t, s); err != nil {
  1668. return "", err
  1669. }
  1670. if hi < lo {
  1671. rng = rng[:len(rng)-len(t)]
  1672. return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
  1673. }
  1674. }
  1675. if p.flags&FoldCase == 0 {
  1676. class = appendRange(class, lo, hi)
  1677. } else {
  1678. class = appendFoldedRange(class, lo, hi)
  1679. }
  1680. }
  1681. t = t[1:] // chop ]
  1682. // Use &re.Rune instead of &class to avoid allocation.
  1683. re.Rune = class
  1684. class = cleanClass(&re.Rune)
  1685. if sign < 0 {
  1686. class = negateClass(class)
  1687. }
  1688. re.Rune = class
  1689. p.push(re)
  1690. return t, nil
  1691. }
  1692. // cleanClass sorts the ranges (pairs of elements of r),
  1693. // merges them, and eliminates duplicates.
  1694. func cleanClass(rp *[]rune) []rune {
  1695. // Sort by lo increasing, hi decreasing to break ties.
  1696. sort.Sort(ranges{rp})
  1697. r := *rp
  1698. if len(r) < 2 {
  1699. return r
  1700. }
  1701. // Merge abutting, overlapping.
  1702. w := 2 // write index
  1703. for i := 2; i < len(r); i += 2 {
  1704. lo, hi := r[i], r[i+1]
  1705. if lo <= r[w-1]+1 {
  1706. // merge with previous range
  1707. if hi > r[w-1] {
  1708. r[w-1] = hi
  1709. }
  1710. continue
  1711. }
  1712. // new disjoint range
  1713. r[w] = lo
  1714. r[w+1] = hi
  1715. w += 2
  1716. }
  1717. return r[:w]
  1718. }
  1719. // appendLiteral returns the result of appending the literal x to the class r.
  1720. func appendLiteral(r []rune, x rune, flags Flags) []rune {
  1721. if flags&FoldCase != 0 {
  1722. return appendFoldedRange(r, x, x)
  1723. }
  1724. return appendRange(r, x, x)
  1725. }
  1726. // appendRange returns the result of appending the range lo-hi to the class r.
  1727. func appendRange(r []rune, lo, hi rune) []rune {
  1728. // Expand last range or next to last range if it overlaps or abuts.
  1729. // Checking two ranges helps when appending case-folded
  1730. // alphabets, so that one range can be expanding A-Z and the
  1731. // other expanding a-z.
  1732. n := len(r)
  1733. for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
  1734. if n >= i {
  1735. rlo, rhi := r[n-i], r[n-i+1]
  1736. if lo <= rhi+1 && rlo <= hi+1 {
  1737. if lo < rlo {
  1738. r[n-i] = lo
  1739. }
  1740. if hi > rhi {
  1741. r[n-i+1] = hi
  1742. }
  1743. return r
  1744. }
  1745. }
  1746. }
  1747. return append(r, lo, hi)
  1748. }
  1749. const (
  1750. // minimum and maximum runes involved in folding.
  1751. // checked during test.
  1752. minFold = 0x0041
  1753. maxFold = 0x1e943
  1754. )
  1755. // appendFoldedRange returns the result of appending the range lo-hi
  1756. // and its case folding-equivalent runes to the class r.
  1757. func appendFoldedRange(r []rune, lo, hi rune) []rune {
  1758. // Optimizations.
  1759. if lo <= minFold && hi >= maxFold {
  1760. // Range is full: folding can't add more.
  1761. return appendRange(r, lo, hi)
  1762. }
  1763. if hi < minFold || lo > maxFold {
  1764. // Range is outside folding possibilities.
  1765. return appendRange(r, lo, hi)
  1766. }
  1767. if lo < minFold {
  1768. // [lo, minFold-1] needs no folding.
  1769. r = appendRange(r, lo, minFold-1)
  1770. lo = minFold
  1771. }
  1772. if hi > maxFold {
  1773. // [maxFold+1, hi] needs no folding.
  1774. r = appendRange(r, maxFold+1, hi)
  1775. hi = maxFold
  1776. }
  1777. // Brute force. Depend on appendRange to coalesce ranges on the fly.
  1778. for c := lo; c <= hi; c++ {
  1779. r = appendRange(r, c, c)
  1780. f := unicode.SimpleFold(c)
  1781. for f != c {
  1782. r = appendRange(r, f, f)
  1783. f = unicode.SimpleFold(f)
  1784. }
  1785. }
  1786. return r
  1787. }
  1788. // appendClass returns the result of appending the class x to the class r.
  1789. // It assume x is clean.
  1790. func appendClass(r []rune, x []rune) []rune {
  1791. for i := 0; i < len(x); i += 2 {
  1792. r = appendRange(r, x[i], x[i+1])
  1793. }
  1794. return r
  1795. }
  1796. // appendFoldedClass returns the result of appending the case folding of the class x to the class r.
  1797. func appendFoldedClass(r []rune, x []rune) []rune {
  1798. for i := 0; i < len(x); i += 2 {
  1799. r = appendFoldedRange(r, x[i], x[i+1])
  1800. }
  1801. return r
  1802. }
  1803. // appendNegatedClass returns the result of appending the negation of the class x to the class r.
  1804. // It assumes x is clean.
  1805. func appendNegatedClass(r []rune, x []rune) []rune {
  1806. nextLo := '\u0000'
  1807. for i := 0; i < len(x); i += 2 {
  1808. lo, hi := x[i], x[i+1]
  1809. if nextLo <= lo-1 {
  1810. r = appendRange(r, nextLo, lo-1)
  1811. }
  1812. nextLo = hi + 1
  1813. }
  1814. if nextLo <= unicode.MaxRune {
  1815. r = appendRange(r, nextLo, unicode.MaxRune)
  1816. }
  1817. return r
  1818. }
  1819. // appendTable returns the result of appending x to the class r.
  1820. func appendTable(r []rune, x *unicode.RangeTable) []rune {
  1821. for _, xr := range x.R16 {
  1822. lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1823. if stride == 1 {
  1824. r = appendRange(r, lo, hi)
  1825. continue
  1826. }
  1827. for c := lo; c <= hi; c += stride {
  1828. r = appendRange(r, c, c)
  1829. }
  1830. }
  1831. for _, xr := range x.R32 {
  1832. lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1833. if stride == 1 {
  1834. r = appendRange(r, lo, hi)
  1835. continue
  1836. }
  1837. for c := lo; c <= hi; c += stride {
  1838. r = appendRange(r, c, c)
  1839. }
  1840. }
  1841. return r
  1842. }
  1843. // appendNegatedTable returns the result of appending the negation of x to the class r.
  1844. func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
  1845. nextLo := '\u0000' // lo end of next class to add
  1846. for _, xr := range x.R16 {
  1847. lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1848. if stride == 1 {
  1849. if nextLo <= lo-1 {
  1850. r = appendRange(r, nextLo, lo-1)
  1851. }
  1852. nextLo = hi + 1
  1853. continue
  1854. }
  1855. for c := lo; c <= hi; c += stride {
  1856. if nextLo <= c-1 {
  1857. r = appendRange(r, nextLo, c-1)
  1858. }
  1859. nextLo = c + 1
  1860. }
  1861. }
  1862. for _, xr := range x.R32 {
  1863. lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1864. if stride == 1 {
  1865. if nextLo <= lo-1 {
  1866. r = appendRange(r, nextLo, lo-1)
  1867. }
  1868. nextLo = hi + 1
  1869. continue
  1870. }
  1871. for c := lo; c <= hi; c += stride {
  1872. if nextLo <= c-1 {
  1873. r = appendRange(r, nextLo, c-1)
  1874. }
  1875. nextLo = c + 1
  1876. }
  1877. }
  1878. if nextLo <= unicode.MaxRune {
  1879. r = appendRange(r, nextLo, unicode.MaxRune)
  1880. }
  1881. return r
  1882. }
  1883. // negateClass overwrites r and returns r's negation.
  1884. // It assumes the class r is already clean.
  1885. func negateClass(r []rune) []rune {
  1886. nextLo := '\u0000' // lo end of next class to add
  1887. w := 0 // write index
  1888. for i := 0; i < len(r); i += 2 {
  1889. lo, hi := r[i], r[i+1]
  1890. if nextLo <= lo-1 {
  1891. r[w] = nextLo
  1892. r[w+1] = lo - 1
  1893. w += 2
  1894. }
  1895. nextLo = hi + 1
  1896. }
  1897. r = r[:w]
  1898. if nextLo <= unicode.MaxRune {
  1899. // It's possible for the negation to have one more
  1900. // range - this one - than the original class, so use append.
  1901. r = append(r, nextLo, unicode.MaxRune)
  1902. }
  1903. return r
  1904. }
  1905. // ranges implements sort.Interface on a []rune.
  1906. // The choice of receiver type definition is strange
  1907. // but avoids an allocation since we already have
  1908. // a *[]rune.
  1909. type ranges struct {
  1910. p *[]rune
  1911. }
  1912. func (ra ranges) Less(i, j int) bool {
  1913. p := *ra.p
  1914. i *= 2
  1915. j *= 2
  1916. return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
  1917. }
  1918. func (ra ranges) Len() int {
  1919. return len(*ra.p) / 2
  1920. }
  1921. func (ra ranges) Swap(i, j int) {
  1922. p := *ra.p
  1923. i *= 2
  1924. j *= 2
  1925. p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
  1926. }
  1927. func checkUTF8(s string) error {
  1928. for s != "" {
  1929. rune, size := utf8.DecodeRuneInString(s)
  1930. if rune == utf8.RuneError && size == 1 {
  1931. return &Error{Code: ErrInvalidUTF8, Expr: s}
  1932. }
  1933. s = s[size:]
  1934. }
  1935. return nil
  1936. }
  1937. func nextRune(s string) (c rune, t string, err error) {
  1938. c, size := utf8.DecodeRuneInString(s)
  1939. if c == utf8.RuneError && size == 1 {
  1940. return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
  1941. }
  1942. return c, s[size:], nil
  1943. }
  1944. func isalnum(c rune) bool {
  1945. return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
  1946. }
  1947. func unhex(c rune) rune {
  1948. if '0' <= c && c <= '9' {
  1949. return c - '0'
  1950. }
  1951. if 'a' <= c && c <= 'f' {
  1952. return c - 'a' + 10
  1953. }
  1954. if 'A' <= c && c <= 'F' {
  1955. return c - 'A' + 10
  1956. }
  1957. return -1
  1958. }