onepass.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the GO-LICENSE file.
  4. // Modifications of this file, if any, are
  5. //
  6. // Copyright 2023 The Regexp Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style
  8. // license that can be found in the LICENSE file.
  9. package regexp // modernc.org/regexp
  10. import (
  11. "sort"
  12. "strings"
  13. "unicode"
  14. "unicode/utf8"
  15. "modernc.org/regexp/syntax"
  16. )
  17. // "One-pass" regexp execution.
  18. // Some regexps can be analyzed to determine that they never need
  19. // backtracking: they are guaranteed to run in one pass over the string
  20. // without bothering to save all the usual NFA state.
  21. // Detect those and execute them more quickly.
  22. // A onePassProg is a compiled one-pass regular expression program.
  23. // It is the same as syntax.Prog except for the use of onePassInst.
  24. type onePassProg struct {
  25. Inst []onePassInst
  26. Start int // index of start instruction
  27. NumCap int // number of InstCapture insts in re
  28. }
  29. // A onePassInst is a single instruction in a one-pass regular expression program.
  30. // It is the same as syntax.Inst except for the new 'Next' field.
  31. type onePassInst struct {
  32. syntax.Inst
  33. Next []uint32
  34. }
  35. // onePassPrefix returns a literal string that all matches for the
  36. // regexp must start with. Complete is true if the prefix
  37. // is the entire match. Pc is the index of the last rune instruction
  38. // in the string. The onePassPrefix skips over the mandatory
  39. // EmptyBeginText.
  40. func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
  41. i := &p.Inst[p.Start]
  42. if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
  43. return "", i.Op == syntax.InstMatch, uint32(p.Start)
  44. }
  45. pc = i.Out
  46. i = &p.Inst[pc]
  47. for i.Op == syntax.InstNop {
  48. pc = i.Out
  49. i = &p.Inst[pc]
  50. }
  51. // Avoid allocation of buffer if prefix is empty.
  52. if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
  53. return "", i.Op == syntax.InstMatch, uint32(p.Start)
  54. }
  55. // Have prefix; gather characters.
  56. var buf strings.Builder
  57. for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError {
  58. buf.WriteRune(i.Rune[0])
  59. pc, i = i.Out, &p.Inst[i.Out]
  60. }
  61. if i.Op == syntax.InstEmptyWidth &&
  62. syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
  63. p.Inst[i.Out].Op == syntax.InstMatch {
  64. complete = true
  65. }
  66. return buf.String(), complete, pc
  67. }
  68. // onePassNext selects the next actionable state of the prog, based on the input character.
  69. // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
  70. // One of the alternates may ultimately lead without input to end of line. If the instruction
  71. // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
  72. func onePassNext(i *onePassInst, r rune) uint32 {
  73. next := i.MatchRunePos(r)
  74. if next >= 0 {
  75. return i.Next[next]
  76. }
  77. if i.Op == syntax.InstAltMatch {
  78. return i.Out
  79. }
  80. return 0
  81. }
  82. func iop(i *syntax.Inst) syntax.InstOp {
  83. op := i.Op
  84. switch op {
  85. case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
  86. op = syntax.InstRune
  87. }
  88. return op
  89. }
  90. // Sparse Array implementation is used as a queueOnePass.
  91. type queueOnePass struct {
  92. sparse []uint32
  93. dense []uint32
  94. size, nextIndex uint32
  95. }
  96. func (q *queueOnePass) empty() bool {
  97. return q.nextIndex >= q.size
  98. }
  99. func (q *queueOnePass) next() (n uint32) {
  100. n = q.dense[q.nextIndex]
  101. q.nextIndex++
  102. return
  103. }
  104. func (q *queueOnePass) clear() {
  105. q.size = 0
  106. q.nextIndex = 0
  107. }
  108. func (q *queueOnePass) contains(u uint32) bool {
  109. if u >= uint32(len(q.sparse)) {
  110. return false
  111. }
  112. return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
  113. }
  114. func (q *queueOnePass) insert(u uint32) {
  115. if !q.contains(u) {
  116. q.insertNew(u)
  117. }
  118. }
  119. func (q *queueOnePass) insertNew(u uint32) {
  120. if u >= uint32(len(q.sparse)) {
  121. return
  122. }
  123. q.sparse[u] = q.size
  124. q.dense[q.size] = u
  125. q.size++
  126. }
  127. func newQueue(size int) (q *queueOnePass) {
  128. return &queueOnePass{
  129. sparse: make([]uint32, size),
  130. dense: make([]uint32, size),
  131. }
  132. }
  133. // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
  134. // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
  135. // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
  136. // NextIp array with the single element mergeFailed is returned.
  137. // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
  138. const mergeFailed = uint32(0xffffffff)
  139. var (
  140. noRune = []rune{}
  141. noNext = []uint32{mergeFailed}
  142. )
  143. func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
  144. leftLen := len(*leftRunes)
  145. rightLen := len(*rightRunes)
  146. if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
  147. panic("mergeRuneSets odd length []rune")
  148. }
  149. var (
  150. lx, rx int
  151. )
  152. merged := make([]rune, 0)
  153. next := make([]uint32, 0)
  154. ok := true
  155. defer func() {
  156. if !ok {
  157. merged = nil
  158. next = nil
  159. }
  160. }()
  161. ix := -1
  162. extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
  163. if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
  164. return false
  165. }
  166. merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
  167. *newLow += 2
  168. ix += 2
  169. next = append(next, pc)
  170. return true
  171. }
  172. for lx < leftLen || rx < rightLen {
  173. switch {
  174. case rx >= rightLen:
  175. ok = extend(&lx, leftRunes, leftPC)
  176. case lx >= leftLen:
  177. ok = extend(&rx, rightRunes, rightPC)
  178. case (*rightRunes)[rx] < (*leftRunes)[lx]:
  179. ok = extend(&rx, rightRunes, rightPC)
  180. default:
  181. ok = extend(&lx, leftRunes, leftPC)
  182. }
  183. if !ok {
  184. return noRune, noNext
  185. }
  186. }
  187. return merged, next
  188. }
  189. // cleanupOnePass drops working memory, and restores certain shortcut instructions.
  190. func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
  191. for ix, instOriginal := range original.Inst {
  192. switch instOriginal.Op {
  193. case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
  194. case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
  195. prog.Inst[ix].Next = nil
  196. case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
  197. prog.Inst[ix].Next = nil
  198. prog.Inst[ix] = onePassInst{Inst: instOriginal}
  199. }
  200. }
  201. }
  202. // onePassCopy creates a copy of the original Prog, as we'll be modifying it.
  203. func onePassCopy(prog *syntax.Prog) *onePassProg {
  204. p := &onePassProg{
  205. Start: prog.Start,
  206. NumCap: prog.NumCap,
  207. Inst: make([]onePassInst, len(prog.Inst)),
  208. }
  209. for i, inst := range prog.Inst {
  210. p.Inst[i] = onePassInst{Inst: inst}
  211. }
  212. // rewrites one or more common Prog constructs that enable some otherwise
  213. // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
  214. // ip A, that points to ips B & C.
  215. // A:BC + B:DA => A:BC + B:CD
  216. // A:BC + B:DC => A:DC + B:DC
  217. for pc := range p.Inst {
  218. switch p.Inst[pc].Op {
  219. default:
  220. continue
  221. case syntax.InstAlt, syntax.InstAltMatch:
  222. // A:Bx + B:Ay
  223. p_A_Other := &p.Inst[pc].Out
  224. p_A_Alt := &p.Inst[pc].Arg
  225. // make sure a target is another Alt
  226. instAlt := p.Inst[*p_A_Alt]
  227. if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
  228. p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
  229. instAlt = p.Inst[*p_A_Alt]
  230. if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
  231. continue
  232. }
  233. }
  234. instOther := p.Inst[*p_A_Other]
  235. // Analyzing both legs pointing to Alts is for another day
  236. if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
  237. // too complicated
  238. continue
  239. }
  240. // simple empty transition loop
  241. // A:BC + B:DA => A:BC + B:DC
  242. p_B_Alt := &p.Inst[*p_A_Alt].Out
  243. p_B_Other := &p.Inst[*p_A_Alt].Arg
  244. patch := false
  245. if instAlt.Out == uint32(pc) {
  246. patch = true
  247. } else if instAlt.Arg == uint32(pc) {
  248. patch = true
  249. p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
  250. }
  251. if patch {
  252. *p_B_Alt = *p_A_Other
  253. }
  254. // empty transition to common target
  255. // A:BC + B:DC => A:DC + B:DC
  256. if *p_A_Other == *p_B_Alt {
  257. *p_A_Alt = *p_B_Other
  258. }
  259. }
  260. }
  261. return p
  262. }
  263. // runeSlice exists to permit sorting the case-folded rune sets.
  264. type runeSlice []rune
  265. func (p runeSlice) Len() int { return len(p) }
  266. func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
  267. func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  268. var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
  269. var anyRune = []rune{0, unicode.MaxRune}
  270. // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
  271. // the match engine can always tell which branch to take. The routine may modify
  272. // p if it is turned into a onepass Prog. If it isn't possible for this to be a
  273. // onepass Prog, the Prog nil is returned. makeOnePass is recursive
  274. // to the size of the Prog.
  275. func makeOnePass(p *onePassProg) *onePassProg {
  276. // If the machine is very long, it's not worth the time to check if we can use one pass.
  277. if len(p.Inst) >= 1000 {
  278. return nil
  279. }
  280. var (
  281. instQueue = newQueue(len(p.Inst))
  282. visitQueue = newQueue(len(p.Inst))
  283. check func(uint32, []bool) bool
  284. onePassRunes = make([][]rune, len(p.Inst))
  285. )
  286. // check that paths from Alt instructions are unambiguous, and rebuild the new
  287. // program as a onepass program
  288. check = func(pc uint32, m []bool) (ok bool) {
  289. ok = true
  290. inst := &p.Inst[pc]
  291. if visitQueue.contains(pc) {
  292. return
  293. }
  294. visitQueue.insert(pc)
  295. switch inst.Op {
  296. case syntax.InstAlt, syntax.InstAltMatch:
  297. ok = check(inst.Out, m) && check(inst.Arg, m)
  298. // check no-input paths to InstMatch
  299. matchOut := m[inst.Out]
  300. matchArg := m[inst.Arg]
  301. if matchOut && matchArg {
  302. ok = false
  303. break
  304. }
  305. // Match on empty goes in inst.Out
  306. if matchArg {
  307. inst.Out, inst.Arg = inst.Arg, inst.Out
  308. //lint:ignore SA4006 original Go stdlib code
  309. matchOut, matchArg = matchArg, matchOut
  310. }
  311. if matchOut {
  312. m[pc] = true
  313. inst.Op = syntax.InstAltMatch
  314. }
  315. // build a dispatch operator from the two legs of the alt.
  316. onePassRunes[pc], inst.Next = mergeRuneSets(
  317. &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
  318. if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
  319. ok = false
  320. break
  321. }
  322. case syntax.InstCapture, syntax.InstNop:
  323. ok = check(inst.Out, m)
  324. m[pc] = m[inst.Out]
  325. // pass matching runes back through these no-ops.
  326. onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
  327. inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
  328. for i := range inst.Next {
  329. inst.Next[i] = inst.Out
  330. }
  331. case syntax.InstEmptyWidth:
  332. ok = check(inst.Out, m)
  333. m[pc] = m[inst.Out]
  334. onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
  335. inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
  336. for i := range inst.Next {
  337. inst.Next[i] = inst.Out
  338. }
  339. case syntax.InstMatch, syntax.InstFail:
  340. m[pc] = inst.Op == syntax.InstMatch
  341. case syntax.InstRune:
  342. m[pc] = false
  343. if len(inst.Next) > 0 {
  344. break
  345. }
  346. instQueue.insert(inst.Out)
  347. if len(inst.Rune) == 0 {
  348. onePassRunes[pc] = []rune{}
  349. inst.Next = []uint32{inst.Out}
  350. break
  351. }
  352. runes := make([]rune, 0)
  353. if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
  354. r0 := inst.Rune[0]
  355. runes = append(runes, r0, r0)
  356. for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
  357. runes = append(runes, r1, r1)
  358. }
  359. sort.Sort(runeSlice(runes))
  360. } else {
  361. runes = append(runes, inst.Rune...)
  362. }
  363. onePassRunes[pc] = runes
  364. inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
  365. for i := range inst.Next {
  366. inst.Next[i] = inst.Out
  367. }
  368. inst.Op = syntax.InstRune
  369. case syntax.InstRune1:
  370. m[pc] = false
  371. if len(inst.Next) > 0 {
  372. break
  373. }
  374. instQueue.insert(inst.Out)
  375. runes := []rune{}
  376. // expand case-folded runes
  377. if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
  378. r0 := inst.Rune[0]
  379. runes = append(runes, r0, r0)
  380. for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
  381. runes = append(runes, r1, r1)
  382. }
  383. sort.Sort(runeSlice(runes))
  384. } else {
  385. runes = append(runes, inst.Rune[0], inst.Rune[0])
  386. }
  387. onePassRunes[pc] = runes
  388. inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
  389. for i := range inst.Next {
  390. inst.Next[i] = inst.Out
  391. }
  392. inst.Op = syntax.InstRune
  393. case syntax.InstRuneAny:
  394. m[pc] = false
  395. if len(inst.Next) > 0 {
  396. break
  397. }
  398. instQueue.insert(inst.Out)
  399. onePassRunes[pc] = append([]rune{}, anyRune...)
  400. inst.Next = []uint32{inst.Out}
  401. case syntax.InstRuneAnyNotNL:
  402. m[pc] = false
  403. if len(inst.Next) > 0 {
  404. break
  405. }
  406. instQueue.insert(inst.Out)
  407. onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
  408. inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
  409. for i := range inst.Next {
  410. inst.Next[i] = inst.Out
  411. }
  412. }
  413. return
  414. }
  415. instQueue.clear()
  416. instQueue.insert(uint32(p.Start))
  417. m := make([]bool, len(p.Inst))
  418. for !instQueue.empty() {
  419. visitQueue.clear()
  420. pc := instQueue.next()
  421. if !check(pc, m) {
  422. p = nil
  423. break
  424. }
  425. }
  426. if p != nil {
  427. for i := range p.Inst {
  428. p.Inst[i].Rune = onePassRunes[i]
  429. }
  430. }
  431. return p
  432. }
  433. // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
  434. // can be recharacterized as a one-pass regexp program, or syntax.nil if the
  435. // Prog cannot be converted. For a one pass prog, the fundamental condition that must
  436. // be true is: at any InstAlt, there must be no ambiguity about what branch to take.
  437. func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
  438. if prog.Start == 0 {
  439. return nil
  440. }
  441. // onepass regexp is anchored
  442. if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
  443. syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
  444. return nil
  445. }
  446. // every instruction leading to InstMatch must be EmptyEndText
  447. for _, inst := range prog.Inst {
  448. opOut := prog.Inst[inst.Out].Op
  449. switch inst.Op {
  450. default:
  451. if opOut == syntax.InstMatch {
  452. return nil
  453. }
  454. case syntax.InstAlt, syntax.InstAltMatch:
  455. if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
  456. return nil
  457. }
  458. case syntax.InstEmptyWidth:
  459. if opOut == syntax.InstMatch {
  460. if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
  461. continue
  462. }
  463. return nil
  464. }
  465. }
  466. }
  467. // Creates a slightly optimized copy of the original Prog
  468. // that cleans up some Prog idioms that block valid onepass programs
  469. p = onePassCopy(prog)
  470. // checkAmbiguity on InstAlts, build onepass Prog if possible
  471. p = makeOnePass(p)
  472. if p != nil {
  473. cleanupOnePass(p, prog)
  474. }
  475. return p
  476. }