exec.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the GO-LICENSE file.
  4. // Modifications of this file, if any, are
  5. //
  6. // Copyright 2023 The Regexp Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style
  8. // license that can be found in the LICENSE file.
  9. package regexp // modernc.org/regexp
  10. import (
  11. "io"
  12. "sync"
  13. "modernc.org/regexp/syntax"
  14. )
  15. // A queue is a 'sparse array' holding pending threads of execution.
  16. // See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
  17. type queue struct {
  18. sparse []uint32
  19. dense []entry
  20. }
  21. // An entry is an entry on a queue.
  22. // It holds both the instruction pc and the actual thread.
  23. // Some queue entries are just place holders so that the machine
  24. // knows it has considered that pc. Such entries have t == nil.
  25. type entry struct {
  26. pc uint32
  27. t *thread
  28. }
  29. // A thread is the state of a single path through the machine:
  30. // an instruction and a corresponding capture array.
  31. // See https://swtch.com/~rsc/regexp/regexp2.html
  32. type thread struct {
  33. inst *syntax.Inst
  34. cap []int
  35. }
  36. // A machine holds all the state during an NFA simulation for p.
  37. type machine struct {
  38. re *Regexp // corresponding Regexp
  39. p *syntax.Prog // compiled program
  40. q0, q1 queue // two queues for runq, nextq
  41. pool []*thread // pool of available threads
  42. matched bool // whether a match was found
  43. matchcap []int // capture information for the match
  44. inputs inputs
  45. }
  46. type inputs struct {
  47. // cached inputs, to avoid allocation
  48. bytes inputBytes
  49. string inputString
  50. reader inputReader
  51. }
  52. func (i *inputs) newBytes(b []byte) input {
  53. i.bytes.str = b
  54. return &i.bytes
  55. }
  56. func (i *inputs) newString(s string) input {
  57. i.string.str = s
  58. return &i.string
  59. }
  60. func (i *inputs) newReader(r io.RuneReader) input {
  61. i.reader.r = r
  62. i.reader.atEOT = false
  63. i.reader.pos = 0
  64. return &i.reader
  65. }
  66. func (i *inputs) clear() {
  67. // We need to clear 1 of these.
  68. // Avoid the expense of clearing the others (pointer write barrier).
  69. if i.bytes.str != nil {
  70. i.bytes.str = nil
  71. } else if i.reader.r != nil {
  72. i.reader.r = nil
  73. } else {
  74. i.string.str = ""
  75. }
  76. }
  77. func (i *inputs) init(r io.RuneReader, b []byte, s string) (input, int) {
  78. if r != nil {
  79. return i.newReader(r), 0
  80. }
  81. if b != nil {
  82. return i.newBytes(b), len(b)
  83. }
  84. return i.newString(s), len(s)
  85. }
  86. func (m *machine) init(ncap int) {
  87. for _, t := range m.pool {
  88. t.cap = t.cap[:ncap]
  89. }
  90. m.matchcap = m.matchcap[:ncap]
  91. }
  92. // alloc allocates a new thread with the given instruction.
  93. // It uses the free pool if possible.
  94. func (m *machine) alloc(i *syntax.Inst) *thread {
  95. var t *thread
  96. if n := len(m.pool); n > 0 {
  97. t = m.pool[n-1]
  98. m.pool = m.pool[:n-1]
  99. } else {
  100. t = new(thread)
  101. t.cap = make([]int, len(m.matchcap), cap(m.matchcap))
  102. }
  103. t.inst = i
  104. return t
  105. }
  106. // A lazyFlag is a lazily-evaluated syntax.EmptyOp,
  107. // for checking zero-width flags like ^ $ \A \z \B \b.
  108. // It records the pair of relevant runes and does not
  109. // determine the implied flags until absolutely necessary
  110. // (most of the time, that means never).
  111. type lazyFlag uint64
  112. func newLazyFlag(r1, r2 rune) lazyFlag {
  113. return lazyFlag(uint64(r1)<<32 | uint64(uint32(r2)))
  114. }
  115. func (f lazyFlag) match(op syntax.EmptyOp) bool {
  116. if op == 0 {
  117. return true
  118. }
  119. r1 := rune(f >> 32)
  120. if op&syntax.EmptyBeginLine != 0 {
  121. if r1 != '\n' && r1 >= 0 {
  122. return false
  123. }
  124. op &^= syntax.EmptyBeginLine
  125. }
  126. if op&syntax.EmptyBeginText != 0 {
  127. if r1 >= 0 {
  128. return false
  129. }
  130. op &^= syntax.EmptyBeginText
  131. }
  132. if op == 0 {
  133. return true
  134. }
  135. r2 := rune(f)
  136. if op&syntax.EmptyEndLine != 0 {
  137. if r2 != '\n' && r2 >= 0 {
  138. return false
  139. }
  140. op &^= syntax.EmptyEndLine
  141. }
  142. if op&syntax.EmptyEndText != 0 {
  143. if r2 >= 0 {
  144. return false
  145. }
  146. op &^= syntax.EmptyEndText
  147. }
  148. if op == 0 {
  149. return true
  150. }
  151. if syntax.IsWordChar(r1) != syntax.IsWordChar(r2) {
  152. op &^= syntax.EmptyWordBoundary
  153. } else {
  154. op &^= syntax.EmptyNoWordBoundary
  155. }
  156. return op == 0
  157. }
  158. // match runs the machine over the input starting at pos.
  159. // It reports whether a match was found.
  160. // If so, m.matchcap holds the submatch information.
  161. func (m *machine) match(i input, pos int) bool {
  162. startCond := m.re.cond
  163. if startCond == ^syntax.EmptyOp(0) { // impossible
  164. return false
  165. }
  166. m.matched = false
  167. for i := range m.matchcap {
  168. m.matchcap[i] = -1
  169. }
  170. runq, nextq := &m.q0, &m.q1
  171. r, r1 := endOfText, endOfText
  172. width, width1 := 0, 0
  173. r, width = i.step(pos)
  174. if r != endOfText {
  175. r1, width1 = i.step(pos + width)
  176. }
  177. var flag lazyFlag
  178. if pos == 0 {
  179. flag = newLazyFlag(-1, r)
  180. } else {
  181. flag = i.context(pos)
  182. }
  183. for {
  184. if len(runq.dense) == 0 {
  185. if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
  186. // Anchored match, past beginning of text.
  187. break
  188. }
  189. if m.matched {
  190. // Have match; finished exploring alternatives.
  191. break
  192. }
  193. if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() {
  194. // Match requires literal prefix; fast search for it.
  195. advance := i.index(m.re, pos)
  196. if advance < 0 {
  197. break
  198. }
  199. pos += advance
  200. r, width = i.step(pos)
  201. r1, width1 = i.step(pos + width)
  202. }
  203. }
  204. if !m.matched {
  205. if len(m.matchcap) > 0 {
  206. m.matchcap[0] = pos
  207. }
  208. m.add(runq, uint32(m.p.Start), pos, m.matchcap, &flag, nil)
  209. }
  210. flag = newLazyFlag(r, r1)
  211. m.step(runq, nextq, pos, pos+width, r, &flag)
  212. if width == 0 {
  213. break
  214. }
  215. if len(m.matchcap) == 0 && m.matched {
  216. // Found a match and not paying attention
  217. // to where it is, so any match will do.
  218. break
  219. }
  220. pos += width
  221. r, width = r1, width1
  222. if r != endOfText {
  223. r1, width1 = i.step(pos + width)
  224. }
  225. runq, nextq = nextq, runq
  226. }
  227. m.clear(nextq)
  228. return m.matched
  229. }
  230. // clear frees all threads on the thread queue.
  231. func (m *machine) clear(q *queue) {
  232. for _, d := range q.dense {
  233. if d.t != nil {
  234. m.pool = append(m.pool, d.t)
  235. }
  236. }
  237. q.dense = q.dense[:0]
  238. }
  239. // step executes one step of the machine, running each of the threads
  240. // on runq and appending new threads to nextq.
  241. // The step processes the rune c (which may be endOfText),
  242. // which starts at position pos and ends at nextPos.
  243. // nextCond gives the setting for the empty-width flags after c.
  244. func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond *lazyFlag) {
  245. longest := m.re.longest
  246. for j := 0; j < len(runq.dense); j++ {
  247. d := &runq.dense[j]
  248. t := d.t
  249. if t == nil {
  250. continue
  251. }
  252. if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] {
  253. m.pool = append(m.pool, t)
  254. continue
  255. }
  256. i := t.inst
  257. add := false
  258. switch i.Op {
  259. default:
  260. panic("bad inst")
  261. case syntax.InstMatch:
  262. if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) {
  263. t.cap[1] = pos
  264. copy(m.matchcap, t.cap)
  265. }
  266. if !longest {
  267. // First-match mode: cut off all lower-priority threads.
  268. for _, d := range runq.dense[j+1:] {
  269. if d.t != nil {
  270. m.pool = append(m.pool, d.t)
  271. }
  272. }
  273. runq.dense = runq.dense[:0]
  274. }
  275. m.matched = true
  276. case syntax.InstRune:
  277. add = i.MatchRune(c)
  278. case syntax.InstRune1:
  279. add = c == i.Rune[0]
  280. case syntax.InstRuneAny:
  281. add = true
  282. case syntax.InstRuneAnyNotNL:
  283. add = c != '\n'
  284. }
  285. if add {
  286. t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t)
  287. }
  288. if t != nil {
  289. m.pool = append(m.pool, t)
  290. }
  291. }
  292. runq.dense = runq.dense[:0]
  293. }
  294. // add adds an entry to q for pc, unless the q already has such an entry.
  295. // It also recursively adds an entry for all instructions reachable from pc by following
  296. // empty-width conditions satisfied by cond. pos gives the current position
  297. // in the input.
  298. func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond *lazyFlag, t *thread) *thread {
  299. Again:
  300. if pc == 0 {
  301. return t
  302. }
  303. if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
  304. return t
  305. }
  306. j := len(q.dense)
  307. q.dense = q.dense[:j+1]
  308. d := &q.dense[j]
  309. d.t = nil
  310. d.pc = pc
  311. q.sparse[pc] = uint32(j)
  312. i := &m.p.Inst[pc]
  313. switch i.Op {
  314. default:
  315. panic("unhandled")
  316. case syntax.InstFail:
  317. // nothing
  318. case syntax.InstAlt, syntax.InstAltMatch:
  319. t = m.add(q, i.Out, pos, cap, cond, t)
  320. pc = i.Arg
  321. goto Again
  322. case syntax.InstEmptyWidth:
  323. if cond.match(syntax.EmptyOp(i.Arg)) {
  324. pc = i.Out
  325. goto Again
  326. }
  327. case syntax.InstNop:
  328. pc = i.Out
  329. goto Again
  330. case syntax.InstCapture:
  331. if int(i.Arg) < len(cap) {
  332. opos := cap[i.Arg]
  333. cap[i.Arg] = pos
  334. m.add(q, i.Out, pos, cap, cond, nil)
  335. cap[i.Arg] = opos
  336. } else {
  337. pc = i.Out
  338. goto Again
  339. }
  340. case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
  341. if t == nil {
  342. t = m.alloc(i)
  343. } else {
  344. t.inst = i
  345. }
  346. if len(cap) > 0 && &t.cap[0] != &cap[0] {
  347. copy(t.cap, cap)
  348. }
  349. d.t = t
  350. t = nil
  351. }
  352. return t
  353. }
  354. type onePassMachine struct {
  355. inputs inputs
  356. matchcap []int
  357. }
  358. var onePassPool sync.Pool
  359. func newOnePassMachine() *onePassMachine {
  360. m, ok := onePassPool.Get().(*onePassMachine)
  361. if !ok {
  362. m = new(onePassMachine)
  363. }
  364. return m
  365. }
  366. func freeOnePassMachine(m *onePassMachine) {
  367. m.inputs.clear()
  368. onePassPool.Put(m)
  369. }
  370. // doOnePass implements r.doExecute using the one-pass execution engine.
  371. func (re *Regexp) doOnePass(ir io.RuneReader, ib []byte, is string, pos, ncap int, dstCap []int) []int {
  372. startCond := re.cond
  373. if startCond == ^syntax.EmptyOp(0) { // impossible
  374. return nil
  375. }
  376. m := newOnePassMachine()
  377. if cap(m.matchcap) < ncap {
  378. m.matchcap = make([]int, ncap)
  379. } else {
  380. m.matchcap = m.matchcap[:ncap]
  381. }
  382. matched := false
  383. for i := range m.matchcap {
  384. m.matchcap[i] = -1
  385. }
  386. i, _ := m.inputs.init(ir, ib, is)
  387. r, r1 := endOfText, endOfText
  388. width, width1 := 0, 0
  389. r, width = i.step(pos)
  390. if r != endOfText {
  391. r1, width1 = i.step(pos + width)
  392. }
  393. var flag lazyFlag
  394. if pos == 0 {
  395. flag = newLazyFlag(-1, r)
  396. } else {
  397. flag = i.context(pos)
  398. }
  399. pc := re.onepass.Start
  400. inst := &re.onepass.Inst[pc]
  401. // If there is a simple literal prefix, skip over it.
  402. if pos == 0 && flag.match(syntax.EmptyOp(inst.Arg)) &&
  403. len(re.prefix) > 0 && i.canCheckPrefix() {
  404. // Match requires literal prefix; fast search for it.
  405. if !i.hasPrefix(re) {
  406. goto Return
  407. }
  408. pos += len(re.prefix)
  409. r, width = i.step(pos)
  410. r1, width1 = i.step(pos + width)
  411. flag = i.context(pos)
  412. pc = int(re.prefixEnd)
  413. }
  414. for {
  415. inst = &re.onepass.Inst[pc]
  416. pc = int(inst.Out)
  417. switch inst.Op {
  418. default:
  419. panic("bad inst")
  420. case syntax.InstMatch:
  421. matched = true
  422. if len(m.matchcap) > 0 {
  423. m.matchcap[0] = 0
  424. m.matchcap[1] = pos
  425. }
  426. goto Return
  427. case syntax.InstRune:
  428. if !inst.MatchRune(r) {
  429. goto Return
  430. }
  431. case syntax.InstRune1:
  432. if r != inst.Rune[0] {
  433. goto Return
  434. }
  435. case syntax.InstRuneAny:
  436. // Nothing
  437. case syntax.InstRuneAnyNotNL:
  438. if r == '\n' {
  439. goto Return
  440. }
  441. // peek at the input rune to see which branch of the Alt to take
  442. case syntax.InstAlt, syntax.InstAltMatch:
  443. pc = int(onePassNext(inst, r))
  444. continue
  445. case syntax.InstFail:
  446. goto Return
  447. case syntax.InstNop:
  448. continue
  449. case syntax.InstEmptyWidth:
  450. if !flag.match(syntax.EmptyOp(inst.Arg)) {
  451. goto Return
  452. }
  453. continue
  454. case syntax.InstCapture:
  455. if int(inst.Arg) < len(m.matchcap) {
  456. m.matchcap[inst.Arg] = pos
  457. }
  458. continue
  459. }
  460. if width == 0 {
  461. break
  462. }
  463. flag = newLazyFlag(r, r1)
  464. pos += width
  465. r, width = r1, width1
  466. if r != endOfText {
  467. r1, width1 = i.step(pos + width)
  468. }
  469. }
  470. Return:
  471. if !matched {
  472. freeOnePassMachine(m)
  473. return nil
  474. }
  475. dstCap = append(dstCap, m.matchcap...)
  476. freeOnePassMachine(m)
  477. return dstCap
  478. }
  479. // doMatch reports whether either r, b or s match the regexp.
  480. func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool {
  481. return re.doExecute(r, b, s, 0, 0, nil) != nil
  482. }
  483. // doExecute finds the leftmost match in the input, appends the position
  484. // of its subexpressions to dstCap and returns dstCap.
  485. //
  486. // nil is returned if no matches are found and non-nil if matches are found.
  487. func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int {
  488. if dstCap == nil {
  489. // Make sure 'return dstCap' is non-nil.
  490. dstCap = arrayNoInts[:0:0]
  491. }
  492. if r == nil && len(b)+len(s) < re.minInputLen {
  493. return nil
  494. }
  495. if r == nil && re.dfa != nil {
  496. return re.doDFA(b, s, pos, ncap, dstCap)
  497. }
  498. if re.onepass != nil {
  499. return re.doOnePass(r, b, s, pos, ncap, dstCap)
  500. }
  501. if r == nil && len(b)+len(s) < re.maxBitStateLen {
  502. return re.backtrack(b, s, pos, ncap, dstCap)
  503. }
  504. m := re.get()
  505. i, _ := m.inputs.init(r, b, s)
  506. m.init(ncap)
  507. if !m.match(i, pos) {
  508. re.put(m)
  509. return nil
  510. }
  511. dstCap = append(dstCap, m.matchcap...)
  512. re.put(m)
  513. return dstCap
  514. }
  515. // arrayNoInts is returned by doExecute match if nil dstCap is passed
  516. // to it with ncap=0.
  517. var arrayNoInts [0]int