compile.go 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the GO-LICENSE file.
  4. // Modifications of this file, if any, are
  5. //
  6. // Copyright 2023 The Regexp Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style
  8. // license that can be found in the LICENSE file.
  9. package syntax // modernc.org/regexp/syntax
  10. import "unicode"
  11. // A patchList is a list of instruction pointers that need to be filled in (patched).
  12. // Because the pointers haven't been filled in yet, we can reuse their storage
  13. // to hold the list. It's kind of sleazy, but works well in practice.
  14. // See https://swtch.com/~rsc/regexp/regexp1.html for inspiration.
  15. //
  16. // These aren't really pointers: they're integers, so we can reinterpret them
  17. // this way without using package unsafe. A value l.head denotes
  18. // p.inst[l.head>>1].Out (l.head&1==0) or .Arg (l.head&1==1).
  19. // head == 0 denotes the empty list, okay because we start every program
  20. // with a fail instruction, so we'll never want to point at its output link.
  21. type patchList struct {
  22. head, tail uint32
  23. }
  24. func makePatchList(n uint32) patchList {
  25. return patchList{n, n}
  26. }
  27. func (l patchList) patch(p *Prog, val uint32) {
  28. head := l.head
  29. for head != 0 {
  30. i := &p.Inst[head>>1]
  31. if head&1 == 0 {
  32. head = i.Out
  33. i.Out = val
  34. } else {
  35. head = i.Arg
  36. i.Arg = val
  37. }
  38. }
  39. }
  40. func (l1 patchList) append(p *Prog, l2 patchList) patchList {
  41. if l1.head == 0 {
  42. return l2
  43. }
  44. if l2.head == 0 {
  45. return l1
  46. }
  47. i := &p.Inst[l1.tail>>1]
  48. if l1.tail&1 == 0 {
  49. i.Out = l2.head
  50. } else {
  51. i.Arg = l2.head
  52. }
  53. return patchList{l1.head, l2.tail}
  54. }
  55. // A frag represents a compiled program fragment.
  56. type frag struct {
  57. i uint32 // index of first instruction
  58. out patchList // where to record end instruction
  59. nullable bool // whether fragment can match empty string
  60. }
  61. type compiler struct {
  62. p *Prog
  63. }
  64. // Compile compiles the regexp into a program to be executed.
  65. // The regexp should have been simplified already (returned from re.Simplify).
  66. func Compile(re *Regexp) (*Prog, error) {
  67. var c compiler
  68. c.init()
  69. f := c.compile(re)
  70. f.out.patch(c.p, c.inst(InstMatch).i)
  71. c.p.Start = int(f.i)
  72. return c.p, nil
  73. }
  74. func (c *compiler) init() {
  75. c.p = new(Prog)
  76. c.p.NumCap = 2 // implicit ( and ) for whole match $0
  77. c.inst(InstFail)
  78. }
  79. var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
  80. var anyRune = []rune{0, unicode.MaxRune}
  81. func (c *compiler) compile(re *Regexp) frag {
  82. switch re.Op {
  83. case OpNoMatch:
  84. return c.fail()
  85. case OpEmptyMatch:
  86. return c.nop()
  87. case OpLiteral:
  88. if len(re.Rune) == 0 {
  89. return c.nop()
  90. }
  91. var f frag
  92. for j := range re.Rune {
  93. f1 := c.rune(re.Rune[j:j+1], re.Flags)
  94. if j == 0 {
  95. f = f1
  96. } else {
  97. f = c.cat(f, f1)
  98. }
  99. }
  100. return f
  101. case OpCharClass:
  102. return c.rune(re.Rune, re.Flags)
  103. case OpAnyCharNotNL:
  104. return c.rune(anyRuneNotNL, 0)
  105. case OpAnyChar:
  106. return c.rune(anyRune, 0)
  107. case OpBeginLine:
  108. return c.empty(EmptyBeginLine)
  109. case OpEndLine:
  110. return c.empty(EmptyEndLine)
  111. case OpBeginText:
  112. return c.empty(EmptyBeginText)
  113. case OpEndText:
  114. return c.empty(EmptyEndText)
  115. case OpWordBoundary:
  116. return c.empty(EmptyWordBoundary)
  117. case OpNoWordBoundary:
  118. return c.empty(EmptyNoWordBoundary)
  119. case OpCapture:
  120. bra := c.cap(uint32(re.Cap << 1))
  121. sub := c.compile(re.Sub[0])
  122. ket := c.cap(uint32(re.Cap<<1 | 1))
  123. return c.cat(c.cat(bra, sub), ket)
  124. case OpStar:
  125. return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
  126. case OpPlus:
  127. return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
  128. case OpQuest:
  129. return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
  130. case OpConcat:
  131. if len(re.Sub) == 0 {
  132. return c.nop()
  133. }
  134. var f frag
  135. for i, sub := range re.Sub {
  136. if i == 0 {
  137. f = c.compile(sub)
  138. } else {
  139. f = c.cat(f, c.compile(sub))
  140. }
  141. }
  142. return f
  143. case OpAlternate:
  144. var f frag
  145. for _, sub := range re.Sub {
  146. f = c.alt(f, c.compile(sub))
  147. }
  148. return f
  149. }
  150. panic("regexp: unhandled case in compile")
  151. }
  152. func (c *compiler) inst(op InstOp) frag {
  153. // TODO: impose length limit
  154. f := frag{i: uint32(len(c.p.Inst)), nullable: true}
  155. c.p.Inst = append(c.p.Inst, Inst{Op: op})
  156. return f
  157. }
  158. func (c *compiler) nop() frag {
  159. f := c.inst(InstNop)
  160. f.out = makePatchList(f.i << 1)
  161. return f
  162. }
  163. func (c *compiler) fail() frag {
  164. return frag{}
  165. }
  166. func (c *compiler) cap(arg uint32) frag {
  167. f := c.inst(InstCapture)
  168. f.out = makePatchList(f.i << 1)
  169. c.p.Inst[f.i].Arg = arg
  170. if c.p.NumCap < int(arg)+1 {
  171. c.p.NumCap = int(arg) + 1
  172. }
  173. return f
  174. }
  175. func (c *compiler) cat(f1, f2 frag) frag {
  176. // concat of failure is failure
  177. if f1.i == 0 || f2.i == 0 {
  178. return frag{}
  179. }
  180. // TODO: elide nop
  181. f1.out.patch(c.p, f2.i)
  182. return frag{f1.i, f2.out, f1.nullable && f2.nullable}
  183. }
  184. func (c *compiler) alt(f1, f2 frag) frag {
  185. // alt of failure is other
  186. if f1.i == 0 {
  187. return f2
  188. }
  189. if f2.i == 0 {
  190. return f1
  191. }
  192. f := c.inst(InstAlt)
  193. i := &c.p.Inst[f.i]
  194. i.Out = f1.i
  195. i.Arg = f2.i
  196. f.out = f1.out.append(c.p, f2.out)
  197. f.nullable = f1.nullable || f2.nullable
  198. return f
  199. }
  200. func (c *compiler) quest(f1 frag, nongreedy bool) frag {
  201. f := c.inst(InstAlt)
  202. i := &c.p.Inst[f.i]
  203. if nongreedy {
  204. i.Arg = f1.i
  205. f.out = makePatchList(f.i << 1)
  206. } else {
  207. i.Out = f1.i
  208. f.out = makePatchList(f.i<<1 | 1)
  209. }
  210. f.out = f.out.append(c.p, f1.out)
  211. return f
  212. }
  213. // loop returns the fragment for the main loop of a plus or star.
  214. // For plus, it can be used after changing the entry to f1.i.
  215. // For star, it can be used directly when f1 can't match an empty string.
  216. // (When f1 can match an empty string, f1* must be implemented as (f1+)?
  217. // to get the priority match order correct.)
  218. func (c *compiler) loop(f1 frag, nongreedy bool) frag {
  219. f := c.inst(InstAlt)
  220. i := &c.p.Inst[f.i]
  221. if nongreedy {
  222. i.Arg = f1.i
  223. f.out = makePatchList(f.i << 1)
  224. } else {
  225. i.Out = f1.i
  226. f.out = makePatchList(f.i<<1 | 1)
  227. }
  228. f1.out.patch(c.p, f.i)
  229. return f
  230. }
  231. func (c *compiler) star(f1 frag, nongreedy bool) frag {
  232. if f1.nullable {
  233. // Use (f1+)? to get priority match order correct.
  234. // See golang.org/issue/46123.
  235. return c.quest(c.plus(f1, nongreedy), nongreedy)
  236. }
  237. return c.loop(f1, nongreedy)
  238. }
  239. func (c *compiler) plus(f1 frag, nongreedy bool) frag {
  240. return frag{f1.i, c.loop(f1, nongreedy).out, f1.nullable}
  241. }
  242. func (c *compiler) empty(op EmptyOp) frag {
  243. f := c.inst(InstEmptyWidth)
  244. c.p.Inst[f.i].Arg = uint32(op)
  245. f.out = makePatchList(f.i << 1)
  246. return f
  247. }
  248. func (c *compiler) rune(r []rune, flags Flags) frag {
  249. f := c.inst(InstRune)
  250. f.nullable = false
  251. i := &c.p.Inst[f.i]
  252. i.Rune = r
  253. flags &= FoldCase // only relevant flag is FoldCase
  254. if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
  255. // and sometimes not even that
  256. flags &^= FoldCase
  257. }
  258. i.Arg = uint32(flags)
  259. f.out = makePatchList(f.i << 1)
  260. // Special cases for exec machine.
  261. switch {
  262. case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
  263. i.Op = InstRune1
  264. case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
  265. i.Op = InstRuneAny
  266. case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
  267. i.Op = InstRuneAnyNotNL
  268. }
  269. return f
  270. }