prog.go 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the GO-LICENSE file.
  4. // Modifications of this file, if any, are
  5. //
  6. // Copyright 2023 The Regexp Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style
  8. // license that can be found in the LICENSE file.
  9. package syntax // modernc.org/regexp/syntax
  10. import (
  11. "strconv"
  12. "strings"
  13. "unicode"
  14. "unicode/utf8"
  15. )
  16. // Compiled program.
  17. // May not belong in this package, but convenient for now.
  18. // A Prog is a compiled regular expression program.
  19. type Prog struct {
  20. Inst []Inst
  21. Start int // index of start instruction
  22. NumCap int // number of InstCapture insts in re
  23. }
  24. // An InstOp is an instruction opcode.
  25. type InstOp uint8
  26. const (
  27. InstAlt InstOp = iota
  28. InstAltMatch
  29. InstCapture
  30. InstEmptyWidth
  31. InstMatch
  32. InstFail
  33. InstNop
  34. InstRune
  35. InstRune1
  36. InstRuneAny
  37. InstRuneAnyNotNL
  38. )
  39. var instOpNames = []string{
  40. "InstAlt",
  41. "InstAltMatch",
  42. "InstCapture",
  43. "InstEmptyWidth",
  44. "InstMatch",
  45. "InstFail",
  46. "InstNop",
  47. "InstRune",
  48. "InstRune1",
  49. "InstRuneAny",
  50. "InstRuneAnyNotNL",
  51. }
  52. func (i InstOp) String() string {
  53. if uint(i) >= uint(len(instOpNames)) {
  54. return ""
  55. }
  56. return instOpNames[i]
  57. }
  58. // An EmptyOp specifies a kind or mixture of zero-width assertions.
  59. type EmptyOp uint8
  60. const (
  61. EmptyBeginLine EmptyOp = 1 << iota
  62. EmptyEndLine
  63. EmptyBeginText
  64. EmptyEndText
  65. EmptyWordBoundary
  66. EmptyNoWordBoundary
  67. )
  68. // EmptyOpContext returns the zero-width assertions
  69. // satisfied at the position between the runes r1 and r2.
  70. // Passing r1 == -1 indicates that the position is
  71. // at the beginning of the text.
  72. // Passing r2 == -1 indicates that the position is
  73. // at the end of the text.
  74. func EmptyOpContext(r1, r2 rune) EmptyOp {
  75. var op EmptyOp = EmptyNoWordBoundary
  76. var boundary byte
  77. switch {
  78. case IsWordChar(r1):
  79. boundary = 1
  80. case r1 == '\n':
  81. op |= EmptyBeginLine
  82. case r1 < 0:
  83. op |= EmptyBeginText | EmptyBeginLine
  84. }
  85. switch {
  86. case IsWordChar(r2):
  87. boundary ^= 1
  88. case r2 == '\n':
  89. op |= EmptyEndLine
  90. case r2 < 0:
  91. op |= EmptyEndText | EmptyEndLine
  92. }
  93. if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
  94. op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
  95. }
  96. return op
  97. }
  98. // IsWordChar reports whether r is considered a “word character”
  99. // during the evaluation of the \b and \B zero-width assertions.
  100. // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
  101. func IsWordChar(r rune) bool {
  102. return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
  103. }
  104. // An Inst is a single instruction in a regular expression program.
  105. type Inst struct {
  106. Op InstOp
  107. Out uint32 // all but InstMatch, InstFail
  108. Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
  109. Rune []rune
  110. }
  111. func (p *Prog) String() string {
  112. var b strings.Builder
  113. dumpProg(&b, p)
  114. return b.String()
  115. }
  116. // skipNop follows any no-op or capturing instructions.
  117. func (p *Prog) skipNop(pc uint32) *Inst {
  118. i := &p.Inst[pc]
  119. for i.Op == InstNop || i.Op == InstCapture {
  120. i = &p.Inst[i.Out]
  121. }
  122. return i
  123. }
  124. // op returns i.Op but merges all the Rune special cases into InstRune
  125. func (i *Inst) op() InstOp {
  126. op := i.Op
  127. switch op {
  128. case InstRune1, InstRuneAny, InstRuneAnyNotNL:
  129. op = InstRune
  130. }
  131. return op
  132. }
  133. // Prefix returns a literal string that all matches for the
  134. // regexp must start with. Complete is true if the prefix
  135. // is the entire match.
  136. func (p *Prog) Prefix() (prefix string, complete bool) {
  137. i := p.skipNop(uint32(p.Start))
  138. // Avoid allocation of buffer if prefix is empty.
  139. if i.op() != InstRune || len(i.Rune) != 1 {
  140. return "", i.Op == InstMatch
  141. }
  142. // Have prefix; gather characters.
  143. var buf strings.Builder
  144. for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
  145. buf.WriteRune(i.Rune[0])
  146. i = p.skipNop(i.Out)
  147. }
  148. return buf.String(), i.Op == InstMatch
  149. }
  150. // StartCond returns the leading empty-width conditions that must
  151. // be true in any match. It returns ^EmptyOp(0) if no matches are possible.
  152. func (p *Prog) StartCond() EmptyOp {
  153. var flag EmptyOp
  154. pc := uint32(p.Start)
  155. i := &p.Inst[pc]
  156. Loop:
  157. for {
  158. switch i.Op {
  159. case InstEmptyWidth:
  160. flag |= EmptyOp(i.Arg)
  161. case InstFail:
  162. return ^EmptyOp(0)
  163. case InstCapture, InstNop:
  164. // skip
  165. default:
  166. break Loop
  167. }
  168. pc = i.Out
  169. i = &p.Inst[pc]
  170. }
  171. return flag
  172. }
  173. const noMatch = -1
  174. // MatchRune reports whether the instruction matches (and consumes) r.
  175. // It should only be called when i.Op == InstRune.
  176. func (i *Inst) MatchRune(r rune) bool {
  177. return i.MatchRunePos(r) != noMatch
  178. }
  179. // MatchRunePos checks whether the instruction matches (and consumes) r.
  180. // If so, MatchRunePos returns the index of the matching rune pair
  181. // (or, when len(i.Rune) == 1, rune singleton).
  182. // If not, MatchRunePos returns -1.
  183. // MatchRunePos should only be called when i.Op == InstRune.
  184. func (i *Inst) MatchRunePos(r rune) int {
  185. rune := i.Rune
  186. switch len(rune) {
  187. case 0:
  188. return noMatch
  189. case 1:
  190. // Special case: single-rune slice is from literal string, not char class.
  191. r0 := rune[0]
  192. if r == r0 {
  193. return 0
  194. }
  195. if Flags(i.Arg)&FoldCase != 0 {
  196. for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
  197. if r == r1 {
  198. return 0
  199. }
  200. }
  201. }
  202. return noMatch
  203. case 2:
  204. if r >= rune[0] && r <= rune[1] {
  205. return 0
  206. }
  207. return noMatch
  208. case 4, 6, 8:
  209. // Linear search for a few pairs.
  210. // Should handle ASCII well.
  211. for j := 0; j < len(rune); j += 2 {
  212. if r < rune[j] {
  213. return noMatch
  214. }
  215. if r <= rune[j+1] {
  216. return j / 2
  217. }
  218. }
  219. return noMatch
  220. }
  221. // Otherwise binary search.
  222. lo := 0
  223. hi := len(rune) / 2
  224. for lo < hi {
  225. m := lo + (hi-lo)/2
  226. if c := rune[2*m]; c <= r {
  227. if r <= rune[2*m+1] {
  228. return m
  229. }
  230. lo = m + 1
  231. } else {
  232. hi = m
  233. }
  234. }
  235. return noMatch
  236. }
  237. // MatchEmptyWidth reports whether the instruction matches
  238. // an empty string between the runes before and after.
  239. // It should only be called when i.Op == InstEmptyWidth.
  240. func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
  241. switch EmptyOp(i.Arg) {
  242. case EmptyBeginLine:
  243. return before == '\n' || before == -1
  244. case EmptyEndLine:
  245. return after == '\n' || after == -1
  246. case EmptyBeginText:
  247. return before == -1
  248. case EmptyEndText:
  249. return after == -1
  250. case EmptyWordBoundary:
  251. return IsWordChar(before) != IsWordChar(after)
  252. case EmptyNoWordBoundary:
  253. return IsWordChar(before) == IsWordChar(after)
  254. }
  255. panic("unknown empty width arg")
  256. }
  257. func (i *Inst) String() string {
  258. var b strings.Builder
  259. dumpInst(&b, i)
  260. return b.String()
  261. }
  262. func bw(b *strings.Builder, args ...string) {
  263. for _, s := range args {
  264. b.WriteString(s)
  265. }
  266. }
  267. func dumpProg(b *strings.Builder, p *Prog) {
  268. for j := range p.Inst {
  269. i := &p.Inst[j]
  270. pc := strconv.Itoa(j)
  271. if len(pc) < 3 {
  272. b.WriteString(" "[len(pc):])
  273. }
  274. if j == p.Start {
  275. pc += "*"
  276. }
  277. bw(b, pc, "\t")
  278. dumpInst(b, i)
  279. bw(b, "\n")
  280. }
  281. }
  282. func u32(i uint32) string {
  283. return strconv.FormatUint(uint64(i), 10)
  284. }
  285. func dumpInst(b *strings.Builder, i *Inst) {
  286. switch i.Op {
  287. case InstAlt:
  288. bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
  289. case InstAltMatch:
  290. bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
  291. case InstCapture:
  292. bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
  293. case InstEmptyWidth:
  294. bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
  295. case InstMatch:
  296. bw(b, "match")
  297. case InstFail:
  298. bw(b, "fail")
  299. case InstNop:
  300. bw(b, "nop -> ", u32(i.Out))
  301. case InstRune:
  302. if i.Rune == nil {
  303. // shouldn't happen
  304. bw(b, "rune <nil>")
  305. }
  306. bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
  307. if Flags(i.Arg)&FoldCase != 0 {
  308. bw(b, "/i")
  309. }
  310. bw(b, " -> ", u32(i.Out))
  311. case InstRune1:
  312. bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
  313. case InstRuneAny:
  314. bw(b, "any -> ", u32(i.Out))
  315. case InstRuneAnyNotNL:
  316. bw(b, "anynotnl -> ", u32(i.Out))
  317. }
  318. }