match.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. // Copyright 2023 The Rec Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package rec // import "modernc.org/rec/lib"
  5. import (
  6. "fmt"
  7. "strconv"
  8. "strings"
  9. "unicode"
  10. "modernc.org/regexp"
  11. "modernc.org/regexp/syntax"
  12. )
  13. func (c *cfg) match(fn string, args []string, utf8Input, strInput bool) error {
  14. if len(args) != 1 {
  15. return fmt.Errorf("expected 1 non-flag argument: a Go-flavored regular expression")
  16. }
  17. expr := args[0]
  18. dfa, err := regexp.NewMatchDFA(expr)
  19. if err != nil {
  20. return err
  21. }
  22. if *c.oDbg {
  23. if _, err := fmt.Fprintf(c.stderr, "%s\nprefix %q\n", dfa, dfa.Prefix()); err != nil {
  24. return err
  25. }
  26. }
  27. c.prolog(false)
  28. startCond := dfa.Cond()
  29. c.w("// %s matches %s, start condition %v\n", fn, pretty(expr), startCond)
  30. rx := c.rxString()
  31. switch {
  32. case strInput:
  33. c.w("func %s%s(s string) bool {\n", rx, fn)
  34. default:
  35. c.w("func %s%s(s []byte) bool {\n", rx, fn)
  36. }
  37. defer c.w("}\n")
  38. if expr == "" {
  39. c.w("\treturn true\n")
  40. return nil
  41. }
  42. if startCond == ^syntax.EmptyOp(0) {
  43. c.w("\treturn false\n")
  44. return nil
  45. }
  46. c.w("\tconst endOfText = %#0x\n", unicode.MaxRune+1)
  47. c.w("\tconst minInputLen = %d\n", dfa.MinInputLen())
  48. if dfa.Prefix() != "" {
  49. switch pr := dfa.PrefixRune(); {
  50. case utf8Input:
  51. pb := rune([]byte(string(pr))[0])
  52. c.w("\tconst prefixByte = %s\n", strconv.QuoteRuneToASCII(pb))
  53. default:
  54. pb := rune(byte(pr))
  55. c.w("\tconst prefixByte = %s\n", strconv.QuoteRuneToASCII(pb))
  56. }
  57. switch {
  58. case strInput:
  59. c.w("\tprefix := %q\n", dfa.Prefix())
  60. default:
  61. c.w("\tprefix := []byte(%q)\n", dfa.Prefix())
  62. }
  63. }
  64. c.w("\tvar pos, pos0, width, width0, width1 int\n")
  65. if dfa.Prefix() != "" {
  66. c.w("\tvar advance int\n")
  67. }
  68. c.w("\tvar r, r1 rune\n")
  69. c.w("\t_ = r\n")
  70. c.w("\t_ = r1\n")
  71. c.w("\t_ = width1\n")
  72. c.w("\tlastRestartPos := -1\n")
  73. switch {
  74. case strInput:
  75. switch {
  76. case utf8Input:
  77. c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { c := s[pos]; if c < utf8.RuneSelf { return rune(c), 1 }; return utf8.DecodeRuneInString(s[pos:]) }; return endOfText, 0 }\n", c.lcase)
  78. default:
  79. c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { return rune(s[pos]), 1}; return endOfText, 0 }\n", c.lcase)
  80. }
  81. default:
  82. switch {
  83. case utf8Input:
  84. c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { c := s[pos]; if c < utf8.RuneSelf { return rune(c), 1 }; return utf8.DecodeRune(s[pos:]) }; return endOfText, 0 }\n", c.lcase)
  85. default:
  86. c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { return rune(s[pos]), 1}; return endOfText, 0 }\n", c.lcase)
  87. }
  88. }
  89. c.w("restart:\n")
  90. if *c.oTrc {
  91. c.trc("==== start/restart: pos %d\n", "pos")
  92. }
  93. c.w("\tr, r1 = endOfText, endOfText\n")
  94. c.w("\twidth, width1 = 0, 0\n")
  95. stepf := func(prefix, r, width, pos string) string {
  96. return fmt.Sprintf("%s\t%s, %s = step(%s);", prefix, r, width, pos)
  97. }
  98. step := func(prefix, r, width, pos string) { c.w("%s", stepf(prefix, r, width, pos)) }
  99. step("", "r", "width", "pos")
  100. c.w("\tif r != endOfText {\n")
  101. step("\t", "r1", "width1", "pos+width")
  102. c.w(" }\n")
  103. c.w("\tif pos == lastRestartPos { return false }; lastRestartPos = pos\n")
  104. pc := dfa.StartPC()
  105. switch {
  106. case startCond&syntax.EmptyBeginText != 0:
  107. c.w("\tif pos != 0 { return false }\n")
  108. if dfa.Prefix() != "" {
  109. c.w("\tif r != %s { return false }\n", strconv.QuoteRuneToASCII(dfa.PrefixRune()))
  110. }
  111. case dfa.Prefix() != "":
  112. pc = dfa.PrefixPC()
  113. switch {
  114. case strInput:
  115. c.w("\tfor {\n")
  116. c.w("\t\tif len(s)-pos < minInputLen { return false }\n")
  117. c.w("\t\tif advance = strings.IndexByte(s[pos:], prefixByte); advance < 0 { return false }\n")
  118. c.w("\t\tpos += advance\n")
  119. c.w("\t\tif len(s)-pos < minInputLen { return false }\n")
  120. c.w("\t\tif s[pos:pos+len(prefix)] == prefix { break }\n")
  121. c.w("\t\tpos++\n")
  122. c.w("\t}\n")
  123. default:
  124. c.w("\tfor {\n")
  125. c.w("\t\tif len(s)-pos < minInputLen { return false }\n")
  126. c.w("\t\tif advance = bytes.IndexByte(s[pos:], prefixByte); advance < 0 { return false }\n")
  127. c.w("\t\tpos += advance\n")
  128. c.w("\t\tif len(s)-pos < minInputLen { return false }\n")
  129. c.w("\t\tif bytes.Equal(s[pos:pos+len(prefix)], prefix) { break }\n")
  130. c.w("\t\tpos++\n")
  131. c.w("\t}\n")
  132. }
  133. default:
  134. c.w("\t\tif len(s)-pos < minInputLen { return false }\n")
  135. }
  136. c.w("\tpos0 = pos; width0 = width\n")
  137. if dfa.Prefix() != "" {
  138. c.w("\tpos += len(prefix)\n")
  139. step("", "r", "width", "pos")
  140. step("", "r1", "width1", "pos+width")
  141. c.w("\twidth0 = width\n")
  142. c.w("\n")
  143. }
  144. prog := dfa.Prog()
  145. gotof := func(pc uint32) string {
  146. switch prog[pc] >> regexp.DFARuneBits {
  147. case regexp.DFAOpAccept:
  148. return "return true"
  149. default:
  150. return fmt.Sprintf("goto l%d", pc)
  151. }
  152. }
  153. consumef := func(pc uint32) string {
  154. s := gotof(pc)
  155. if strings.Contains(s, "return") {
  156. return s
  157. }
  158. return fmt.Sprintf("pos += width; if r, width = r1, width1; r != endOfText { %s }; %s;", stepf("", "r1", "width1", "pos+width"), s)
  159. }
  160. c.w("goto l%d\n", pc)
  161. pc = dfa.StartPC()
  162. for pc < len(prog) {
  163. c.w("goto l%d; l%[1]d:\n", pc)
  164. if *c.oTrc {
  165. c.trc("---- new state\n", "")
  166. }
  167. sym := prog[pc]
  168. pc0 := pc
  169. pc++
  170. if *c.oTrc {
  171. c.trc("%04d: %#U, pos %d, r %#U, %d, r1 %#U, %d, pos0 %d, width0 %d\n", fmt.Sprintf("%d, %d, pos, r, width, r1, width, pos0, width0", pc0, sym))
  172. }
  173. switch op, ch := sym>>regexp.DFARuneBits, sym&regexp.DFARuneMask; op {
  174. case regexp.DFAOpCharClass:
  175. next := prog[pc]
  176. pc++
  177. cls := dfa.CharClass(int(ch))
  178. out := false
  179. for i := 0; i < len(cls); i += 2 {
  180. switch lo, hi := cls[i], cls[i+1]; {
  181. case lo == hi:
  182. c.w("\tif r == %s { %s }\n", strconv.QuoteRuneToASCII(lo), consumef(next))
  183. default:
  184. out = true
  185. c.w("\tif r < %s { goto l%dout }\n", strconv.QuoteRuneToASCII(lo), pc0)
  186. c.w("\tif r <= %s { %s }\n", strconv.QuoteRuneToASCII(hi), consumef(next))
  187. }
  188. }
  189. if out {
  190. c.w("l%dout:\n", pc0)
  191. }
  192. case regexp.DFAOpStop:
  193. c.w("\tpos = pos0 + width0; goto restart\n")
  194. case regexp.DFAOpRune:
  195. next := prog[pc]
  196. pc++
  197. c.w("\tif r == %s { %s }\n", strconv.QuoteRuneToASCII(rune(ch)), consumef(next))
  198. case regexp.DFAOpAccept:
  199. c.w("\treturn true\n")
  200. case regexp.DFAOpBeginText:
  201. next := prog[pc]
  202. pc++
  203. c.w("\tif pos != pos0 { return false }\n")
  204. c.w("\t%s\n", gotof(next))
  205. case regexp.DFAOpEndText:
  206. next := prog[pc]
  207. pc++
  208. c.w("\tif r == endOfText { %s }\n", gotof(next))
  209. default:
  210. c.w("\tpanic(%q)\n", fmt.Sprintf("%04d: %#U", pc0, sym))
  211. }
  212. }
  213. return nil
  214. }