lex.go 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. // Copyright 2023 The Rec Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package rec // import "modernc.org/rec/lib"
  5. import (
  6. "fmt"
  7. "strconv"
  8. "unicode"
  9. "modernc.org/regexp"
  10. )
  11. func (c *cfg) lex(fn string, args []string, utf8Input, strInput bool) error {
  12. if len(args) == 0 {
  13. return fmt.Errorf("expected 1 or more non-flag argument: a Go-flavored regular expression")
  14. }
  15. dfa, err := regexp.NewLexDFA(args)
  16. if err != nil {
  17. return err
  18. }
  19. if *c.oDbg {
  20. if _, err := fmt.Fprintf(c.stderr, "%s", dfa); err != nil {
  21. return err
  22. }
  23. }
  24. m := map[int]string{}
  25. for id := 0; id < dfa.CharClasses(); id++ {
  26. cls := dfa.CharClass(id)
  27. if s := tables.find(cls); s != "" {
  28. m[id] = s
  29. }
  30. }
  31. c.prolog(true)
  32. kind := "ASCII"
  33. if utf8Input {
  34. kind = "UTF-8"
  35. }
  36. c.w(`// %s recognizes longest %s lexemes. Lower IDs take precedence on same length.
  37. //
  38. `, fn, kind)
  39. for i, v := range args {
  40. c.w("//\tid %3d: %s\n", i, pretty(v))
  41. }
  42. c.w("//\n// ID == -1 is returned when no lexeme was recognized.\n")
  43. rx := c.rxString()
  44. switch {
  45. case strInput:
  46. c.w("func %s%s(s string) (id, length int) {\n", rx, fn)
  47. default:
  48. c.w("func %s%s(s []byte) (id, length int) {\n", rx, fn)
  49. }
  50. defer c.w("}\n")
  51. c.w("\tconst endOfText = %#0x\n", unicode.MaxRune+1)
  52. c.w("\tvar pos, pos0, width, width1 int\n")
  53. c.w("\tid = -1\n")
  54. c.w("\tvar r, r1 rune\n")
  55. c.w("\t_ = pos0\n")
  56. c.w("\t_ = r\n")
  57. c.w("\t_ = r1\n")
  58. c.w("\t_ = width1\n")
  59. switch {
  60. case strInput:
  61. switch {
  62. case utf8Input:
  63. c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { c := s[pos]; if c < utf8.RuneSelf { return rune(c), 1 }; return utf8.DecodeRuneInString(s[pos:]) }; return endOfText, 0 }\n", c.lcase)
  64. default:
  65. c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { return rune(s[pos]), 1}; return endOfText, 0 }\n", c.lcase)
  66. }
  67. default:
  68. switch {
  69. case utf8Input:
  70. c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { c := s[pos]; if c < utf8.RuneSelf { return rune(c), 1 }; return utf8.DecodeRune(s[pos:]) }; return endOfText, 0 }\n", c.lcase)
  71. default:
  72. c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { return rune(s[pos]), 1}; return endOfText, 0 }\n", c.lcase)
  73. }
  74. }
  75. stepf := func(prefix, r, width, pos string) string {
  76. return fmt.Sprintf("%s\t%s, %s = step(%s);", prefix, r, width, pos)
  77. }
  78. c.w("\tmove := func() { pos += width; if r, width = r1, width1; r != endOfText { %s }; }\n", stepf("", "r1", "width1", "pos+width"))
  79. c.w("\taccept := func(x rune) bool { if r == x { move(); return true }; return false }\n_ = accept\n")
  80. c.w("\taccept2 := func(x rune) bool { if r <= x { move(); return true }; return false }\n_ = accept2\n")
  81. if *c.oTrc {
  82. c.trc("==== start: pos %d\n", "pos")
  83. }
  84. c.w("\tr, r1 = endOfText, endOfText\n")
  85. c.w("\twidth, width1 = 0, 0\n")
  86. step := func(prefix, r, width, pos string) { c.w("%s", stepf(prefix, r, width, pos)) }
  87. step("", "r", "width", "pos")
  88. c.w("\tif r != endOfText {\n")
  89. step("\t", "r1", "width1", "pos+width")
  90. c.w(" }\n")
  91. prog := dfa.Prog()
  92. gotof := func(pc uint32) string { return fmt.Sprintf("goto l%d", pc) }
  93. consumef := func(pc uint32) string { return fmt.Sprintf("move(); %s;", gotof(pc)) }
  94. for pc := dfa.StartPC(); pc < len(prog); {
  95. if pc != dfa.StartPC() {
  96. c.w("goto l%d\nl%[1]d:\n", pc)
  97. }
  98. if *c.oTrc {
  99. c.trc("---- new state\n", "")
  100. }
  101. sym := prog[pc]
  102. pc0 := pc
  103. pc++
  104. if *c.oTrc {
  105. c.trc("%04d: %#U, pos %d, r %#U, %d, r1 %#U, %d, pos0 %d\n", fmt.Sprintf("%d, %d, pos, r, width, r1, width, pos0", pc0, sym))
  106. }
  107. switch op, ch := sym>>regexp.DFARuneBits, sym&regexp.DFARuneMask; op {
  108. case regexp.DFAOpCharClass:
  109. next := prog[pc]
  110. pc++
  111. cls := dfa.CharClass(int(ch))
  112. switch nm := m[int(ch)]; {
  113. case nm != "":
  114. c.w("\tif unicode.Is(unicode.%s, r) { %s }\n", nm, consumef(next))
  115. default:
  116. for i := 0; i < len(cls); i += 2 {
  117. switch lo, hi := cls[i], cls[i+1]; {
  118. case lo == hi:
  119. c.w("\tif accept(%s) { %s }\n", strconv.QuoteRuneToASCII(lo), gotof(next))
  120. default:
  121. if lo > 0 {
  122. c.w("\tif r < %s { goto l%dout }\n", strconv.QuoteRuneToASCII(lo), pc0)
  123. }
  124. c.w("\tif accept2(%s) { %s }\n", strconv.QuoteRuneToASCII(hi), gotof(next))
  125. }
  126. }
  127. c.w("l%dout:\n", pc0)
  128. }
  129. case regexp.DFAOpStop:
  130. c.w("\treturn id, length\n")
  131. case regexp.DFAOpRune:
  132. next := prog[pc]
  133. pc++
  134. c.w("\tif accept(%s) { %s }\n", strconv.QuoteRuneToASCII(rune(ch)), gotof(next))
  135. case regexp.DFAOpAccept:
  136. if *c.oTrc {
  137. c.trc("accept %d\n", fmt.Sprint(ch))
  138. }
  139. c.w("\tid, length = %[1]d, pos\n", ch)
  140. if *c.oTrc {
  141. c.trc("id %d, length %d\n", "id, length")
  142. }
  143. case regexp.DFAOpBeginText:
  144. next := prog[pc]
  145. pc++
  146. c.w("\tif pos != pos0 { return -1, 0 }\n")
  147. c.w("\t%s\n", gotof(next))
  148. case regexp.DFAOpEndText:
  149. next := prog[pc]
  150. pc++
  151. c.w("\tif r == endOfText { %s }\n", gotof(next))
  152. default:
  153. c.w("\tpanic(%q)\n", fmt.Sprintf("%04d: %#U", pc0, sym))
  154. }
  155. }
  156. return nil
  157. }