svi
/
wartank


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
							// Copyright 2023 The Rec Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package rec // import "modernc.org/rec/lib"

import (
	"fmt"
	"strconv"
	"unicode"

	"modernc.org/regexp"
)

func (c *cfg) lex(fn string, args []string, utf8Input, strInput bool) error {
	if len(args) == 0 {
		return fmt.Errorf("expected 1 or more non-flag argument: a Go-flavored regular expression")
	}

	dfa, err := regexp.NewLexDFA(args)
	if err != nil {
		return err
	}

	if *c.oDbg {
		if _, err := fmt.Fprintf(c.stderr, "%s", dfa); err != nil {
			return err
		}
	}

	m := map[int]string{}
	for id := 0; id < dfa.CharClasses(); id++ {
		cls := dfa.CharClass(id)
		if s := tables.find(cls); s != "" {
			m[id] = s
		}
	}
	c.prolog(true)
	kind := "ASCII"
	if utf8Input {
		kind = "UTF-8"
	}
	c.w(`// %s recognizes longest %s lexemes. Lower IDs take precedence on same length.
//
`, fn, kind)
	for i, v := range args {
		c.w("//\tid %3d: %s\n", i, pretty(v))
	}
	c.w("//\n// ID == -1 is returned when no lexeme was recognized.\n")
	rx := c.rxString()
	switch {
	case strInput:
		c.w("func %s%s(s string) (id, length int) {\n", rx, fn)
	default:
		c.w("func %s%s(s []byte) (id, length int) {\n", rx, fn)
	}

	defer c.w("}\n")

	c.w("\tconst endOfText = %#0x\n", unicode.MaxRune+1)
	c.w("\tvar pos, pos0, width, width1 int\n")
	c.w("\tid = -1\n")
	c.w("\tvar r, r1 rune\n")
	c.w("\t_ = pos0\n")
	c.w("\t_ = r\n")
	c.w("\t_ = r1\n")
	c.w("\t_ = width1\n")
	switch {
	case strInput:
		switch {
		case utf8Input:
			c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { c := s[pos]; if c < utf8.RuneSelf { return rune(c), 1 }; return utf8.DecodeRuneInString(s[pos:]) }; return endOfText, 0 }\n", c.lcase)
		default:
			c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { return rune(s[pos]), 1}; return endOfText, 0 }\n", c.lcase)
		}
	default:
		switch {
		case utf8Input:
			c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { c := s[pos]; if c < utf8.RuneSelf { return rune(c), 1 }; return utf8.DecodeRune(s[pos:]) }; return endOfText, 0 }\n", c.lcase)
		default:
			c.w("\tstep := func(pos int) (r rune, n int) { %sif pos < len(s) { return rune(s[pos]), 1}; return endOfText, 0 }\n", c.lcase)
		}
	}

	stepf := func(prefix, r, width, pos string) string {
		return fmt.Sprintf("%s\t%s, %s = step(%s);", prefix, r, width, pos)
	}

	c.w("\tmove := func() { pos += width; if r, width = r1, width1; r != endOfText { %s }; }\n", stepf("", "r1", "width1", "pos+width"))
	c.w("\taccept := func(x rune) bool { if r == x { move(); return true }; return false }\n_ = accept\n")
	c.w("\taccept2 := func(x rune) bool { if r <= x { move(); return true }; return false }\n_ = accept2\n")
	if *c.oTrc {
		c.trc("==== start: pos %d\n", "pos")
	}
	c.w("\tr, r1 = endOfText, endOfText\n")
	c.w("\twidth, width1 = 0, 0\n")

	step := func(prefix, r, width, pos string) { c.w("%s", stepf(prefix, r, width, pos)) }

	step("", "r", "width", "pos")
	c.w("\tif r != endOfText {\n")
	step("\t", "r1", "width1", "pos+width")
	c.w(" }\n")
	prog := dfa.Prog()

	gotof := func(pc uint32) string { return fmt.Sprintf("goto l%d", pc) }
	consumef := func(pc uint32) string { return fmt.Sprintf("move(); %s;", gotof(pc)) }

	for pc := dfa.StartPC(); pc < len(prog); {
		if pc != dfa.StartPC() {
			c.w("goto l%d\nl%[1]d:\n", pc)
		}
		if *c.oTrc {
			c.trc("---- new state\n", "")
		}
		sym := prog[pc]
		pc0 := pc
		pc++
		if *c.oTrc {
			c.trc("%04d: %#U, pos %d, r %#U, %d, r1 %#U, %d, pos0 %d\n", fmt.Sprintf("%d, %d, pos, r, width, r1, width, pos0", pc0, sym))
		}
		switch op, ch := sym>>regexp.DFARuneBits, sym&regexp.DFARuneMask; op {
		case regexp.DFAOpCharClass:
			next := prog[pc]
			pc++
			cls := dfa.CharClass(int(ch))
			switch nm := m[int(ch)]; {
			case nm != "":
				c.w("\tif unicode.Is(unicode.%s, r) { %s }\n", nm, consumef(next))
			default:
				for i := 0; i < len(cls); i += 2 {
					switch lo, hi := cls[i], cls[i+1]; {
					case lo == hi:
						c.w("\tif accept(%s) { %s }\n", strconv.QuoteRuneToASCII(lo), gotof(next))
					default:
						if lo > 0 {
							c.w("\tif r < %s { goto l%dout }\n", strconv.QuoteRuneToASCII(lo), pc0)
						}
						c.w("\tif accept2(%s) { %s }\n", strconv.QuoteRuneToASCII(hi), gotof(next))
					}
				}
				c.w("l%dout:\n", pc0)
			}
		case regexp.DFAOpStop:
			c.w("\treturn id, length\n")
		case regexp.DFAOpRune:
			next := prog[pc]
			pc++
			c.w("\tif accept(%s) { %s }\n", strconv.QuoteRuneToASCII(rune(ch)), gotof(next))
		case regexp.DFAOpAccept:
			if *c.oTrc {
				c.trc("accept %d\n", fmt.Sprint(ch))
			}
			c.w("\tid, length = %[1]d, pos\n", ch)
			if *c.oTrc {
				c.trc("id %d, length %d\n", "id, length")
			}
		case regexp.DFAOpBeginText:
			next := prog[pc]
			pc++
			c.w("\tif pos != pos0 { return -1, 0 }\n")
			c.w("\t%s\n", gotof(next))
		case regexp.DFAOpEndText:
			next := prog[pc]
			pc++
			c.w("\tif r == endOfText { %s }\n", gotof(next))
		default:
			c.w("\tpanic(%q)\n", fmt.Sprintf("%04d: %#U", pc0, sym))
		}
	}
	return nil
}