// Copyright 2023 The Regexp Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package regexp // modernc.org/regexp import ( "fmt" "sort" "strconv" "strings" "unicode" "unicode/utf8" "modernc.org/fsm" "modernc.org/regexp/syntax" ) const ( smallClass = 64 unicodePrivateFirst = 0xe000 unicodePrivateLast = 0xf8ff ) type nfaLongest struct { *fsm.NFA charClasses register[string] err error clsIndex []*runeSlice sink *fsm.State start *fsm.State capturesAsGroups bool consumes bool hasOpEndText bool } func compileNFALongest(sre *syntax.Regexp, capturesAsGroups, postProcess bool) (r *nfaLongest, err error) { defer func() { if r != nil { if err == nil { err = r.err } return } if err == nil { err = fmt.Errorf("compile NFA longest failed") } }() r = &nfaLongest{ NFA: fsm.NewLimitedNFA(maxFSMStates), capturesAsGroups: capturesAsGroups, } if !r.canCompile(sre) { return nil, r.err } r.start = r.NewState() r.sink = r.NewState() if r.start == nil || r.sink == nil { return nil, r.err } out := r.add(r.start, sre) if out == nil { return nil, r.err } if r.hasOpEndText && !r.consumes { return nil, r.err } s := r.NewState() if s == nil { return nil, r.err } s.IsAccepting = true out.NewEdge(DFAOpAccept<>DFARuneBits, sym&DFARuneMask; op { case DFAOpRune: for _, next := range nexts { infos = append(infos, info{c, next}) } } } var charClasses []*runeSlice for _, sym := range syms { nexts := edges.Get(sym).List() switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op { case DFAOpCharClass: charClass := n.clsIndex[c] for _, v := range charClasses { if !charClass.isDisjoint(*v) { return false } } charClasses = append(charClasses, charClass) for _, info := range infos { c := info.rune if charClass.has(rune(c)) { for _, next := range nexts { state.NewEdge(DFAOpRune<>DFARuneBits, syms[j]>>DFARuneBits soa := opSortOrder[a] sob := opSortOrder[b] if soa < sob { return true } if soa > sob { return false } return syms[i]&DFARuneMask < syms[j]&DFARuneMask }) return syms } type runeInfo struct { rune nexts []*fsm.State } type charClassInfo struct { id int sym int s *runeSlice nexts []*fsm.State } //lint:ignore U1000 debug helper func nfaString(n *fsm.NFA, clsIndex []*runeSlice) string { a := strings.Split(n.String(), "\n") w := 0 for _, v := range a { v0 := v var c uint64 if x := strings.Index(v, "(U+"); x >= 0 { s := v[x+3:] s = s[:strings.Index(s, ")")] c, _ = strconv.ParseUint(s, 16, 32) c = c & DFARuneMask } switch { case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*dfaOpState)): // skip case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpAccept)): a[w] = fmt.Sprintf("%s (accept)", v0) w++ case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpCharClass)): a[w] = fmt.Sprintf("%s (class[%d]=%s)", v0, c, clsIndex[c]) w++ case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpEndText)): a[w] = fmt.Sprintf("%s (EOT)", v0) w++ case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpBeginText)): a[w] = fmt.Sprintf("%s (begin text)", v0) w++ default: a[w] = v0 w++ } } a = a[:w] for i, v := range clsIndex { a = append(a, fmt.Sprintf("char class %d: %v", i, v)) } a = append(a, "") return strings.Join(a, "\n") } func optimizeDFA(n *fsm.NFA, clsReg *register[string], clsIndex *[]*runeSlice) (r *fsm.NFA) { const classify = 7 for _, state := range n.List() { edges := state.Transitions() syms := edges.List() m := map[*fsm.State][]int{} // state: sym for _, sym := range syms { if sym == fsm.Epsilon { panic("internal error, not a DFA") } switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op { case DFAOpRune: targets := edges.Get(c).List() if len(targets) != 1 { return nil } target := targets[0] m[target] = append(m[target], sym) } } for next, syms := range m { if len(syms) < classify { continue } var rs runeSlice for _, r := range syms { c := rune(r & DFARuneMask) rs = append(rs, c, c) } rs = rs.normalize() id := registerCharClass(clsReg, clsIndex, &rs) for _, sym := range syms { edges.Delete(sym) } state.NewEdge(DFAOpCharClass<>DFARuneBits, sym&DFARuneMask; op { case DFAOpRune: if c >= unicodePrivateFirst && c <= unicodePrivateLast { break } runeInfos = append(runeInfos, runeInfo{rune(c), edges.Get(c).List()}) } } var charClassInfos []charClassInfo for _, sym := range syms { nexts := edges.Get(sym).List() switch op, sID := sym>>DFARuneBits, sym&DFARuneMask; op { case DFAOpCharClass: cls := (*clsIndex)[sID] if cls.cardinality() <= smallClass { nexts := edges.Get(sym).List() state.Transitions().Delete(sym) runes := cls.expand() for _, next := range nexts { for _, v := range runes { state.NewEdge(DFAOpRune<>DFARuneBits, sym&DFARuneMask; op { case DFAOpRune: if c >= unicodePrivateFirst && c <= unicodePrivateLast { edges.Delete(sym) state.NewEdge(DFAOpAccept<>DFARuneBits, sym&DFARuneMask; op { case dfaOpState: dfaPC2nfaStates[len(d.prog)] = append(dfaPC2nfaStates[len(d.prog)], c) } } instrCnt := 0 hasOpBeginText := false var charClasses []*runeSlice accepted := false for _, sym := range syms { nexts := edges.Get(sym).List() if len(nexts) > 1 { return } next := nexts[0].Id() switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op { case DFAOpRune: d.prog = append(d.prog, uint32(DFAOpRune<>DFARuneBits, sym&DFARuneMask; op { case DFAOpRune: for _, next := range nexts { infos = append(infos, info{c, next}) } case DFAOpCharClass: if hasCharClass { return nil, nil } hasCharClass = true charClass := d.id2charClass[c] for _, info := range infos { c := info.rune if charClass.has(rune(c)) { rebuild = true for _, next := range nexts { state.NewEdge(DFAOpRune<>DFARuneBits, sym&DFARuneMask; op { case dfaOpState: dfaPC2nfaStates[len(d.prog)] = append(dfaPC2nfaStates[len(d.prog)], c) case DFAOpAccept: isAccepting = true } } if isAccepting { d.prog = append(d.prog, uint32(DFAOpAccept< 1 { return } next := nexts[0].Id() switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op { case DFAOpRune: d.prog = append(d.prog, uint32(DFAOpRune<>DFARuneBits, sym&DFARuneMask; op { case DFAOpRune: next := d.prog[pc] pc++ if r == rune(c) { pc = int(next) continue state } case DFAOpBeginText: if pc0 != d.startPC { return false } pc = int(d.prog[pc]) continue instr default: return false } } } d.prefixPC = pc return true } func (n *nfaLongest) String() string { a := strings.Split(n.NFA.String(), "\n") w := 0 for _, v := range a { v0 := v var c uint64 if x := strings.Index(v, "(U+"); x >= 0 { s := v[x+3:] s = s[:strings.Index(s, ")")] c, _ = strconv.ParseUint(s, 16, 32) c = c & DFARuneMask } switch { case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*dfaOpState)): // skip case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpAccept)): a[w] = fmt.Sprintf("%s (accept)", v0) w++ case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpCharClass)): a[w] = fmt.Sprintf("%s (%s)", v0, n.clsIndex[c]) w++ case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpEndText)): a[w] = fmt.Sprintf("%s (EOT)", v0) w++ case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpBeginText)): a[w] = fmt.Sprintf("%s (begin text)", v0) w++ default: a[w] = v0 w++ } } a = a[:w] return strings.Join(a, "\n") } func (d *DFA) String() string { var b strings.Builder fmt.Fprintf(&b, "NFA\n%s\n", d.nfa) prog := d.prog fmt.Fprintf(&b, "DFA prog: %4d, start %05d", len(prog), d.startPC) if d.prefixPC > 0 { fmt.Fprintf(&b, ", prefix start %05d", d.prefixPC) } fmt.Fprintln(&b) state := 0 newState := true for pc := 0; pc < len(prog); { if newState { fmt.Fprintf(&b, "[%d] nfa states %v\n", state, d.dfaPC2nfaStates[pc]) state++ newState = false } sym := prog[pc] fmt.Fprintf(&b, "\t%05d:\t%#U\t", pc, sym) pc++ switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op { case DFAOpRune: next := prog[pc] pc++ fmt.Fprintf(&b, "%s -> %05d\n", strconv.QuoteRuneToASCII(rune(c)), next) case DFAOpEndText: next := prog[pc] pc++ fmt.Fprintf(&b, "EOT -> %05d\n", next) case DFAOpCharClass: next := prog[pc] pc++ cls := d.id2charClass[c] fmt.Fprintf(&b, "%s -> %05d\n", cls, next) case DFAOpAccept: fmt.Fprintf(&b, "accept\n") case DFAOpStop: fmt.Fprintf(&b, "stop\n") newState = true case DFAOpBeginText: next := prog[pc] pc++ fmt.Fprintf(&b, "begin text -> %05d\n", next) default: fmt.Fprintf(&b, "???\n") } } return b.String() } // OnePassProg represents a one pass program. // // "One-pass" regexp execution. // // Some regexps can be analyzed to determine that they never need backtracking: // they are guaranteed to run in one pass over the string without bothering to // save all the usual NFA state. type OnePassProg struct { Prog *syntax.Prog numSubexp int prefix string subexpNames []string prefixComplete bool // prefix is the entire regexp } func (p *OnePassProg) String() string { return p.Prog.String() } // NumSubexp returns the number of parenthesized subexpressions in this // OnePassProg. func (p *OnePassProg) NumSubexp() int { return p.numSubexp } // SubexpNames returns the names of the parenthesized subexpressions in this // OnePassProg. The name for the first sub-expression is names[1], so that if m // is a match slice, the name for m[i] is SubexpNames()[i]. Since the // OnePassProg as a whole cannot be named, names[0] is always the empty string. func (p *OnePassProg) SubexpNames() []string { return append([]string(nil), p.subexpNames...) } // LiteralPrefix returns a literal string that must begin any match // of the regular expression p. It returns the boolean true if the // literal string comprises the entire regular expression. func (p *OnePassProg) LiteralPrefix() (prefix string, complete bool) { return p.prefix, p.prefixComplete } // OnePassProg returns a new OnePassProg or an error, if any. func NewOnePassProg(expr string, mode syntax.Flags) (*OnePassProg, error) { re, err := syntax.Parse(expr, mode) if err != nil { return nil, err } maxCap := re.MaxCap() capNames := re.CapNames() re = re.Simplify() prog, err := syntax.Compile(re) if err != nil { return nil, err } onepass := compileOnePass(prog) if onepass == nil { return nil, fmt.Errorf("cannot handle `%s`", expr) } prefix, prefixComplete, _ := onePassPrefix(prog) return &OnePassProg{ Prog: prog, numSubexp: maxCap, subexpNames: capNames, prefix: prefix, prefixComplete: prefixComplete, }, nil }