rec.go 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254
  1. // Copyright 2023 The Regexp Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package regexp // modernc.org/regexp
  5. import (
  6. "fmt"
  7. "sort"
  8. "strconv"
  9. "strings"
  10. "unicode"
  11. "unicode/utf8"
  12. "modernc.org/fsm"
  13. "modernc.org/regexp/syntax"
  14. )
  15. const (
  16. smallClass = 64
  17. unicodePrivateFirst = 0xe000
  18. unicodePrivateLast = 0xf8ff
  19. )
  20. type nfaLongest struct {
  21. *fsm.NFA
  22. charClasses register[string]
  23. err error
  24. clsIndex []*runeSlice
  25. sink *fsm.State
  26. start *fsm.State
  27. capturesAsGroups bool
  28. consumes bool
  29. hasOpEndText bool
  30. }
  31. func compileNFALongest(sre *syntax.Regexp, capturesAsGroups, postProcess bool) (r *nfaLongest, err error) {
  32. defer func() {
  33. if r != nil {
  34. if err == nil {
  35. err = r.err
  36. }
  37. return
  38. }
  39. if err == nil {
  40. err = fmt.Errorf("compile NFA longest failed")
  41. }
  42. }()
  43. r = &nfaLongest{
  44. NFA: fsm.NewLimitedNFA(maxFSMStates),
  45. capturesAsGroups: capturesAsGroups,
  46. }
  47. if !r.canCompile(sre) {
  48. return nil, r.err
  49. }
  50. r.start = r.NewState()
  51. r.sink = r.NewState()
  52. if r.start == nil || r.sink == nil {
  53. return nil, r.err
  54. }
  55. out := r.add(r.start, sre)
  56. if out == nil {
  57. return nil, r.err
  58. }
  59. if r.hasOpEndText && !r.consumes {
  60. return nil, r.err
  61. }
  62. s := r.NewState()
  63. if s == nil {
  64. return nil, r.err
  65. }
  66. s.IsAccepting = true
  67. out.NewEdge(DFAOpAccept<<DFARuneBits+out.Id(), s)
  68. r.sink.IsAccepting = true
  69. for _, state := range r.List() {
  70. state.NewEdge(dfaOpState<<DFARuneBits+state.Id(), r.sink)
  71. }
  72. if postProcess && !r.post() {
  73. return nil, r.err
  74. }
  75. return r, nil
  76. }
  77. func (n *nfaLongest) post() bool {
  78. for _, state := range n.List() {
  79. edges := state.Transitions()
  80. syms := edges.List()
  81. type info struct {
  82. rune int
  83. next *fsm.State
  84. }
  85. var infos []info
  86. for _, sym := range syms {
  87. nexts := edges.Get(sym).List()
  88. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  89. case DFAOpRune:
  90. for _, next := range nexts {
  91. infos = append(infos, info{c, next})
  92. }
  93. }
  94. }
  95. var charClasses []*runeSlice
  96. for _, sym := range syms {
  97. nexts := edges.Get(sym).List()
  98. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  99. case DFAOpCharClass:
  100. charClass := n.clsIndex[c]
  101. for _, v := range charClasses {
  102. if !charClass.isDisjoint(*v) {
  103. return false
  104. }
  105. }
  106. charClasses = append(charClasses, charClass)
  107. for _, info := range infos {
  108. c := info.rune
  109. if charClass.has(rune(c)) {
  110. for _, next := range nexts {
  111. state.NewEdge(DFAOpRune<<DFARuneBits+c, next)
  112. }
  113. }
  114. }
  115. }
  116. }
  117. }
  118. return true
  119. }
  120. func sortedSyms(edges fsm.Transitions) (syms []int) {
  121. syms = edges.List()
  122. sort.Slice(syms, func(i, j int) bool {
  123. a, b := syms[i]>>DFARuneBits, syms[j]>>DFARuneBits
  124. soa := opSortOrder[a]
  125. sob := opSortOrder[b]
  126. if soa < sob {
  127. return true
  128. }
  129. if soa > sob {
  130. return false
  131. }
  132. return syms[i]&DFARuneMask < syms[j]&DFARuneMask
  133. })
  134. return syms
  135. }
  136. type runeInfo struct {
  137. rune
  138. nexts []*fsm.State
  139. }
  140. type charClassInfo struct {
  141. id int
  142. sym int
  143. s *runeSlice
  144. nexts []*fsm.State
  145. }
  146. //lint:ignore U1000 debug helper
  147. func nfaString(n *fsm.NFA, clsIndex []*runeSlice) string {
  148. a := strings.Split(n.String(), "\n")
  149. w := 0
  150. for _, v := range a {
  151. v0 := v
  152. var c uint64
  153. if x := strings.Index(v, "(U+"); x >= 0 {
  154. s := v[x+3:]
  155. s = s[:strings.Index(s, ")")]
  156. c, _ = strconv.ParseUint(s, 16, 32)
  157. c = c & DFARuneMask
  158. }
  159. switch {
  160. case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*dfaOpState)):
  161. // skip
  162. case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpAccept)):
  163. a[w] = fmt.Sprintf("%s (accept)", v0)
  164. w++
  165. case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpCharClass)):
  166. a[w] = fmt.Sprintf("%s (class[%d]=%s)", v0, c, clsIndex[c])
  167. w++
  168. case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpEndText)):
  169. a[w] = fmt.Sprintf("%s (EOT)", v0)
  170. w++
  171. case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpBeginText)):
  172. a[w] = fmt.Sprintf("%s (begin text)", v0)
  173. w++
  174. default:
  175. a[w] = v0
  176. w++
  177. }
  178. }
  179. a = a[:w]
  180. for i, v := range clsIndex {
  181. a = append(a, fmt.Sprintf("char class %d: %v", i, v))
  182. }
  183. a = append(a, "")
  184. return strings.Join(a, "\n")
  185. }
  186. func optimizeDFA(n *fsm.NFA, clsReg *register[string], clsIndex *[]*runeSlice) (r *fsm.NFA) {
  187. const classify = 7
  188. for _, state := range n.List() {
  189. edges := state.Transitions()
  190. syms := edges.List()
  191. m := map[*fsm.State][]int{} // state: sym
  192. for _, sym := range syms {
  193. if sym == fsm.Epsilon {
  194. panic("internal error, not a DFA")
  195. }
  196. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  197. case DFAOpRune:
  198. targets := edges.Get(c).List()
  199. if len(targets) != 1 {
  200. return nil
  201. }
  202. target := targets[0]
  203. m[target] = append(m[target], sym)
  204. }
  205. }
  206. for next, syms := range m {
  207. if len(syms) < classify {
  208. continue
  209. }
  210. var rs runeSlice
  211. for _, r := range syms {
  212. c := rune(r & DFARuneMask)
  213. rs = append(rs, c, c)
  214. }
  215. rs = rs.normalize()
  216. id := registerCharClass(clsReg, clsIndex, &rs)
  217. for _, sym := range syms {
  218. edges.Delete(sym)
  219. }
  220. state.NewEdge(DFAOpCharClass<<DFARuneBits+id, next)
  221. }
  222. }
  223. return n
  224. }
  225. func resolveCharClasses2(n *fsm.NFA, clsReg *register[string], clsIndex *[]*runeSlice) (r *fsm.NFA, modified bool) {
  226. for _, state := range n.List() {
  227. restart:
  228. edges := state.Transitions()
  229. syms := edges.List()
  230. var runeInfos []runeInfo
  231. for _, sym := range syms {
  232. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  233. case DFAOpRune:
  234. if c >= unicodePrivateFirst && c <= unicodePrivateLast {
  235. break
  236. }
  237. runeInfos = append(runeInfos, runeInfo{rune(c), edges.Get(c).List()})
  238. }
  239. }
  240. var charClassInfos []charClassInfo
  241. for _, sym := range syms {
  242. nexts := edges.Get(sym).List()
  243. switch op, sID := sym>>DFARuneBits, sym&DFARuneMask; op {
  244. case DFAOpCharClass:
  245. cls := (*clsIndex)[sID]
  246. if cls.cardinality() <= smallClass {
  247. nexts := edges.Get(sym).List()
  248. state.Transitions().Delete(sym)
  249. runes := cls.expand()
  250. for _, next := range nexts {
  251. for _, v := range runes {
  252. state.NewEdge(DFAOpRune<<DFARuneBits+int(v), next)
  253. }
  254. }
  255. modified = true
  256. goto restart
  257. }
  258. si := charClassInfo{sID, sym, cls, nexts}
  259. if len(charClassInfos) == 0 {
  260. charClassInfos = append(charClassInfos, si)
  261. break
  262. }
  263. // resolve any splits
  264. for _, ti := range charClassInfos {
  265. s0 := *si.s
  266. t0 := *ti.s
  267. switch s, b, t := s0.split(t0); {
  268. case len(b) == 0: // s0 and t0 are disjoint
  269. charClassInfos = append(charClassInfos, si)
  270. default:
  271. // s0 ['a', 'b'] ['a'] ['a', 'b']
  272. // t0 ['b', 'c'] ['a', 'b'] ['b']
  273. //
  274. // s ['a'] [] ['a']
  275. // b ['b'] ['a'] ['b']
  276. // t ['c'] ['b'] []
  277. bID := registerCharClass(clsReg, clsIndex, &b)
  278. sNexts := edges.Get(si.sym).List()
  279. tNexts := edges.Get(ti.sym).List()
  280. state.Transitions().Delete(si.sym)
  281. state.Transitions().Delete(ti.sym)
  282. if len(s) != 0 {
  283. sID := registerCharClass(clsReg, clsIndex, &s)
  284. for _, next := range sNexts {
  285. state.NewEdge(DFAOpCharClass<<DFARuneBits+sID, next)
  286. }
  287. }
  288. for _, next := range sNexts {
  289. state.NewEdge(DFAOpCharClass<<DFARuneBits+bID, next)
  290. }
  291. for _, next := range tNexts {
  292. state.NewEdge(DFAOpCharClass<<DFARuneBits+bID, next)
  293. }
  294. if len(t) != 0 {
  295. tID := registerCharClass(clsReg, clsIndex, &t)
  296. for _, next := range tNexts {
  297. state.NewEdge(DFAOpCharClass<<DFARuneBits+tID, next)
  298. }
  299. }
  300. // trc("---- state %d", state.Id())
  301. // trc("s0=%s", s0)
  302. // trc("t0=%s", t0)
  303. // trc("s=%s", s)
  304. // trc("b=%s", b)
  305. // trc("t=%s", t)
  306. modified = true
  307. goto restart
  308. }
  309. }
  310. }
  311. }
  312. for _, runeInfo := range runeInfos {
  313. for _, charClassInfo := range charClassInfos {
  314. cls := *charClassInfo.s
  315. r := runeInfo.rune
  316. if cls.has(r) {
  317. sym := DFAOpRune<<DFARuneBits + int(r)
  318. for _, next := range charClassInfo.nexts {
  319. closure := edges.Get(sym)
  320. if !closure.Has(next) {
  321. modified = true
  322. state.NewEdge(sym, next)
  323. }
  324. }
  325. }
  326. }
  327. }
  328. }
  329. return n, modified
  330. }
  331. func resolveCharClasses(n *fsm.NFA, clsReg *register[string], clsIndex *[]*runeSlice) (r *fsm.NFA) {
  332. for i := 0; i < 100; i++ {
  333. s := n.String()
  334. m, modified := resolveCharClasses2(n, clsReg, clsIndex)
  335. if !modified {
  336. return n
  337. }
  338. n2 := m.MinimalDFA(false)
  339. if n2 == nil {
  340. return nil
  341. }
  342. if s == n2.String() {
  343. return n2
  344. }
  345. n = n2
  346. }
  347. return nil
  348. }
  349. func (n *nfaLongest) canCompile(sre *syntax.Regexp) bool {
  350. switch sre.Op {
  351. case syntax.OpNoMatch:
  352. return false
  353. case syntax.OpEmptyMatch:
  354. return true
  355. case syntax.OpLiteral:
  356. n.consumes = true
  357. return true
  358. case syntax.OpCharClass:
  359. if len(sre.Rune) != 0 {
  360. n.consumes = true
  361. }
  362. return true
  363. case syntax.OpAnyCharNotNL:
  364. n.consumes = true
  365. return true
  366. case syntax.OpAnyChar:
  367. n.consumes = true
  368. return true
  369. case syntax.OpBeginLine:
  370. return false
  371. case syntax.OpEndLine:
  372. return false
  373. case syntax.OpBeginText:
  374. return true
  375. case syntax.OpEndText:
  376. n.hasOpEndText = true
  377. return true
  378. case syntax.OpWordBoundary:
  379. return false
  380. case syntax.OpNoWordBoundary:
  381. return false
  382. case syntax.OpCapture:
  383. if !n.capturesAsGroups {
  384. return false
  385. }
  386. for _, v := range sre.Sub {
  387. if !n.canCompile(v) {
  388. return false
  389. }
  390. }
  391. return true
  392. case syntax.OpStar:
  393. return n.canCompile(sre.Sub[0])
  394. case syntax.OpPlus:
  395. return n.canCompile(sre.Sub[0])
  396. case syntax.OpQuest:
  397. return n.canCompile(sre.Sub[0])
  398. case syntax.OpRepeat:
  399. return false
  400. case syntax.OpConcat:
  401. for _, v := range sre.Sub {
  402. if !n.canCompile(v) {
  403. return false
  404. }
  405. }
  406. return true
  407. case syntax.OpAlternate:
  408. for _, v := range sre.Sub {
  409. if !n.canCompile(v) {
  410. return false
  411. }
  412. }
  413. return true
  414. default:
  415. return false
  416. }
  417. }
  418. func (n *nfaLongest) add(in *fsm.State, sre *syntax.Regexp) (out *fsm.State) {
  419. switch sre.Op {
  420. case syntax.OpLiteral:
  421. for _, v := range sre.Rune {
  422. if out = n.NewState(); out == nil {
  423. return nil
  424. }
  425. in.NewEdge(DFAOpRune<<DFARuneBits+int(v), out)
  426. if sre.Flags&syntax.FoldCase != 0 {
  427. if v2 := unicode.SimpleFold(v); v2 != v {
  428. in.NewEdge(DFAOpRune<<DFARuneBits+int(v2), out)
  429. }
  430. }
  431. in = out
  432. }
  433. return out
  434. case syntax.OpConcat:
  435. for _, v := range sre.Sub {
  436. if out = n.add(in, v); out == nil {
  437. return nil
  438. }
  439. in = out
  440. }
  441. return out
  442. case syntax.OpCapture:
  443. if !n.capturesAsGroups {
  444. return nil
  445. }
  446. for _, v := range sre.Sub {
  447. if out = n.add(in, v); out == nil {
  448. return nil
  449. }
  450. in = out
  451. }
  452. return out
  453. case syntax.OpAlternate:
  454. for _, v := range sre.Sub {
  455. o := n.add(in, v)
  456. if o == nil {
  457. return nil
  458. }
  459. switch {
  460. case out == nil:
  461. out = o
  462. default:
  463. o.NewEdge(fsm.Epsilon, out)
  464. }
  465. }
  466. return out
  467. case syntax.OpQuest:
  468. if out = n.add(in, sre.Sub[0]); out == nil {
  469. return nil
  470. }
  471. in.NewEdge(fsm.Epsilon, out)
  472. return out
  473. case syntax.OpPlus:
  474. if n.hasEOT(sre.Sub[0]) {
  475. return nil
  476. }
  477. s := n.NewState()
  478. if s == nil {
  479. return nil
  480. }
  481. in.NewEdge(fsm.Epsilon, s)
  482. if out = n.add(s, sre.Sub[0]); out == nil {
  483. return nil
  484. }
  485. out.NewEdge(fsm.Epsilon, s)
  486. if s = n.NewState(); s == nil {
  487. return nil
  488. }
  489. out.NewEdge(fsm.Epsilon, s)
  490. return s
  491. case syntax.OpStar:
  492. if n.hasEOT(sre.Sub[0]) {
  493. return nil
  494. }
  495. s := n.NewState()
  496. if s == nil {
  497. return nil
  498. }
  499. in.NewEdge(fsm.Epsilon, s)
  500. if out = n.add(s, sre.Sub[0]); out == nil {
  501. return nil
  502. }
  503. out.NewEdge(fsm.Epsilon, s)
  504. s.NewEdge(fsm.Epsilon, out)
  505. if s = n.NewState(); s == nil {
  506. return nil
  507. }
  508. out.NewEdge(fsm.Epsilon, s)
  509. return s
  510. case syntax.OpBeginText:
  511. if out = n.NewState(); out == nil {
  512. return nil
  513. }
  514. in.NewEdge(DFAOpBeginText<<DFARuneBits, out)
  515. return out
  516. case syntax.OpCharClass:
  517. return n.charClass(in, (*runeSlice)(&sre.Rune))
  518. case syntax.OpAnyCharNotNL:
  519. return n.charClass(in, (*runeSlice)(&anyRuneNotNL))
  520. case syntax.OpAnyChar:
  521. return n.charClass(in, (*runeSlice)(&anyRune))
  522. case syntax.OpEndText:
  523. if out = n.NewState(); out == nil {
  524. return nil
  525. }
  526. in.NewEdge(DFAOpEndText<<DFARuneBits, out)
  527. return out
  528. case syntax.OpEmptyMatch:
  529. if out = n.NewState(); out == nil {
  530. return nil
  531. }
  532. in.NewEdge(fsm.Epsilon, out)
  533. return out
  534. default:
  535. return nil
  536. }
  537. panic("unreachable")
  538. }
  539. func (n *nfaLongest) hasEOT(sre *syntax.Regexp) bool {
  540. switch sre.Op {
  541. case syntax.OpEndText:
  542. return true
  543. case syntax.OpConcat:
  544. for _, v := range sre.Sub {
  545. if n.hasEOT(v) {
  546. return true
  547. }
  548. }
  549. return false
  550. case syntax.OpCapture:
  551. for _, v := range sre.Sub {
  552. if n.hasEOT(v) {
  553. return true
  554. }
  555. }
  556. return false
  557. case syntax.OpAlternate:
  558. for _, v := range sre.Sub {
  559. if n.hasEOT(v) {
  560. return true
  561. }
  562. }
  563. return false
  564. case syntax.OpStar, syntax.OpPlus, syntax.OpQuest:
  565. return n.hasEOT(sre.Sub[0])
  566. default:
  567. return false
  568. }
  569. }
  570. func (n *nfaLongest) charClass(in *fsm.State, p *runeSlice) (out *fsm.State) {
  571. if out = n.NewState(); out == nil {
  572. return nil
  573. }
  574. id := registerCharClass(&n.charClasses, &n.clsIndex, p)
  575. in.NewEdge(DFAOpCharClass<<DFARuneBits+id, out)
  576. return out
  577. }
  578. func registerCharClass(reg *register[string], index *[]*runeSlice, charClass *runeSlice) (id int) {
  579. if id = reg.id(charClass.hashString()); id == len(*index) {
  580. *index = append(*index, charClass)
  581. }
  582. return id
  583. }
  584. // DFA represents a regexp compiled to a DFA. Only a subset of regexps is
  585. // supported.
  586. type DFA struct {
  587. cond syntax.EmptyOp
  588. dfaPC2nfaStates map[int][]int
  589. err error
  590. id2charClass []*runeSlice
  591. minInputLen int
  592. nfa *nfaLongest
  593. prefix string
  594. prefixRune rune
  595. prog []uint32
  596. startPC int
  597. prefixPC int
  598. }
  599. // CharClasses returns the number of distinc character classes used by d.
  600. func (d *DFA) CharClasses() int { return len(d.id2charClass) }
  601. // CharClass returns a character class for id which must come from the
  602. // DFAOpCharClass instruction argument of d.Prog. The returned slice consist of
  603. // pairs describing ranges of members of the set.
  604. func (d *DFA) CharClass(id int) (r []rune) {
  605. return append(r, []rune(*d.id2charClass[id])...)
  606. }
  607. // MinInputLen reports the minimum length of the input in bytes for a match.
  608. func (d *DFA) MinInputLen() int { return d.minInputLen }
  609. // Prefix reports the required prefix in unanchored matches, if any.
  610. func (d *DFA) Prefix() string { return d.prefix }
  611. // PrefixRune reports the first rune in prefix.
  612. func (d *DFA) PrefixRune() rune { return d.prefixRune }
  613. // Cond reports the empty-width conditions required at start of match.
  614. func (d *DFA) Cond() syntax.EmptyOp { return d.cond }
  615. // Prog returns the DFA program.
  616. func (d *DFA) Prog() []uint32 { return d.prog }
  617. // StartPC reports the PC to start the DFA.
  618. func (d *DFA) StartPC() int { return d.startPC }
  619. // PrefixPC reports the PC to start the DFA after the prefix is consumed.
  620. func (d *DFA) PrefixPC() int { return d.prefixPC }
  621. // NewLexDFA returns a DFA suitable for a lexer or an error, if any.
  622. func NewLexDFA(expr []string) (r *DFA, err error) {
  623. var b strings.Builder
  624. for i, v := range expr {
  625. if i != 0 {
  626. b.WriteByte('|')
  627. }
  628. fmt.Fprintf(&b, "(%s)\\x{%04x}", v, unicodePrivateFirst+i)
  629. }
  630. sre, err := syntax.Parse(b.String(), syntax.Perl|syntax.DisableOptimizations)
  631. if err != nil {
  632. return nil, err
  633. }
  634. // sre = sre.Simplify()
  635. prog, err := syntax.Compile(sre)
  636. if err != nil {
  637. return nil, err
  638. }
  639. nfa, err := compileNFALongest(sre, true, false)
  640. if err != nil {
  641. return nil, err
  642. }
  643. if nfa.NFA, _ = resolveCharClasses2(nfa.NFA, &nfa.charClasses, &nfa.clsIndex); nfa.NFA == nil {
  644. return nil, fmt.Errorf("compile NFA longest failed")
  645. }
  646. defer func() {
  647. if r != nil {
  648. if err == nil {
  649. err = r.err
  650. }
  651. return
  652. }
  653. if err == nil {
  654. err = fmt.Errorf("compile DFA longest failed")
  655. }
  656. }()
  657. dfa := nfa.MinimalDFA(false)
  658. if dfa == nil {
  659. return nil, nil
  660. }
  661. if dfa = resolveCharClasses(dfa, &nfa.charClasses, &nfa.clsIndex); dfa == nil {
  662. return nil, nil
  663. }
  664. prefix, _ := prog.Prefix()
  665. d := &DFA{
  666. cond: prog.StartCond(),
  667. id2charClass: nfa.clsIndex,
  668. minInputLen: minInputLen(sre),
  669. nfa: nfa,
  670. prefix: prefix,
  671. prefixPC: -1,
  672. }
  673. if prefix != "" {
  674. d.prefixRune, _ = utf8.DecodeRuneInString(prefix)
  675. }
  676. for _, state := range dfa.List() {
  677. edges := state.Transitions()
  678. syms := sortedSyms(edges)
  679. for _, sym := range syms {
  680. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  681. case DFAOpRune:
  682. if c >= unicodePrivateFirst && c <= unicodePrivateLast {
  683. edges.Delete(sym)
  684. state.NewEdge(DFAOpAccept<<DFARuneBits|(c-unicodePrivateFirst), state)
  685. }
  686. }
  687. }
  688. }
  689. if dfa = dfa.MinimalDFA(false); dfa == nil {
  690. return nil, nil
  691. }
  692. if dfa = optimizeDFA(dfa, &nfa.charClasses, &nfa.clsIndex); dfa == nil {
  693. return nil, nil
  694. }
  695. d.id2charClass = nfa.clsIndex
  696. state2pc := []int{}
  697. states := dfa.List()
  698. linker := make([][]int, len(states))
  699. dfaPC2nfaStates := map[int][]int{}
  700. d.dfaPC2nfaStates = dfaPC2nfaStates
  701. for _, state := range states {
  702. state2pc = append(state2pc, len(d.prog))
  703. if state.IsAccepting {
  704. continue
  705. }
  706. edges := state.Transitions()
  707. syms := sortedSyms(edges)
  708. for _, sym := range syms {
  709. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  710. case dfaOpState:
  711. dfaPC2nfaStates[len(d.prog)] = append(dfaPC2nfaStates[len(d.prog)], c)
  712. }
  713. }
  714. instrCnt := 0
  715. hasOpBeginText := false
  716. var charClasses []*runeSlice
  717. accepted := false
  718. for _, sym := range syms {
  719. nexts := edges.Get(sym).List()
  720. if len(nexts) > 1 {
  721. return
  722. }
  723. next := nexts[0].Id()
  724. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  725. case DFAOpRune:
  726. d.prog = append(d.prog, uint32(DFAOpRune<<DFARuneBits+c), 0)
  727. linker[next] = append(linker[next], len(d.prog)-1)
  728. instrCnt++
  729. case DFAOpEndText:
  730. d.prog = append(d.prog, uint32(DFAOpEndText<<DFARuneBits), 0)
  731. linker[next] = append(linker[next], len(d.prog)-1)
  732. instrCnt++
  733. case DFAOpCharClass:
  734. charClass := d.id2charClass[c]
  735. for _, v := range charClasses {
  736. if !charClass.isDisjoint(*v) {
  737. return
  738. }
  739. }
  740. charClasses = append(charClasses, charClass)
  741. d.prog = append(d.prog, uint32(DFAOpCharClass<<DFARuneBits+c), 0)
  742. linker[next] = append(linker[next], len(d.prog)-1)
  743. instrCnt++
  744. case DFAOpBeginText:
  745. if state != dfa.Start() {
  746. return
  747. }
  748. d.prog = append(d.prog, uint32(DFAOpBeginText<<DFARuneBits), 0)
  749. linker[next] = append(linker[next], len(d.prog)-1)
  750. instrCnt++
  751. hasOpBeginText = true
  752. case DFAOpAccept:
  753. if !accepted {
  754. d.prog = append(d.prog, uint32(sym))
  755. accepted = true
  756. }
  757. case dfaOpState:
  758. // nop
  759. default:
  760. return
  761. }
  762. }
  763. if hasOpBeginText && instrCnt != 1 {
  764. return
  765. }
  766. d.prog = append(d.prog, uint32(DFAOpStop<<DFARuneBits))
  767. }
  768. for _, v := range dfaPC2nfaStates {
  769. sort.Ints(v)
  770. }
  771. for state, links := range linker {
  772. pc := state2pc[state]
  773. for _, ref := range links {
  774. d.prog[ref] = uint32(pc)
  775. }
  776. }
  777. return d, nil
  778. }
  779. // NewMatchDFA returns a DFA suitable for the Match operation or an error, if
  780. // any.
  781. func NewMatchDFA(expr string) (r *DFA, err error) {
  782. sre, err := syntax.Parse(expr, syntax.Perl)
  783. if err != nil {
  784. return nil, err
  785. }
  786. sre = sre.Simplify()
  787. prog, err := syntax.Compile(sre)
  788. if err != nil {
  789. return nil, err
  790. }
  791. nfa, err := compileNFALongest(sre, true, true)
  792. if err != nil {
  793. return nil, err
  794. }
  795. defer func() {
  796. if r != nil {
  797. if err == nil {
  798. err = r.err
  799. }
  800. return
  801. }
  802. if err == nil {
  803. err = fmt.Errorf("compile DFA longest failed")
  804. }
  805. }()
  806. dfa := nfa.MinimalDFA(false)
  807. if dfa == nil {
  808. return nil, nil
  809. }
  810. prefix, _ := prog.Prefix()
  811. d := &DFA{
  812. cond: prog.StartCond(),
  813. id2charClass: nfa.clsIndex,
  814. minInputLen: minInputLen(sre),
  815. nfa: nfa,
  816. prefix: prefix,
  817. prefixPC: -1,
  818. }
  819. if prefix != "" {
  820. d.prefixRune, _ = utf8.DecodeRuneInString(prefix)
  821. }
  822. states := dfa.List()
  823. rebuild := false
  824. for _, state := range states {
  825. edges := state.Transitions()
  826. syms := sortedSyms(edges)
  827. type info struct {
  828. rune int
  829. next *fsm.State
  830. }
  831. var infos []info
  832. hasCharClass := false
  833. for _, sym := range syms {
  834. nexts := edges.Get(sym).List()
  835. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  836. case DFAOpRune:
  837. for _, next := range nexts {
  838. infos = append(infos, info{c, next})
  839. }
  840. case DFAOpCharClass:
  841. if hasCharClass {
  842. return nil, nil
  843. }
  844. hasCharClass = true
  845. charClass := d.id2charClass[c]
  846. for _, info := range infos {
  847. c := info.rune
  848. if charClass.has(rune(c)) {
  849. rebuild = true
  850. for _, next := range nexts {
  851. state.NewEdge(DFAOpRune<<DFARuneBits+c, next)
  852. }
  853. }
  854. }
  855. }
  856. }
  857. }
  858. if rebuild {
  859. if dfa = dfa.MinimalDFA(false); dfa == nil {
  860. return nil, nil
  861. }
  862. }
  863. state2pc := []int{}
  864. states = dfa.List()
  865. linker := make([][]int, len(states))
  866. dfaPC2nfaStates := map[int][]int{}
  867. d.dfaPC2nfaStates = dfaPC2nfaStates
  868. for _, state := range states {
  869. state2pc = append(state2pc, len(d.prog))
  870. if state.IsAccepting {
  871. continue
  872. }
  873. edges := state.Transitions()
  874. syms := sortedSyms(edges)
  875. hasCharClass := false
  876. isAccepting := false
  877. for _, sym := range syms {
  878. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  879. case dfaOpState:
  880. dfaPC2nfaStates[len(d.prog)] = append(dfaPC2nfaStates[len(d.prog)], c)
  881. case DFAOpAccept:
  882. isAccepting = true
  883. }
  884. }
  885. if isAccepting {
  886. d.prog = append(d.prog, uint32(DFAOpAccept<<DFARuneBits))
  887. }
  888. instrCnt := 0
  889. hasOpBeginText := false
  890. for _, sym := range syms {
  891. nexts := edges.Get(sym).List()
  892. if len(nexts) > 1 {
  893. return
  894. }
  895. next := nexts[0].Id()
  896. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  897. case DFAOpRune:
  898. d.prog = append(d.prog, uint32(DFAOpRune<<DFARuneBits+c), 0)
  899. linker[next] = append(linker[next], len(d.prog)-1)
  900. instrCnt++
  901. case DFAOpEndText:
  902. d.prog = append(d.prog, uint32(DFAOpEndText<<DFARuneBits), 0)
  903. linker[next] = append(linker[next], len(d.prog)-1)
  904. instrCnt++
  905. case DFAOpCharClass:
  906. if hasCharClass {
  907. return
  908. }
  909. hasCharClass = true
  910. d.prog = append(d.prog, uint32(DFAOpCharClass<<DFARuneBits+c), 0)
  911. linker[next] = append(linker[next], len(d.prog)-1)
  912. instrCnt++
  913. case DFAOpBeginText:
  914. if state != dfa.Start() {
  915. return
  916. }
  917. d.prog = append(d.prog, uint32(DFAOpBeginText<<DFARuneBits), 0)
  918. linker[next] = append(linker[next], len(d.prog)-1)
  919. instrCnt++
  920. hasOpBeginText = true
  921. case DFAOpAccept, dfaOpState:
  922. // nop
  923. default:
  924. return
  925. }
  926. }
  927. if hasOpBeginText && instrCnt != 1 {
  928. return
  929. }
  930. d.prog = append(d.prog, uint32(DFAOpStop<<DFARuneBits))
  931. }
  932. for _, v := range dfaPC2nfaStates {
  933. sort.Ints(v)
  934. }
  935. for state, links := range linker {
  936. pc := state2pc[state]
  937. for _, ref := range links {
  938. d.prog[ref] = uint32(pc)
  939. }
  940. }
  941. if !d.post() {
  942. return nil, nil
  943. }
  944. return d, nil
  945. }
  946. func (d *DFA) post() bool {
  947. s := d.prefix
  948. if s == "" {
  949. return true
  950. }
  951. pc := d.startPC
  952. state:
  953. for len(s) != 0 {
  954. r, n := utf8.DecodeRuneInString(s)
  955. if n == 0 {
  956. return false
  957. }
  958. s = s[n:]
  959. instr:
  960. for {
  961. pc0 := pc
  962. sym := d.prog[pc]
  963. pc++
  964. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  965. case DFAOpRune:
  966. next := d.prog[pc]
  967. pc++
  968. if r == rune(c) {
  969. pc = int(next)
  970. continue state
  971. }
  972. case DFAOpBeginText:
  973. if pc0 != d.startPC {
  974. return false
  975. }
  976. pc = int(d.prog[pc])
  977. continue instr
  978. default:
  979. return false
  980. }
  981. }
  982. }
  983. d.prefixPC = pc
  984. return true
  985. }
  986. func (n *nfaLongest) String() string {
  987. a := strings.Split(n.NFA.String(), "\n")
  988. w := 0
  989. for _, v := range a {
  990. v0 := v
  991. var c uint64
  992. if x := strings.Index(v, "(U+"); x >= 0 {
  993. s := v[x+3:]
  994. s = s[:strings.Index(s, ")")]
  995. c, _ = strconv.ParseUint(s, 16, 32)
  996. c = c & DFARuneMask
  997. }
  998. switch {
  999. case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*dfaOpState)):
  1000. // skip
  1001. case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpAccept)):
  1002. a[w] = fmt.Sprintf("%s (accept)", v0)
  1003. w++
  1004. case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpCharClass)):
  1005. a[w] = fmt.Sprintf("%s (%s)", v0, n.clsIndex[c])
  1006. w++
  1007. case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpEndText)):
  1008. a[w] = fmt.Sprintf("%s (EOT)", v0)
  1009. w++
  1010. case strings.Contains(v0, fmt.Sprintf("(U+%X", 2*DFAOpBeginText)):
  1011. a[w] = fmt.Sprintf("%s (begin text)", v0)
  1012. w++
  1013. default:
  1014. a[w] = v0
  1015. w++
  1016. }
  1017. }
  1018. a = a[:w]
  1019. return strings.Join(a, "\n")
  1020. }
  1021. func (d *DFA) String() string {
  1022. var b strings.Builder
  1023. fmt.Fprintf(&b, "NFA\n%s\n", d.nfa)
  1024. prog := d.prog
  1025. fmt.Fprintf(&b, "DFA prog: %4d, start %05d", len(prog), d.startPC)
  1026. if d.prefixPC > 0 {
  1027. fmt.Fprintf(&b, ", prefix start %05d", d.prefixPC)
  1028. }
  1029. fmt.Fprintln(&b)
  1030. state := 0
  1031. newState := true
  1032. for pc := 0; pc < len(prog); {
  1033. if newState {
  1034. fmt.Fprintf(&b, "[%d] nfa states %v\n", state, d.dfaPC2nfaStates[pc])
  1035. state++
  1036. newState = false
  1037. }
  1038. sym := prog[pc]
  1039. fmt.Fprintf(&b, "\t%05d:\t%#U\t", pc, sym)
  1040. pc++
  1041. switch op, c := sym>>DFARuneBits, sym&DFARuneMask; op {
  1042. case DFAOpRune:
  1043. next := prog[pc]
  1044. pc++
  1045. fmt.Fprintf(&b, "%s -> %05d\n", strconv.QuoteRuneToASCII(rune(c)), next)
  1046. case DFAOpEndText:
  1047. next := prog[pc]
  1048. pc++
  1049. fmt.Fprintf(&b, "EOT -> %05d\n", next)
  1050. case DFAOpCharClass:
  1051. next := prog[pc]
  1052. pc++
  1053. cls := d.id2charClass[c]
  1054. fmt.Fprintf(&b, "%s -> %05d\n", cls, next)
  1055. case DFAOpAccept:
  1056. fmt.Fprintf(&b, "accept\n")
  1057. case DFAOpStop:
  1058. fmt.Fprintf(&b, "stop\n")
  1059. newState = true
  1060. case DFAOpBeginText:
  1061. next := prog[pc]
  1062. pc++
  1063. fmt.Fprintf(&b, "begin text -> %05d\n", next)
  1064. default:
  1065. fmt.Fprintf(&b, "???\n")
  1066. }
  1067. }
  1068. return b.String()
  1069. }
  1070. // OnePassProg represents a one pass program.
  1071. //
  1072. // "One-pass" regexp execution.
  1073. //
  1074. // Some regexps can be analyzed to determine that they never need backtracking:
  1075. // they are guaranteed to run in one pass over the string without bothering to
  1076. // save all the usual NFA state.
  1077. type OnePassProg struct {
  1078. Prog *syntax.Prog
  1079. numSubexp int
  1080. prefix string
  1081. subexpNames []string
  1082. prefixComplete bool // prefix is the entire regexp
  1083. }
  1084. func (p *OnePassProg) String() string { return p.Prog.String() }
  1085. // NumSubexp returns the number of parenthesized subexpressions in this
  1086. // OnePassProg.
  1087. func (p *OnePassProg) NumSubexp() int {
  1088. return p.numSubexp
  1089. }
  1090. // SubexpNames returns the names of the parenthesized subexpressions in this
  1091. // OnePassProg. The name for the first sub-expression is names[1], so that if m
  1092. // is a match slice, the name for m[i] is SubexpNames()[i]. Since the
  1093. // OnePassProg as a whole cannot be named, names[0] is always the empty string.
  1094. func (p *OnePassProg) SubexpNames() []string {
  1095. return append([]string(nil), p.subexpNames...)
  1096. }
  1097. // LiteralPrefix returns a literal string that must begin any match
  1098. // of the regular expression p. It returns the boolean true if the
  1099. // literal string comprises the entire regular expression.
  1100. func (p *OnePassProg) LiteralPrefix() (prefix string, complete bool) {
  1101. return p.prefix, p.prefixComplete
  1102. }
  1103. // OnePassProg returns a new OnePassProg or an error, if any.
  1104. func NewOnePassProg(expr string, mode syntax.Flags) (*OnePassProg, error) {
  1105. re, err := syntax.Parse(expr, mode)
  1106. if err != nil {
  1107. return nil, err
  1108. }
  1109. maxCap := re.MaxCap()
  1110. capNames := re.CapNames()
  1111. re = re.Simplify()
  1112. prog, err := syntax.Compile(re)
  1113. if err != nil {
  1114. return nil, err
  1115. }
  1116. onepass := compileOnePass(prog)
  1117. if onepass == nil {
  1118. return nil, fmt.Errorf("cannot handle `%s`", expr)
  1119. }
  1120. prefix, prefixComplete, _ := onePassPrefix(prog)
  1121. return &OnePassProg{
  1122. Prog: prog,
  1123. numSubexp: maxCap,
  1124. subexpNames: capNames,
  1125. prefix: prefix,
  1126. prefixComplete: prefixComplete,
  1127. }, nil
  1128. }