regexp.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the GO-LICENSE file.
  4. // Modifications of this file, if any, are
  5. //
  6. // Copyright 2023 The Regexp Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style
  8. // license that can be found in the LICENSE file.
  9. package syntax // modernc.org/regexp/syntax
  10. // Note to implementers:
  11. // In this package, re is always a *Regexp and r is always a rune.
  12. import (
  13. "strconv"
  14. "strings"
  15. "unicode"
  16. )
  17. // A Regexp is a node in a regular expression syntax tree.
  18. type Regexp struct {
  19. Op Op // operator
  20. Flags Flags
  21. Sub []*Regexp // subexpressions, if any
  22. Sub0 [1]*Regexp // storage for short Sub
  23. Rune []rune // matched runes, for OpLiteral, OpCharClass
  24. Rune0 [2]rune // storage for short Rune
  25. Min, Max int // min, max for OpRepeat
  26. Cap int // capturing index, for OpCapture
  27. Name string // capturing name, for OpCapture
  28. }
  29. //go:generate stringer -type Op -trimprefix Op
  30. // An Op is a single regular expression operator.
  31. type Op uint8
  32. // Operators are listed in precedence order, tightest binding to weakest.
  33. // Character class operators are listed simplest to most complex
  34. // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
  35. const (
  36. OpNoMatch Op = 1 + iota // matches no strings
  37. OpEmptyMatch // matches empty string
  38. OpLiteral // matches Runes sequence
  39. OpCharClass // matches Runes interpreted as range pair list
  40. OpAnyCharNotNL // matches any character except newline
  41. OpAnyChar // matches any character
  42. OpBeginLine // matches empty string at beginning of line
  43. OpEndLine // matches empty string at end of line
  44. OpBeginText // matches empty string at beginning of text
  45. OpEndText // matches empty string at end of text
  46. OpWordBoundary // matches word boundary `\b`
  47. OpNoWordBoundary // matches word non-boundary `\B`
  48. OpCapture // capturing subexpression with index Cap, optional name Name
  49. OpStar // matches Sub[0] zero or more times
  50. OpPlus // matches Sub[0] one or more times
  51. OpQuest // matches Sub[0] zero or one times
  52. OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
  53. OpConcat // matches concatenation of Subs
  54. OpAlternate // matches alternation of Subs
  55. )
  56. const opPseudo Op = 128 // where pseudo-ops start
  57. // Equal reports whether x and y have identical structure.
  58. func (x *Regexp) Equal(y *Regexp) bool {
  59. if x == nil || y == nil {
  60. return x == y
  61. }
  62. if x.Op != y.Op {
  63. return false
  64. }
  65. switch x.Op {
  66. case OpEndText:
  67. // The parse flags remember whether this is \z or \Z.
  68. if x.Flags&WasDollar != y.Flags&WasDollar {
  69. return false
  70. }
  71. case OpLiteral, OpCharClass:
  72. if len(x.Rune) != len(y.Rune) {
  73. return false
  74. }
  75. for i, r := range x.Rune {
  76. if r != y.Rune[i] {
  77. return false
  78. }
  79. }
  80. case OpAlternate, OpConcat:
  81. if len(x.Sub) != len(y.Sub) {
  82. return false
  83. }
  84. for i, sub := range x.Sub {
  85. if !sub.Equal(y.Sub[i]) {
  86. return false
  87. }
  88. }
  89. case OpStar, OpPlus, OpQuest:
  90. if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
  91. return false
  92. }
  93. case OpRepeat:
  94. if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
  95. return false
  96. }
  97. case OpCapture:
  98. if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
  99. return false
  100. }
  101. }
  102. return true
  103. }
  104. // writeRegexp writes the Perl syntax for the regular expression re to b.
  105. func writeRegexp(b *strings.Builder, re *Regexp) {
  106. switch re.Op {
  107. default:
  108. b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
  109. case OpNoMatch:
  110. b.WriteString(`[^\x00-\x{10FFFF}]`)
  111. case OpEmptyMatch:
  112. b.WriteString(`(?:)`)
  113. case OpLiteral:
  114. if re.Flags&FoldCase != 0 {
  115. b.WriteString(`(?i:`)
  116. }
  117. for _, r := range re.Rune {
  118. escape(b, r, false)
  119. }
  120. if re.Flags&FoldCase != 0 {
  121. b.WriteString(`)`)
  122. }
  123. case OpCharClass:
  124. if len(re.Rune)%2 != 0 {
  125. b.WriteString(`[invalid char class]`)
  126. break
  127. }
  128. b.WriteRune('[')
  129. if len(re.Rune) == 0 {
  130. b.WriteString(`^\x00-\x{10FFFF}`)
  131. } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
  132. // Contains 0 and MaxRune. Probably a negated class.
  133. // Print the gaps.
  134. b.WriteRune('^')
  135. for i := 1; i < len(re.Rune)-1; i += 2 {
  136. lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
  137. escape(b, lo, lo == '-')
  138. if lo != hi {
  139. b.WriteRune('-')
  140. escape(b, hi, hi == '-')
  141. }
  142. }
  143. } else {
  144. for i := 0; i < len(re.Rune); i += 2 {
  145. lo, hi := re.Rune[i], re.Rune[i+1]
  146. escape(b, lo, lo == '-')
  147. if lo != hi {
  148. b.WriteRune('-')
  149. escape(b, hi, hi == '-')
  150. }
  151. }
  152. }
  153. b.WriteRune(']')
  154. case OpAnyCharNotNL:
  155. b.WriteString(`(?-s:.)`)
  156. case OpAnyChar:
  157. b.WriteString(`(?s:.)`)
  158. case OpBeginLine:
  159. b.WriteString(`(?m:^)`)
  160. case OpEndLine:
  161. b.WriteString(`(?m:$)`)
  162. case OpBeginText:
  163. b.WriteString(`\A`)
  164. case OpEndText:
  165. if re.Flags&WasDollar != 0 {
  166. b.WriteString(`(?-m:$)`)
  167. } else {
  168. b.WriteString(`\z`)
  169. }
  170. case OpWordBoundary:
  171. b.WriteString(`\b`)
  172. case OpNoWordBoundary:
  173. b.WriteString(`\B`)
  174. case OpCapture:
  175. if re.Name != "" {
  176. b.WriteString(`(?P<`)
  177. b.WriteString(re.Name)
  178. b.WriteRune('>')
  179. } else {
  180. b.WriteRune('(')
  181. }
  182. if re.Sub[0].Op != OpEmptyMatch {
  183. writeRegexp(b, re.Sub[0])
  184. }
  185. b.WriteRune(')')
  186. case OpStar, OpPlus, OpQuest, OpRepeat:
  187. if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
  188. b.WriteString(`(?:`)
  189. writeRegexp(b, sub)
  190. b.WriteString(`)`)
  191. } else {
  192. writeRegexp(b, sub)
  193. }
  194. switch re.Op {
  195. case OpStar:
  196. b.WriteRune('*')
  197. case OpPlus:
  198. b.WriteRune('+')
  199. case OpQuest:
  200. b.WriteRune('?')
  201. case OpRepeat:
  202. b.WriteRune('{')
  203. b.WriteString(strconv.Itoa(re.Min))
  204. if re.Max != re.Min {
  205. b.WriteRune(',')
  206. if re.Max >= 0 {
  207. b.WriteString(strconv.Itoa(re.Max))
  208. }
  209. }
  210. b.WriteRune('}')
  211. }
  212. if re.Flags&NonGreedy != 0 {
  213. b.WriteRune('?')
  214. }
  215. case OpConcat:
  216. for _, sub := range re.Sub {
  217. if sub.Op == OpAlternate {
  218. b.WriteString(`(?:`)
  219. writeRegexp(b, sub)
  220. b.WriteString(`)`)
  221. } else {
  222. writeRegexp(b, sub)
  223. }
  224. }
  225. case OpAlternate:
  226. for i, sub := range re.Sub {
  227. if i > 0 {
  228. b.WriteRune('|')
  229. }
  230. writeRegexp(b, sub)
  231. }
  232. }
  233. }
  234. func (re *Regexp) String() string {
  235. var b strings.Builder
  236. writeRegexp(&b, re)
  237. return b.String()
  238. }
  239. const meta = `\.+*?()|[]{}^$`
  240. func escape(b *strings.Builder, r rune, force bool) {
  241. if unicode.IsPrint(r) {
  242. if strings.ContainsRune(meta, r) || force {
  243. b.WriteRune('\\')
  244. }
  245. b.WriteRune(r)
  246. return
  247. }
  248. switch r {
  249. case '\a':
  250. b.WriteString(`\a`)
  251. case '\f':
  252. b.WriteString(`\f`)
  253. case '\n':
  254. b.WriteString(`\n`)
  255. case '\r':
  256. b.WriteString(`\r`)
  257. case '\t':
  258. b.WriteString(`\t`)
  259. case '\v':
  260. b.WriteString(`\v`)
  261. default:
  262. if r < 0x100 {
  263. b.WriteString(`\x`)
  264. s := strconv.FormatInt(int64(r), 16)
  265. if len(s) == 1 {
  266. b.WriteRune('0')
  267. }
  268. b.WriteString(s)
  269. break
  270. }
  271. b.WriteString(`\x{`)
  272. b.WriteString(strconv.FormatInt(int64(r), 16))
  273. b.WriteString(`}`)
  274. }
  275. }
  276. // MaxCap walks the regexp to find the maximum capture index.
  277. func (re *Regexp) MaxCap() int {
  278. m := 0
  279. if re.Op == OpCapture {
  280. m = re.Cap
  281. }
  282. for _, sub := range re.Sub {
  283. if n := sub.MaxCap(); m < n {
  284. m = n
  285. }
  286. }
  287. return m
  288. }
  289. // CapNames walks the regexp to find the names of capturing groups.
  290. func (re *Regexp) CapNames() []string {
  291. names := make([]string, re.MaxCap()+1)
  292. re.capNames(names)
  293. return names
  294. }
  295. func (re *Regexp) capNames(names []string) {
  296. if re.Op == OpCapture {
  297. names[re.Cap] = re.Name
  298. }
  299. for _, sub := range re.Sub {
  300. sub.capNames(names)
  301. }
  302. }