parser_decode.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524
  1. package ansi
  2. import (
  3. "unicode/utf8"
  4. "github.com/charmbracelet/x/ansi/parser"
  5. "github.com/mattn/go-runewidth"
  6. "github.com/rivo/uniseg"
  7. )
  8. // State represents the state of the ANSI escape sequence parser used by
  9. // [DecodeSequence].
  10. type State = byte
  11. // ANSI escape sequence states used by [DecodeSequence].
  12. const (
  13. NormalState State = iota
  14. PrefixState
  15. ParamsState
  16. IntermedState
  17. EscapeState
  18. StringState
  19. )
  20. // DecodeSequence decodes the first ANSI escape sequence or a printable
  21. // grapheme from the given data. It returns the sequence slice, the number of
  22. // bytes read, the cell width for each sequence, and the new state.
  23. //
  24. // The cell width will always be 0 for control and escape sequences, 1 for
  25. // ASCII printable characters, and the number of cells other Unicode characters
  26. // occupy. It uses the uniseg package to calculate the width of Unicode
  27. // graphemes and characters. This means it will always do grapheme clustering
  28. // (mode 2027).
  29. //
  30. // Passing a non-nil [*Parser] as the last argument will allow the decoder to
  31. // collect sequence parameters, data, and commands. The parser cmd will have
  32. // the packed command value that contains intermediate and prefix characters.
  33. // In the case of a OSC sequence, the cmd will be the OSC command number. Use
  34. // [Cmd] and [Param] types to unpack command intermediates and prefixes as well
  35. // as parameters.
  36. //
  37. // Zero [Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
  38. // validity of other data sequences, OSC, DCS, etc, will require checking for
  39. // the returned sequence terminator bytes such as ST (ESC \\) and BEL).
  40. //
  41. // We store the command byte in [Cmd] in the most significant byte, the
  42. // prefix byte in the next byte, and the intermediate byte in the least
  43. // significant byte. This is done to avoid using a struct to store the command
  44. // and its intermediates and prefixes. The command byte is always the least
  45. // significant byte i.e. [Cmd & 0xff]. Use the [Cmd] type to unpack the
  46. // command, intermediate, and prefix bytes. Note that we only collect the last
  47. // prefix character and intermediate byte.
  48. //
  49. // The [p.Params] slice will contain the parameters of the sequence. Any
  50. // sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type
  51. // to unpack the parameters.
  52. //
  53. // Example:
  54. //
  55. // var state byte // the initial state is always zero [NormalState]
  56. // p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
  57. // input := []byte("\x1b[31mHello, World!\x1b[0m")
  58. // for len(input) > 0 {
  59. // seq, width, n, newState := DecodeSequence(input, state, p)
  60. // log.Printf("seq: %q, width: %d", seq, width)
  61. // state = newState
  62. // input = input[n:]
  63. // }
  64. //
  65. // This function treats the text as a sequence of grapheme clusters.
  66. func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
  67. return decodeSequence(GraphemeWidth, b, state, p)
  68. }
  69. // DecodeSequenceWc decodes the first ANSI escape sequence or a printable
  70. // grapheme from the given data. It returns the sequence slice, the number of
  71. // bytes read, the cell width for each sequence, and the new state.
  72. //
  73. // The cell width will always be 0 for control and escape sequences, 1 for
  74. // ASCII printable characters, and the number of cells other Unicode characters
  75. // occupy. It uses the uniseg package to calculate the width of Unicode
  76. // graphemes and characters. This means it will always do grapheme clustering
  77. // (mode 2027).
  78. //
  79. // Passing a non-nil [*Parser] as the last argument will allow the decoder to
  80. // collect sequence parameters, data, and commands. The parser cmd will have
  81. // the packed command value that contains intermediate and prefix characters.
  82. // In the case of a OSC sequence, the cmd will be the OSC command number. Use
  83. // [Cmd] and [Param] types to unpack command intermediates and prefixes as well
  84. // as parameters.
  85. //
  86. // Zero [Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
  87. // validity of other data sequences, OSC, DCS, etc, will require checking for
  88. // the returned sequence terminator bytes such as ST (ESC \\) and BEL).
  89. //
  90. // We store the command byte in [Cmd] in the most significant byte, the
  91. // prefix byte in the next byte, and the intermediate byte in the least
  92. // significant byte. This is done to avoid using a struct to store the command
  93. // and its intermediates and prefixes. The command byte is always the least
  94. // significant byte i.e. [Cmd & 0xff]. Use the [Cmd] type to unpack the
  95. // command, intermediate, and prefix bytes. Note that we only collect the last
  96. // prefix character and intermediate byte.
  97. //
  98. // The [p.Params] slice will contain the parameters of the sequence. Any
  99. // sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type
  100. // to unpack the parameters.
  101. //
  102. // Example:
  103. //
  104. // var state byte // the initial state is always zero [NormalState]
  105. // p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
  106. // input := []byte("\x1b[31mHello, World!\x1b[0m")
  107. // for len(input) > 0 {
  108. // seq, width, n, newState := DecodeSequenceWc(input, state, p)
  109. // log.Printf("seq: %q, width: %d", seq, width)
  110. // state = newState
  111. // input = input[n:]
  112. // }
  113. //
  114. // This function treats the text as a sequence of wide characters and runes.
  115. func DecodeSequenceWc[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
  116. return decodeSequence(WcWidth, b, state, p)
  117. }
  118. func decodeSequence[T string | []byte](m Method, b T, state State, p *Parser) (seq T, width int, n int, newState byte) {
  119. for i := 0; i < len(b); i++ {
  120. c := b[i]
  121. switch state {
  122. case NormalState:
  123. switch c {
  124. case ESC:
  125. if p != nil {
  126. if len(p.params) > 0 {
  127. p.params[0] = parser.MissingParam
  128. }
  129. p.cmd = 0
  130. p.paramsLen = 0
  131. p.dataLen = 0
  132. }
  133. state = EscapeState
  134. continue
  135. case CSI, DCS:
  136. if p != nil {
  137. if len(p.params) > 0 {
  138. p.params[0] = parser.MissingParam
  139. }
  140. p.cmd = 0
  141. p.paramsLen = 0
  142. p.dataLen = 0
  143. }
  144. state = PrefixState
  145. continue
  146. case OSC, APC, SOS, PM:
  147. if p != nil {
  148. p.cmd = parser.MissingCommand
  149. p.dataLen = 0
  150. }
  151. state = StringState
  152. continue
  153. }
  154. if p != nil {
  155. p.dataLen = 0
  156. p.paramsLen = 0
  157. p.cmd = 0
  158. }
  159. if c > US && c < DEL {
  160. // ASCII printable characters
  161. return b[i : i+1], 1, 1, NormalState
  162. }
  163. if c <= US || c == DEL || c < 0xC0 {
  164. // C0 & C1 control characters & DEL
  165. return b[i : i+1], 0, 1, NormalState
  166. }
  167. if utf8.RuneStart(c) {
  168. seq, _, width, _ = FirstGraphemeCluster(b, -1)
  169. if m == WcWidth {
  170. width = runewidth.StringWidth(string(seq))
  171. }
  172. i += len(seq)
  173. return b[:i], width, i, NormalState
  174. }
  175. // Invalid UTF-8 sequence
  176. return b[:i], 0, i, NormalState
  177. case PrefixState:
  178. if c >= '<' && c <= '?' {
  179. if p != nil {
  180. // We only collect the last prefix character.
  181. p.cmd &^= 0xff << parser.PrefixShift
  182. p.cmd |= int(c) << parser.PrefixShift
  183. }
  184. break
  185. }
  186. state = ParamsState
  187. fallthrough
  188. case ParamsState:
  189. if c >= '0' && c <= '9' {
  190. if p != nil {
  191. if p.params[p.paramsLen] == parser.MissingParam {
  192. p.params[p.paramsLen] = 0
  193. }
  194. p.params[p.paramsLen] *= 10
  195. p.params[p.paramsLen] += int(c - '0')
  196. }
  197. break
  198. }
  199. if c == ':' {
  200. if p != nil {
  201. p.params[p.paramsLen] |= parser.HasMoreFlag
  202. }
  203. }
  204. if c == ';' || c == ':' {
  205. if p != nil {
  206. p.paramsLen++
  207. if p.paramsLen < len(p.params) {
  208. p.params[p.paramsLen] = parser.MissingParam
  209. }
  210. }
  211. break
  212. }
  213. state = IntermedState
  214. fallthrough
  215. case IntermedState:
  216. if c >= ' ' && c <= '/' {
  217. if p != nil {
  218. p.cmd &^= 0xff << parser.IntermedShift
  219. p.cmd |= int(c) << parser.IntermedShift
  220. }
  221. break
  222. }
  223. if p != nil {
  224. // Increment the last parameter
  225. if p.paramsLen > 0 && p.paramsLen < len(p.params)-1 ||
  226. p.paramsLen == 0 && len(p.params) > 0 && p.params[0] != parser.MissingParam {
  227. p.paramsLen++
  228. }
  229. }
  230. if c >= '@' && c <= '~' {
  231. if p != nil {
  232. p.cmd &^= 0xff
  233. p.cmd |= int(c)
  234. }
  235. if HasDcsPrefix(b) {
  236. // Continue to collect DCS data
  237. if p != nil {
  238. p.dataLen = 0
  239. }
  240. state = StringState
  241. continue
  242. }
  243. return b[:i+1], 0, i + 1, NormalState
  244. }
  245. // Invalid CSI/DCS sequence
  246. return b[:i], 0, i, NormalState
  247. case EscapeState:
  248. switch c {
  249. case '[', 'P':
  250. if p != nil {
  251. if len(p.params) > 0 {
  252. p.params[0] = parser.MissingParam
  253. }
  254. p.paramsLen = 0
  255. p.cmd = 0
  256. }
  257. state = PrefixState
  258. continue
  259. case ']', 'X', '^', '_':
  260. if p != nil {
  261. p.cmd = parser.MissingCommand
  262. p.dataLen = 0
  263. }
  264. state = StringState
  265. continue
  266. }
  267. if c >= ' ' && c <= '/' {
  268. if p != nil {
  269. p.cmd &^= 0xff << parser.IntermedShift
  270. p.cmd |= int(c) << parser.IntermedShift
  271. }
  272. continue
  273. } else if c >= '0' && c <= '~' {
  274. if p != nil {
  275. p.cmd &^= 0xff
  276. p.cmd |= int(c)
  277. }
  278. return b[:i+1], 0, i + 1, NormalState
  279. }
  280. // Invalid escape sequence
  281. return b[:i], 0, i, NormalState
  282. case StringState:
  283. switch c {
  284. case BEL:
  285. if HasOscPrefix(b) {
  286. parseOscCmd(p)
  287. return b[:i+1], 0, i + 1, NormalState
  288. }
  289. case CAN, SUB:
  290. if HasOscPrefix(b) {
  291. // Ensure we parse the OSC command number
  292. parseOscCmd(p)
  293. }
  294. // Cancel the sequence
  295. return b[:i], 0, i, NormalState
  296. case ST:
  297. if HasOscPrefix(b) {
  298. // Ensure we parse the OSC command number
  299. parseOscCmd(p)
  300. }
  301. return b[:i+1], 0, i + 1, NormalState
  302. case ESC:
  303. if HasStPrefix(b[i:]) {
  304. if HasOscPrefix(b) {
  305. // Ensure we parse the OSC command number
  306. parseOscCmd(p)
  307. }
  308. // End of string 7-bit (ST)
  309. return b[:i+2], 0, i + 2, NormalState
  310. }
  311. // Otherwise, cancel the sequence
  312. return b[:i], 0, i, NormalState
  313. }
  314. if p != nil && p.dataLen < len(p.data) {
  315. p.data[p.dataLen] = c
  316. p.dataLen++
  317. // Parse the OSC command number
  318. if c == ';' && HasOscPrefix(b) {
  319. parseOscCmd(p)
  320. }
  321. }
  322. }
  323. }
  324. return b, 0, len(b), state
  325. }
  326. func parseOscCmd(p *Parser) {
  327. if p == nil || p.cmd != parser.MissingCommand {
  328. return
  329. }
  330. for j := 0; j < p.dataLen; j++ {
  331. d := p.data[j]
  332. if d < '0' || d > '9' {
  333. break
  334. }
  335. if p.cmd == parser.MissingCommand {
  336. p.cmd = 0
  337. }
  338. p.cmd *= 10
  339. p.cmd += int(d - '0')
  340. }
  341. }
  342. // Equal returns true if the given byte slices are equal.
  343. func Equal[T string | []byte](a, b T) bool {
  344. return string(a) == string(b)
  345. }
  346. // HasPrefix returns true if the given byte slice has prefix.
  347. func HasPrefix[T string | []byte](b, prefix T) bool {
  348. return len(b) >= len(prefix) && Equal(b[0:len(prefix)], prefix)
  349. }
  350. // HasSuffix returns true if the given byte slice has suffix.
  351. func HasSuffix[T string | []byte](b, suffix T) bool {
  352. return len(b) >= len(suffix) && Equal(b[len(b)-len(suffix):], suffix)
  353. }
  354. // HasCsiPrefix returns true if the given byte slice has a CSI prefix.
  355. func HasCsiPrefix[T string | []byte](b T) bool {
  356. return (len(b) > 0 && b[0] == CSI) ||
  357. (len(b) > 1 && b[0] == ESC && b[1] == '[')
  358. }
  359. // HasOscPrefix returns true if the given byte slice has an OSC prefix.
  360. func HasOscPrefix[T string | []byte](b T) bool {
  361. return (len(b) > 0 && b[0] == OSC) ||
  362. (len(b) > 1 && b[0] == ESC && b[1] == ']')
  363. }
  364. // HasApcPrefix returns true if the given byte slice has an APC prefix.
  365. func HasApcPrefix[T string | []byte](b T) bool {
  366. return (len(b) > 0 && b[0] == APC) ||
  367. (len(b) > 1 && b[0] == ESC && b[1] == '_')
  368. }
  369. // HasDcsPrefix returns true if the given byte slice has a DCS prefix.
  370. func HasDcsPrefix[T string | []byte](b T) bool {
  371. return (len(b) > 0 && b[0] == DCS) ||
  372. (len(b) > 1 && b[0] == ESC && b[1] == 'P')
  373. }
  374. // HasSosPrefix returns true if the given byte slice has a SOS prefix.
  375. func HasSosPrefix[T string | []byte](b T) bool {
  376. return (len(b) > 0 && b[0] == SOS) ||
  377. (len(b) > 1 && b[0] == ESC && b[1] == 'X')
  378. }
  379. // HasPmPrefix returns true if the given byte slice has a PM prefix.
  380. func HasPmPrefix[T string | []byte](b T) bool {
  381. return (len(b) > 0 && b[0] == PM) ||
  382. (len(b) > 1 && b[0] == ESC && b[1] == '^')
  383. }
  384. // HasStPrefix returns true if the given byte slice has a ST prefix.
  385. func HasStPrefix[T string | []byte](b T) bool {
  386. return (len(b) > 0 && b[0] == ST) ||
  387. (len(b) > 1 && b[0] == ESC && b[1] == '\\')
  388. }
  389. // HasEscPrefix returns true if the given byte slice has an ESC prefix.
  390. func HasEscPrefix[T string | []byte](b T) bool {
  391. return len(b) > 0 && b[0] == ESC
  392. }
  393. // FirstGraphemeCluster returns the first grapheme cluster in the given string or byte slice.
  394. // This is a syntactic sugar function that wraps
  395. // uniseg.FirstGraphemeClusterInString and uniseg.FirstGraphemeCluster.
  396. func FirstGraphemeCluster[T string | []byte](b T, state int) (T, T, int, int) {
  397. switch b := any(b).(type) {
  398. case string:
  399. cluster, rest, width, newState := uniseg.FirstGraphemeClusterInString(b, state)
  400. return T(cluster), T(rest), width, newState
  401. case []byte:
  402. cluster, rest, width, newState := uniseg.FirstGraphemeCluster(b, state)
  403. return T(cluster), T(rest), width, newState
  404. }
  405. panic("unreachable")
  406. }
  407. // Cmd represents a sequence command. This is used to pack/unpack a sequence
  408. // command with its intermediate and prefix characters. Those are commonly
  409. // found in CSI and DCS sequences.
  410. type Cmd int
  411. // Prefix returns the unpacked prefix byte of the CSI sequence.
  412. // This is always gonna be one of the following '<' '=' '>' '?' and in the
  413. // range of 0x3C-0x3F.
  414. // Zero is returned if the sequence does not have a prefix.
  415. func (c Cmd) Prefix() byte {
  416. return byte(parser.Prefix(int(c)))
  417. }
  418. // Intermediate returns the unpacked intermediate byte of the CSI sequence.
  419. // An intermediate byte is in the range of 0x20-0x2F. This includes these
  420. // characters from ' ', '!', '"', '#', '$', '%', '&', ”', '(', ')', '*', '+',
  421. // ',', '-', '.', '/'.
  422. // Zero is returned if the sequence does not have an intermediate byte.
  423. func (c Cmd) Intermediate() byte {
  424. return byte(parser.Intermediate(int(c)))
  425. }
  426. // Final returns the unpacked command byte of the CSI sequence.
  427. func (c Cmd) Final() byte {
  428. return byte(parser.Command(int(c)))
  429. }
  430. // Command packs a command with the given prefix, intermediate, and final. A
  431. // zero byte means the sequence does not have a prefix or intermediate.
  432. //
  433. // Prefixes are in the range of 0x3C-0x3F that is one of `<=>?`.
  434. //
  435. // Intermediates are in the range of 0x20-0x2F that is anything in
  436. // `!"#$%&'()*+,-./`.
  437. //
  438. // Final bytes are in the range of 0x40-0x7E that is anything in the range
  439. // `@A–Z[\]^_`a–z{|}~`.
  440. func Command(prefix, inter, final byte) (c int) {
  441. c = int(final)
  442. c |= int(prefix) << parser.PrefixShift
  443. c |= int(inter) << parser.IntermedShift
  444. return
  445. }
  446. // Param represents a sequence parameter. Sequence parameters with
  447. // sub-parameters are packed with the HasMoreFlag set. This is used to unpack
  448. // the parameters from a CSI and DCS sequences.
  449. type Param int
  450. // Param returns the unpacked parameter at the given index.
  451. // It returns the default value if the parameter is missing.
  452. func (s Param) Param(def int) int {
  453. p := int(s) & parser.ParamMask
  454. if p == parser.MissingParam {
  455. return def
  456. }
  457. return p
  458. }
  459. // HasMore unpacks the HasMoreFlag from the parameter.
  460. func (s Param) HasMore() bool {
  461. return s&parser.HasMoreFlag != 0
  462. }
  463. // Parameter packs an escape code parameter with the given parameter and
  464. // whether this parameter has following sub-parameters.
  465. func Parameter(p int, hasMore bool) (s int) {
  466. s = p & parser.ParamMask
  467. if hasMore {
  468. s |= parser.HasMoreFlag
  469. }
  470. return
  471. }