parser_decode.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. package ansi
  2. import (
  3. "unicode/utf8"
  4. "github.com/charmbracelet/x/ansi/parser"
  5. "github.com/rivo/uniseg"
  6. )
  7. // State represents the state of the ANSI escape sequence parser used by
  8. // [DecodeSequence].
  9. type State = byte
  10. // ANSI escape sequence states used by [DecodeSequence].
  11. const (
  12. NormalState State = iota
  13. MarkerState
  14. ParamsState
  15. IntermedState
  16. EscapeState
  17. StringState
  18. )
  19. // DecodeSequence decodes the first ANSI escape sequence or a printable
  20. // grapheme from the given data. It returns the sequence slice, the number of
  21. // bytes read, the cell width for each sequence, and the new state.
  22. //
  23. // The cell width will always be 0 for control and escape sequences, 1 for
  24. // ASCII printable characters, and the number of cells other Unicode characters
  25. // occupy. It uses the uniseg package to calculate the width of Unicode
  26. // graphemes and characters. This means it will always do grapheme clustering
  27. // (mode 2027).
  28. //
  29. // Passing a non-nil [*Parser] as the last argument will allow the decoder to
  30. // collect sequence parameters, data, and commands. The parser cmd will have
  31. // the packed command value that contains intermediate and marker characters.
  32. // In the case of a OSC sequence, the cmd will be the OSC command number. Use
  33. // [Command] and [Parameter] types to unpack command intermediates and markers as well
  34. // as parameters.
  35. //
  36. // Zero [Command] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
  37. // validity of other data sequences, OSC, DCS, etc, will require checking for
  38. // the returned sequence terminator bytes such as ST (ESC \\) and BEL).
  39. //
  40. // We store the command byte in [Command] in the most significant byte, the
  41. // marker byte in the next byte, and the intermediate byte in the least
  42. // significant byte. This is done to avoid using a struct to store the command
  43. // and its intermediates and markers. The command byte is always the least
  44. // significant byte i.e. [Cmd & 0xff]. Use the [Command] type to unpack the
  45. // command, intermediate, and marker bytes. Note that we only collect the last
  46. // marker character and intermediate byte.
  47. //
  48. // The [p.Params] slice will contain the parameters of the sequence. Any
  49. // sub-parameter will have the [parser.HasMoreFlag] set. Use the [Parameter] type
  50. // to unpack the parameters.
  51. //
  52. // Example:
  53. //
  54. // var state byte // the initial state is always zero [NormalState]
  55. // p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
  56. // input := []byte("\x1b[31mHello, World!\x1b[0m")
  57. // for len(input) > 0 {
  58. // seq, width, n, newState := DecodeSequence(input, state, p)
  59. // log.Printf("seq: %q, width: %d", seq, width)
  60. // state = newState
  61. // input = input[n:]
  62. // }
  63. func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
  64. for i := 0; i < len(b); i++ {
  65. c := b[i]
  66. switch state {
  67. case NormalState:
  68. switch c {
  69. case ESC:
  70. if p != nil {
  71. if len(p.params) > 0 {
  72. p.params[0] = parser.MissingParam
  73. }
  74. p.cmd = 0
  75. p.paramsLen = 0
  76. p.dataLen = 0
  77. }
  78. state = EscapeState
  79. continue
  80. case CSI, DCS:
  81. if p != nil {
  82. if len(p.params) > 0 {
  83. p.params[0] = parser.MissingParam
  84. }
  85. p.cmd = 0
  86. p.paramsLen = 0
  87. p.dataLen = 0
  88. }
  89. state = MarkerState
  90. continue
  91. case OSC, APC, SOS, PM:
  92. if p != nil {
  93. p.cmd = parser.MissingCommand
  94. p.dataLen = 0
  95. }
  96. state = StringState
  97. continue
  98. }
  99. if p != nil {
  100. p.dataLen = 0
  101. p.paramsLen = 0
  102. p.cmd = 0
  103. }
  104. if c > US && c < DEL {
  105. // ASCII printable characters
  106. return b[i : i+1], 1, 1, NormalState
  107. }
  108. if c <= US || c == DEL || c < 0xC0 {
  109. // C0 & C1 control characters & DEL
  110. return b[i : i+1], 0, 1, NormalState
  111. }
  112. if utf8.RuneStart(c) {
  113. seq, _, width, _ = FirstGraphemeCluster(b, -1)
  114. i += len(seq)
  115. return b[:i], width, i, NormalState
  116. }
  117. // Invalid UTF-8 sequence
  118. return b[:i], 0, i, NormalState
  119. case MarkerState:
  120. if c >= '<' && c <= '?' {
  121. if p != nil {
  122. // We only collect the last marker character.
  123. p.cmd &^= 0xff << parser.MarkerShift
  124. p.cmd |= int(c) << parser.MarkerShift
  125. }
  126. break
  127. }
  128. state = ParamsState
  129. fallthrough
  130. case ParamsState:
  131. if c >= '0' && c <= '9' {
  132. if p != nil {
  133. if p.params[p.paramsLen] == parser.MissingParam {
  134. p.params[p.paramsLen] = 0
  135. }
  136. p.params[p.paramsLen] *= 10
  137. p.params[p.paramsLen] += int(c - '0')
  138. }
  139. break
  140. }
  141. if c == ':' {
  142. if p != nil {
  143. p.params[p.paramsLen] |= parser.HasMoreFlag
  144. }
  145. }
  146. if c == ';' || c == ':' {
  147. if p != nil {
  148. p.paramsLen++
  149. if p.paramsLen < len(p.params) {
  150. p.params[p.paramsLen] = parser.MissingParam
  151. }
  152. }
  153. break
  154. }
  155. state = IntermedState
  156. fallthrough
  157. case IntermedState:
  158. if c >= ' ' && c <= '/' {
  159. if p != nil {
  160. p.cmd &^= 0xff << parser.IntermedShift
  161. p.cmd |= int(c) << parser.IntermedShift
  162. }
  163. break
  164. }
  165. if p != nil {
  166. // Increment the last parameter
  167. if p.paramsLen > 0 && p.paramsLen < len(p.params)-1 ||
  168. p.paramsLen == 0 && len(p.params) > 0 && p.params[0] != parser.MissingParam {
  169. p.paramsLen++
  170. }
  171. }
  172. if c >= '@' && c <= '~' {
  173. if p != nil {
  174. p.cmd &^= 0xff
  175. p.cmd |= int(c)
  176. }
  177. if HasDcsPrefix(b) {
  178. // Continue to collect DCS data
  179. if p != nil {
  180. p.dataLen = 0
  181. }
  182. state = StringState
  183. continue
  184. }
  185. return b[:i+1], 0, i + 1, NormalState
  186. }
  187. // Invalid CSI/DCS sequence
  188. return b[:i], 0, i, NormalState
  189. case EscapeState:
  190. switch c {
  191. case '[', 'P':
  192. if p != nil {
  193. if len(p.params) > 0 {
  194. p.params[0] = parser.MissingParam
  195. }
  196. p.paramsLen = 0
  197. p.cmd = 0
  198. }
  199. state = MarkerState
  200. continue
  201. case ']', 'X', '^', '_':
  202. if p != nil {
  203. p.cmd = parser.MissingCommand
  204. p.dataLen = 0
  205. }
  206. state = StringState
  207. continue
  208. }
  209. if c >= ' ' && c <= '/' {
  210. if p != nil {
  211. p.cmd &^= 0xff << parser.IntermedShift
  212. p.cmd |= int(c) << parser.IntermedShift
  213. }
  214. continue
  215. } else if c >= '0' && c <= '~' {
  216. if p != nil {
  217. p.cmd &^= 0xff
  218. p.cmd |= int(c)
  219. }
  220. return b[:i+1], 0, i + 1, NormalState
  221. }
  222. // Invalid escape sequence
  223. return b[:i], 0, i, NormalState
  224. case StringState:
  225. switch c {
  226. case BEL:
  227. if HasOscPrefix(b) {
  228. parseOscCmd(p)
  229. return b[:i+1], 0, i + 1, NormalState
  230. }
  231. case CAN, SUB:
  232. if HasOscPrefix(b) {
  233. // Ensure we parse the OSC command number
  234. parseOscCmd(p)
  235. }
  236. // Cancel the sequence
  237. return b[:i], 0, i, NormalState
  238. case ST:
  239. if HasOscPrefix(b) {
  240. // Ensure we parse the OSC command number
  241. parseOscCmd(p)
  242. }
  243. return b[:i+1], 0, i + 1, NormalState
  244. case ESC:
  245. if HasStPrefix(b[i:]) {
  246. if HasOscPrefix(b) {
  247. // Ensure we parse the OSC command number
  248. parseOscCmd(p)
  249. }
  250. // End of string 7-bit (ST)
  251. return b[:i+2], 0, i + 2, NormalState
  252. }
  253. // Otherwise, cancel the sequence
  254. return b[:i], 0, i, NormalState
  255. }
  256. if p != nil && p.dataLen < len(p.data) {
  257. p.data[p.dataLen] = c
  258. p.dataLen++
  259. // Parse the OSC command number
  260. if c == ';' && HasOscPrefix(b) {
  261. parseOscCmd(p)
  262. }
  263. }
  264. }
  265. }
  266. return b, 0, len(b), state
  267. }
  268. func parseOscCmd(p *Parser) {
  269. if p == nil || p.cmd != parser.MissingCommand {
  270. return
  271. }
  272. for j := 0; j < p.dataLen; j++ {
  273. d := p.data[j]
  274. if d < '0' || d > '9' {
  275. break
  276. }
  277. if p.cmd == parser.MissingCommand {
  278. p.cmd = 0
  279. }
  280. p.cmd *= 10
  281. p.cmd += int(d - '0')
  282. }
  283. }
  284. // Equal returns true if the given byte slices are equal.
  285. func Equal[T string | []byte](a, b T) bool {
  286. return string(a) == string(b)
  287. }
  288. // HasPrefix returns true if the given byte slice has prefix.
  289. func HasPrefix[T string | []byte](b, prefix T) bool {
  290. return len(b) >= len(prefix) && Equal(b[0:len(prefix)], prefix)
  291. }
  292. // HasSuffix returns true if the given byte slice has suffix.
  293. func HasSuffix[T string | []byte](b, suffix T) bool {
  294. return len(b) >= len(suffix) && Equal(b[len(b)-len(suffix):], suffix)
  295. }
  296. // HasCsiPrefix returns true if the given byte slice has a CSI prefix.
  297. func HasCsiPrefix[T string | []byte](b T) bool {
  298. return (len(b) > 0 && b[0] == CSI) ||
  299. (len(b) > 1 && b[0] == ESC && b[1] == '[')
  300. }
  301. // HasOscPrefix returns true if the given byte slice has an OSC prefix.
  302. func HasOscPrefix[T string | []byte](b T) bool {
  303. return (len(b) > 0 && b[0] == OSC) ||
  304. (len(b) > 1 && b[0] == ESC && b[1] == ']')
  305. }
  306. // HasApcPrefix returns true if the given byte slice has an APC prefix.
  307. func HasApcPrefix[T string | []byte](b T) bool {
  308. return (len(b) > 0 && b[0] == APC) ||
  309. (len(b) > 1 && b[0] == ESC && b[1] == '_')
  310. }
  311. // HasDcsPrefix returns true if the given byte slice has a DCS prefix.
  312. func HasDcsPrefix[T string | []byte](b T) bool {
  313. return (len(b) > 0 && b[0] == DCS) ||
  314. (len(b) > 1 && b[0] == ESC && b[1] == 'P')
  315. }
  316. // HasSosPrefix returns true if the given byte slice has a SOS prefix.
  317. func HasSosPrefix[T string | []byte](b T) bool {
  318. return (len(b) > 0 && b[0] == SOS) ||
  319. (len(b) > 1 && b[0] == ESC && b[1] == 'X')
  320. }
  321. // HasPmPrefix returns true if the given byte slice has a PM prefix.
  322. func HasPmPrefix[T string | []byte](b T) bool {
  323. return (len(b) > 0 && b[0] == PM) ||
  324. (len(b) > 1 && b[0] == ESC && b[1] == '^')
  325. }
  326. // HasStPrefix returns true if the given byte slice has a ST prefix.
  327. func HasStPrefix[T string | []byte](b T) bool {
  328. return (len(b) > 0 && b[0] == ST) ||
  329. (len(b) > 1 && b[0] == ESC && b[1] == '\\')
  330. }
  331. // HasEscPrefix returns true if the given byte slice has an ESC prefix.
  332. func HasEscPrefix[T string | []byte](b T) bool {
  333. return len(b) > 0 && b[0] == ESC
  334. }
  335. // FirstGraphemeCluster returns the first grapheme cluster in the given string or byte slice.
  336. // This is a syntactic sugar function that wraps
  337. // uniseg.FirstGraphemeClusterInString and uniseg.FirstGraphemeCluster.
  338. func FirstGraphemeCluster[T string | []byte](b T, state int) (T, T, int, int) {
  339. switch b := any(b).(type) {
  340. case string:
  341. cluster, rest, width, newState := uniseg.FirstGraphemeClusterInString(b, state)
  342. return T(cluster), T(rest), width, newState
  343. case []byte:
  344. cluster, rest, width, newState := uniseg.FirstGraphemeCluster(b, state)
  345. return T(cluster), T(rest), width, newState
  346. }
  347. panic("unreachable")
  348. }
  349. // Command represents a sequence command. This is used to pack/unpack a sequence
  350. // command with its intermediate and marker characters. Those are commonly
  351. // found in CSI and DCS sequences.
  352. type Command int
  353. // Marker returns the unpacked marker byte of the CSI sequence.
  354. // This is always gonna be one of the following '<' '=' '>' '?' and in the
  355. // range of 0x3C-0x3F.
  356. // Zero is returned if the sequence does not have a marker.
  357. func (c Command) Marker() int {
  358. return parser.Marker(int(c))
  359. }
  360. // Intermediate returns the unpacked intermediate byte of the CSI sequence.
  361. // An intermediate byte is in the range of 0x20-0x2F. This includes these
  362. // characters from ' ', '!', '"', '#', '$', '%', '&', ”', '(', ')', '*', '+',
  363. // ',', '-', '.', '/'.
  364. // Zero is returned if the sequence does not have an intermediate byte.
  365. func (c Command) Intermediate() int {
  366. return parser.Intermediate(int(c))
  367. }
  368. // Command returns the unpacked command byte of the CSI sequence.
  369. func (c Command) Command() int {
  370. return parser.Command(int(c))
  371. }
  372. // Cmd returns a packed [Command] with the given command, marker, and
  373. // intermediate.
  374. // The first byte is the command, the next shift is the marker, and the next
  375. // shift is the intermediate.
  376. //
  377. // Even though this function takes integers, it only uses the lower 8 bits of
  378. // each integer.
  379. func Cmd(marker, inter, cmd int) (c Command) {
  380. c = Command(cmd & parser.CommandMask)
  381. c |= Command(marker&parser.CommandMask) << parser.MarkerShift
  382. c |= Command(inter&parser.CommandMask) << parser.IntermedShift
  383. return
  384. }
  385. // Parameter represents a sequence parameter. Sequence parameters with
  386. // sub-parameters are packed with the HasMoreFlag set. This is used to unpack
  387. // the parameters from a CSI and DCS sequences.
  388. type Parameter int
  389. // Param returns the unpacked parameter at the given index.
  390. // It returns the default value if the parameter is missing.
  391. func (s Parameter) Param(def int) int {
  392. p := int(s) & parser.ParamMask
  393. if p == parser.MissingParam {
  394. return def
  395. }
  396. return p
  397. }
  398. // HasMore unpacks the HasMoreFlag from the parameter.
  399. func (s Parameter) HasMore() bool {
  400. return s&parser.HasMoreFlag != 0
  401. }
  402. // Param returns a packed [Parameter] with the given parameter and whether this
  403. // parameter has following sub-parameters.
  404. func Param(p int, hasMore bool) (s Parameter) {
  405. s = Parameter(p & parser.ParamMask)
  406. if hasMore {
  407. s |= Parameter(parser.HasMoreFlag)
  408. }
  409. return
  410. }