parser.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. package ansi
  2. import (
  3. "unicode/utf8"
  4. "unsafe"
  5. "github.com/charmbracelet/x/ansi/parser"
  6. )
  7. // ParserDispatcher is a function that dispatches a sequence.
  8. type ParserDispatcher func(Sequence)
  9. // Parser represents a DEC ANSI compatible sequence parser.
  10. //
  11. // It uses a state machine to parse ANSI escape sequences and control
  12. // characters. The parser is designed to be used with a terminal emulator or
  13. // similar application that needs to parse ANSI escape sequences and control
  14. // characters.
  15. // See package [parser] for more information.
  16. //
  17. //go:generate go run ./gen.go
  18. type Parser struct {
  19. // the dispatch function to call when a sequence is complete
  20. dispatcher ParserDispatcher
  21. // params contains the raw parameters of the sequence.
  22. // These parameters used when constructing CSI and DCS sequences.
  23. params []int
  24. // data contains the raw data of the sequence.
  25. // These data used when constructing OSC, DCS, SOS, PM, and APC sequences.
  26. data []byte
  27. // dataLen keeps track of the length of the data buffer.
  28. // If dataLen is -1, the data buffer is unlimited and will grow as needed.
  29. // Otherwise, dataLen is limited by the size of the data buffer.
  30. dataLen int
  31. // paramsLen keeps track of the number of parameters.
  32. // This is limited by the size of the params buffer.
  33. //
  34. // This is also used when collecting UTF-8 runes to keep track of the
  35. // number of rune bytes collected.
  36. paramsLen int
  37. // cmd contains the raw command along with the private marker and
  38. // intermediate bytes of the sequence.
  39. // The first lower byte contains the command byte, the next byte contains
  40. // the private marker, and the next byte contains the intermediate byte.
  41. //
  42. // This is also used when collecting UTF-8 runes treating it as a slice of
  43. // 4 bytes.
  44. cmd int
  45. // state is the current state of the parser.
  46. state byte
  47. }
  48. // NewParser returns a new parser with an optional [ParserDispatcher].
  49. // The [Parser] uses a default size of 32 for the parameters and 64KB for the
  50. // data buffer. Use [Parser.SetParamsSize] and [Parser.SetDataSize] to set the
  51. // size of the parameters and data buffer respectively.
  52. func NewParser(d ParserDispatcher) *Parser {
  53. p := new(Parser)
  54. p.SetDispatcher(d)
  55. p.SetParamsSize(parser.MaxParamsSize)
  56. p.SetDataSize(1024 * 64) // 64KB data buffer
  57. return p
  58. }
  59. // SetDispatcher sets the dispatcher function to call when a sequence is
  60. // complete.
  61. func (p *Parser) SetDispatcher(d ParserDispatcher) {
  62. p.dispatcher = d
  63. }
  64. // SetParamsSize sets the size of the parameters buffer.
  65. // This is used when constructing CSI and DCS sequences.
  66. func (p *Parser) SetParamsSize(size int) {
  67. p.params = make([]int, size)
  68. }
  69. // SetDataSize sets the size of the data buffer.
  70. // This is used when constructing OSC, DCS, SOS, PM, and APC sequences.
  71. // If size is less than or equal to 0, the data buffer is unlimited and will
  72. // grow as needed.
  73. func (p *Parser) SetDataSize(size int) {
  74. if size <= 0 {
  75. size = 0
  76. p.dataLen = -1
  77. }
  78. p.data = make([]byte, size)
  79. }
  80. // Params returns the list of parsed packed parameters.
  81. func (p *Parser) Params() []Parameter {
  82. return unsafe.Slice((*Parameter)(unsafe.Pointer(&p.params[0])), p.paramsLen)
  83. }
  84. // Param returns the parameter at the given index and falls back to the default
  85. // value if the parameter is missing. If the index is out of bounds, it returns
  86. // the default value and false.
  87. func (p *Parser) Param(i, def int) (int, bool) {
  88. if i < 0 || i >= p.paramsLen {
  89. return def, false
  90. }
  91. return Parameter(p.params[i]).Param(def), true
  92. }
  93. // Cmd returns the packed command of the last dispatched sequence.
  94. func (p *Parser) Cmd() Command {
  95. return Command(p.cmd)
  96. }
  97. // Rune returns the last dispatched sequence as a rune.
  98. func (p *Parser) Rune() rune {
  99. rw := utf8ByteLen(byte(p.cmd & 0xff))
  100. if rw == -1 {
  101. return utf8.RuneError
  102. }
  103. r, _ := utf8.DecodeRune((*[utf8.UTFMax]byte)(unsafe.Pointer(&p.cmd))[:rw])
  104. return r
  105. }
  106. // Data returns the raw data of the last dispatched sequence.
  107. func (p *Parser) Data() []byte {
  108. return p.data[:p.dataLen]
  109. }
  110. // Reset resets the parser to its initial state.
  111. func (p *Parser) Reset() {
  112. p.clear()
  113. p.state = parser.GroundState
  114. }
  115. // clear clears the parser parameters and command.
  116. func (p *Parser) clear() {
  117. if len(p.params) > 0 {
  118. p.params[0] = parser.MissingParam
  119. }
  120. p.paramsLen = 0
  121. p.cmd = 0
  122. }
  123. // State returns the current state of the parser.
  124. func (p *Parser) State() parser.State {
  125. return p.state
  126. }
  127. // StateName returns the name of the current state.
  128. func (p *Parser) StateName() string {
  129. return parser.StateNames[p.state]
  130. }
  131. // Parse parses the given dispatcher and byte buffer.
  132. // Deprecated: Loop over the buffer and call [Parser.Advance] instead.
  133. func (p *Parser) Parse(b []byte) {
  134. for i := 0; i < len(b); i++ {
  135. p.Advance(b[i])
  136. }
  137. }
  138. // Advance advances the parser using the given byte. It returns the action
  139. // performed by the parser.
  140. func (p *Parser) Advance(b byte) parser.Action {
  141. switch p.state {
  142. case parser.Utf8State:
  143. // We handle UTF-8 here.
  144. return p.advanceUtf8(b)
  145. default:
  146. return p.advance(b)
  147. }
  148. }
  149. func (p *Parser) collectRune(b byte) {
  150. if p.paramsLen >= utf8.UTFMax {
  151. return
  152. }
  153. shift := p.paramsLen * 8
  154. p.cmd &^= 0xff << shift
  155. p.cmd |= int(b) << shift
  156. p.paramsLen++
  157. }
  158. func (p *Parser) dispatch(s Sequence) {
  159. if p.dispatcher != nil {
  160. p.dispatcher(s)
  161. }
  162. }
  163. func (p *Parser) advanceUtf8(b byte) parser.Action {
  164. // Collect UTF-8 rune bytes.
  165. p.collectRune(b)
  166. rw := utf8ByteLen(byte(p.cmd & 0xff))
  167. if rw == -1 {
  168. // We panic here because the first byte comes from the state machine,
  169. // if this panics, it means there is a bug in the state machine!
  170. panic("invalid rune") // unreachable
  171. }
  172. if p.paramsLen < rw {
  173. return parser.CollectAction
  174. }
  175. // We have enough bytes to decode the rune using unsafe
  176. p.dispatch(Rune(p.Rune()))
  177. p.state = parser.GroundState
  178. p.paramsLen = 0
  179. return parser.PrintAction
  180. }
  181. func (p *Parser) advance(b byte) parser.Action {
  182. state, action := parser.Table.Transition(p.state, b)
  183. // We need to clear the parser state if the state changes from EscapeState.
  184. // This is because when we enter the EscapeState, we don't get a chance to
  185. // clear the parser state. For example, when a sequence terminates with a
  186. // ST (\x1b\\ or \x9c), we dispatch the current sequence and transition to
  187. // EscapeState. However, the parser state is not cleared in this case and
  188. // we need to clear it here before dispatching the esc sequence.
  189. if p.state != state {
  190. if p.state == parser.EscapeState {
  191. p.performAction(parser.ClearAction, state, b)
  192. }
  193. if action == parser.PutAction &&
  194. p.state == parser.DcsEntryState && state == parser.DcsStringState {
  195. // XXX: This is a special case where we need to start collecting
  196. // non-string parameterized data i.e. doesn't follow the ECMA-48 §
  197. // 5.4.1 string parameters format.
  198. p.performAction(parser.StartAction, state, 0)
  199. }
  200. }
  201. // Handle special cases
  202. switch {
  203. case b == ESC && p.state == parser.EscapeState:
  204. // Two ESCs in a row
  205. p.performAction(parser.ExecuteAction, state, b)
  206. default:
  207. p.performAction(action, state, b)
  208. }
  209. p.state = state
  210. return action
  211. }
  212. func (p *Parser) parseStringCmd() {
  213. // Try to parse the command
  214. datalen := len(p.data)
  215. if p.dataLen >= 0 {
  216. datalen = p.dataLen
  217. }
  218. for i := 0; i < datalen; i++ {
  219. d := p.data[i]
  220. if d < '0' || d > '9' {
  221. break
  222. }
  223. if p.cmd == parser.MissingCommand {
  224. p.cmd = 0
  225. }
  226. p.cmd *= 10
  227. p.cmd += int(d - '0')
  228. }
  229. }
  230. func (p *Parser) performAction(action parser.Action, state parser.State, b byte) {
  231. switch action {
  232. case parser.IgnoreAction:
  233. break
  234. case parser.ClearAction:
  235. p.clear()
  236. case parser.PrintAction:
  237. p.dispatch(Rune(b))
  238. case parser.ExecuteAction:
  239. p.dispatch(ControlCode(b))
  240. case parser.MarkerAction:
  241. // Collect private marker
  242. // we only store the last marker
  243. p.cmd &^= 0xff << parser.MarkerShift
  244. p.cmd |= int(b) << parser.MarkerShift
  245. case parser.CollectAction:
  246. if state == parser.Utf8State {
  247. // Reset the UTF-8 counter
  248. p.paramsLen = 0
  249. p.collectRune(b)
  250. } else {
  251. // Collect intermediate bytes
  252. // we only store the last intermediate byte
  253. p.cmd &^= 0xff << parser.IntermedShift
  254. p.cmd |= int(b) << parser.IntermedShift
  255. }
  256. case parser.ParamAction:
  257. // Collect parameters
  258. if p.paramsLen >= len(p.params) {
  259. break
  260. }
  261. if b >= '0' && b <= '9' {
  262. if p.params[p.paramsLen] == parser.MissingParam {
  263. p.params[p.paramsLen] = 0
  264. }
  265. p.params[p.paramsLen] *= 10
  266. p.params[p.paramsLen] += int(b - '0')
  267. }
  268. if b == ':' {
  269. p.params[p.paramsLen] |= parser.HasMoreFlag
  270. }
  271. if b == ';' || b == ':' {
  272. p.paramsLen++
  273. if p.paramsLen < len(p.params) {
  274. p.params[p.paramsLen] = parser.MissingParam
  275. }
  276. }
  277. case parser.StartAction:
  278. if p.dataLen < 0 && p.data != nil {
  279. p.data = p.data[:0]
  280. } else {
  281. p.dataLen = 0
  282. }
  283. if p.state >= parser.DcsEntryState && p.state <= parser.DcsStringState {
  284. // Collect the command byte for DCS
  285. p.cmd |= int(b)
  286. } else {
  287. p.cmd = parser.MissingCommand
  288. }
  289. case parser.PutAction:
  290. switch p.state {
  291. case parser.OscStringState:
  292. if b == ';' && p.cmd == parser.MissingCommand {
  293. p.parseStringCmd()
  294. }
  295. }
  296. if p.dataLen < 0 {
  297. p.data = append(p.data, b)
  298. } else {
  299. if p.dataLen < len(p.data) {
  300. p.data[p.dataLen] = b
  301. p.dataLen++
  302. }
  303. }
  304. case parser.DispatchAction:
  305. // Increment the last parameter
  306. if p.paramsLen > 0 && p.paramsLen < len(p.params)-1 ||
  307. p.paramsLen == 0 && len(p.params) > 0 && p.params[0] != parser.MissingParam {
  308. p.paramsLen++
  309. }
  310. if p.state == parser.OscStringState && p.cmd == parser.MissingCommand {
  311. // Ensure we have a command for OSC
  312. p.parseStringCmd()
  313. }
  314. if p.dispatcher == nil {
  315. break
  316. }
  317. var seq Sequence
  318. data := p.data
  319. if p.dataLen >= 0 {
  320. data = data[:p.dataLen]
  321. }
  322. switch p.state {
  323. case parser.CsiEntryState, parser.CsiParamState, parser.CsiIntermediateState:
  324. p.cmd |= int(b)
  325. seq = CsiSequence{Cmd: Command(p.cmd), Params: p.Params()}
  326. case parser.EscapeState, parser.EscapeIntermediateState:
  327. p.cmd |= int(b)
  328. seq = EscSequence(p.cmd)
  329. case parser.DcsEntryState, parser.DcsParamState, parser.DcsIntermediateState, parser.DcsStringState:
  330. seq = DcsSequence{Cmd: Command(p.cmd), Params: p.Params(), Data: data}
  331. case parser.OscStringState:
  332. seq = OscSequence{Cmd: p.cmd, Data: data}
  333. case parser.SosStringState:
  334. seq = SosSequence{Data: data}
  335. case parser.PmStringState:
  336. seq = PmSequence{Data: data}
  337. case parser.ApcStringState:
  338. seq = ApcSequence{Data: data}
  339. }
  340. p.dispatch(seq)
  341. }
  342. }
  343. func utf8ByteLen(b byte) int {
  344. if b <= 0b0111_1111 { // 0x00-0x7F
  345. return 1
  346. } else if b >= 0b1100_0000 && b <= 0b1101_1111 { // 0xC0-0xDF
  347. return 2
  348. } else if b >= 0b1110_0000 && b <= 0b1110_1111 { // 0xE0-0xEF
  349. return 3
  350. } else if b >= 0b1111_0000 && b <= 0b1111_0111 { // 0xF0-0xF7
  351. return 4
  352. }
  353. return -1
  354. }