reader.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660
  1. package text
  2. import (
  3. "bytes"
  4. "io"
  5. "regexp"
  6. "unicode/utf8"
  7. "github.com/yuin/goldmark/util"
  8. )
  9. const invalidValue = -1
  10. // EOF indicates the end of file.
  11. const EOF = byte(0xff)
  12. // A Reader interface provides abstracted method for reading text.
  13. type Reader interface {
  14. io.RuneReader
  15. // Source returns a source of the reader.
  16. Source() []byte
  17. // ResetPosition resets positions.
  18. ResetPosition()
  19. // Peek returns a byte at current position without advancing the internal pointer.
  20. Peek() byte
  21. // PeekLine returns the current line without advancing the internal pointer.
  22. PeekLine() ([]byte, Segment)
  23. // PrecendingCharacter returns a character just before current internal pointer.
  24. PrecendingCharacter() rune
  25. // Value returns a value of the given segment.
  26. Value(Segment) []byte
  27. // LineOffset returns a distance from the line head to current position.
  28. LineOffset() int
  29. // Position returns current line number and position.
  30. Position() (int, Segment)
  31. // SetPosition sets current line number and position.
  32. SetPosition(int, Segment)
  33. // SetPadding sets padding to the reader.
  34. SetPadding(int)
  35. // Advance advances the internal pointer.
  36. Advance(int)
  37. // AdvanceAndSetPadding advances the internal pointer and add padding to the
  38. // reader.
  39. AdvanceAndSetPadding(int, int)
  40. // AdvanceLine advances the internal pointer to the next line head.
  41. AdvanceLine()
  42. // SkipSpaces skips space characters and returns a non-blank line.
  43. // If it reaches EOF, returns false.
  44. SkipSpaces() (Segment, int, bool)
  45. // SkipSpaces skips blank lines and returns a non-blank line.
  46. // If it reaches EOF, returns false.
  47. SkipBlankLines() (Segment, int, bool)
  48. // Match performs regular expression matching to current line.
  49. Match(reg *regexp.Regexp) bool
  50. // Match performs regular expression searching to current line.
  51. FindSubMatch(reg *regexp.Regexp) [][]byte
  52. // FindClosure finds corresponding closure.
  53. FindClosure(opener, closer byte, options FindClosureOptions) (*Segments, bool)
  54. }
  55. // FindClosureOptions is options for Reader.FindClosure.
  56. type FindClosureOptions struct {
  57. // CodeSpan is a flag for the FindClosure. If this is set to true,
  58. // FindClosure ignores closers in codespans.
  59. CodeSpan bool
  60. // Nesting is a flag for the FindClosure. If this is set to true,
  61. // FindClosure allows nesting.
  62. Nesting bool
  63. // Newline is a flag for the FindClosure. If this is set to true,
  64. // FindClosure searches for a closer over multiple lines.
  65. Newline bool
  66. // Advance is a flag for the FindClosure. If this is set to true,
  67. // FindClosure advances pointers when closer is found.
  68. Advance bool
  69. }
  70. type reader struct {
  71. source []byte
  72. sourceLength int
  73. line int
  74. peekedLine []byte
  75. pos Segment
  76. head int
  77. lineOffset int
  78. }
  79. // NewReader return a new Reader that can read UTF-8 bytes .
  80. func NewReader(source []byte) Reader {
  81. r := &reader{
  82. source: source,
  83. sourceLength: len(source),
  84. }
  85. r.ResetPosition()
  86. return r
  87. }
  88. func (r *reader) FindClosure(opener, closer byte, options FindClosureOptions) (*Segments, bool) {
  89. return findClosureReader(r, opener, closer, options)
  90. }
  91. func (r *reader) ResetPosition() {
  92. r.line = -1
  93. r.head = 0
  94. r.lineOffset = -1
  95. r.AdvanceLine()
  96. }
  97. func (r *reader) Source() []byte {
  98. return r.source
  99. }
  100. func (r *reader) Value(seg Segment) []byte {
  101. return seg.Value(r.source)
  102. }
  103. func (r *reader) Peek() byte {
  104. if r.pos.Start >= 0 && r.pos.Start < r.sourceLength {
  105. if r.pos.Padding != 0 {
  106. return space[0]
  107. }
  108. return r.source[r.pos.Start]
  109. }
  110. return EOF
  111. }
  112. func (r *reader) PeekLine() ([]byte, Segment) {
  113. if r.pos.Start >= 0 && r.pos.Start < r.sourceLength {
  114. if r.peekedLine == nil {
  115. r.peekedLine = r.pos.Value(r.Source())
  116. }
  117. return r.peekedLine, r.pos
  118. }
  119. return nil, r.pos
  120. }
  121. // io.RuneReader interface.
  122. func (r *reader) ReadRune() (rune, int, error) {
  123. return readRuneReader(r)
  124. }
  125. func (r *reader) LineOffset() int {
  126. if r.lineOffset < 0 {
  127. v := 0
  128. for i := r.head; i < r.pos.Start; i++ {
  129. if r.source[i] == '\t' {
  130. v += util.TabWidth(v)
  131. } else {
  132. v++
  133. }
  134. }
  135. r.lineOffset = v - r.pos.Padding
  136. }
  137. return r.lineOffset
  138. }
  139. func (r *reader) PrecendingCharacter() rune {
  140. if r.pos.Start <= 0 {
  141. if r.pos.Padding != 0 {
  142. return rune(' ')
  143. }
  144. return rune('\n')
  145. }
  146. i := r.pos.Start - 1
  147. for ; i >= 0; i-- {
  148. if utf8.RuneStart(r.source[i]) {
  149. break
  150. }
  151. }
  152. rn, _ := utf8.DecodeRune(r.source[i:])
  153. return rn
  154. }
  155. func (r *reader) Advance(n int) {
  156. r.lineOffset = -1
  157. if n < len(r.peekedLine) && r.pos.Padding == 0 {
  158. r.pos.Start += n
  159. r.peekedLine = nil
  160. return
  161. }
  162. r.peekedLine = nil
  163. l := r.sourceLength
  164. for ; n > 0 && r.pos.Start < l; n-- {
  165. if r.pos.Padding != 0 {
  166. r.pos.Padding--
  167. continue
  168. }
  169. if r.source[r.pos.Start] == '\n' {
  170. r.AdvanceLine()
  171. continue
  172. }
  173. r.pos.Start++
  174. }
  175. }
  176. func (r *reader) AdvanceAndSetPadding(n, padding int) {
  177. r.Advance(n)
  178. if padding > r.pos.Padding {
  179. r.SetPadding(padding)
  180. }
  181. }
  182. func (r *reader) AdvanceLine() {
  183. r.lineOffset = -1
  184. r.peekedLine = nil
  185. r.pos.Start = r.pos.Stop
  186. r.head = r.pos.Start
  187. if r.pos.Start < 0 {
  188. return
  189. }
  190. r.pos.Stop = r.sourceLength
  191. for i := r.pos.Start; i < r.sourceLength; i++ {
  192. c := r.source[i]
  193. if c == '\n' {
  194. r.pos.Stop = i + 1
  195. break
  196. }
  197. }
  198. r.line++
  199. r.pos.Padding = 0
  200. }
  201. func (r *reader) Position() (int, Segment) {
  202. return r.line, r.pos
  203. }
  204. func (r *reader) SetPosition(line int, pos Segment) {
  205. r.lineOffset = -1
  206. r.line = line
  207. r.pos = pos
  208. }
  209. func (r *reader) SetPadding(v int) {
  210. r.pos.Padding = v
  211. }
  212. func (r *reader) SkipSpaces() (Segment, int, bool) {
  213. return skipSpacesReader(r)
  214. }
  215. func (r *reader) SkipBlankLines() (Segment, int, bool) {
  216. return skipBlankLinesReader(r)
  217. }
  218. func (r *reader) Match(reg *regexp.Regexp) bool {
  219. return matchReader(r, reg)
  220. }
  221. func (r *reader) FindSubMatch(reg *regexp.Regexp) [][]byte {
  222. return findSubMatchReader(r, reg)
  223. }
  224. // A BlockReader interface is a reader that is optimized for Blocks.
  225. type BlockReader interface {
  226. Reader
  227. // Reset resets current state and sets new segments to the reader.
  228. Reset(segment *Segments)
  229. }
  230. type blockReader struct {
  231. source []byte
  232. segments *Segments
  233. segmentsLength int
  234. line int
  235. pos Segment
  236. head int
  237. last int
  238. lineOffset int
  239. }
  240. // NewBlockReader returns a new BlockReader.
  241. func NewBlockReader(source []byte, segments *Segments) BlockReader {
  242. r := &blockReader{
  243. source: source,
  244. }
  245. if segments != nil {
  246. r.Reset(segments)
  247. }
  248. return r
  249. }
  250. func (r *blockReader) FindClosure(opener, closer byte, options FindClosureOptions) (*Segments, bool) {
  251. return findClosureReader(r, opener, closer, options)
  252. }
  253. func (r *blockReader) ResetPosition() {
  254. r.line = -1
  255. r.head = 0
  256. r.last = 0
  257. r.lineOffset = -1
  258. r.pos.Start = -1
  259. r.pos.Stop = -1
  260. r.pos.Padding = 0
  261. if r.segmentsLength > 0 {
  262. last := r.segments.At(r.segmentsLength - 1)
  263. r.last = last.Stop
  264. }
  265. r.AdvanceLine()
  266. }
  267. func (r *blockReader) Reset(segments *Segments) {
  268. r.segments = segments
  269. r.segmentsLength = segments.Len()
  270. r.ResetPosition()
  271. }
  272. func (r *blockReader) Source() []byte {
  273. return r.source
  274. }
  275. func (r *blockReader) Value(seg Segment) []byte {
  276. line := r.segmentsLength - 1
  277. ret := make([]byte, 0, seg.Stop-seg.Start+1)
  278. for ; line >= 0; line-- {
  279. if seg.Start >= r.segments.At(line).Start {
  280. break
  281. }
  282. }
  283. i := seg.Start
  284. for ; line < r.segmentsLength; line++ {
  285. s := r.segments.At(line)
  286. if i < 0 {
  287. i = s.Start
  288. }
  289. ret = s.ConcatPadding(ret)
  290. for ; i < seg.Stop && i < s.Stop; i++ {
  291. ret = append(ret, r.source[i])
  292. }
  293. i = -1
  294. if s.Stop > seg.Stop {
  295. break
  296. }
  297. }
  298. return ret
  299. }
  300. // io.RuneReader interface.
  301. func (r *blockReader) ReadRune() (rune, int, error) {
  302. return readRuneReader(r)
  303. }
  304. func (r *blockReader) PrecendingCharacter() rune {
  305. if r.pos.Padding != 0 {
  306. return rune(' ')
  307. }
  308. if r.segments.Len() < 1 {
  309. return rune('\n')
  310. }
  311. firstSegment := r.segments.At(0)
  312. if r.line == 0 && r.pos.Start <= firstSegment.Start {
  313. return rune('\n')
  314. }
  315. l := len(r.source)
  316. i := r.pos.Start - 1
  317. for ; i < l && i >= 0; i-- {
  318. if utf8.RuneStart(r.source[i]) {
  319. break
  320. }
  321. }
  322. if i < 0 || i >= l {
  323. return rune('\n')
  324. }
  325. rn, _ := utf8.DecodeRune(r.source[i:])
  326. return rn
  327. }
  328. func (r *blockReader) LineOffset() int {
  329. if r.lineOffset < 0 {
  330. v := 0
  331. for i := r.head; i < r.pos.Start; i++ {
  332. if r.source[i] == '\t' {
  333. v += util.TabWidth(v)
  334. } else {
  335. v++
  336. }
  337. }
  338. r.lineOffset = v - r.pos.Padding
  339. }
  340. return r.lineOffset
  341. }
  342. func (r *blockReader) Peek() byte {
  343. if r.line < r.segmentsLength && r.pos.Start >= 0 && r.pos.Start < r.last {
  344. if r.pos.Padding != 0 {
  345. return space[0]
  346. }
  347. return r.source[r.pos.Start]
  348. }
  349. return EOF
  350. }
  351. func (r *blockReader) PeekLine() ([]byte, Segment) {
  352. if r.line < r.segmentsLength && r.pos.Start >= 0 && r.pos.Start < r.last {
  353. return r.pos.Value(r.source), r.pos
  354. }
  355. return nil, r.pos
  356. }
  357. func (r *blockReader) Advance(n int) {
  358. r.lineOffset = -1
  359. if n < r.pos.Stop-r.pos.Start && r.pos.Padding == 0 {
  360. r.pos.Start += n
  361. return
  362. }
  363. for ; n > 0; n-- {
  364. if r.pos.Padding != 0 {
  365. r.pos.Padding--
  366. continue
  367. }
  368. if r.pos.Start >= r.pos.Stop-1 && r.pos.Stop < r.last {
  369. r.AdvanceLine()
  370. continue
  371. }
  372. r.pos.Start++
  373. }
  374. }
  375. func (r *blockReader) AdvanceAndSetPadding(n, padding int) {
  376. r.Advance(n)
  377. if padding > r.pos.Padding {
  378. r.SetPadding(padding)
  379. }
  380. }
  381. func (r *blockReader) AdvanceLine() {
  382. r.SetPosition(r.line+1, NewSegment(invalidValue, invalidValue))
  383. r.head = r.pos.Start
  384. }
  385. func (r *blockReader) Position() (int, Segment) {
  386. return r.line, r.pos
  387. }
  388. func (r *blockReader) SetPosition(line int, pos Segment) {
  389. r.lineOffset = -1
  390. r.line = line
  391. if pos.Start == invalidValue {
  392. if r.line < r.segmentsLength {
  393. s := r.segments.At(line)
  394. r.head = s.Start
  395. r.pos = s
  396. }
  397. } else {
  398. r.pos = pos
  399. if r.line < r.segmentsLength {
  400. s := r.segments.At(line)
  401. r.head = s.Start
  402. }
  403. }
  404. }
  405. func (r *blockReader) SetPadding(v int) {
  406. r.lineOffset = -1
  407. r.pos.Padding = v
  408. }
  409. func (r *blockReader) SkipSpaces() (Segment, int, bool) {
  410. return skipSpacesReader(r)
  411. }
  412. func (r *blockReader) SkipBlankLines() (Segment, int, bool) {
  413. return skipBlankLinesReader(r)
  414. }
  415. func (r *blockReader) Match(reg *regexp.Regexp) bool {
  416. return matchReader(r, reg)
  417. }
  418. func (r *blockReader) FindSubMatch(reg *regexp.Regexp) [][]byte {
  419. return findSubMatchReader(r, reg)
  420. }
  421. func skipBlankLinesReader(r Reader) (Segment, int, bool) {
  422. lines := 0
  423. for {
  424. line, seg := r.PeekLine()
  425. if line == nil {
  426. return seg, lines, false
  427. }
  428. if util.IsBlank(line) {
  429. lines++
  430. r.AdvanceLine()
  431. } else {
  432. return seg, lines, true
  433. }
  434. }
  435. }
  436. func skipSpacesReader(r Reader) (Segment, int, bool) {
  437. chars := 0
  438. for {
  439. line, segment := r.PeekLine()
  440. if line == nil {
  441. return segment, chars, false
  442. }
  443. for i, c := range line {
  444. if util.IsSpace(c) {
  445. chars++
  446. r.Advance(1)
  447. continue
  448. }
  449. return segment.WithStart(segment.Start + i + 1), chars, true
  450. }
  451. }
  452. }
  453. func matchReader(r Reader, reg *regexp.Regexp) bool {
  454. oldline, oldseg := r.Position()
  455. match := reg.FindReaderSubmatchIndex(r)
  456. r.SetPosition(oldline, oldseg)
  457. if match == nil {
  458. return false
  459. }
  460. r.Advance(match[1] - match[0])
  461. return true
  462. }
  463. func findSubMatchReader(r Reader, reg *regexp.Regexp) [][]byte {
  464. oldLine, oldSeg := r.Position()
  465. match := reg.FindReaderSubmatchIndex(r)
  466. r.SetPosition(oldLine, oldSeg)
  467. if match == nil {
  468. return nil
  469. }
  470. var bb bytes.Buffer
  471. bb.Grow(match[1] - match[0])
  472. for i := 0; i < match[1]; {
  473. r, size, _ := readRuneReader(r)
  474. i += size
  475. bb.WriteRune(r)
  476. }
  477. bs := bb.Bytes()
  478. var result [][]byte
  479. for i := 0; i < len(match); i += 2 {
  480. if match[i] < 0 {
  481. result = append(result, []byte{})
  482. continue
  483. }
  484. result = append(result, bs[match[i]:match[i+1]])
  485. }
  486. r.SetPosition(oldLine, oldSeg)
  487. r.Advance(match[1] - match[0])
  488. return result
  489. }
  490. func readRuneReader(r Reader) (rune, int, error) {
  491. line, _ := r.PeekLine()
  492. if line == nil {
  493. return 0, 0, io.EOF
  494. }
  495. rn, size := utf8.DecodeRune(line)
  496. if rn == utf8.RuneError {
  497. return 0, 0, io.EOF
  498. }
  499. r.Advance(size)
  500. return rn, size, nil
  501. }
  502. func findClosureReader(r Reader, opener, closer byte, opts FindClosureOptions) (*Segments, bool) {
  503. opened := 1
  504. codeSpanOpener := 0
  505. closed := false
  506. orgline, orgpos := r.Position()
  507. var ret *Segments
  508. for {
  509. bs, seg := r.PeekLine()
  510. if bs == nil {
  511. goto end
  512. }
  513. i := 0
  514. for i < len(bs) {
  515. c := bs[i]
  516. if opts.CodeSpan && codeSpanOpener != 0 && c == '`' {
  517. codeSpanCloser := 0
  518. for ; i < len(bs); i++ {
  519. if bs[i] == '`' {
  520. codeSpanCloser++
  521. } else {
  522. i--
  523. break
  524. }
  525. }
  526. if codeSpanCloser == codeSpanOpener {
  527. codeSpanOpener = 0
  528. }
  529. } else if codeSpanOpener == 0 && c == '\\' && i < len(bs)-1 && util.IsPunct(bs[i+1]) {
  530. i += 2
  531. continue
  532. } else if opts.CodeSpan && codeSpanOpener == 0 && c == '`' {
  533. for ; i < len(bs); i++ {
  534. if bs[i] == '`' {
  535. codeSpanOpener++
  536. } else {
  537. i--
  538. break
  539. }
  540. }
  541. } else if (opts.CodeSpan && codeSpanOpener == 0) || !opts.CodeSpan {
  542. if c == closer {
  543. opened--
  544. if opened == 0 {
  545. if ret == nil {
  546. ret = NewSegments()
  547. }
  548. ret.Append(seg.WithStop(seg.Start + i))
  549. r.Advance(i + 1)
  550. closed = true
  551. goto end
  552. }
  553. } else if c == opener {
  554. if !opts.Nesting {
  555. goto end
  556. }
  557. opened++
  558. }
  559. }
  560. i++
  561. }
  562. if !opts.Newline {
  563. goto end
  564. }
  565. r.AdvanceLine()
  566. if ret == nil {
  567. ret = NewSegments()
  568. }
  569. ret.Append(seg)
  570. }
  571. end:
  572. if !opts.Advance {
  573. r.SetPosition(orgline, orgpos)
  574. }
  575. if closed {
  576. return ret, true
  577. }
  578. return nil, false
  579. }