reader.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653
  1. package text
  2. import (
  3. "io"
  4. "regexp"
  5. "unicode/utf8"
  6. "github.com/yuin/goldmark/util"
  7. )
  8. const invalidValue = -1
  9. // EOF indicates the end of file.
  10. const EOF = byte(0xff)
  11. // A Reader interface provides abstracted method for reading text.
  12. type Reader interface {
  13. io.RuneReader
  14. // Source returns a source of the reader.
  15. Source() []byte
  16. // ResetPosition resets positions.
  17. ResetPosition()
  18. // Peek returns a byte at current position without advancing the internal pointer.
  19. Peek() byte
  20. // PeekLine returns the current line without advancing the internal pointer.
  21. PeekLine() ([]byte, Segment)
  22. // PrecendingCharacter returns a character just before current internal pointer.
  23. PrecendingCharacter() rune
  24. // Value returns a value of the given segment.
  25. Value(Segment) []byte
  26. // LineOffset returns a distance from the line head to current position.
  27. LineOffset() int
  28. // Position returns current line number and position.
  29. Position() (int, Segment)
  30. // SetPosition sets current line number and position.
  31. SetPosition(int, Segment)
  32. // SetPadding sets padding to the reader.
  33. SetPadding(int)
  34. // Advance advances the internal pointer.
  35. Advance(int)
  36. // AdvanceAndSetPadding advances the internal pointer and add padding to the
  37. // reader.
  38. AdvanceAndSetPadding(int, int)
  39. // AdvanceLine advances the internal pointer to the next line head.
  40. AdvanceLine()
  41. // SkipSpaces skips space characters and returns a non-blank line.
  42. // If it reaches EOF, returns false.
  43. SkipSpaces() (Segment, int, bool)
  44. // SkipSpaces skips blank lines and returns a non-blank line.
  45. // If it reaches EOF, returns false.
  46. SkipBlankLines() (Segment, int, bool)
  47. // Match performs regular expression matching to current line.
  48. Match(reg *regexp.Regexp) bool
  49. // Match performs regular expression searching to current line.
  50. FindSubMatch(reg *regexp.Regexp) [][]byte
  51. // FindClosure finds corresponding closure.
  52. FindClosure(opener, closer byte, options FindClosureOptions) (*Segments, bool)
  53. }
  54. // FindClosureOptions is options for Reader.FindClosure
  55. type FindClosureOptions struct {
  56. // CodeSpan is a flag for the FindClosure. If this is set to true,
  57. // FindClosure ignores closers in codespans.
  58. CodeSpan bool
  59. // Nesting is a flag for the FindClosure. If this is set to true,
  60. // FindClosure allows nesting.
  61. Nesting bool
  62. // Newline is a flag for the FindClosure. If this is set to true,
  63. // FindClosure searches for a closer over multiple lines.
  64. Newline bool
  65. // Advance is a flag for the FindClosure. If this is set to true,
  66. // FindClosure advances pointers when closer is found.
  67. Advance bool
  68. }
  69. type reader struct {
  70. source []byte
  71. sourceLength int
  72. line int
  73. peekedLine []byte
  74. pos Segment
  75. head int
  76. lineOffset int
  77. }
  78. // NewReader return a new Reader that can read UTF-8 bytes .
  79. func NewReader(source []byte) Reader {
  80. r := &reader{
  81. source: source,
  82. sourceLength: len(source),
  83. }
  84. r.ResetPosition()
  85. return r
  86. }
  87. func (r *reader) FindClosure(opener, closer byte, options FindClosureOptions) (*Segments, bool) {
  88. return findClosureReader(r, opener, closer, options)
  89. }
  90. func (r *reader) ResetPosition() {
  91. r.line = -1
  92. r.head = 0
  93. r.lineOffset = -1
  94. r.AdvanceLine()
  95. }
  96. func (r *reader) Source() []byte {
  97. return r.source
  98. }
  99. func (r *reader) Value(seg Segment) []byte {
  100. return seg.Value(r.source)
  101. }
  102. func (r *reader) Peek() byte {
  103. if r.pos.Start >= 0 && r.pos.Start < r.sourceLength {
  104. if r.pos.Padding != 0 {
  105. return space[0]
  106. }
  107. return r.source[r.pos.Start]
  108. }
  109. return EOF
  110. }
  111. func (r *reader) PeekLine() ([]byte, Segment) {
  112. if r.pos.Start >= 0 && r.pos.Start < r.sourceLength {
  113. if r.peekedLine == nil {
  114. r.peekedLine = r.pos.Value(r.Source())
  115. }
  116. return r.peekedLine, r.pos
  117. }
  118. return nil, r.pos
  119. }
  120. // io.RuneReader interface
  121. func (r *reader) ReadRune() (rune, int, error) {
  122. return readRuneReader(r)
  123. }
  124. func (r *reader) LineOffset() int {
  125. if r.lineOffset < 0 {
  126. v := 0
  127. for i := r.head; i < r.pos.Start; i++ {
  128. if r.source[i] == '\t' {
  129. v += util.TabWidth(v)
  130. } else {
  131. v++
  132. }
  133. }
  134. r.lineOffset = v - r.pos.Padding
  135. }
  136. return r.lineOffset
  137. }
  138. func (r *reader) PrecendingCharacter() rune {
  139. if r.pos.Start <= 0 {
  140. if r.pos.Padding != 0 {
  141. return rune(' ')
  142. }
  143. return rune('\n')
  144. }
  145. i := r.pos.Start - 1
  146. for ; i >= 0; i-- {
  147. if utf8.RuneStart(r.source[i]) {
  148. break
  149. }
  150. }
  151. rn, _ := utf8.DecodeRune(r.source[i:])
  152. return rn
  153. }
  154. func (r *reader) Advance(n int) {
  155. r.lineOffset = -1
  156. if n < len(r.peekedLine) && r.pos.Padding == 0 {
  157. r.pos.Start += n
  158. r.peekedLine = nil
  159. return
  160. }
  161. r.peekedLine = nil
  162. l := r.sourceLength
  163. for ; n > 0 && r.pos.Start < l; n-- {
  164. if r.pos.Padding != 0 {
  165. r.pos.Padding--
  166. continue
  167. }
  168. if r.source[r.pos.Start] == '\n' {
  169. r.AdvanceLine()
  170. continue
  171. }
  172. r.pos.Start++
  173. }
  174. }
  175. func (r *reader) AdvanceAndSetPadding(n, padding int) {
  176. r.Advance(n)
  177. if padding > r.pos.Padding {
  178. r.SetPadding(padding)
  179. }
  180. }
  181. func (r *reader) AdvanceLine() {
  182. r.lineOffset = -1
  183. r.peekedLine = nil
  184. r.pos.Start = r.pos.Stop
  185. r.head = r.pos.Start
  186. if r.pos.Start < 0 {
  187. return
  188. }
  189. r.pos.Stop = r.sourceLength
  190. for i := r.pos.Start; i < r.sourceLength; i++ {
  191. c := r.source[i]
  192. if c == '\n' {
  193. r.pos.Stop = i + 1
  194. break
  195. }
  196. }
  197. r.line++
  198. r.pos.Padding = 0
  199. }
  200. func (r *reader) Position() (int, Segment) {
  201. return r.line, r.pos
  202. }
  203. func (r *reader) SetPosition(line int, pos Segment) {
  204. r.lineOffset = -1
  205. r.line = line
  206. r.pos = pos
  207. }
  208. func (r *reader) SetPadding(v int) {
  209. r.pos.Padding = v
  210. }
  211. func (r *reader) SkipSpaces() (Segment, int, bool) {
  212. return skipSpacesReader(r)
  213. }
  214. func (r *reader) SkipBlankLines() (Segment, int, bool) {
  215. return skipBlankLinesReader(r)
  216. }
  217. func (r *reader) Match(reg *regexp.Regexp) bool {
  218. return matchReader(r, reg)
  219. }
  220. func (r *reader) FindSubMatch(reg *regexp.Regexp) [][]byte {
  221. return findSubMatchReader(r, reg)
  222. }
  223. // A BlockReader interface is a reader that is optimized for Blocks.
  224. type BlockReader interface {
  225. Reader
  226. // Reset resets current state and sets new segments to the reader.
  227. Reset(segment *Segments)
  228. }
  229. type blockReader struct {
  230. source []byte
  231. segments *Segments
  232. segmentsLength int
  233. line int
  234. pos Segment
  235. head int
  236. last int
  237. lineOffset int
  238. }
  239. // NewBlockReader returns a new BlockReader.
  240. func NewBlockReader(source []byte, segments *Segments) BlockReader {
  241. r := &blockReader{
  242. source: source,
  243. }
  244. if segments != nil {
  245. r.Reset(segments)
  246. }
  247. return r
  248. }
  249. func (r *blockReader) FindClosure(opener, closer byte, options FindClosureOptions) (*Segments, bool) {
  250. return findClosureReader(r, opener, closer, options)
  251. }
  252. func (r *blockReader) ResetPosition() {
  253. r.line = -1
  254. r.head = 0
  255. r.last = 0
  256. r.lineOffset = -1
  257. r.pos.Start = -1
  258. r.pos.Stop = -1
  259. r.pos.Padding = 0
  260. if r.segmentsLength > 0 {
  261. last := r.segments.At(r.segmentsLength - 1)
  262. r.last = last.Stop
  263. }
  264. r.AdvanceLine()
  265. }
  266. func (r *blockReader) Reset(segments *Segments) {
  267. r.segments = segments
  268. r.segmentsLength = segments.Len()
  269. r.ResetPosition()
  270. }
  271. func (r *blockReader) Source() []byte {
  272. return r.source
  273. }
  274. func (r *blockReader) Value(seg Segment) []byte {
  275. line := r.segmentsLength - 1
  276. ret := make([]byte, 0, seg.Stop-seg.Start+1)
  277. for ; line >= 0; line-- {
  278. if seg.Start >= r.segments.At(line).Start {
  279. break
  280. }
  281. }
  282. i := seg.Start
  283. for ; line < r.segmentsLength; line++ {
  284. s := r.segments.At(line)
  285. if i < 0 {
  286. i = s.Start
  287. }
  288. ret = s.ConcatPadding(ret)
  289. for ; i < seg.Stop && i < s.Stop; i++ {
  290. ret = append(ret, r.source[i])
  291. }
  292. i = -1
  293. if s.Stop > seg.Stop {
  294. break
  295. }
  296. }
  297. return ret
  298. }
  299. // io.RuneReader interface
  300. func (r *blockReader) ReadRune() (rune, int, error) {
  301. return readRuneReader(r)
  302. }
  303. func (r *blockReader) PrecendingCharacter() rune {
  304. if r.pos.Padding != 0 {
  305. return rune(' ')
  306. }
  307. if r.segments.Len() < 1 {
  308. return rune('\n')
  309. }
  310. firstSegment := r.segments.At(0)
  311. if r.line == 0 && r.pos.Start <= firstSegment.Start {
  312. return rune('\n')
  313. }
  314. l := len(r.source)
  315. i := r.pos.Start - 1
  316. for ; i < l && i >= 0; i-- {
  317. if utf8.RuneStart(r.source[i]) {
  318. break
  319. }
  320. }
  321. if i < 0 || i >= l {
  322. return rune('\n')
  323. }
  324. rn, _ := utf8.DecodeRune(r.source[i:])
  325. return rn
  326. }
  327. func (r *blockReader) LineOffset() int {
  328. if r.lineOffset < 0 {
  329. v := 0
  330. for i := r.head; i < r.pos.Start; i++ {
  331. if r.source[i] == '\t' {
  332. v += util.TabWidth(v)
  333. } else {
  334. v++
  335. }
  336. }
  337. r.lineOffset = v - r.pos.Padding
  338. }
  339. return r.lineOffset
  340. }
  341. func (r *blockReader) Peek() byte {
  342. if r.line < r.segmentsLength && r.pos.Start >= 0 && r.pos.Start < r.last {
  343. if r.pos.Padding != 0 {
  344. return space[0]
  345. }
  346. return r.source[r.pos.Start]
  347. }
  348. return EOF
  349. }
  350. func (r *blockReader) PeekLine() ([]byte, Segment) {
  351. if r.line < r.segmentsLength && r.pos.Start >= 0 && r.pos.Start < r.last {
  352. return r.pos.Value(r.source), r.pos
  353. }
  354. return nil, r.pos
  355. }
  356. func (r *blockReader) Advance(n int) {
  357. r.lineOffset = -1
  358. if n < r.pos.Stop-r.pos.Start && r.pos.Padding == 0 {
  359. r.pos.Start += n
  360. return
  361. }
  362. for ; n > 0; n-- {
  363. if r.pos.Padding != 0 {
  364. r.pos.Padding--
  365. continue
  366. }
  367. if r.pos.Start >= r.pos.Stop-1 && r.pos.Stop < r.last {
  368. r.AdvanceLine()
  369. continue
  370. }
  371. r.pos.Start++
  372. }
  373. }
  374. func (r *blockReader) AdvanceAndSetPadding(n, padding int) {
  375. r.Advance(n)
  376. if padding > r.pos.Padding {
  377. r.SetPadding(padding)
  378. }
  379. }
  380. func (r *blockReader) AdvanceLine() {
  381. r.SetPosition(r.line+1, NewSegment(invalidValue, invalidValue))
  382. r.head = r.pos.Start
  383. }
  384. func (r *blockReader) Position() (int, Segment) {
  385. return r.line, r.pos
  386. }
  387. func (r *blockReader) SetPosition(line int, pos Segment) {
  388. r.lineOffset = -1
  389. r.line = line
  390. if pos.Start == invalidValue {
  391. if r.line < r.segmentsLength {
  392. s := r.segments.At(line)
  393. r.head = s.Start
  394. r.pos = s
  395. }
  396. } else {
  397. r.pos = pos
  398. if r.line < r.segmentsLength {
  399. s := r.segments.At(line)
  400. r.head = s.Start
  401. }
  402. }
  403. }
  404. func (r *blockReader) SetPadding(v int) {
  405. r.lineOffset = -1
  406. r.pos.Padding = v
  407. }
  408. func (r *blockReader) SkipSpaces() (Segment, int, bool) {
  409. return skipSpacesReader(r)
  410. }
  411. func (r *blockReader) SkipBlankLines() (Segment, int, bool) {
  412. return skipBlankLinesReader(r)
  413. }
  414. func (r *blockReader) Match(reg *regexp.Regexp) bool {
  415. return matchReader(r, reg)
  416. }
  417. func (r *blockReader) FindSubMatch(reg *regexp.Regexp) [][]byte {
  418. return findSubMatchReader(r, reg)
  419. }
  420. func skipBlankLinesReader(r Reader) (Segment, int, bool) {
  421. lines := 0
  422. for {
  423. line, seg := r.PeekLine()
  424. if line == nil {
  425. return seg, lines, false
  426. }
  427. if util.IsBlank(line) {
  428. lines++
  429. r.AdvanceLine()
  430. } else {
  431. return seg, lines, true
  432. }
  433. }
  434. }
  435. func skipSpacesReader(r Reader) (Segment, int, bool) {
  436. chars := 0
  437. for {
  438. line, segment := r.PeekLine()
  439. if line == nil {
  440. return segment, chars, false
  441. }
  442. for i, c := range line {
  443. if util.IsSpace(c) {
  444. chars++
  445. r.Advance(1)
  446. continue
  447. }
  448. return segment.WithStart(segment.Start + i + 1), chars, true
  449. }
  450. }
  451. }
  452. func matchReader(r Reader, reg *regexp.Regexp) bool {
  453. oldline, oldseg := r.Position()
  454. match := reg.FindReaderSubmatchIndex(r)
  455. r.SetPosition(oldline, oldseg)
  456. if match == nil {
  457. return false
  458. }
  459. r.Advance(match[1] - match[0])
  460. return true
  461. }
  462. func findSubMatchReader(r Reader, reg *regexp.Regexp) [][]byte {
  463. oldline, oldseg := r.Position()
  464. match := reg.FindReaderSubmatchIndex(r)
  465. r.SetPosition(oldline, oldseg)
  466. if match == nil {
  467. return nil
  468. }
  469. runes := make([]rune, 0, match[1]-match[0])
  470. for i := 0; i < match[1]; {
  471. r, size, _ := readRuneReader(r)
  472. i += size
  473. runes = append(runes, r)
  474. }
  475. result := [][]byte{}
  476. for i := 0; i < len(match); i += 2 {
  477. result = append(result, []byte(string(runes[match[i]:match[i+1]])))
  478. }
  479. r.SetPosition(oldline, oldseg)
  480. r.Advance(match[1] - match[0])
  481. return result
  482. }
  483. func readRuneReader(r Reader) (rune, int, error) {
  484. line, _ := r.PeekLine()
  485. if line == nil {
  486. return 0, 0, io.EOF
  487. }
  488. rn, size := utf8.DecodeRune(line)
  489. if rn == utf8.RuneError {
  490. return 0, 0, io.EOF
  491. }
  492. r.Advance(size)
  493. return rn, size, nil
  494. }
  495. func findClosureReader(r Reader, opener, closer byte, opts FindClosureOptions) (*Segments, bool) {
  496. opened := 1
  497. codeSpanOpener := 0
  498. closed := false
  499. orgline, orgpos := r.Position()
  500. var ret *Segments
  501. for {
  502. bs, seg := r.PeekLine()
  503. if bs == nil {
  504. goto end
  505. }
  506. i := 0
  507. for i < len(bs) {
  508. c := bs[i]
  509. if opts.CodeSpan && codeSpanOpener != 0 && c == '`' {
  510. codeSpanCloser := 0
  511. for ; i < len(bs); i++ {
  512. if bs[i] == '`' {
  513. codeSpanCloser++
  514. } else {
  515. i--
  516. break
  517. }
  518. }
  519. if codeSpanCloser == codeSpanOpener {
  520. codeSpanOpener = 0
  521. }
  522. } else if codeSpanOpener == 0 && c == '\\' && i < len(bs)-1 && util.IsPunct(bs[i+1]) {
  523. i += 2
  524. continue
  525. } else if opts.CodeSpan && codeSpanOpener == 0 && c == '`' {
  526. for ; i < len(bs); i++ {
  527. if bs[i] == '`' {
  528. codeSpanOpener++
  529. } else {
  530. i--
  531. break
  532. }
  533. }
  534. } else if (opts.CodeSpan && codeSpanOpener == 0) || !opts.CodeSpan {
  535. if c == closer {
  536. opened--
  537. if opened == 0 {
  538. if ret == nil {
  539. ret = NewSegments()
  540. }
  541. ret.Append(seg.WithStop(seg.Start + i))
  542. r.Advance(i + 1)
  543. closed = true
  544. goto end
  545. }
  546. } else if c == opener {
  547. if !opts.Nesting {
  548. goto end
  549. }
  550. opened++
  551. }
  552. }
  553. i++
  554. }
  555. if !opts.Newline {
  556. goto end
  557. }
  558. r.AdvanceLine()
  559. if ret == nil {
  560. ret = NewSegments()
  561. }
  562. ret.Append(seg)
  563. }
  564. end:
  565. if !opts.Advance {
  566. r.SetPosition(orgline, orgpos)
  567. }
  568. if closed {
  569. return ret, true
  570. }
  571. return nil, false
  572. }