raw_html.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. package parser
  2. import (
  3. "bytes"
  4. "regexp"
  5. "github.com/yuin/goldmark/ast"
  6. "github.com/yuin/goldmark/text"
  7. "github.com/yuin/goldmark/util"
  8. )
  9. type rawHTMLParser struct {
  10. }
  11. var defaultRawHTMLParser = &rawHTMLParser{}
  12. // NewRawHTMLParser return a new InlineParser that can parse
  13. // inline htmls.
  14. func NewRawHTMLParser() InlineParser {
  15. return defaultRawHTMLParser
  16. }
  17. func (s *rawHTMLParser) Trigger() []byte {
  18. return []byte{'<'}
  19. }
  20. func (s *rawHTMLParser) Parse(parent ast.Node, block text.Reader, pc Context) ast.Node {
  21. line, _ := block.PeekLine()
  22. if len(line) > 1 && util.IsAlphaNumeric(line[1]) {
  23. return s.parseMultiLineRegexp(openTagRegexp, block, pc)
  24. }
  25. if len(line) > 2 && line[1] == '/' && util.IsAlphaNumeric(line[2]) {
  26. return s.parseMultiLineRegexp(closeTagRegexp, block, pc)
  27. }
  28. if bytes.HasPrefix(line, openComment) {
  29. return s.parseComment(block, pc)
  30. }
  31. if bytes.HasPrefix(line, openProcessingInstruction) {
  32. return s.parseUntil(block, closeProcessingInstruction, pc)
  33. }
  34. if len(line) > 2 && line[1] == '!' && line[2] >= 'A' && line[2] <= 'Z' {
  35. return s.parseUntil(block, closeDecl, pc)
  36. }
  37. if bytes.HasPrefix(line, openCDATA) {
  38. return s.parseUntil(block, closeCDATA, pc)
  39. }
  40. return nil
  41. }
  42. var tagnamePattern = `([A-Za-z][A-Za-z0-9-]*)`
  43. var spaceOrOneNewline = `(?:[ \t]|(?:\r\n|\n){0,1})`
  44. var attributePattern = `(?:[\r\n \t]+[a-zA-Z_:][a-zA-Z0-9:._-]*(?:[\r\n \t]*=[\r\n \t]*(?:[^\"'=<>` + "`" + `\x00-\x20]+|'[^']*'|"[^"]*"))?)` //nolint:golint,lll
  45. var openTagRegexp = regexp.MustCompile("^<" + tagnamePattern + attributePattern + `*` + spaceOrOneNewline + `*/?>`)
  46. var closeTagRegexp = regexp.MustCompile("^</" + tagnamePattern + spaceOrOneNewline + `*>`)
  47. var openProcessingInstruction = []byte("<?")
  48. var closeProcessingInstruction = []byte("?>")
  49. var openCDATA = []byte("<![CDATA[")
  50. var closeCDATA = []byte("]]>")
  51. var closeDecl = []byte(">")
  52. var emptyComment = []byte("<!---->")
  53. var invalidComment1 = []byte("<!-->")
  54. var invalidComment2 = []byte("<!--->")
  55. var openComment = []byte("<!--")
  56. var closeComment = []byte("-->")
  57. var doubleHyphen = []byte("--")
  58. func (s *rawHTMLParser) parseComment(block text.Reader, pc Context) ast.Node {
  59. savedLine, savedSegment := block.Position()
  60. node := ast.NewRawHTML()
  61. line, segment := block.PeekLine()
  62. if bytes.HasPrefix(line, emptyComment) {
  63. node.Segments.Append(segment.WithStop(segment.Start + len(emptyComment)))
  64. block.Advance(len(emptyComment))
  65. return node
  66. }
  67. if bytes.HasPrefix(line, invalidComment1) || bytes.HasPrefix(line, invalidComment2) {
  68. return nil
  69. }
  70. offset := len(openComment)
  71. line = line[offset:]
  72. for {
  73. hindex := bytes.Index(line, doubleHyphen)
  74. if hindex > -1 {
  75. hindex += offset
  76. }
  77. index := bytes.Index(line, closeComment) + offset
  78. if index > -1 && hindex == index {
  79. if index == 0 || len(line) < 2 || line[index-offset-1] != '-' {
  80. node.Segments.Append(segment.WithStop(segment.Start + index + len(closeComment)))
  81. block.Advance(index + len(closeComment))
  82. return node
  83. }
  84. }
  85. if hindex > 0 {
  86. break
  87. }
  88. node.Segments.Append(segment)
  89. block.AdvanceLine()
  90. line, segment = block.PeekLine()
  91. offset = 0
  92. if line == nil {
  93. break
  94. }
  95. }
  96. block.SetPosition(savedLine, savedSegment)
  97. return nil
  98. }
  99. func (s *rawHTMLParser) parseUntil(block text.Reader, closer []byte, pc Context) ast.Node {
  100. savedLine, savedSegment := block.Position()
  101. node := ast.NewRawHTML()
  102. for {
  103. line, segment := block.PeekLine()
  104. if line == nil {
  105. break
  106. }
  107. index := bytes.Index(line, closer)
  108. if index > -1 {
  109. node.Segments.Append(segment.WithStop(segment.Start + index + len(closer)))
  110. block.Advance(index + len(closer))
  111. return node
  112. }
  113. node.Segments.Append(segment)
  114. block.AdvanceLine()
  115. }
  116. block.SetPosition(savedLine, savedSegment)
  117. return nil
  118. }
  119. func (s *rawHTMLParser) parseMultiLineRegexp(reg *regexp.Regexp, block text.Reader, pc Context) ast.Node {
  120. sline, ssegment := block.Position()
  121. if block.Match(reg) {
  122. node := ast.NewRawHTML()
  123. eline, esegment := block.Position()
  124. block.SetPosition(sline, ssegment)
  125. for {
  126. line, segment := block.PeekLine()
  127. if line == nil {
  128. break
  129. }
  130. l, _ := block.Position()
  131. start := segment.Start
  132. if l == sline {
  133. start = ssegment.Start
  134. }
  135. end := segment.Stop
  136. if l == eline {
  137. end = esegment.Start
  138. }
  139. node.Segments.Append(text.NewSegment(start, end))
  140. if l == eline {
  141. block.Advance(end - start)
  142. break
  143. }
  144. block.AdvanceLine()
  145. }
  146. return node
  147. }
  148. return nil
  149. }