html_block.go 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. package parser
  2. import (
  3. "bytes"
  4. "regexp"
  5. "strings"
  6. "github.com/yuin/goldmark/ast"
  7. "github.com/yuin/goldmark/text"
  8. "github.com/yuin/goldmark/util"
  9. )
  10. var allowedBlockTags = map[string]bool{
  11. "address": true,
  12. "article": true,
  13. "aside": true,
  14. "base": true,
  15. "basefont": true,
  16. "blockquote": true,
  17. "body": true,
  18. "caption": true,
  19. "center": true,
  20. "col": true,
  21. "colgroup": true,
  22. "dd": true,
  23. "details": true,
  24. "dialog": true,
  25. "dir": true,
  26. "div": true,
  27. "dl": true,
  28. "dt": true,
  29. "fieldset": true,
  30. "figcaption": true,
  31. "figure": true,
  32. "footer": true,
  33. "form": true,
  34. "frame": true,
  35. "frameset": true,
  36. "h1": true,
  37. "h2": true,
  38. "h3": true,
  39. "h4": true,
  40. "h5": true,
  41. "h6": true,
  42. "head": true,
  43. "header": true,
  44. "hr": true,
  45. "html": true,
  46. "iframe": true,
  47. "legend": true,
  48. "li": true,
  49. "link": true,
  50. "main": true,
  51. "menu": true,
  52. "menuitem": true,
  53. "meta": true,
  54. "nav": true,
  55. "noframes": true,
  56. "ol": true,
  57. "optgroup": true,
  58. "option": true,
  59. "p": true,
  60. "param": true,
  61. "section": true,
  62. "source": true,
  63. "summary": true,
  64. "table": true,
  65. "tbody": true,
  66. "td": true,
  67. "tfoot": true,
  68. "th": true,
  69. "thead": true,
  70. "title": true,
  71. "tr": true,
  72. "track": true,
  73. "ul": true,
  74. }
  75. var htmlBlockType1OpenRegexp = regexp.MustCompile(`(?i)^[ ]{0,3}<(script|pre|style|textarea)(?:\s.*|>.*|/>.*|)(?:\r\n|\n)?$`) //nolint:golint,lll
  76. var htmlBlockType1CloseRegexp = regexp.MustCompile(`(?i)^.*</(?:script|pre|style|textarea)>.*`)
  77. var htmlBlockType2OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<!\-\-`)
  78. var htmlBlockType2Close = []byte{'-', '-', '>'}
  79. var htmlBlockType3OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\?`)
  80. var htmlBlockType3Close = []byte{'?', '>'}
  81. var htmlBlockType4OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<![A-Z]+.*(?:\r\n|\n)?$`)
  82. var htmlBlockType4Close = []byte{'>'}
  83. var htmlBlockType5OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\!\[CDATA\[`)
  84. var htmlBlockType5Close = []byte{']', ']', '>'}
  85. var htmlBlockType6Regexp = regexp.MustCompile(`^[ ]{0,3}<(?:/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(?:[ ].*|>.*|/>.*|)(?:\r\n|\n)?$`) //nolint:golint,lll
  86. var htmlBlockType7Regexp = regexp.MustCompile(`^[ ]{0,3}<(/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(` + attributePattern + `*)[ ]*(?:>|/>)[ ]*(?:\r\n|\n)?$`) //nolint:golint,lll
  87. type htmlBlockParser struct {
  88. }
  89. var defaultHTMLBlockParser = &htmlBlockParser{}
  90. // NewHTMLBlockParser return a new BlockParser that can parse html
  91. // blocks.
  92. func NewHTMLBlockParser() BlockParser {
  93. return defaultHTMLBlockParser
  94. }
  95. func (b *htmlBlockParser) Trigger() []byte {
  96. return []byte{'<'}
  97. }
  98. func (b *htmlBlockParser) Open(parent ast.Node, reader text.Reader, pc Context) (ast.Node, State) {
  99. var node *ast.HTMLBlock
  100. line, segment := reader.PeekLine()
  101. last := pc.LastOpenedBlock().Node
  102. if pos := pc.BlockOffset(); pos < 0 || line[pos] != '<' {
  103. return nil, NoChildren
  104. }
  105. if m := htmlBlockType1OpenRegexp.FindSubmatchIndex(line); m != nil {
  106. node = ast.NewHTMLBlock(ast.HTMLBlockType1)
  107. } else if htmlBlockType2OpenRegexp.Match(line) {
  108. node = ast.NewHTMLBlock(ast.HTMLBlockType2)
  109. } else if htmlBlockType3OpenRegexp.Match(line) {
  110. node = ast.NewHTMLBlock(ast.HTMLBlockType3)
  111. } else if htmlBlockType4OpenRegexp.Match(line) {
  112. node = ast.NewHTMLBlock(ast.HTMLBlockType4)
  113. } else if htmlBlockType5OpenRegexp.Match(line) {
  114. node = ast.NewHTMLBlock(ast.HTMLBlockType5)
  115. } else if match := htmlBlockType7Regexp.FindSubmatchIndex(line); match != nil {
  116. isCloseTag := match[2] > -1 && bytes.Equal(line[match[2]:match[3]], []byte("/"))
  117. hasAttr := match[6] != match[7]
  118. tagName := strings.ToLower(string(line[match[4]:match[5]]))
  119. _, ok := allowedBlockTags[tagName]
  120. if ok {
  121. node = ast.NewHTMLBlock(ast.HTMLBlockType6)
  122. } else if tagName != "script" && tagName != "style" &&
  123. tagName != "pre" && !ast.IsParagraph(last) && !(isCloseTag && hasAttr) { // type 7 can not interrupt paragraph
  124. node = ast.NewHTMLBlock(ast.HTMLBlockType7)
  125. }
  126. }
  127. if node == nil {
  128. if match := htmlBlockType6Regexp.FindSubmatchIndex(line); match != nil {
  129. tagName := string(line[match[2]:match[3]])
  130. _, ok := allowedBlockTags[strings.ToLower(tagName)]
  131. if ok {
  132. node = ast.NewHTMLBlock(ast.HTMLBlockType6)
  133. }
  134. }
  135. }
  136. if node != nil {
  137. reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
  138. node.Lines().Append(segment)
  139. return node, NoChildren
  140. }
  141. return nil, NoChildren
  142. }
  143. func (b *htmlBlockParser) Continue(node ast.Node, reader text.Reader, pc Context) State {
  144. htmlBlock := node.(*ast.HTMLBlock)
  145. lines := htmlBlock.Lines()
  146. line, segment := reader.PeekLine()
  147. var closurePattern []byte
  148. switch htmlBlock.HTMLBlockType {
  149. case ast.HTMLBlockType1:
  150. if lines.Len() == 1 {
  151. firstLine := lines.At(0)
  152. if htmlBlockType1CloseRegexp.Match(firstLine.Value(reader.Source())) {
  153. return Close
  154. }
  155. }
  156. if htmlBlockType1CloseRegexp.Match(line) {
  157. htmlBlock.ClosureLine = segment
  158. reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
  159. return Close
  160. }
  161. case ast.HTMLBlockType2:
  162. closurePattern = htmlBlockType2Close
  163. fallthrough
  164. case ast.HTMLBlockType3:
  165. if closurePattern == nil {
  166. closurePattern = htmlBlockType3Close
  167. }
  168. fallthrough
  169. case ast.HTMLBlockType4:
  170. if closurePattern == nil {
  171. closurePattern = htmlBlockType4Close
  172. }
  173. fallthrough
  174. case ast.HTMLBlockType5:
  175. if closurePattern == nil {
  176. closurePattern = htmlBlockType5Close
  177. }
  178. if lines.Len() == 1 {
  179. firstLine := lines.At(0)
  180. if bytes.Contains(firstLine.Value(reader.Source()), closurePattern) {
  181. return Close
  182. }
  183. }
  184. if bytes.Contains(line, closurePattern) {
  185. htmlBlock.ClosureLine = segment
  186. reader.Advance(segment.Len())
  187. return Close
  188. }
  189. case ast.HTMLBlockType6, ast.HTMLBlockType7:
  190. if util.IsBlank(line) {
  191. return Close
  192. }
  193. }
  194. node.Lines().Append(segment)
  195. reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
  196. return Continue | NoChildren
  197. }
  198. func (b *htmlBlockParser) Close(node ast.Node, reader text.Reader, pc Context) {
  199. // nothing to do
  200. }
  201. func (b *htmlBlockParser) CanInterruptParagraph() bool {
  202. return true
  203. }
  204. func (b *htmlBlockParser) CanAcceptIndentedLine() bool {
  205. return false
  206. }