util.go 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998
  1. // Package util provides utility functions for the goldmark.
  2. package util
  3. import (
  4. "bytes"
  5. "io"
  6. "net/url"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "unicode"
  11. "unicode/utf8"
  12. )
  13. // A CopyOnWriteBuffer is a byte buffer that copies buffer when
  14. // it need to be changed.
  15. type CopyOnWriteBuffer struct {
  16. buffer []byte
  17. copied bool
  18. }
  19. // NewCopyOnWriteBuffer returns a new CopyOnWriteBuffer.
  20. func NewCopyOnWriteBuffer(buffer []byte) CopyOnWriteBuffer {
  21. return CopyOnWriteBuffer{
  22. buffer: buffer,
  23. copied: false,
  24. }
  25. }
  26. // Write writes given bytes to the buffer.
  27. // Write allocate new buffer and clears it at the first time.
  28. func (b *CopyOnWriteBuffer) Write(value []byte) {
  29. if !b.copied {
  30. b.buffer = make([]byte, 0, len(b.buffer)+20)
  31. b.copied = true
  32. }
  33. b.buffer = append(b.buffer, value...)
  34. }
  35. // WriteString writes given string to the buffer.
  36. // WriteString allocate new buffer and clears it at the first time.
  37. func (b *CopyOnWriteBuffer) WriteString(value string) {
  38. b.Write(StringToReadOnlyBytes(value))
  39. }
  40. // Append appends given bytes to the buffer.
  41. // Append copy buffer at the first time.
  42. func (b *CopyOnWriteBuffer) Append(value []byte) {
  43. if !b.copied {
  44. tmp := make([]byte, len(b.buffer), len(b.buffer)+20)
  45. copy(tmp, b.buffer)
  46. b.buffer = tmp
  47. b.copied = true
  48. }
  49. b.buffer = append(b.buffer, value...)
  50. }
  51. // AppendString appends given string to the buffer.
  52. // AppendString copy buffer at the first time.
  53. func (b *CopyOnWriteBuffer) AppendString(value string) {
  54. b.Append(StringToReadOnlyBytes(value))
  55. }
  56. // WriteByte writes the given byte to the buffer.
  57. // WriteByte allocate new buffer and clears it at the first time.
  58. func (b *CopyOnWriteBuffer) WriteByte(c byte) error {
  59. if !b.copied {
  60. b.buffer = make([]byte, 0, len(b.buffer)+20)
  61. b.copied = true
  62. }
  63. b.buffer = append(b.buffer, c)
  64. return nil
  65. }
  66. // AppendByte appends given bytes to the buffer.
  67. // AppendByte copy buffer at the first time.
  68. func (b *CopyOnWriteBuffer) AppendByte(c byte) {
  69. if !b.copied {
  70. tmp := make([]byte, len(b.buffer), len(b.buffer)+20)
  71. copy(tmp, b.buffer)
  72. b.buffer = tmp
  73. b.copied = true
  74. }
  75. b.buffer = append(b.buffer, c)
  76. }
  77. // Bytes returns bytes of this buffer.
  78. func (b *CopyOnWriteBuffer) Bytes() []byte {
  79. return b.buffer
  80. }
  81. // IsCopied returns true if buffer has been copied, otherwise false.
  82. func (b *CopyOnWriteBuffer) IsCopied() bool {
  83. return b.copied
  84. }
  85. // IsEscapedPunctuation returns true if character at a given index i
  86. // is an escaped punctuation, otherwise false.
  87. func IsEscapedPunctuation(source []byte, i int) bool {
  88. return source[i] == '\\' && i < len(source)-1 && IsPunct(source[i+1])
  89. }
  90. // ReadWhile read the given source while pred is true.
  91. func ReadWhile(source []byte, index [2]int, pred func(byte) bool) (int, bool) {
  92. j := index[0]
  93. ok := false
  94. for ; j < index[1]; j++ {
  95. c1 := source[j]
  96. if pred(c1) {
  97. ok = true
  98. continue
  99. }
  100. break
  101. }
  102. return j, ok
  103. }
  104. // IsBlank returns true if the given string is all space characters.
  105. func IsBlank(bs []byte) bool {
  106. for _, b := range bs {
  107. if !IsSpace(b) {
  108. return false
  109. }
  110. }
  111. return true
  112. }
  113. // VisualizeSpaces visualize invisible space characters.
  114. func VisualizeSpaces(bs []byte) []byte {
  115. bs = bytes.Replace(bs, []byte(" "), []byte("[SPACE]"), -1)
  116. bs = bytes.Replace(bs, []byte("\t"), []byte("[TAB]"), -1)
  117. bs = bytes.Replace(bs, []byte("\n"), []byte("[NEWLINE]\n"), -1)
  118. bs = bytes.Replace(bs, []byte("\r"), []byte("[CR]"), -1)
  119. bs = bytes.Replace(bs, []byte("\v"), []byte("[VTAB]"), -1)
  120. bs = bytes.Replace(bs, []byte("\x00"), []byte("[NUL]"), -1)
  121. bs = bytes.Replace(bs, []byte("\ufffd"), []byte("[U+FFFD]"), -1)
  122. return bs
  123. }
  124. // TabWidth calculates actual width of a tab at the given position.
  125. func TabWidth(currentPos int) int {
  126. return 4 - currentPos%4
  127. }
  128. // IndentPosition searches an indent position with the given width for the given line.
  129. // If the line contains tab characters, paddings may be not zero.
  130. // currentPos==0 and width==2:
  131. //
  132. // position: 0 1
  133. // [TAB]aaaa
  134. // width: 1234 5678
  135. //
  136. // width=2 is in the tab character. In this case, IndentPosition returns
  137. // (pos=1, padding=2).
  138. func IndentPosition(bs []byte, currentPos, width int) (pos, padding int) {
  139. return IndentPositionPadding(bs, currentPos, 0, width)
  140. }
  141. // IndentPositionPadding searches an indent position with the given width for the given line.
  142. // This function is mostly same as IndentPosition except this function
  143. // takes account into additional paddings.
  144. func IndentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) {
  145. if width == 0 {
  146. return 0, paddingv
  147. }
  148. w := 0
  149. i := 0
  150. l := len(bs)
  151. for ; i < l; i++ {
  152. if bs[i] == '\t' && w < width {
  153. w += TabWidth(currentPos + w)
  154. } else if bs[i] == ' ' && w < width {
  155. w++
  156. } else {
  157. break
  158. }
  159. }
  160. if w >= width {
  161. return i - paddingv, w - width
  162. }
  163. return -1, -1
  164. }
  165. // DedentPosition dedents lines by the given width.
  166. //
  167. // Deprecated: This function has bugs. Use util.IndentPositionPadding and util.FirstNonSpacePosition.
  168. func DedentPosition(bs []byte, currentPos, width int) (pos, padding int) {
  169. if width == 0 {
  170. return 0, 0
  171. }
  172. w := 0
  173. l := len(bs)
  174. i := 0
  175. for ; i < l; i++ {
  176. if bs[i] == '\t' {
  177. w += TabWidth(currentPos + w)
  178. } else if bs[i] == ' ' {
  179. w++
  180. } else {
  181. break
  182. }
  183. }
  184. if w >= width {
  185. return i, w - width
  186. }
  187. return i, 0
  188. }
  189. // DedentPositionPadding dedents lines by the given width.
  190. // This function is mostly same as DedentPosition except this function
  191. // takes account into additional paddings.
  192. //
  193. // Deprecated: This function has bugs. Use util.IndentPositionPadding and util.FirstNonSpacePosition.
  194. func DedentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) {
  195. if width == 0 {
  196. return 0, paddingv
  197. }
  198. w := 0
  199. i := 0
  200. l := len(bs)
  201. for ; i < l; i++ {
  202. if bs[i] == '\t' {
  203. w += TabWidth(currentPos + w)
  204. } else if bs[i] == ' ' {
  205. w++
  206. } else {
  207. break
  208. }
  209. }
  210. if w >= width {
  211. return i - paddingv, w - width
  212. }
  213. return i - paddingv, 0
  214. }
  215. // IndentWidth calculate an indent width for the given line.
  216. func IndentWidth(bs []byte, currentPos int) (width, pos int) {
  217. l := len(bs)
  218. for i := 0; i < l; i++ {
  219. b := bs[i]
  220. if b == ' ' {
  221. width++
  222. pos++
  223. } else if b == '\t' {
  224. width += TabWidth(currentPos + width)
  225. pos++
  226. } else {
  227. break
  228. }
  229. }
  230. return
  231. }
  232. // FirstNonSpacePosition returns a position line that is a first nonspace
  233. // character.
  234. func FirstNonSpacePosition(bs []byte) int {
  235. i := 0
  236. for ; i < len(bs); i++ {
  237. c := bs[i]
  238. if c == ' ' || c == '\t' {
  239. continue
  240. }
  241. if c == '\n' {
  242. return -1
  243. }
  244. return i
  245. }
  246. return -1
  247. }
  248. // FindClosure returns a position that closes the given opener.
  249. // If codeSpan is set true, it ignores characters in code spans.
  250. // If allowNesting is set true, closures correspond to nested opener will be
  251. // ignored.
  252. //
  253. // Deprecated: This function can not handle newlines. Many elements
  254. // can be existed over multiple lines(e.g. link labels).
  255. // Use text.Reader.FindClosure.
  256. func FindClosure(bs []byte, opener, closure byte, codeSpan, allowNesting bool) int {
  257. i := 0
  258. opened := 1
  259. codeSpanOpener := 0
  260. for i < len(bs) {
  261. c := bs[i]
  262. if codeSpan && codeSpanOpener != 0 && c == '`' {
  263. codeSpanCloser := 0
  264. for ; i < len(bs); i++ {
  265. if bs[i] == '`' {
  266. codeSpanCloser++
  267. } else {
  268. i--
  269. break
  270. }
  271. }
  272. if codeSpanCloser == codeSpanOpener {
  273. codeSpanOpener = 0
  274. }
  275. } else if codeSpanOpener == 0 && c == '\\' && i < len(bs)-1 && IsPunct(bs[i+1]) {
  276. i += 2
  277. continue
  278. } else if codeSpan && codeSpanOpener == 0 && c == '`' {
  279. for ; i < len(bs); i++ {
  280. if bs[i] == '`' {
  281. codeSpanOpener++
  282. } else {
  283. i--
  284. break
  285. }
  286. }
  287. } else if (codeSpan && codeSpanOpener == 0) || !codeSpan {
  288. if c == closure {
  289. opened--
  290. if opened == 0 {
  291. return i
  292. }
  293. } else if c == opener {
  294. if !allowNesting {
  295. return -1
  296. }
  297. opened++
  298. }
  299. }
  300. i++
  301. }
  302. return -1
  303. }
  304. // TrimLeft trims characters in the given s from head of the source.
  305. // bytes.TrimLeft offers same functionalities, but bytes.TrimLeft
  306. // allocates new buffer for the result.
  307. func TrimLeft(source, b []byte) []byte {
  308. i := 0
  309. for ; i < len(source); i++ {
  310. c := source[i]
  311. found := false
  312. for j := 0; j < len(b); j++ {
  313. if c == b[j] {
  314. found = true
  315. break
  316. }
  317. }
  318. if !found {
  319. break
  320. }
  321. }
  322. return source[i:]
  323. }
  324. // TrimRight trims characters in the given s from tail of the source.
  325. func TrimRight(source, b []byte) []byte {
  326. i := len(source) - 1
  327. for ; i >= 0; i-- {
  328. c := source[i]
  329. found := false
  330. for j := 0; j < len(b); j++ {
  331. if c == b[j] {
  332. found = true
  333. break
  334. }
  335. }
  336. if !found {
  337. break
  338. }
  339. }
  340. return source[:i+1]
  341. }
  342. // TrimLeftLength returns a length of leading specified characters.
  343. func TrimLeftLength(source, s []byte) int {
  344. return len(source) - len(TrimLeft(source, s))
  345. }
  346. // TrimRightLength returns a length of trailing specified characters.
  347. func TrimRightLength(source, s []byte) int {
  348. return len(source) - len(TrimRight(source, s))
  349. }
  350. // TrimLeftSpaceLength returns a length of leading space characters.
  351. func TrimLeftSpaceLength(source []byte) int {
  352. i := 0
  353. for ; i < len(source); i++ {
  354. if !IsSpace(source[i]) {
  355. break
  356. }
  357. }
  358. return i
  359. }
  360. // TrimRightSpaceLength returns a length of trailing space characters.
  361. func TrimRightSpaceLength(source []byte) int {
  362. l := len(source)
  363. i := l - 1
  364. for ; i >= 0; i-- {
  365. if !IsSpace(source[i]) {
  366. break
  367. }
  368. }
  369. if i < 0 {
  370. return l
  371. }
  372. return l - 1 - i
  373. }
  374. // TrimLeftSpace returns a subslice of the given string by slicing off all leading
  375. // space characters.
  376. func TrimLeftSpace(source []byte) []byte {
  377. return TrimLeft(source, spaces)
  378. }
  379. // TrimRightSpace returns a subslice of the given string by slicing off all trailing
  380. // space characters.
  381. func TrimRightSpace(source []byte) []byte {
  382. return TrimRight(source, spaces)
  383. }
  384. // DoFullUnicodeCaseFolding performs full unicode case folding to given bytes.
  385. func DoFullUnicodeCaseFolding(v []byte) []byte {
  386. var rbuf []byte
  387. cob := NewCopyOnWriteBuffer(v)
  388. n := 0
  389. for i := 0; i < len(v); i++ {
  390. c := v[i]
  391. if c < 0xb5 {
  392. if c >= 0x41 && c <= 0x5a {
  393. // A-Z to a-z
  394. cob.Write(v[n:i])
  395. _ = cob.WriteByte(c + 32)
  396. n = i + 1
  397. }
  398. continue
  399. }
  400. if !utf8.RuneStart(c) {
  401. continue
  402. }
  403. r, length := utf8.DecodeRune(v[i:])
  404. if r == utf8.RuneError {
  405. continue
  406. }
  407. folded, ok := unicodeCaseFoldings[r]
  408. if !ok {
  409. continue
  410. }
  411. cob.Write(v[n:i])
  412. if rbuf == nil {
  413. rbuf = make([]byte, 4)
  414. }
  415. for _, f := range folded {
  416. l := utf8.EncodeRune(rbuf, f)
  417. cob.Write(rbuf[:l])
  418. }
  419. i += length - 1
  420. n = i + 1
  421. }
  422. if cob.IsCopied() {
  423. cob.Write(v[n:])
  424. }
  425. return cob.Bytes()
  426. }
  427. // ReplaceSpaces replaces sequence of spaces with the given repl.
  428. func ReplaceSpaces(source []byte, repl byte) []byte {
  429. var ret []byte
  430. start := -1
  431. for i, c := range source {
  432. iss := IsSpace(c)
  433. if start < 0 && iss {
  434. start = i
  435. continue
  436. } else if start >= 0 && iss {
  437. continue
  438. } else if start >= 0 {
  439. if ret == nil {
  440. ret = make([]byte, 0, len(source))
  441. ret = append(ret, source[:start]...)
  442. }
  443. ret = append(ret, repl)
  444. start = -1
  445. }
  446. if ret != nil {
  447. ret = append(ret, c)
  448. }
  449. }
  450. if start >= 0 && ret != nil {
  451. ret = append(ret, repl)
  452. }
  453. if ret == nil {
  454. return source
  455. }
  456. return ret
  457. }
  458. // ToRune decode given bytes start at pos and returns a rune.
  459. func ToRune(source []byte, pos int) rune {
  460. i := pos
  461. for ; i >= 0; i-- {
  462. if utf8.RuneStart(source[i]) {
  463. break
  464. }
  465. }
  466. r, _ := utf8.DecodeRune(source[i:])
  467. return r
  468. }
  469. // ToValidRune returns 0xFFFD if the given rune is invalid, otherwise v.
  470. func ToValidRune(v rune) rune {
  471. if v == 0 || !utf8.ValidRune(v) {
  472. return rune(0xFFFD)
  473. }
  474. return v
  475. }
  476. // ToLinkReference converts given bytes into a valid link reference string.
  477. // ToLinkReference performs unicode case folding, trims leading and trailing spaces, converts into lower
  478. // case and replace spaces with a single space character.
  479. func ToLinkReference(v []byte) string {
  480. v = TrimLeftSpace(v)
  481. v = TrimRightSpace(v)
  482. v = DoFullUnicodeCaseFolding(v)
  483. return string(ReplaceSpaces(v, ' '))
  484. }
  485. var htmlEscapeTable = [256][]byte{nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, []byte("&quot;"), nil, nil, nil, []byte("&amp;"), nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, []byte("&lt;"), nil, []byte("&gt;"), nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil} //nolint:golint,lll
  486. // EscapeHTMLByte returns HTML escaped bytes if the given byte should be escaped,
  487. // otherwise nil.
  488. func EscapeHTMLByte(b byte) []byte {
  489. return htmlEscapeTable[b]
  490. }
  491. // EscapeHTML escapes characters that should be escaped in HTML text.
  492. func EscapeHTML(v []byte) []byte {
  493. cob := NewCopyOnWriteBuffer(v)
  494. n := 0
  495. for i := 0; i < len(v); i++ {
  496. c := v[i]
  497. escaped := htmlEscapeTable[c]
  498. if escaped != nil {
  499. cob.Write(v[n:i])
  500. cob.Write(escaped)
  501. n = i + 1
  502. }
  503. }
  504. if cob.IsCopied() {
  505. cob.Write(v[n:])
  506. }
  507. return cob.Bytes()
  508. }
  509. // UnescapePunctuations unescapes blackslash escaped punctuations.
  510. func UnescapePunctuations(source []byte) []byte {
  511. cob := NewCopyOnWriteBuffer(source)
  512. limit := len(source)
  513. n := 0
  514. for i := 0; i < limit; {
  515. c := source[i]
  516. if i < limit-1 && c == '\\' && IsPunct(source[i+1]) {
  517. cob.Write(source[n:i])
  518. _ = cob.WriteByte(source[i+1])
  519. i += 2
  520. n = i
  521. continue
  522. }
  523. i++
  524. }
  525. if cob.IsCopied() {
  526. cob.Write(source[n:])
  527. }
  528. return cob.Bytes()
  529. }
  530. // ResolveNumericReferences resolve numeric references like '&#1234;" .
  531. func ResolveNumericReferences(source []byte) []byte {
  532. cob := NewCopyOnWriteBuffer(source)
  533. buf := make([]byte, 6)
  534. limit := len(source)
  535. var ok bool
  536. n := 0
  537. for i := 0; i < limit; i++ {
  538. if source[i] == '&' {
  539. pos := i
  540. next := i + 1
  541. if next < limit && source[next] == '#' {
  542. nnext := next + 1
  543. if nnext < limit {
  544. nc := source[nnext]
  545. // code point like #x22;
  546. if nnext < limit && nc == 'x' || nc == 'X' {
  547. start := nnext + 1
  548. i, ok = ReadWhile(source, [2]int{start, limit}, IsHexDecimal)
  549. if ok && i < limit && source[i] == ';' {
  550. v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 16, 32)
  551. cob.Write(source[n:pos])
  552. n = i + 1
  553. runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v)))
  554. cob.Write(buf[:runeSize])
  555. continue
  556. }
  557. // code point like #1234;
  558. } else if nc >= '0' && nc <= '9' {
  559. start := nnext
  560. i, ok = ReadWhile(source, [2]int{start, limit}, IsNumeric)
  561. if ok && i < limit && i-start < 8 && source[i] == ';' {
  562. v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 0, 32)
  563. cob.Write(source[n:pos])
  564. n = i + 1
  565. runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v)))
  566. cob.Write(buf[:runeSize])
  567. continue
  568. }
  569. }
  570. }
  571. }
  572. i = next - 1
  573. }
  574. }
  575. if cob.IsCopied() {
  576. cob.Write(source[n:])
  577. }
  578. return cob.Bytes()
  579. }
  580. // ResolveEntityNames resolve entity references like '&ouml;" .
  581. func ResolveEntityNames(source []byte) []byte {
  582. cob := NewCopyOnWriteBuffer(source)
  583. limit := len(source)
  584. var ok bool
  585. n := 0
  586. for i := 0; i < limit; i++ {
  587. if source[i] == '&' {
  588. pos := i
  589. next := i + 1
  590. if !(next < limit && source[next] == '#') {
  591. start := next
  592. i, ok = ReadWhile(source, [2]int{start, limit}, IsAlphaNumeric)
  593. if ok && i < limit && source[i] == ';' {
  594. name := BytesToReadOnlyString(source[start:i])
  595. entity, ok := LookUpHTML5EntityByName(name)
  596. if ok {
  597. cob.Write(source[n:pos])
  598. n = i + 1
  599. cob.Write(entity.Characters)
  600. continue
  601. }
  602. }
  603. }
  604. i = next - 1
  605. }
  606. }
  607. if cob.IsCopied() {
  608. cob.Write(source[n:])
  609. }
  610. return cob.Bytes()
  611. }
  612. var htmlSpace = []byte("%20")
  613. // URLEscape escape the given URL.
  614. // If resolveReference is set true:
  615. // 1. unescape punctuations
  616. // 2. resolve numeric references
  617. // 3. resolve entity references
  618. //
  619. // URL encoded values (%xx) are kept as is.
  620. func URLEscape(v []byte, resolveReference bool) []byte {
  621. if resolveReference {
  622. v = UnescapePunctuations(v)
  623. v = ResolveNumericReferences(v)
  624. v = ResolveEntityNames(v)
  625. }
  626. cob := NewCopyOnWriteBuffer(v)
  627. limit := len(v)
  628. n := 0
  629. for i := 0; i < limit; {
  630. c := v[i]
  631. if urlEscapeTable[c] == 1 {
  632. i++
  633. continue
  634. }
  635. if c == '%' && i+2 < limit && IsHexDecimal(v[i+1]) && IsHexDecimal(v[i+1]) {
  636. i += 3
  637. continue
  638. }
  639. u8len := utf8lenTable[c]
  640. if u8len == 99 { // invalid utf8 leading byte, skip it
  641. i++
  642. continue
  643. }
  644. if c == ' ' {
  645. cob.Write(v[n:i])
  646. cob.Write(htmlSpace)
  647. i++
  648. n = i
  649. continue
  650. }
  651. if int(u8len) > len(v) {
  652. u8len = int8(len(v) - 1)
  653. }
  654. if u8len == 0 {
  655. i++
  656. n = i
  657. continue
  658. }
  659. cob.Write(v[n:i])
  660. stop := i + int(u8len)
  661. if stop > len(v) {
  662. i++
  663. n = i
  664. continue
  665. }
  666. cob.Write(StringToReadOnlyBytes(url.QueryEscape(string(v[i:stop]))))
  667. i += int(u8len)
  668. n = i
  669. }
  670. if cob.IsCopied() && n < limit {
  671. cob.Write(v[n:])
  672. }
  673. return cob.Bytes()
  674. }
  675. // FindURLIndex returns a stop index value if the given bytes seem an URL.
  676. // This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* .
  677. func FindURLIndex(b []byte) int {
  678. i := 0
  679. if !(len(b) > 0 && urlTable[b[i]]&7 == 7) {
  680. return -1
  681. }
  682. i++
  683. for ; i < len(b); i++ {
  684. c := b[i]
  685. if urlTable[c]&4 != 4 {
  686. break
  687. }
  688. }
  689. if i == 1 || i > 33 || i >= len(b) {
  690. return -1
  691. }
  692. if b[i] != ':' {
  693. return -1
  694. }
  695. i++
  696. for ; i < len(b); i++ {
  697. c := b[i]
  698. if urlTable[c]&1 != 1 {
  699. break
  700. }
  701. }
  702. return i
  703. }
  704. var emailDomainRegexp = regexp.MustCompile(`^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*`) //nolint:golint,lll
  705. // FindEmailIndex returns a stop index value if the given bytes seem an email address.
  706. func FindEmailIndex(b []byte) int {
  707. // TODO: eliminate regexps
  708. i := 0
  709. for ; i < len(b); i++ {
  710. c := b[i]
  711. if emailTable[c]&1 != 1 {
  712. break
  713. }
  714. }
  715. if i == 0 {
  716. return -1
  717. }
  718. if i >= len(b) || b[i] != '@' {
  719. return -1
  720. }
  721. i++
  722. if i >= len(b) {
  723. return -1
  724. }
  725. match := emailDomainRegexp.FindSubmatchIndex(b[i:])
  726. if match == nil {
  727. return -1
  728. }
  729. return i + match[1]
  730. }
  731. var spaces = []byte(" \t\n\x0b\x0c\x0d")
  732. var spaceTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
  733. var punctTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
  734. // a-zA-Z0-9, ;/?:@&=+$,-_.!~*'()#
  735. var urlEscapeTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
  736. var utf8lenTable = [256]int8{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 99, 99, 99, 99, 99, 99, 99, 99} //nolint:golint,lll
  737. var urlTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 1, 0, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} //nolint:golint,lll
  738. var emailTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
  739. // UTF8Len returns a byte length of the utf-8 character.
  740. func UTF8Len(b byte) int8 {
  741. return utf8lenTable[b]
  742. }
  743. // IsPunct returns true if the given character is a punctuation, otherwise false.
  744. func IsPunct(c byte) bool {
  745. return punctTable[c] == 1
  746. }
  747. // IsPunctRune returns true if the given rune is a punctuation, otherwise false.
  748. func IsPunctRune(r rune) bool {
  749. return int32(r) <= 256 && IsPunct(byte(r)) || unicode.IsPunct(r)
  750. }
  751. // IsSpace returns true if the given character is a space, otherwise false.
  752. func IsSpace(c byte) bool {
  753. return spaceTable[c] == 1
  754. }
  755. // IsSpaceRune returns true if the given rune is a space, otherwise false.
  756. func IsSpaceRune(r rune) bool {
  757. return int32(r) <= 256 && IsSpace(byte(r)) || unicode.IsSpace(r)
  758. }
  759. // IsNumeric returns true if the given character is a numeric, otherwise false.
  760. func IsNumeric(c byte) bool {
  761. return c >= '0' && c <= '9'
  762. }
  763. // IsHexDecimal returns true if the given character is a hexdecimal, otherwise false.
  764. func IsHexDecimal(c byte) bool {
  765. return c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F'
  766. }
  767. // IsAlphaNumeric returns true if the given character is a alphabet or a numeric, otherwise false.
  768. func IsAlphaNumeric(c byte) bool {
  769. return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9'
  770. }
  771. // IsEastAsianWideRune returns trhe if the given rune is an east asian wide character, otherwise false.
  772. func IsEastAsianWideRune(r rune) bool {
  773. // https://en.wikipedia.org/wiki/CJK_Symbols_and_Punctuation
  774. var CJKSymbolsAndPunctuation = &unicode.RangeTable{
  775. R16: []unicode.Range16{
  776. {0x3000, 0x303F, 1},
  777. },
  778. }
  779. return unicode.Is(unicode.Hiragana, r) ||
  780. unicode.Is(unicode.Katakana, r) ||
  781. unicode.Is(unicode.Han, r) ||
  782. unicode.Is(unicode.Lm, r) ||
  783. unicode.Is(unicode.Hangul, r) ||
  784. unicode.Is(CJKSymbolsAndPunctuation, r)
  785. }
  786. // A BufWriter is a subset of the bufio.Writer .
  787. type BufWriter interface {
  788. io.Writer
  789. Available() int
  790. Buffered() int
  791. Flush() error
  792. WriteByte(c byte) error
  793. WriteRune(r rune) (size int, err error)
  794. WriteString(s string) (int, error)
  795. }
  796. // A PrioritizedValue struct holds pair of an arbitrary value and a priority.
  797. type PrioritizedValue struct {
  798. // Value is an arbitrary value that you want to prioritize.
  799. Value interface{}
  800. // Priority is a priority of the value.
  801. Priority int
  802. }
  803. // PrioritizedSlice is a slice of the PrioritizedValues.
  804. type PrioritizedSlice []PrioritizedValue
  805. // Sort sorts the PrioritizedSlice in ascending order.
  806. func (s PrioritizedSlice) Sort() {
  807. sort.Slice(s, func(i, j int) bool {
  808. return s[i].Priority < s[j].Priority
  809. })
  810. }
  811. // Remove removes the given value from this slice.
  812. func (s PrioritizedSlice) Remove(v interface{}) PrioritizedSlice {
  813. i := 0
  814. found := false
  815. for ; i < len(s); i++ {
  816. if s[i].Value == v {
  817. found = true
  818. break
  819. }
  820. }
  821. if !found {
  822. return s
  823. }
  824. return append(s[:i], s[i+1:]...)
  825. }
  826. // Prioritized returns a new PrioritizedValue.
  827. func Prioritized(v interface{}, priority int) PrioritizedValue {
  828. return PrioritizedValue{v, priority}
  829. }
  830. func bytesHash(b []byte) uint64 {
  831. var hash uint64 = 5381
  832. for _, c := range b {
  833. hash = ((hash << 5) + hash) + uint64(c)
  834. }
  835. return hash
  836. }
  837. // BytesFilter is a efficient data structure for checking whether bytes exist or not.
  838. // BytesFilter is thread-safe.
  839. type BytesFilter interface {
  840. // Add adds given bytes to this set.
  841. Add([]byte)
  842. // Contains return true if this set contains given bytes, otherwise false.
  843. Contains([]byte) bool
  844. // Extend copies this filter and adds given bytes to new filter.
  845. Extend(...[]byte) BytesFilter
  846. }
  847. type bytesFilter struct {
  848. chars [256]uint8
  849. threshold int
  850. slots [][][]byte
  851. }
  852. // NewBytesFilter returns a new BytesFilter.
  853. func NewBytesFilter(elements ...[]byte) BytesFilter {
  854. s := &bytesFilter{
  855. threshold: 3,
  856. slots: make([][][]byte, 64),
  857. }
  858. for _, element := range elements {
  859. s.Add(element)
  860. }
  861. return s
  862. }
  863. func (s *bytesFilter) Add(b []byte) {
  864. l := len(b)
  865. m := s.threshold
  866. if l < s.threshold {
  867. m = l
  868. }
  869. for i := 0; i < m; i++ {
  870. s.chars[b[i]] |= 1 << uint8(i)
  871. }
  872. h := bytesHash(b) % uint64(len(s.slots))
  873. slot := s.slots[h]
  874. if slot == nil {
  875. slot = [][]byte{}
  876. }
  877. s.slots[h] = append(slot, b)
  878. }
  879. func (s *bytesFilter) Extend(bs ...[]byte) BytesFilter {
  880. newFilter := NewBytesFilter().(*bytesFilter)
  881. newFilter.chars = s.chars
  882. newFilter.threshold = s.threshold
  883. for k, v := range s.slots {
  884. newSlot := make([][]byte, len(v))
  885. copy(newSlot, v)
  886. newFilter.slots[k] = v
  887. }
  888. for _, b := range bs {
  889. newFilter.Add(b)
  890. }
  891. return newFilter
  892. }
  893. func (s *bytesFilter) Contains(b []byte) bool {
  894. l := len(b)
  895. m := s.threshold
  896. if l < s.threshold {
  897. m = l
  898. }
  899. for i := 0; i < m; i++ {
  900. if (s.chars[b[i]] & (1 << uint8(i))) == 0 {
  901. return false
  902. }
  903. }
  904. h := bytesHash(b) % uint64(len(s.slots))
  905. slot := s.slots[h]
  906. if len(slot) == 0 {
  907. return false
  908. }
  909. for _, element := range slot {
  910. if bytes.Equal(element, b) {
  911. return true
  912. }
  913. }
  914. return false
  915. }