util.go 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989
  1. // Package util provides utility functions for the goldmark.
  2. package util
  3. import (
  4. "bytes"
  5. "io"
  6. "net/url"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "unicode"
  11. "unicode/utf8"
  12. )
  13. // A CopyOnWriteBuffer is a byte buffer that copies buffer when
  14. // it need to be changed.
  15. type CopyOnWriteBuffer struct {
  16. buffer []byte
  17. copied bool
  18. }
  19. // NewCopyOnWriteBuffer returns a new CopyOnWriteBuffer.
  20. func NewCopyOnWriteBuffer(buffer []byte) CopyOnWriteBuffer {
  21. return CopyOnWriteBuffer{
  22. buffer: buffer,
  23. copied: false,
  24. }
  25. }
  26. // Write writes given bytes to the buffer.
  27. // Write allocate new buffer and clears it at the first time.
  28. func (b *CopyOnWriteBuffer) Write(value []byte) {
  29. if !b.copied {
  30. b.buffer = make([]byte, 0, len(b.buffer)+20)
  31. b.copied = true
  32. }
  33. b.buffer = append(b.buffer, value...)
  34. }
  35. // WriteString writes given string to the buffer.
  36. // WriteString allocate new buffer and clears it at the first time.
  37. func (b *CopyOnWriteBuffer) WriteString(value string) {
  38. b.Write(StringToReadOnlyBytes(value))
  39. }
  40. // Append appends given bytes to the buffer.
  41. // Append copy buffer at the first time.
  42. func (b *CopyOnWriteBuffer) Append(value []byte) {
  43. if !b.copied {
  44. tmp := make([]byte, len(b.buffer), len(b.buffer)+20)
  45. copy(tmp, b.buffer)
  46. b.buffer = tmp
  47. b.copied = true
  48. }
  49. b.buffer = append(b.buffer, value...)
  50. }
  51. // AppendString appends given string to the buffer.
  52. // AppendString copy buffer at the first time.
  53. func (b *CopyOnWriteBuffer) AppendString(value string) {
  54. b.Append(StringToReadOnlyBytes(value))
  55. }
  56. // WriteByte writes the given byte to the buffer.
  57. // WriteByte allocate new buffer and clears it at the first time.
  58. func (b *CopyOnWriteBuffer) WriteByte(c byte) {
  59. if !b.copied {
  60. b.buffer = make([]byte, 0, len(b.buffer)+20)
  61. b.copied = true
  62. }
  63. b.buffer = append(b.buffer, c)
  64. }
  65. // AppendByte appends given bytes to the buffer.
  66. // AppendByte copy buffer at the first time.
  67. func (b *CopyOnWriteBuffer) AppendByte(c byte) {
  68. if !b.copied {
  69. tmp := make([]byte, len(b.buffer), len(b.buffer)+20)
  70. copy(tmp, b.buffer)
  71. b.buffer = tmp
  72. b.copied = true
  73. }
  74. b.buffer = append(b.buffer, c)
  75. }
  76. // Bytes returns bytes of this buffer.
  77. func (b *CopyOnWriteBuffer) Bytes() []byte {
  78. return b.buffer
  79. }
  80. // IsCopied returns true if buffer has been copied, otherwise false.
  81. func (b *CopyOnWriteBuffer) IsCopied() bool {
  82. return b.copied
  83. }
  84. // IsEscapedPunctuation returns true if character at a given index i
  85. // is an escaped punctuation, otherwise false.
  86. func IsEscapedPunctuation(source []byte, i int) bool {
  87. return source[i] == '\\' && i < len(source)-1 && IsPunct(source[i+1])
  88. }
  89. // ReadWhile read the given source while pred is true.
  90. func ReadWhile(source []byte, index [2]int, pred func(byte) bool) (int, bool) {
  91. j := index[0]
  92. ok := false
  93. for ; j < index[1]; j++ {
  94. c1 := source[j]
  95. if pred(c1) {
  96. ok = true
  97. continue
  98. }
  99. break
  100. }
  101. return j, ok
  102. }
  103. // IsBlank returns true if the given string is all space characters.
  104. func IsBlank(bs []byte) bool {
  105. for _, b := range bs {
  106. if !IsSpace(b) {
  107. return false
  108. }
  109. }
  110. return true
  111. }
  112. // VisualizeSpaces visualize invisible space characters.
  113. func VisualizeSpaces(bs []byte) []byte {
  114. bs = bytes.Replace(bs, []byte(" "), []byte("[SPACE]"), -1)
  115. bs = bytes.Replace(bs, []byte("\t"), []byte("[TAB]"), -1)
  116. bs = bytes.Replace(bs, []byte("\n"), []byte("[NEWLINE]\n"), -1)
  117. bs = bytes.Replace(bs, []byte("\r"), []byte("[CR]"), -1)
  118. bs = bytes.Replace(bs, []byte("\v"), []byte("[VTAB]"), -1)
  119. bs = bytes.Replace(bs, []byte("\x00"), []byte("[NUL]"), -1)
  120. bs = bytes.Replace(bs, []byte("\ufffd"), []byte("[U+FFFD]"), -1)
  121. return bs
  122. }
  123. // TabWidth calculates actual width of a tab at the given position.
  124. func TabWidth(currentPos int) int {
  125. return 4 - currentPos%4
  126. }
  127. // IndentPosition searches an indent position with the given width for the given line.
  128. // If the line contains tab characters, paddings may be not zero.
  129. // currentPos==0 and width==2:
  130. //
  131. // position: 0 1
  132. // [TAB]aaaa
  133. // width: 1234 5678
  134. //
  135. // width=2 is in the tab character. In this case, IndentPosition returns
  136. // (pos=1, padding=2)
  137. func IndentPosition(bs []byte, currentPos, width int) (pos, padding int) {
  138. return IndentPositionPadding(bs, currentPos, 0, width)
  139. }
  140. // IndentPositionPadding searches an indent position with the given width for the given line.
  141. // This function is mostly same as IndentPosition except this function
  142. // takes account into additional paddings.
  143. func IndentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) {
  144. if width == 0 {
  145. return 0, paddingv
  146. }
  147. w := 0
  148. i := 0
  149. l := len(bs)
  150. for ; i < l; i++ {
  151. if bs[i] == '\t' && w < width {
  152. w += TabWidth(currentPos + w)
  153. } else if bs[i] == ' ' && w < width {
  154. w++
  155. } else {
  156. break
  157. }
  158. }
  159. if w >= width {
  160. return i - paddingv, w - width
  161. }
  162. return -1, -1
  163. }
  164. // DedentPosition dedents lines by the given width.
  165. //
  166. // Deprecated: This function has bugs. Use util.IndentPositionPadding and util.FirstNonSpacePosition.
  167. func DedentPosition(bs []byte, currentPos, width int) (pos, padding int) {
  168. if width == 0 {
  169. return 0, 0
  170. }
  171. w := 0
  172. l := len(bs)
  173. i := 0
  174. for ; i < l; i++ {
  175. if bs[i] == '\t' {
  176. w += TabWidth(currentPos + w)
  177. } else if bs[i] == ' ' {
  178. w++
  179. } else {
  180. break
  181. }
  182. }
  183. if w >= width {
  184. return i, w - width
  185. }
  186. return i, 0
  187. }
  188. // DedentPositionPadding dedents lines by the given width.
  189. // This function is mostly same as DedentPosition except this function
  190. // takes account into additional paddings.
  191. //
  192. // Deprecated: This function has bugs. Use util.IndentPositionPadding and util.FirstNonSpacePosition.
  193. func DedentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) {
  194. if width == 0 {
  195. return 0, paddingv
  196. }
  197. w := 0
  198. i := 0
  199. l := len(bs)
  200. for ; i < l; i++ {
  201. if bs[i] == '\t' {
  202. w += TabWidth(currentPos + w)
  203. } else if bs[i] == ' ' {
  204. w++
  205. } else {
  206. break
  207. }
  208. }
  209. if w >= width {
  210. return i - paddingv, w - width
  211. }
  212. return i - paddingv, 0
  213. }
  214. // IndentWidth calculate an indent width for the given line.
  215. func IndentWidth(bs []byte, currentPos int) (width, pos int) {
  216. l := len(bs)
  217. for i := 0; i < l; i++ {
  218. b := bs[i]
  219. if b == ' ' {
  220. width++
  221. pos++
  222. } else if b == '\t' {
  223. width += TabWidth(currentPos + width)
  224. pos++
  225. } else {
  226. break
  227. }
  228. }
  229. return
  230. }
  231. // FirstNonSpacePosition returns a position line that is a first nonspace
  232. // character.
  233. func FirstNonSpacePosition(bs []byte) int {
  234. i := 0
  235. for ; i < len(bs); i++ {
  236. c := bs[i]
  237. if c == ' ' || c == '\t' {
  238. continue
  239. }
  240. if c == '\n' {
  241. return -1
  242. }
  243. return i
  244. }
  245. return -1
  246. }
  247. // FindClosure returns a position that closes the given opener.
  248. // If codeSpan is set true, it ignores characters in code spans.
  249. // If allowNesting is set true, closures correspond to nested opener will be
  250. // ignored.
  251. //
  252. // Deprecated: This function can not handle newlines. Many elements
  253. // can be existed over multiple lines(e.g. link labels).
  254. // Use text.Reader.FindClosure.
  255. func FindClosure(bs []byte, opener, closure byte, codeSpan, allowNesting bool) int {
  256. i := 0
  257. opened := 1
  258. codeSpanOpener := 0
  259. for i < len(bs) {
  260. c := bs[i]
  261. if codeSpan && codeSpanOpener != 0 && c == '`' {
  262. codeSpanCloser := 0
  263. for ; i < len(bs); i++ {
  264. if bs[i] == '`' {
  265. codeSpanCloser++
  266. } else {
  267. i--
  268. break
  269. }
  270. }
  271. if codeSpanCloser == codeSpanOpener {
  272. codeSpanOpener = 0
  273. }
  274. } else if codeSpanOpener == 0 && c == '\\' && i < len(bs)-1 && IsPunct(bs[i+1]) {
  275. i += 2
  276. continue
  277. } else if codeSpan && codeSpanOpener == 0 && c == '`' {
  278. for ; i < len(bs); i++ {
  279. if bs[i] == '`' {
  280. codeSpanOpener++
  281. } else {
  282. i--
  283. break
  284. }
  285. }
  286. } else if (codeSpan && codeSpanOpener == 0) || !codeSpan {
  287. if c == closure {
  288. opened--
  289. if opened == 0 {
  290. return i
  291. }
  292. } else if c == opener {
  293. if !allowNesting {
  294. return -1
  295. }
  296. opened++
  297. }
  298. }
  299. i++
  300. }
  301. return -1
  302. }
  303. // TrimLeft trims characters in the given s from head of the source.
  304. // bytes.TrimLeft offers same functionalities, but bytes.TrimLeft
  305. // allocates new buffer for the result.
  306. func TrimLeft(source, b []byte) []byte {
  307. i := 0
  308. for ; i < len(source); i++ {
  309. c := source[i]
  310. found := false
  311. for j := 0; j < len(b); j++ {
  312. if c == b[j] {
  313. found = true
  314. break
  315. }
  316. }
  317. if !found {
  318. break
  319. }
  320. }
  321. return source[i:]
  322. }
  323. // TrimRight trims characters in the given s from tail of the source.
  324. func TrimRight(source, b []byte) []byte {
  325. i := len(source) - 1
  326. for ; i >= 0; i-- {
  327. c := source[i]
  328. found := false
  329. for j := 0; j < len(b); j++ {
  330. if c == b[j] {
  331. found = true
  332. break
  333. }
  334. }
  335. if !found {
  336. break
  337. }
  338. }
  339. return source[:i+1]
  340. }
  341. // TrimLeftLength returns a length of leading specified characters.
  342. func TrimLeftLength(source, s []byte) int {
  343. return len(source) - len(TrimLeft(source, s))
  344. }
  345. // TrimRightLength returns a length of trailing specified characters.
  346. func TrimRightLength(source, s []byte) int {
  347. return len(source) - len(TrimRight(source, s))
  348. }
  349. // TrimLeftSpaceLength returns a length of leading space characters.
  350. func TrimLeftSpaceLength(source []byte) int {
  351. i := 0
  352. for ; i < len(source); i++ {
  353. if !IsSpace(source[i]) {
  354. break
  355. }
  356. }
  357. return i
  358. }
  359. // TrimRightSpaceLength returns a length of trailing space characters.
  360. func TrimRightSpaceLength(source []byte) int {
  361. l := len(source)
  362. i := l - 1
  363. for ; i >= 0; i-- {
  364. if !IsSpace(source[i]) {
  365. break
  366. }
  367. }
  368. if i < 0 {
  369. return l
  370. }
  371. return l - 1 - i
  372. }
  373. // TrimLeftSpace returns a subslice of the given string by slicing off all leading
  374. // space characters.
  375. func TrimLeftSpace(source []byte) []byte {
  376. return TrimLeft(source, spaces)
  377. }
  378. // TrimRightSpace returns a subslice of the given string by slicing off all trailing
  379. // space characters.
  380. func TrimRightSpace(source []byte) []byte {
  381. return TrimRight(source, spaces)
  382. }
  383. // DoFullUnicodeCaseFolding performs full unicode case folding to given bytes.
  384. func DoFullUnicodeCaseFolding(v []byte) []byte {
  385. var rbuf []byte
  386. cob := NewCopyOnWriteBuffer(v)
  387. n := 0
  388. for i := 0; i < len(v); i++ {
  389. c := v[i]
  390. if c < 0xb5 {
  391. if c >= 0x41 && c <= 0x5a {
  392. // A-Z to a-z
  393. cob.Write(v[n:i])
  394. cob.WriteByte(c + 32)
  395. n = i + 1
  396. }
  397. continue
  398. }
  399. if !utf8.RuneStart(c) {
  400. continue
  401. }
  402. r, length := utf8.DecodeRune(v[i:])
  403. if r == utf8.RuneError {
  404. continue
  405. }
  406. folded, ok := unicodeCaseFoldings[r]
  407. if !ok {
  408. continue
  409. }
  410. cob.Write(v[n:i])
  411. if rbuf == nil {
  412. rbuf = make([]byte, 4)
  413. }
  414. for _, f := range folded {
  415. l := utf8.EncodeRune(rbuf, f)
  416. cob.Write(rbuf[:l])
  417. }
  418. i += length - 1
  419. n = i + 1
  420. }
  421. if cob.IsCopied() {
  422. cob.Write(v[n:])
  423. }
  424. return cob.Bytes()
  425. }
  426. // ReplaceSpaces replaces sequence of spaces with the given repl.
  427. func ReplaceSpaces(source []byte, repl byte) []byte {
  428. var ret []byte
  429. start := -1
  430. for i, c := range source {
  431. iss := IsSpace(c)
  432. if start < 0 && iss {
  433. start = i
  434. continue
  435. } else if start >= 0 && iss {
  436. continue
  437. } else if start >= 0 {
  438. if ret == nil {
  439. ret = make([]byte, 0, len(source))
  440. ret = append(ret, source[:start]...)
  441. }
  442. ret = append(ret, repl)
  443. start = -1
  444. }
  445. if ret != nil {
  446. ret = append(ret, c)
  447. }
  448. }
  449. if start >= 0 && ret != nil {
  450. ret = append(ret, repl)
  451. }
  452. if ret == nil {
  453. return source
  454. }
  455. return ret
  456. }
  457. // ToRune decode given bytes start at pos and returns a rune.
  458. func ToRune(source []byte, pos int) rune {
  459. i := pos
  460. for ; i >= 0; i-- {
  461. if utf8.RuneStart(source[i]) {
  462. break
  463. }
  464. }
  465. r, _ := utf8.DecodeRune(source[i:])
  466. return r
  467. }
  468. // ToValidRune returns 0xFFFD if the given rune is invalid, otherwise v.
  469. func ToValidRune(v rune) rune {
  470. if v == 0 || !utf8.ValidRune(v) {
  471. return rune(0xFFFD)
  472. }
  473. return v
  474. }
  475. // ToLinkReference converts given bytes into a valid link reference string.
  476. // ToLinkReference performs unicode case folding, trims leading and trailing spaces, converts into lower
  477. // case and replace spaces with a single space character.
  478. func ToLinkReference(v []byte) string {
  479. v = TrimLeftSpace(v)
  480. v = TrimRightSpace(v)
  481. v = DoFullUnicodeCaseFolding(v)
  482. return string(ReplaceSpaces(v, ' '))
  483. }
  484. var htmlEscapeTable = [256][]byte{nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, []byte("&quot;"), nil, nil, nil, []byte("&amp;"), nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, []byte("&lt;"), nil, []byte("&gt;"), nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil}
  485. // EscapeHTMLByte returns HTML escaped bytes if the given byte should be escaped,
  486. // otherwise nil.
  487. func EscapeHTMLByte(b byte) []byte {
  488. return htmlEscapeTable[b]
  489. }
  490. // EscapeHTML escapes characters that should be escaped in HTML text.
  491. func EscapeHTML(v []byte) []byte {
  492. cob := NewCopyOnWriteBuffer(v)
  493. n := 0
  494. for i := 0; i < len(v); i++ {
  495. c := v[i]
  496. escaped := htmlEscapeTable[c]
  497. if escaped != nil {
  498. cob.Write(v[n:i])
  499. cob.Write(escaped)
  500. n = i + 1
  501. }
  502. }
  503. if cob.IsCopied() {
  504. cob.Write(v[n:])
  505. }
  506. return cob.Bytes()
  507. }
  508. // UnescapePunctuations unescapes blackslash escaped punctuations.
  509. func UnescapePunctuations(source []byte) []byte {
  510. cob := NewCopyOnWriteBuffer(source)
  511. limit := len(source)
  512. n := 0
  513. for i := 0; i < limit; {
  514. c := source[i]
  515. if i < limit-1 && c == '\\' && IsPunct(source[i+1]) {
  516. cob.Write(source[n:i])
  517. cob.WriteByte(source[i+1])
  518. i += 2
  519. n = i
  520. continue
  521. }
  522. i++
  523. }
  524. if cob.IsCopied() {
  525. cob.Write(source[n:])
  526. }
  527. return cob.Bytes()
  528. }
  529. // ResolveNumericReferences resolve numeric references like '&#1234;" .
  530. func ResolveNumericReferences(source []byte) []byte {
  531. cob := NewCopyOnWriteBuffer(source)
  532. buf := make([]byte, 6, 6)
  533. limit := len(source)
  534. ok := false
  535. n := 0
  536. for i := 0; i < limit; i++ {
  537. if source[i] == '&' {
  538. pos := i
  539. next := i + 1
  540. if next < limit && source[next] == '#' {
  541. nnext := next + 1
  542. if nnext < limit {
  543. nc := source[nnext]
  544. // code point like #x22;
  545. if nnext < limit && nc == 'x' || nc == 'X' {
  546. start := nnext + 1
  547. i, ok = ReadWhile(source, [2]int{start, limit}, IsHexDecimal)
  548. if ok && i < limit && source[i] == ';' {
  549. v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 16, 32)
  550. cob.Write(source[n:pos])
  551. n = i + 1
  552. runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v)))
  553. cob.Write(buf[:runeSize])
  554. continue
  555. }
  556. // code point like #1234;
  557. } else if nc >= '0' && nc <= '9' {
  558. start := nnext
  559. i, ok = ReadWhile(source, [2]int{start, limit}, IsNumeric)
  560. if ok && i < limit && i-start < 8 && source[i] == ';' {
  561. v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 0, 32)
  562. cob.Write(source[n:pos])
  563. n = i + 1
  564. runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v)))
  565. cob.Write(buf[:runeSize])
  566. continue
  567. }
  568. }
  569. }
  570. }
  571. i = next - 1
  572. }
  573. }
  574. if cob.IsCopied() {
  575. cob.Write(source[n:])
  576. }
  577. return cob.Bytes()
  578. }
  579. // ResolveEntityNames resolve entity references like '&ouml;" .
  580. func ResolveEntityNames(source []byte) []byte {
  581. cob := NewCopyOnWriteBuffer(source)
  582. limit := len(source)
  583. ok := false
  584. n := 0
  585. for i := 0; i < limit; i++ {
  586. if source[i] == '&' {
  587. pos := i
  588. next := i + 1
  589. if !(next < limit && source[next] == '#') {
  590. start := next
  591. i, ok = ReadWhile(source, [2]int{start, limit}, IsAlphaNumeric)
  592. if ok && i < limit && source[i] == ';' {
  593. name := BytesToReadOnlyString(source[start:i])
  594. entity, ok := LookUpHTML5EntityByName(name)
  595. if ok {
  596. cob.Write(source[n:pos])
  597. n = i + 1
  598. cob.Write(entity.Characters)
  599. continue
  600. }
  601. }
  602. }
  603. i = next - 1
  604. }
  605. }
  606. if cob.IsCopied() {
  607. cob.Write(source[n:])
  608. }
  609. return cob.Bytes()
  610. }
  611. var htmlSpace = []byte("%20")
  612. // URLEscape escape the given URL.
  613. // If resolveReference is set true:
  614. // 1. unescape punctuations
  615. // 2. resolve numeric references
  616. // 3. resolve entity references
  617. //
  618. // URL encoded values (%xx) are kept as is.
  619. func URLEscape(v []byte, resolveReference bool) []byte {
  620. if resolveReference {
  621. v = UnescapePunctuations(v)
  622. v = ResolveNumericReferences(v)
  623. v = ResolveEntityNames(v)
  624. }
  625. cob := NewCopyOnWriteBuffer(v)
  626. limit := len(v)
  627. n := 0
  628. for i := 0; i < limit; {
  629. c := v[i]
  630. if urlEscapeTable[c] == 1 {
  631. i++
  632. continue
  633. }
  634. if c == '%' && i+2 < limit && IsHexDecimal(v[i+1]) && IsHexDecimal(v[i+1]) {
  635. i += 3
  636. continue
  637. }
  638. u8len := utf8lenTable[c]
  639. if u8len == 99 { // invalid utf8 leading byte, skip it
  640. i++
  641. continue
  642. }
  643. if c == ' ' {
  644. cob.Write(v[n:i])
  645. cob.Write(htmlSpace)
  646. i++
  647. n = i
  648. continue
  649. }
  650. if int(u8len) > len(v) {
  651. u8len = int8(len(v) - 1)
  652. }
  653. if u8len == 0 {
  654. i++
  655. n = i
  656. continue
  657. }
  658. cob.Write(v[n:i])
  659. stop := i + int(u8len)
  660. if stop > len(v) {
  661. i++
  662. n = i
  663. continue
  664. }
  665. cob.Write(StringToReadOnlyBytes(url.QueryEscape(string(v[i:stop]))))
  666. i += int(u8len)
  667. n = i
  668. }
  669. if cob.IsCopied() && n < limit {
  670. cob.Write(v[n:])
  671. }
  672. return cob.Bytes()
  673. }
  674. // FindURLIndex returns a stop index value if the given bytes seem an URL.
  675. // This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* .
  676. func FindURLIndex(b []byte) int {
  677. i := 0
  678. if !(len(b) > 0 && urlTable[b[i]]&7 == 7) {
  679. return -1
  680. }
  681. i++
  682. for ; i < len(b); i++ {
  683. c := b[i]
  684. if urlTable[c]&4 != 4 {
  685. break
  686. }
  687. }
  688. if i == 1 || i > 33 || i >= len(b) {
  689. return -1
  690. }
  691. if b[i] != ':' {
  692. return -1
  693. }
  694. i++
  695. for ; i < len(b); i++ {
  696. c := b[i]
  697. if urlTable[c]&1 != 1 {
  698. break
  699. }
  700. }
  701. return i
  702. }
  703. var emailDomainRegexp = regexp.MustCompile(`^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*`)
  704. // FindEmailIndex returns a stop index value if the given bytes seem an email address.
  705. func FindEmailIndex(b []byte) int {
  706. // TODO: eliminate regexps
  707. i := 0
  708. for ; i < len(b); i++ {
  709. c := b[i]
  710. if emailTable[c]&1 != 1 {
  711. break
  712. }
  713. }
  714. if i == 0 {
  715. return -1
  716. }
  717. if i >= len(b) || b[i] != '@' {
  718. return -1
  719. }
  720. i++
  721. if i >= len(b) {
  722. return -1
  723. }
  724. match := emailDomainRegexp.FindSubmatchIndex(b[i:])
  725. if match == nil {
  726. return -1
  727. }
  728. return i + match[1]
  729. }
  730. var spaces = []byte(" \t\n\x0b\x0c\x0d")
  731. var spaceTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  732. var punctTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  733. // a-zA-Z0-9, ;/?:@&=+$,-_.!~*'()#
  734. var urlEscapeTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  735. var utf8lenTable = [256]int8{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 99, 99, 99, 99, 99, 99, 99, 99}
  736. var urlTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 1, 0, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
  737. var emailTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  738. // UTF8Len returns a byte length of the utf-8 character.
  739. func UTF8Len(b byte) int8 {
  740. return utf8lenTable[b]
  741. }
  742. // IsPunct returns true if the given character is a punctuation, otherwise false.
  743. func IsPunct(c byte) bool {
  744. return punctTable[c] == 1
  745. }
  746. // IsPunctRune returns true if the given rune is a punctuation, otherwise false.
  747. func IsPunctRune(r rune) bool {
  748. return int32(r) <= 256 && IsPunct(byte(r)) || unicode.IsPunct(r)
  749. }
  750. // IsSpace returns true if the given character is a space, otherwise false.
  751. func IsSpace(c byte) bool {
  752. return spaceTable[c] == 1
  753. }
  754. // IsSpaceRune returns true if the given rune is a space, otherwise false.
  755. func IsSpaceRune(r rune) bool {
  756. return int32(r) <= 256 && IsSpace(byte(r)) || unicode.IsSpace(r)
  757. }
  758. // IsNumeric returns true if the given character is a numeric, otherwise false.
  759. func IsNumeric(c byte) bool {
  760. return c >= '0' && c <= '9'
  761. }
  762. // IsHexDecimal returns true if the given character is a hexdecimal, otherwise false.
  763. func IsHexDecimal(c byte) bool {
  764. return c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F'
  765. }
  766. // IsAlphaNumeric returns true if the given character is a alphabet or a numeric, otherwise false.
  767. func IsAlphaNumeric(c byte) bool {
  768. return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9'
  769. }
  770. // IsEastAsianWideRune returns trhe if the given rune is an east asian wide character, otherwise false.
  771. func IsEastAsianWideRune(r rune) bool {
  772. return unicode.Is(unicode.Hiragana, r) ||
  773. unicode.Is(unicode.Katakana, r) ||
  774. unicode.Is(unicode.Han, r) ||
  775. unicode.Is(unicode.Lm, r) ||
  776. unicode.Is(unicode.Hangul, r)
  777. }
  778. // A BufWriter is a subset of the bufio.Writer .
  779. type BufWriter interface {
  780. io.Writer
  781. Available() int
  782. Buffered() int
  783. Flush() error
  784. WriteByte(c byte) error
  785. WriteRune(r rune) (size int, err error)
  786. WriteString(s string) (int, error)
  787. }
  788. // A PrioritizedValue struct holds pair of an arbitrary value and a priority.
  789. type PrioritizedValue struct {
  790. // Value is an arbitrary value that you want to prioritize.
  791. Value interface{}
  792. // Priority is a priority of the value.
  793. Priority int
  794. }
  795. // PrioritizedSlice is a slice of the PrioritizedValues
  796. type PrioritizedSlice []PrioritizedValue
  797. // Sort sorts the PrioritizedSlice in ascending order.
  798. func (s PrioritizedSlice) Sort() {
  799. sort.Slice(s, func(i, j int) bool {
  800. return s[i].Priority < s[j].Priority
  801. })
  802. }
  803. // Remove removes the given value from this slice.
  804. func (s PrioritizedSlice) Remove(v interface{}) PrioritizedSlice {
  805. i := 0
  806. found := false
  807. for ; i < len(s); i++ {
  808. if s[i].Value == v {
  809. found = true
  810. break
  811. }
  812. }
  813. if !found {
  814. return s
  815. }
  816. return append(s[:i], s[i+1:]...)
  817. }
  818. // Prioritized returns a new PrioritizedValue.
  819. func Prioritized(v interface{}, priority int) PrioritizedValue {
  820. return PrioritizedValue{v, priority}
  821. }
  822. func bytesHash(b []byte) uint64 {
  823. var hash uint64 = 5381
  824. for _, c := range b {
  825. hash = ((hash << 5) + hash) + uint64(c)
  826. }
  827. return hash
  828. }
  829. // BytesFilter is a efficient data structure for checking whether bytes exist or not.
  830. // BytesFilter is thread-safe.
  831. type BytesFilter interface {
  832. // Add adds given bytes to this set.
  833. Add([]byte)
  834. // Contains return true if this set contains given bytes, otherwise false.
  835. Contains([]byte) bool
  836. // Extend copies this filter and adds given bytes to new filter.
  837. Extend(...[]byte) BytesFilter
  838. }
  839. type bytesFilter struct {
  840. chars [256]uint8
  841. threshold int
  842. slots [][][]byte
  843. }
  844. // NewBytesFilter returns a new BytesFilter.
  845. func NewBytesFilter(elements ...[]byte) BytesFilter {
  846. s := &bytesFilter{
  847. threshold: 3,
  848. slots: make([][][]byte, 64),
  849. }
  850. for _, element := range elements {
  851. s.Add(element)
  852. }
  853. return s
  854. }
  855. func (s *bytesFilter) Add(b []byte) {
  856. l := len(b)
  857. m := s.threshold
  858. if l < s.threshold {
  859. m = l
  860. }
  861. for i := 0; i < m; i++ {
  862. s.chars[b[i]] |= 1 << uint8(i)
  863. }
  864. h := bytesHash(b) % uint64(len(s.slots))
  865. slot := s.slots[h]
  866. if slot == nil {
  867. slot = [][]byte{}
  868. }
  869. s.slots[h] = append(slot, b)
  870. }
  871. func (s *bytesFilter) Extend(bs ...[]byte) BytesFilter {
  872. newFilter := NewBytesFilter().(*bytesFilter)
  873. newFilter.chars = s.chars
  874. newFilter.threshold = s.threshold
  875. for k, v := range s.slots {
  876. newSlot := make([][]byte, len(v))
  877. copy(newSlot, v)
  878. newFilter.slots[k] = v
  879. }
  880. for _, b := range bs {
  881. newFilter.Add(b)
  882. }
  883. return newFilter
  884. }
  885. func (s *bytesFilter) Contains(b []byte) bool {
  886. l := len(b)
  887. m := s.threshold
  888. if l < s.threshold {
  889. m = l
  890. }
  891. for i := 0; i < m; i++ {
  892. if (s.chars[b[i]] & (1 << uint8(i))) == 0 {
  893. return false
  894. }
  895. }
  896. h := bytesHash(b) % uint64(len(s.slots))
  897. slot := s.slots[h]
  898. if slot == nil || len(slot) == 0 {
  899. return false
  900. }
  901. for _, element := range slot {
  902. if bytes.Equal(element, b) {
  903. return true
  904. }
  905. }
  906. return false
  907. }