m4.go 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. package matchfinder
  2. import (
  3. "bytes"
  4. "encoding/binary"
  5. "math/bits"
  6. "runtime"
  7. )
  8. // M4 is an implementation of the MatchFinder
  9. // interface that uses a hash table to find matches,
  10. // optional match chains,
  11. // and the advanced parsing technique from
  12. // https://fastcompression.blogspot.com/2011/12/advanced-parsing-strategies.html.
  13. type M4 struct {
  14. // MaxDistance is the maximum distance (in bytes) to look back for
  15. // a match. The default is 65535.
  16. MaxDistance int
  17. // MinLength is the length of the shortest match to return.
  18. // The default is 4.
  19. MinLength int
  20. // HashLen is the number of bytes to use to calculate the hashes.
  21. // The maximum is 8 and the default is 6.
  22. HashLen int
  23. // TableBits is the number of bits in the hash table indexes.
  24. // The default is 17 (128K entries).
  25. TableBits int
  26. // ChainLength is how many entries to search on the "match chain" of older
  27. // locations with the same hash as the current location.
  28. ChainLength int
  29. // DistanceBitCost is used when comparing two matches to see
  30. // which is better. The comparison is primarily based on the length
  31. // of the matches, but it can also take the distance into account,
  32. // in terms of the number of bits needed to represent the distance.
  33. // One byte of length is given a score of 256, so 32 (256/8) would
  34. // be a reasonable first guess for the value of one bit.
  35. // (The default is 0, which bases the comparison solely on length.)
  36. DistanceBitCost int
  37. table []uint32
  38. chain []uint32
  39. history []byte
  40. }
  41. func (q *M4) Reset() {
  42. for i := range q.table {
  43. q.table[i] = 0
  44. }
  45. q.history = q.history[:0]
  46. q.chain = q.chain[:0]
  47. }
  48. func (q *M4) score(m absoluteMatch) int {
  49. return (m.End-m.Start)*256 + (bits.LeadingZeros32(uint32(m.Start-m.Match))-32)*q.DistanceBitCost
  50. }
  51. func (q *M4) FindMatches(dst []Match, src []byte) []Match {
  52. if q.MaxDistance == 0 {
  53. q.MaxDistance = 65535
  54. }
  55. if q.MinLength == 0 {
  56. q.MinLength = 4
  57. }
  58. if q.HashLen == 0 {
  59. q.HashLen = 6
  60. }
  61. if q.TableBits == 0 {
  62. q.TableBits = 17
  63. }
  64. if len(q.table) < 1<<q.TableBits {
  65. q.table = make([]uint32, 1<<q.TableBits)
  66. }
  67. e := matchEmitter{Dst: dst}
  68. if len(q.history) > q.MaxDistance*2 {
  69. // Trim down the history buffer.
  70. delta := len(q.history) - q.MaxDistance
  71. copy(q.history, q.history[delta:])
  72. q.history = q.history[:q.MaxDistance]
  73. if q.ChainLength > 0 {
  74. q.chain = q.chain[:q.MaxDistance]
  75. }
  76. for i, v := range q.table {
  77. newV := int(v) - delta
  78. if newV < 0 {
  79. newV = 0
  80. }
  81. q.table[i] = uint32(newV)
  82. }
  83. }
  84. // Append src to the history buffer.
  85. e.NextEmit = len(q.history)
  86. q.history = append(q.history, src...)
  87. if q.ChainLength > 0 {
  88. q.chain = append(q.chain, make([]uint32, len(src))...)
  89. }
  90. src = q.history
  91. // matches stores the matches that have been found but not emitted,
  92. // in reverse order. (matches[0] is the most recent one.)
  93. var matches [3]absoluteMatch
  94. for i := e.NextEmit; i < len(src)-7; i++ {
  95. if matches[0] != (absoluteMatch{}) && i >= matches[0].End {
  96. // We have found some matches, and we're far enough along that we probably
  97. // won't find overlapping matches, so we might as well emit them.
  98. if matches[1] != (absoluteMatch{}) {
  99. if matches[1].End > matches[0].Start {
  100. matches[1].End = matches[0].Start
  101. }
  102. if matches[1].End-matches[1].Start >= q.MinLength && q.score(matches[1]) > 0 {
  103. e.emit(matches[1])
  104. }
  105. }
  106. e.emit(matches[0])
  107. matches = [3]absoluteMatch{}
  108. }
  109. // Look for a repeat match one byte after the current position.
  110. if matches[0] == (absoluteMatch{}) && len(e.Dst) > 0 {
  111. prevDistance := e.Dst[len(e.Dst)-1].Distance
  112. if binary.LittleEndian.Uint32(src[i+1:]) == binary.LittleEndian.Uint32(src[i+1-prevDistance:]) {
  113. // We have a 4-byte match.
  114. m := extendMatch2(src, i+1, i+1-prevDistance, e.NextEmit+1)
  115. if m.End-m.Start >= q.MinLength {
  116. matches[0] = m
  117. }
  118. }
  119. }
  120. // Calculate and store the hash.
  121. h := ((binary.LittleEndian.Uint64(src[i:]) & (1<<(8*q.HashLen) - 1)) * hashMul64) >> (64 - q.TableBits)
  122. candidate := int(q.table[h])
  123. q.table[h] = uint32(i)
  124. if q.ChainLength > 0 && candidate != 0 {
  125. delta := i - candidate
  126. q.chain[i] = uint32(delta)
  127. }
  128. if i < matches[0].End && i != matches[0].End+2-q.HashLen {
  129. continue
  130. }
  131. if candidate == 0 || i-candidate > q.MaxDistance {
  132. continue
  133. }
  134. // Look for a match.
  135. var currentMatch absoluteMatch
  136. if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
  137. m := extendMatch2(src, i, candidate, e.NextEmit)
  138. if m.End-m.Start > q.MinLength && q.score(m) > 0 {
  139. currentMatch = m
  140. }
  141. }
  142. for j := 0; j < q.ChainLength; j++ {
  143. delta := q.chain[candidate]
  144. if delta == 0 {
  145. break
  146. }
  147. candidate -= int(delta)
  148. if candidate <= 0 || i-candidate > q.MaxDistance {
  149. break
  150. }
  151. if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
  152. m := extendMatch2(src, i, candidate, e.NextEmit)
  153. if m.End-m.Start > q.MinLength && q.score(m) > q.score(currentMatch) {
  154. currentMatch = m
  155. }
  156. }
  157. }
  158. if currentMatch.End-currentMatch.Start < q.MinLength {
  159. continue
  160. }
  161. overlapPenalty := 0
  162. if matches[0] != (absoluteMatch{}) {
  163. overlapPenalty = 275
  164. if currentMatch.Start <= matches[1].End {
  165. // This match would completely replace the previous match,
  166. // so there is no penalty for overlap.
  167. overlapPenalty = 0
  168. }
  169. }
  170. if q.score(currentMatch) <= q.score(matches[0])+overlapPenalty {
  171. continue
  172. }
  173. matches = [3]absoluteMatch{
  174. currentMatch,
  175. matches[0],
  176. matches[1],
  177. }
  178. if matches[2] == (absoluteMatch{}) {
  179. continue
  180. }
  181. // We have three matches, so it's time to emit one and/or eliminate one.
  182. switch {
  183. case matches[0].Start < matches[2].End:
  184. // The first and third matches overlap; discard the one in between.
  185. matches = [3]absoluteMatch{
  186. matches[0],
  187. matches[2],
  188. absoluteMatch{},
  189. }
  190. case matches[0].Start < matches[2].End+q.MinLength:
  191. // The first and third matches don't overlap, but there's no room for
  192. // another match between them. Emit the first match and discard the second.
  193. e.emit(matches[2])
  194. matches = [3]absoluteMatch{
  195. matches[0],
  196. absoluteMatch{},
  197. absoluteMatch{},
  198. }
  199. default:
  200. // Emit the first match, shortening it if necessary to avoid overlap with the second.
  201. if matches[2].End > matches[1].Start {
  202. matches[2].End = matches[1].Start
  203. if q.ChainLength > 0 && matches[2].End-matches[2].Start >= q.MinLength {
  204. // Since the match length was trimmed, we may be able to find a closer match
  205. // to replace it.
  206. pos := matches[2].Start
  207. for {
  208. delta := int(q.chain[pos])
  209. if delta == 0 {
  210. break
  211. }
  212. pos -= delta
  213. if pos <= matches[2].Match {
  214. break
  215. }
  216. if bytes.Equal(src[matches[2].Start:matches[2].End], src[pos:pos+matches[2].End-matches[2].Start]) {
  217. matches[2].Match = pos
  218. break
  219. }
  220. }
  221. }
  222. }
  223. if matches[2].End-matches[2].Start >= q.MinLength && q.score(matches[2]) > 0 {
  224. e.emit(matches[2])
  225. }
  226. matches[2] = absoluteMatch{}
  227. }
  228. }
  229. // We've found all the matches now; emit the remaining ones.
  230. if matches[1] != (absoluteMatch{}) {
  231. if matches[1].End > matches[0].Start {
  232. matches[1].End = matches[0].Start
  233. }
  234. if matches[1].End-matches[1].Start >= q.MinLength && q.score(matches[1]) > 0 {
  235. e.emit(matches[1])
  236. }
  237. }
  238. if matches[0] != (absoluteMatch{}) {
  239. e.emit(matches[0])
  240. }
  241. dst = e.Dst
  242. if e.NextEmit < len(src) {
  243. dst = append(dst, Match{
  244. Unmatched: len(src) - e.NextEmit,
  245. })
  246. }
  247. return dst
  248. }
  249. const hashMul64 = 0x1E35A7BD1E35A7BD
  250. // extendMatch returns the largest k such that k <= len(src) and that
  251. // src[i:i+k-j] and src[j:k] have the same contents.
  252. //
  253. // It assumes that:
  254. //
  255. // 0 <= i && i < j && j <= len(src)
  256. func extendMatch(src []byte, i, j int) int {
  257. switch runtime.GOARCH {
  258. case "amd64", "arm64":
  259. // As long as we are 8 or more bytes before the end of src, we can load and
  260. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  261. for j+8 < len(src) {
  262. iBytes := binary.LittleEndian.Uint64(src[i:])
  263. jBytes := binary.LittleEndian.Uint64(src[j:])
  264. if iBytes != jBytes {
  265. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  266. // the index of the first byte that differs. The BSF instruction finds the
  267. // least significant 1 bit, the amd64 architecture is little-endian, and
  268. // the shift by 3 converts a bit index to a byte index.
  269. return j + bits.TrailingZeros64(iBytes^jBytes)>>3
  270. }
  271. i, j = i+8, j+8
  272. }
  273. case "386":
  274. // On a 32-bit CPU, we do it 4 bytes at a time.
  275. for j+4 < len(src) {
  276. iBytes := binary.LittleEndian.Uint32(src[i:])
  277. jBytes := binary.LittleEndian.Uint32(src[j:])
  278. if iBytes != jBytes {
  279. return j + bits.TrailingZeros32(iBytes^jBytes)>>3
  280. }
  281. i, j = i+4, j+4
  282. }
  283. }
  284. for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
  285. }
  286. return j
  287. }
  288. // Given a 4-byte match at src[start] and src[candidate], extendMatch2 extends it
  289. // upward as far as possible, and downward no farther than to min.
  290. func extendMatch2(src []byte, start, candidate, min int) absoluteMatch {
  291. end := extendMatch(src, candidate+4, start+4)
  292. for start > min && candidate > 0 && src[start-1] == src[candidate-1] {
  293. start--
  294. candidate--
  295. }
  296. return absoluteMatch{
  297. Start: start,
  298. End: end,
  299. Match: candidate,
  300. }
  301. }