| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328 |
- package matchfinder
- import (
- "encoding/binary"
- "math"
- "math/bits"
- "slices"
- )
- // Pathfinder is a MatchFinder that uses hash chains to find matches, and a
- // shortest-path optimizer to choose which matches to use.
- type Pathfinder struct {
- // MaxDistance is the maximum distance (in bytes) to look back for
- // a match. The default is 65535.
- MaxDistance int
- // MinLength is the length of the shortest match to return.
- // The default is 4.
- MinLength int
- // HashLen is the number of bytes to use to calculate the hashes.
- // The maximum is 8 and the default is 6.
- HashLen int
- // TableBits is the number of bits in the hash table indexes.
- // The default is 17 (128K entries).
- TableBits int
- // ChainLength is how many entries to search on the "match chain" of older
- // locations with the same hash as the current location.
- ChainLength int
- table []uint32
- chain []uint32
- history []byte
- // holding onto buffers to reduce allocations:
- arrivals []arrival
- foundMatches []absoluteMatch
- matches []Match
- }
- func (q *Pathfinder) Reset() {
- for i := range q.table {
- q.table[i] = 0
- }
- q.history = q.history[:0]
- q.chain = q.chain[:0]
- }
- // An arrival represents how we got to a certain byte position.
- // The cost is the total cost to get there from the beginning of the block.
- // If distance > 0, the arrival is with a match.
- // If distance == 0, the arrival is with a run of literals.
- type arrival struct {
- length uint32
- distance uint32
- cost float32
- }
- const (
- baseMatchCost float32 = 4
- )
- func (q *Pathfinder) FindMatches(dst []Match, src []byte) []Match {
- if q.MaxDistance == 0 {
- q.MaxDistance = 65535
- }
- if q.MinLength == 0 {
- q.MinLength = 4
- }
- if q.HashLen == 0 {
- q.HashLen = 6
- }
- if q.TableBits == 0 {
- q.TableBits = 17
- }
- if len(q.table) < 1<<q.TableBits {
- q.table = make([]uint32, 1<<q.TableBits)
- }
- var histogram [256]uint32
- for _, b := range src {
- histogram[b]++
- }
- var byteCost [256]float32
- for b, n := range histogram {
- cost := max(math.Log2(float64(len(src))/float64(n)), 1)
- byteCost[b] = float32(cost)
- }
- // Each element in arrivals corresponds to the position just after
- // the corresponding byte in src.
- arrivals := q.arrivals
- if len(arrivals) < len(src) {
- arrivals = make([]arrival, len(src))
- q.arrivals = arrivals
- } else {
- arrivals = arrivals[:len(src)]
- for i := range arrivals {
- arrivals[i] = arrival{}
- }
- }
- if len(q.history) > q.MaxDistance*2 {
- // Trim down the history buffer.
- delta := len(q.history) - q.MaxDistance
- copy(q.history, q.history[delta:])
- q.history = q.history[:q.MaxDistance]
- q.chain = q.chain[:q.MaxDistance]
- for i, v := range q.table {
- newV := max(int(v)-delta, 0)
- q.table[i] = uint32(newV)
- }
- }
- // Append src to the history buffer.
- historyLen := len(q.history)
- q.history = append(q.history, src...)
- q.chain = append(q.chain, make([]uint32, len(src))...)
- src = q.history
- // Calculate hashes and build the chain.
- for i := historyLen; i < len(src)-7; i++ {
- h := ((binary.LittleEndian.Uint64(src[i:]) & (1<<(8*q.HashLen) - 1)) * hashMul64) >> (64 - q.TableBits)
- candidate := int(q.table[h])
- q.table[h] = uint32(i)
- if candidate != 0 {
- delta := i - candidate
- q.chain[i] = uint32(delta)
- }
- }
- // Look for matches, and collect them in foundMatches. Later we'll figure out
- // which ones to actually use.
- foundMatches := q.foundMatches[:0]
- var prevMatch absoluteMatch
- i := historyLen
- for i < len(src)-7 {
- delta := q.chain[i]
- if delta == 0 {
- i++
- continue
- }
- candidate := i - int(delta)
- if candidate <= 0 || i-candidate > q.MaxDistance {
- i++
- continue
- }
- var currentMatch absoluteMatch
- if i >= prevMatch.End && prevMatch != (absoluteMatch{}) {
- // Look for a repeat match at i+1.
- prevDistance := prevMatch.Start - prevMatch.Match
- if binary.LittleEndian.Uint32(src[i+1:]) == binary.LittleEndian.Uint32(src[i+1-prevDistance:]) {
- m := extendMatch2(src, i+1, i+1-prevDistance, i+1)
- if m.End-m.Start > q.MinLength {
- currentMatch = m
- foundMatches = append(foundMatches, m)
- }
- }
- }
- if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
- m := extendMatch2(src, i, candidate, max(historyLen, prevMatch.Start))
- if m.End-m.Start > q.MinLength {
- currentMatch = m
- foundMatches = append(foundMatches, m)
- }
- }
- for range q.ChainLength {
- delta := q.chain[candidate]
- if delta == 0 {
- break
- }
- candidate -= int(delta)
- if candidate <= 0 || i-candidate > q.MaxDistance {
- break
- }
- if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
- m := extendMatch2(src, i, candidate, max(historyLen, prevMatch.Start))
- if m.End-m.Start > q.MinLength && m.End-m.Start > currentMatch.End-currentMatch.Start {
- currentMatch = m
- foundMatches = append(foundMatches, m)
- }
- }
- }
- if i < prevMatch.End && currentMatch.End-currentMatch.Start <= prevMatch.End-prevMatch.Start {
- // We were looking for an overlapping match, but we didn't find one longer
- // than the previous match. So we'll go back to sequential search,
- // starting right after the previous match.
- i = prevMatch.End
- continue
- }
- if currentMatch == (absoluteMatch{}) {
- // No match found. Continue with sequential search.
- i++
- continue
- }
- // We've found a match; now look for matches overlapping the end of it.
- prevMatch = currentMatch
- i = currentMatch.End + 2 - q.HashLen
- }
- q.foundMatches = foundMatches
- slices.SortFunc(foundMatches, func(a, b absoluteMatch) int { return a.Start - b.Start })
- matchIndex := 0
- var pending absoluteMatch
- for i := historyLen; i < len(src); i++ {
- var arrivedHere arrival
- if i > historyLen {
- arrivedHere = arrivals[i-historyLen-1]
- }
- unmatched := 0
- if arrivedHere.distance == 0 {
- unmatched = int(arrivedHere.length)
- }
- prevDistance := 0
- if i-unmatched > historyLen {
- prevDistance = int(arrivals[i-historyLen-1-unmatched].distance)
- }
- literalCost := byteCost[src[i]]
- nextArrival := &arrivals[i-historyLen]
- if nextArrival.cost == 0 || arrivedHere.cost+literalCost < nextArrival.cost {
- *nextArrival = arrival{
- cost: arrivedHere.cost + literalCost,
- length: uint32(unmatched + 1),
- }
- }
- for matchIndex < len(foundMatches) && foundMatches[matchIndex].Start == i {
- m := foundMatches[matchIndex]
- matchIndex++
- if m.End > pending.End {
- pending = m
- }
- matchCost := baseMatchCost + float32(bits.Len(uint(unmatched)))
- if m.Start-m.Match != prevDistance {
- matchCost += float32(bits.Len(uint(m.Start - m.Match)))
- }
- for j := m.Start + q.MinLength; j <= m.End; j++ {
- adjustedCost := matchCost
- if j-m.Start < 6 {
- // Matches shorter than 6 are comparatively rare, and therefore
- // have longer codes.
- adjustedCost += float32(6-(j-m.Start)) * 2
- }
- a := &arrivals[j-historyLen-1]
- if a.cost == 0 || arrivedHere.cost+adjustedCost < a.cost {
- *a = arrival{
- length: uint32(j - m.Start),
- distance: uint32(m.Start - m.Match),
- cost: arrivedHere.cost + adjustedCost,
- }
- }
- }
- }
- // If a match from an earlier position extends far enough past the current
- // position, try using the tail of it, starting from here.
- if unmatched == 0 && pending.Start != i && pending.End >= i+q.MinLength &&
- !(arrivedHere.length != 0 && arrivedHere.distance == uint32(pending.Start-pending.Match)) {
- matchCost := baseMatchCost + float32(bits.Len(uint(pending.Start-pending.Match)))
- for j := i + q.MinLength; j <= pending.End; j++ {
- adjustedCost := matchCost
- if j-i < 6 {
- // Matches shorter than 6 are comparatively rare, and therefore
- // have longer codes.
- adjustedCost += float32(6-(j-i)) * 2
- }
- a := &arrivals[j-historyLen-1]
- if a.cost == 0 || arrivedHere.cost+adjustedCost < a.cost {
- *a = arrival{
- length: uint32(j - i),
- distance: uint32(pending.Start - pending.Match),
- cost: arrivedHere.cost + adjustedCost,
- }
- }
- }
- }
- delta := q.chain[i]
- if delta == 0 {
- continue
- }
- candidate := i - int(delta)
- if candidate <= 0 || i-candidate > q.MaxDistance {
- continue
- }
- }
- // We've found the shortest path; now walk it backward and store the matches.
- matches := q.matches[:0]
- i = len(arrivals) - 1
- for i >= 0 {
- a := arrivals[i]
- if a.distance > 0 {
- matches = append(matches, Match{
- Length: int(a.length),
- Distance: int(a.distance),
- })
- i -= int(a.length)
- } else {
- if len(matches) == 0 {
- matches = append(matches, Match{})
- }
- matches[len(matches)-1].Unmatched = int(a.length)
- i -= int(a.length)
- }
- }
- q.matches = matches
- slices.Reverse(matches)
- return append(dst, matches...)
- }
|