bloom.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. // Copyright 2013 The LevelDB-Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package y
  5. import "math"
  6. // Filter is an encoded set of []byte keys.
  7. type Filter []byte
  8. func (f Filter) MayContainKey(k []byte) bool {
  9. return f.MayContain(Hash(k))
  10. }
  11. // MayContain returns whether the filter may contain given key. False positives
  12. // are possible, where it returns true for keys not in the original set.
  13. func (f Filter) MayContain(h uint32) bool {
  14. if len(f) < 2 {
  15. return false
  16. }
  17. k := f[len(f)-1]
  18. if k > 30 {
  19. // This is reserved for potentially new encodings for short Bloom filters.
  20. // Consider it a match.
  21. return true
  22. }
  23. nBits := uint32(8 * (len(f) - 1))
  24. delta := h>>17 | h<<15
  25. for j := uint8(0); j < k; j++ {
  26. bitPos := h % nBits
  27. if f[bitPos/8]&(1<<(bitPos%8)) == 0 {
  28. return false
  29. }
  30. h += delta
  31. }
  32. return true
  33. }
  34. // NewFilter returns a new Bloom filter that encodes a set of []byte keys with
  35. // the given number of bits per key, approximately.
  36. //
  37. // A good bitsPerKey value is 10, which yields a filter with ~ 1% false
  38. // positive rate.
  39. func NewFilter(keys []uint32, bitsPerKey int) Filter {
  40. return Filter(appendFilter(nil, keys, bitsPerKey))
  41. }
  42. // BloomBitsPerKey returns the bits per key required by bloomfilter based on
  43. // the false positive rate.
  44. func BloomBitsPerKey(numEntries int, fp float64) int {
  45. size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2)
  46. locs := math.Ceil(float64(0.69314718056) * size / float64(numEntries))
  47. return int(locs)
  48. }
  49. func appendFilter(buf []byte, keys []uint32, bitsPerKey int) []byte {
  50. if bitsPerKey < 0 {
  51. bitsPerKey = 0
  52. }
  53. // 0.69 is approximately ln(2).
  54. k := uint32(float64(bitsPerKey) * 0.69)
  55. if k < 1 {
  56. k = 1
  57. }
  58. if k > 30 {
  59. k = 30
  60. }
  61. nBits := len(keys) * bitsPerKey
  62. // For small len(keys), we can see a very high false positive rate. Fix it
  63. // by enforcing a minimum bloom filter length.
  64. if nBits < 64 {
  65. nBits = 64
  66. }
  67. nBytes := (nBits + 7) / 8
  68. nBits = nBytes * 8
  69. buf, filter := extend(buf, nBytes+1)
  70. for _, h := range keys {
  71. delta := h>>17 | h<<15
  72. for j := uint32(0); j < k; j++ {
  73. bitPos := h % uint32(nBits)
  74. filter[bitPos/8] |= 1 << (bitPos % 8)
  75. h += delta
  76. }
  77. }
  78. filter[nBytes] = uint8(k)
  79. return buf
  80. }
  81. // extend appends n zero bytes to b. It returns the overall slice (of length
  82. // n+len(originalB)) and the slice of n trailing zeroes.
  83. func extend(b []byte, n int) (overall, trailer []byte) {
  84. want := n + len(b)
  85. if want <= cap(b) {
  86. overall = b[:want]
  87. trailer = overall[len(b):]
  88. for i := range trailer {
  89. trailer[i] = 0
  90. }
  91. } else {
  92. // Grow the capacity exponentially, with a 1KiB minimum.
  93. c := 1024
  94. for c < want {
  95. c += c / 4
  96. }
  97. overall = make([]byte, want, c)
  98. trailer = overall[len(b):]
  99. copy(overall, b)
  100. }
  101. return overall, trailer
  102. }
  103. // hash implements a hashing algorithm similar to the Murmur hash.
  104. func Hash(b []byte) uint32 {
  105. const (
  106. seed = 0xbc9f1d34
  107. m = 0xc6a4a793
  108. )
  109. h := uint32(seed) ^ uint32(len(b))*m
  110. for ; len(b) >= 4; b = b[4:] {
  111. h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
  112. h *= m
  113. h ^= h >> 16
  114. }
  115. switch len(b) {
  116. case 3:
  117. h += uint32(b[2]) << 16
  118. fallthrough
  119. case 2:
  120. h += uint32(b[1]) << 8
  121. fallthrough
  122. case 1:
  123. h += uint32(b[0])
  124. h *= m
  125. h ^= h >> 24
  126. }
  127. return h
  128. }
  129. // FilterPolicy implements the db.FilterPolicy interface from the leveldb/db
  130. // package.
  131. //
  132. // The integer value is the approximate number of bits used per key. A good
  133. // value is 10, which yields a filter with ~ 1% false positive rate.
  134. //
  135. // It is valid to use the other API in this package (leveldb/bloom) without
  136. // using this type or the leveldb/db package.
  137. // type FilterPolicy int
  138. // // Name implements the db.FilterPolicy interface.
  139. // func (p FilterPolicy) Name() string {
  140. // // This string looks arbitrary, but its value is written to LevelDB .ldb
  141. // // files, and should be this exact value to be compatible with those files
  142. // // and with the C++ LevelDB code.
  143. // return "leveldb.BuiltinBloomFilter2"
  144. // }
  145. // // AppendFilter implements the db.FilterPolicy interface.
  146. // func (p FilterPolicy) AppendFilter(dst []byte, keys [][]byte) []byte {
  147. // return appendFilter(dst, keys, int(p))
  148. // }
  149. // // MayContain implements the db.FilterPolicy interface.
  150. // func (p FilterPolicy) MayContain(filter, key []byte) bool {
  151. // return Filter(filter).MayContain(key)
  152. // }