stateless.go 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. package flate
  2. import (
  3. "io"
  4. "math"
  5. "sync"
  6. "github.com/klauspost/compress/internal/le"
  7. )
  8. const (
  9. maxStatelessBlock = math.MaxInt16
  10. // dictionary will be taken from maxStatelessBlock, so limit it.
  11. maxStatelessDict = 8 << 10
  12. slTableBits = 13
  13. slTableSize = 1 << slTableBits
  14. slTableShift = 32 - slTableBits
  15. )
  16. type statelessWriter struct {
  17. dst io.Writer
  18. closed bool
  19. }
  20. func (s *statelessWriter) Close() error {
  21. if s.closed {
  22. return nil
  23. }
  24. s.closed = true
  25. // Emit EOF block
  26. return StatelessDeflate(s.dst, nil, true, nil)
  27. }
  28. func (s *statelessWriter) Write(p []byte) (n int, err error) {
  29. err = StatelessDeflate(s.dst, p, false, nil)
  30. if err != nil {
  31. return 0, err
  32. }
  33. return len(p), nil
  34. }
  35. func (s *statelessWriter) Reset(w io.Writer) {
  36. s.dst = w
  37. s.closed = false
  38. }
  39. // NewStatelessWriter will do compression but without maintaining any state
  40. // between Write calls.
  41. // There will be no memory kept between Write calls,
  42. // but compression and speed will be suboptimal.
  43. // Because of this, the size of actual Write calls will affect output size.
  44. func NewStatelessWriter(dst io.Writer) io.WriteCloser {
  45. return &statelessWriter{dst: dst}
  46. }
  47. // bitWriterPool contains bit writers that can be reused.
  48. var bitWriterPool = sync.Pool{
  49. New: func() any {
  50. return newHuffmanBitWriter(nil)
  51. },
  52. }
  53. // tokensPool contains tokens struct objects that can be reused
  54. var tokensPool = sync.Pool{
  55. New: func() any {
  56. return &tokens{}
  57. },
  58. }
  59. // StatelessDeflate allows compressing directly to a Writer without retaining state.
  60. // When returning everything will be flushed.
  61. // Up to 8KB of an optional dictionary can be given which is presumed to precede the block.
  62. // Longer dictionaries will be truncated and will still produce valid output.
  63. // Sending nil dictionary is perfectly fine.
  64. func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
  65. bw := bitWriterPool.Get().(*huffmanBitWriter)
  66. bw.reset(out)
  67. defer func() {
  68. // don't keep a reference to our output
  69. bw.reset(nil)
  70. bitWriterPool.Put(bw)
  71. }()
  72. if eof && len(in) == 0 {
  73. // Just write an EOF block.
  74. // Could be faster...
  75. bw.writeStoredHeader(0, true)
  76. bw.flush()
  77. return bw.err
  78. }
  79. // Truncate dict
  80. if len(dict) > maxStatelessDict {
  81. dict = dict[len(dict)-maxStatelessDict:]
  82. }
  83. // For subsequent loops, keep shallow dict reference to avoid alloc+copy.
  84. var inDict []byte
  85. dst := tokensPool.Get().(*tokens)
  86. dst.Reset()
  87. defer func() {
  88. tokensPool.Put(dst)
  89. }()
  90. for len(in) > 0 {
  91. todo := in
  92. if len(inDict) > 0 {
  93. if len(todo) > maxStatelessBlock-maxStatelessDict {
  94. todo = todo[:maxStatelessBlock-maxStatelessDict]
  95. }
  96. } else if len(todo) > maxStatelessBlock-len(dict) {
  97. todo = todo[:maxStatelessBlock-len(dict)]
  98. }
  99. inOrg := in
  100. in = in[len(todo):]
  101. uncompressed := todo
  102. if len(dict) > 0 {
  103. // combine dict and source
  104. bufLen := len(todo) + len(dict)
  105. combined := make([]byte, bufLen)
  106. copy(combined, dict)
  107. copy(combined[len(dict):], todo)
  108. todo = combined
  109. }
  110. // Compress
  111. if len(inDict) == 0 {
  112. statelessEnc(dst, todo, int16(len(dict)))
  113. } else {
  114. statelessEnc(dst, inDict[:maxStatelessDict+len(todo)], maxStatelessDict)
  115. }
  116. isEof := eof && len(in) == 0
  117. if dst.n == 0 {
  118. bw.writeStoredHeader(len(uncompressed), isEof)
  119. if bw.err != nil {
  120. return bw.err
  121. }
  122. bw.writeBytes(uncompressed)
  123. } else if int(dst.n) > len(uncompressed)-len(uncompressed)>>4 {
  124. // If we removed less than 1/16th, huffman compress the block.
  125. bw.writeBlockHuff(isEof, uncompressed, len(in) == 0)
  126. } else {
  127. bw.writeBlockDynamic(dst, isEof, uncompressed, len(in) == 0)
  128. }
  129. if len(in) > 0 {
  130. // Retain a dict if we have more
  131. inDict = inOrg[len(uncompressed)-maxStatelessDict:]
  132. dict = nil
  133. dst.Reset()
  134. }
  135. if bw.err != nil {
  136. return bw.err
  137. }
  138. }
  139. if !eof {
  140. // Align, only a stored block can do that.
  141. bw.writeStoredHeader(0, false)
  142. }
  143. bw.flush()
  144. return bw.err
  145. }
  146. func hashSL(u uint32) uint32 {
  147. return (u * 0x1e35a7bd) >> slTableShift
  148. }
  149. func load3216(b []byte, i int16) uint32 {
  150. return le.Load32(b, i)
  151. }
  152. func load6416(b []byte, i int16) uint64 {
  153. return le.Load64(b, i)
  154. }
  155. func statelessEnc(dst *tokens, src []byte, startAt int16) {
  156. const (
  157. inputMargin = 12 - 1
  158. minNonLiteralBlockSize = 1 + 1 + inputMargin
  159. )
  160. type tableEntry struct {
  161. offset int16
  162. }
  163. var table [slTableSize]tableEntry
  164. // This check isn't in the Snappy implementation, but there, the caller
  165. // instead of the callee handles this case.
  166. if len(src)-int(startAt) < minNonLiteralBlockSize {
  167. // We do not fill the token table.
  168. // This will be picked up by caller.
  169. dst.n = 0
  170. return
  171. }
  172. // Index until startAt
  173. if startAt > 0 {
  174. cv := load3232(src, 0)
  175. for i := range startAt {
  176. table[hashSL(cv)] = tableEntry{offset: i}
  177. cv = (cv >> 8) | (uint32(src[i+4]) << 24)
  178. }
  179. }
  180. s := startAt + 1
  181. nextEmit := startAt
  182. // sLimit is when to stop looking for offset/length copies. The inputMargin
  183. // lets us use a fast path for emitLiteral in the main loop, while we are
  184. // looking for copies.
  185. sLimit := int16(len(src) - inputMargin)
  186. // nextEmit is where in src the next emitLiteral should start from.
  187. cv := load3216(src, s)
  188. for {
  189. const skipLog = 5
  190. const doEvery = 2
  191. nextS := s
  192. var candidate tableEntry
  193. for {
  194. nextHash := hashSL(cv)
  195. candidate = table[nextHash]
  196. nextS = s + doEvery + (s-nextEmit)>>skipLog
  197. if nextS > sLimit || nextS <= 0 {
  198. goto emitRemainder
  199. }
  200. now := load6416(src, nextS)
  201. table[nextHash] = tableEntry{offset: s}
  202. nextHash = hashSL(uint32(now))
  203. if cv == load3216(src, candidate.offset) {
  204. table[nextHash] = tableEntry{offset: nextS}
  205. break
  206. }
  207. // Do one right away...
  208. cv = uint32(now)
  209. s = nextS
  210. nextS++
  211. candidate = table[nextHash]
  212. now >>= 8
  213. table[nextHash] = tableEntry{offset: s}
  214. if cv == load3216(src, candidate.offset) {
  215. table[nextHash] = tableEntry{offset: nextS}
  216. break
  217. }
  218. cv = uint32(now)
  219. s = nextS
  220. }
  221. // A 4-byte match has been found. We'll later see if more than 4 bytes
  222. // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
  223. // them as literal bytes.
  224. for {
  225. // Invariant: we have a 4-byte match at s, and no need to emit any
  226. // literal bytes prior to s.
  227. // Extend the 4-byte match as long as possible.
  228. t := candidate.offset
  229. l := int16(matchLen(src[s+4:], src[t+4:]) + 4)
  230. // Extend backwards
  231. for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
  232. s--
  233. t--
  234. l++
  235. }
  236. if nextEmit < s {
  237. if false {
  238. emitLiteral(dst, src[nextEmit:s])
  239. } else {
  240. for _, v := range src[nextEmit:s] {
  241. dst.tokens[dst.n] = token(v)
  242. dst.litHist[v]++
  243. dst.n++
  244. }
  245. }
  246. }
  247. // Save the match found
  248. dst.AddMatchLong(int32(l), uint32(s-t-baseMatchOffset))
  249. s += l
  250. nextEmit = s
  251. if nextS >= s {
  252. s = nextS + 1
  253. }
  254. if s >= sLimit {
  255. goto emitRemainder
  256. }
  257. // We could immediately start working at s now, but to improve
  258. // compression we first update the hash table at s-2 and at s. If
  259. // another emitCopy is not our next move, also calculate nextHash
  260. // at s+1. At least on GOARCH=amd64, these three hash calculations
  261. // are faster as one load64 call (with some shifts) instead of
  262. // three load32 calls.
  263. x := load6416(src, s-2)
  264. o := s - 2
  265. prevHash := hashSL(uint32(x))
  266. table[prevHash] = tableEntry{offset: o}
  267. x >>= 16
  268. currHash := hashSL(uint32(x))
  269. candidate = table[currHash]
  270. table[currHash] = tableEntry{offset: o + 2}
  271. if uint32(x) != load3216(src, candidate.offset) {
  272. cv = uint32(x >> 8)
  273. s++
  274. break
  275. }
  276. }
  277. }
  278. emitRemainder:
  279. if int(nextEmit) < len(src) {
  280. // If nothing was added, don't encode literals.
  281. if dst.n == 0 {
  282. return
  283. }
  284. emitLiteral(dst, src[nextEmit:])
  285. }
  286. }