decompress_amd64.go 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. //go:build amd64 && !appengine && !noasm && gc
  2. // +build amd64,!appengine,!noasm,gc
  3. // This file contains the specialisation of Decoder.Decompress4X
  4. // and Decoder.Decompress1X that use an asm implementation of thir main loops.
  5. package huff0
  6. import (
  7. "errors"
  8. "fmt"
  9. "github.com/klauspost/compress/internal/cpuinfo"
  10. )
  11. // decompress4x_main_loop_x86 is an x86 assembler implementation
  12. // of Decompress4X when tablelog > 8.
  13. //
  14. //go:noescape
  15. func decompress4x_main_loop_amd64(ctx *decompress4xContext)
  16. // decompress4x_8b_loop_x86 is an x86 assembler implementation
  17. // of Decompress4X when tablelog <= 8 which decodes 4 entries
  18. // per loop.
  19. //
  20. //go:noescape
  21. func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
  22. // fallback8BitSize is the size where using Go version is faster.
  23. const fallback8BitSize = 800
  24. type decompress4xContext struct {
  25. pbr *[4]bitReaderShifted
  26. peekBits uint8
  27. out *byte
  28. dstEvery int
  29. tbl *dEntrySingle
  30. decoded int
  31. limit *byte
  32. }
  33. // Decompress4X will decompress a 4X encoded stream.
  34. // The length of the supplied input must match the end of a block exactly.
  35. // The *capacity* of the dst slice must match the destination size of
  36. // the uncompressed data exactly.
  37. func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
  38. if len(d.dt.single) == 0 {
  39. return nil, errors.New("no table loaded")
  40. }
  41. if len(src) < 6+(4*1) {
  42. return nil, errors.New("input too small")
  43. }
  44. use8BitTables := d.actualTableLog <= 8
  45. if cap(dst) < fallback8BitSize && use8BitTables {
  46. return d.decompress4X8bit(dst, src)
  47. }
  48. var br [4]bitReaderShifted
  49. // Decode "jump table"
  50. start := 6
  51. for i := range 3 {
  52. length := int(src[i*2]) | (int(src[i*2+1]) << 8)
  53. if start+length >= len(src) {
  54. return nil, errors.New("truncated input (or invalid offset)")
  55. }
  56. err := br[i].init(src[start : start+length])
  57. if err != nil {
  58. return nil, err
  59. }
  60. start += length
  61. }
  62. err := br[3].init(src[start:])
  63. if err != nil {
  64. return nil, err
  65. }
  66. // destination, offset to match first output
  67. dstSize := cap(dst)
  68. dst = dst[:dstSize]
  69. out := dst
  70. dstEvery := (dstSize + 3) / 4
  71. const tlSize = 1 << tableLogMax
  72. const tlMask = tlSize - 1
  73. single := d.dt.single[:tlSize]
  74. var decoded int
  75. if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
  76. ctx := decompress4xContext{
  77. pbr: &br,
  78. peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
  79. out: &out[0],
  80. dstEvery: dstEvery,
  81. tbl: &single[0],
  82. limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
  83. }
  84. if use8BitTables {
  85. decompress4x_8b_main_loop_amd64(&ctx)
  86. } else {
  87. decompress4x_main_loop_amd64(&ctx)
  88. }
  89. decoded = ctx.decoded
  90. out = out[decoded/4:]
  91. }
  92. // Decode remaining.
  93. remainBytes := dstEvery - (decoded / 4)
  94. for i := range br {
  95. offset := dstEvery * i
  96. endsAt := min(offset+remainBytes, len(out))
  97. br := &br[i]
  98. bitsLeft := br.remaining()
  99. for bitsLeft > 0 {
  100. br.fill()
  101. if offset >= endsAt {
  102. return nil, errors.New("corruption detected: stream overrun 4")
  103. }
  104. // Read value and increment offset.
  105. val := br.peekBitsFast(d.actualTableLog)
  106. v := single[val&tlMask].entry
  107. nBits := uint8(v)
  108. br.advance(nBits)
  109. bitsLeft -= uint(nBits)
  110. out[offset] = uint8(v >> 8)
  111. offset++
  112. }
  113. if offset != endsAt {
  114. return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
  115. }
  116. decoded += offset - dstEvery*i
  117. err = br.close()
  118. if err != nil {
  119. return nil, err
  120. }
  121. }
  122. if dstSize != decoded {
  123. return nil, errors.New("corruption detected: short output block")
  124. }
  125. return dst, nil
  126. }
  127. // decompress4x_main_loop_x86 is an x86 assembler implementation
  128. // of Decompress1X when tablelog > 8.
  129. //
  130. //go:noescape
  131. func decompress1x_main_loop_amd64(ctx *decompress1xContext)
  132. // decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
  133. // of Decompress1X when tablelog > 8.
  134. //
  135. //go:noescape
  136. func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
  137. type decompress1xContext struct {
  138. pbr *bitReaderShifted
  139. peekBits uint8
  140. out *byte
  141. outCap int
  142. tbl *dEntrySingle
  143. decoded int
  144. }
  145. // Error reported by asm implementations
  146. const error_max_decoded_size_exeeded = -1
  147. // Decompress1X will decompress a 1X encoded stream.
  148. // The cap of the output buffer will be the maximum decompressed size.
  149. // The length of the supplied input must match the end of a block exactly.
  150. func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
  151. if len(d.dt.single) == 0 {
  152. return nil, errors.New("no table loaded")
  153. }
  154. var br bitReaderShifted
  155. err := br.init(src)
  156. if err != nil {
  157. return dst, err
  158. }
  159. maxDecodedSize := cap(dst)
  160. dst = dst[:maxDecodedSize]
  161. const tlSize = 1 << tableLogMax
  162. const tlMask = tlSize - 1
  163. if maxDecodedSize >= 4 {
  164. ctx := decompress1xContext{
  165. pbr: &br,
  166. out: &dst[0],
  167. outCap: maxDecodedSize,
  168. peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
  169. tbl: &d.dt.single[0],
  170. }
  171. if cpuinfo.HasBMI2() {
  172. decompress1x_main_loop_bmi2(&ctx)
  173. } else {
  174. decompress1x_main_loop_amd64(&ctx)
  175. }
  176. if ctx.decoded == error_max_decoded_size_exeeded {
  177. return nil, ErrMaxDecodedSizeExceeded
  178. }
  179. dst = dst[:ctx.decoded]
  180. }
  181. // br < 8, so uint8 is fine
  182. bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
  183. for bitsLeft > 0 {
  184. br.fill()
  185. if len(dst) >= maxDecodedSize {
  186. br.close()
  187. return nil, ErrMaxDecodedSizeExceeded
  188. }
  189. v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
  190. nBits := uint8(v.entry)
  191. br.advance(nBits)
  192. bitsLeft -= nBits
  193. dst = append(dst, uint8(v.entry>>8))
  194. }
  195. return dst, br.close()
  196. }