manifest.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. /*
  2. * Copyright 2017 Dgraph Labs, Inc. and Contributors
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package badger
  17. import (
  18. "bufio"
  19. "bytes"
  20. "encoding/binary"
  21. stderrors "errors"
  22. "fmt"
  23. "hash/crc32"
  24. "io"
  25. "math"
  26. "os"
  27. "path/filepath"
  28. "sync"
  29. "github.com/pkg/errors"
  30. "google.golang.org/protobuf/proto"
  31. "github.com/dgraph-io/badger/v4/options"
  32. "github.com/dgraph-io/badger/v4/pb"
  33. "github.com/dgraph-io/badger/v4/y"
  34. )
  35. // Manifest represents the contents of the MANIFEST file in a Badger store.
  36. //
  37. // The MANIFEST file describes the startup state of the db -- all LSM files and what level they're
  38. // at.
  39. //
  40. // It consists of a sequence of ManifestChangeSet objects. Each of these is treated atomically,
  41. // and contains a sequence of ManifestChange's (file creations/deletions) which we use to
  42. // reconstruct the manifest at startup.
  43. type Manifest struct {
  44. Levels []levelManifest
  45. Tables map[uint64]TableManifest
  46. // Contains total number of creation and deletion changes in the manifest -- used to compute
  47. // whether it'd be useful to rewrite the manifest.
  48. Creations int
  49. Deletions int
  50. }
  51. func createManifest() Manifest {
  52. levels := make([]levelManifest, 0)
  53. return Manifest{
  54. Levels: levels,
  55. Tables: make(map[uint64]TableManifest),
  56. }
  57. }
  58. // levelManifest contains information about LSM tree levels
  59. // in the MANIFEST file.
  60. type levelManifest struct {
  61. Tables map[uint64]struct{} // Set of table id's
  62. }
  63. // TableManifest contains information about a specific table
  64. // in the LSM tree.
  65. type TableManifest struct {
  66. Level uint8
  67. KeyID uint64
  68. Compression options.CompressionType
  69. }
  70. // manifestFile holds the file pointer (and other info) about the manifest file, which is a log
  71. // file we append to.
  72. type manifestFile struct {
  73. fp *os.File
  74. directory string
  75. // The external magic number used by the application running badger.
  76. externalMagic uint16
  77. // We make this configurable so that unit tests can hit rewrite() code quickly
  78. deletionsRewriteThreshold int
  79. // Guards appends, which includes access to the manifest field.
  80. appendLock sync.Mutex
  81. // Used to track the current state of the manifest, used when rewriting.
  82. manifest Manifest
  83. // Used to indicate if badger was opened in InMemory mode.
  84. inMemory bool
  85. }
  86. const (
  87. // ManifestFilename is the filename for the manifest file.
  88. ManifestFilename = "MANIFEST"
  89. manifestRewriteFilename = "MANIFEST-REWRITE"
  90. manifestDeletionsRewriteThreshold = 10000
  91. manifestDeletionsRatio = 10
  92. )
  93. // asChanges returns a sequence of changes that could be used to recreate the Manifest in its
  94. // present state.
  95. func (m *Manifest) asChanges() []*pb.ManifestChange {
  96. changes := make([]*pb.ManifestChange, 0, len(m.Tables))
  97. for id, tm := range m.Tables {
  98. changes = append(changes, newCreateChange(id, int(tm.Level), tm.KeyID, tm.Compression))
  99. }
  100. return changes
  101. }
  102. func (m *Manifest) clone() Manifest {
  103. changeSet := pb.ManifestChangeSet{Changes: m.asChanges()}
  104. ret := createManifest()
  105. y.Check(applyChangeSet(&ret, &changeSet))
  106. return ret
  107. }
  108. // openOrCreateManifestFile opens a Badger manifest file if it exists, or creates one if
  109. // doesn’t exists.
  110. func openOrCreateManifestFile(opt Options) (
  111. ret *manifestFile, result Manifest, err error) {
  112. if opt.InMemory {
  113. return &manifestFile{inMemory: true}, Manifest{}, nil
  114. }
  115. return helpOpenOrCreateManifestFile(opt.Dir, opt.ReadOnly, opt.ExternalMagicVersion,
  116. manifestDeletionsRewriteThreshold)
  117. }
  118. func helpOpenOrCreateManifestFile(dir string, readOnly bool, extMagic uint16,
  119. deletionsThreshold int) (*manifestFile, Manifest, error) {
  120. path := filepath.Join(dir, ManifestFilename)
  121. var flags y.Flags
  122. if readOnly {
  123. flags |= y.ReadOnly
  124. }
  125. fp, err := y.OpenExistingFile(path, flags) // We explicitly sync in addChanges, outside the lock.
  126. if err != nil {
  127. if !os.IsNotExist(err) {
  128. return nil, Manifest{}, err
  129. }
  130. if readOnly {
  131. return nil, Manifest{}, fmt.Errorf("no manifest found, required for read-only db")
  132. }
  133. m := createManifest()
  134. fp, netCreations, err := helpRewrite(dir, &m, extMagic)
  135. if err != nil {
  136. return nil, Manifest{}, err
  137. }
  138. y.AssertTrue(netCreations == 0)
  139. mf := &manifestFile{
  140. fp: fp,
  141. directory: dir,
  142. externalMagic: extMagic,
  143. manifest: m.clone(),
  144. deletionsRewriteThreshold: deletionsThreshold,
  145. }
  146. return mf, m, nil
  147. }
  148. manifest, truncOffset, err := ReplayManifestFile(fp, extMagic)
  149. if err != nil {
  150. _ = fp.Close()
  151. return nil, Manifest{}, err
  152. }
  153. if !readOnly {
  154. // Truncate file so we don't have a half-written entry at the end.
  155. if err := fp.Truncate(truncOffset); err != nil {
  156. _ = fp.Close()
  157. return nil, Manifest{}, err
  158. }
  159. }
  160. if _, err = fp.Seek(0, io.SeekEnd); err != nil {
  161. _ = fp.Close()
  162. return nil, Manifest{}, err
  163. }
  164. mf := &manifestFile{
  165. fp: fp,
  166. directory: dir,
  167. externalMagic: extMagic,
  168. manifest: manifest.clone(),
  169. deletionsRewriteThreshold: deletionsThreshold,
  170. }
  171. return mf, manifest, nil
  172. }
  173. func (mf *manifestFile) close() error {
  174. if mf.inMemory {
  175. return nil
  176. }
  177. return mf.fp.Close()
  178. }
  179. // addChanges writes a batch of changes, atomically, to the file. By "atomically" that means when
  180. // we replay the MANIFEST file, we'll either replay all the changes or none of them. (The truth of
  181. // this depends on the filesystem -- some might append garbage data if a system crash happens at
  182. // the wrong time.)
  183. func (mf *manifestFile) addChanges(changesParam []*pb.ManifestChange) error {
  184. if mf.inMemory {
  185. return nil
  186. }
  187. changes := pb.ManifestChangeSet{Changes: changesParam}
  188. buf, err := proto.Marshal(&changes)
  189. if err != nil {
  190. return err
  191. }
  192. // Maybe we could use O_APPEND instead (on certain file systems)
  193. mf.appendLock.Lock()
  194. defer mf.appendLock.Unlock()
  195. if err := applyChangeSet(&mf.manifest, &changes); err != nil {
  196. return err
  197. }
  198. // Rewrite manifest if it'd shrink by 1/10 and it's big enough to care
  199. if mf.manifest.Deletions > mf.deletionsRewriteThreshold &&
  200. mf.manifest.Deletions > manifestDeletionsRatio*(mf.manifest.Creations-mf.manifest.Deletions) {
  201. if err := mf.rewrite(); err != nil {
  202. return err
  203. }
  204. } else {
  205. var lenCrcBuf [8]byte
  206. binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(buf)))
  207. binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(buf, y.CastagnoliCrcTable))
  208. buf = append(lenCrcBuf[:], buf...)
  209. if _, err := mf.fp.Write(buf); err != nil {
  210. return err
  211. }
  212. }
  213. return syncFunc(mf.fp)
  214. }
  215. // this function is saved here to allow injection of fake filesystem latency at test time.
  216. var syncFunc = func(f *os.File) error { return f.Sync() }
  217. // Has to be 4 bytes. The value can never change, ever, anyway.
  218. var magicText = [4]byte{'B', 'd', 'g', 'r'}
  219. // The magic version number. It is allocated 2 bytes, so it's value must be <= math.MaxUint16
  220. const badgerMagicVersion = 8
  221. func helpRewrite(dir string, m *Manifest, extMagic uint16) (*os.File, int, error) {
  222. rewritePath := filepath.Join(dir, manifestRewriteFilename)
  223. // We explicitly sync.
  224. fp, err := y.OpenTruncFile(rewritePath, false)
  225. if err != nil {
  226. return nil, 0, err
  227. }
  228. // magic bytes are structured as
  229. // +---------------------+-------------------------+-----------------------+
  230. // | magicText (4 bytes) | externalMagic (2 bytes) | badgerMagic (2 bytes) |
  231. // +---------------------+-------------------------+-----------------------+
  232. y.AssertTrue(badgerMagicVersion <= math.MaxUint16)
  233. buf := make([]byte, 8)
  234. copy(buf[0:4], magicText[:])
  235. binary.BigEndian.PutUint16(buf[4:6], extMagic)
  236. binary.BigEndian.PutUint16(buf[6:8], badgerMagicVersion)
  237. netCreations := len(m.Tables)
  238. changes := m.asChanges()
  239. set := pb.ManifestChangeSet{Changes: changes}
  240. changeBuf, err := proto.Marshal(&set)
  241. if err != nil {
  242. fp.Close()
  243. return nil, 0, err
  244. }
  245. var lenCrcBuf [8]byte
  246. binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(changeBuf)))
  247. binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(changeBuf, y.CastagnoliCrcTable))
  248. buf = append(buf, lenCrcBuf[:]...)
  249. buf = append(buf, changeBuf...)
  250. if _, err := fp.Write(buf); err != nil {
  251. fp.Close()
  252. return nil, 0, err
  253. }
  254. if err := fp.Sync(); err != nil {
  255. fp.Close()
  256. return nil, 0, err
  257. }
  258. // In Windows the files should be closed before doing a Rename.
  259. if err = fp.Close(); err != nil {
  260. return nil, 0, err
  261. }
  262. manifestPath := filepath.Join(dir, ManifestFilename)
  263. if err := os.Rename(rewritePath, manifestPath); err != nil {
  264. return nil, 0, err
  265. }
  266. fp, err = y.OpenExistingFile(manifestPath, 0)
  267. if err != nil {
  268. return nil, 0, err
  269. }
  270. if _, err := fp.Seek(0, io.SeekEnd); err != nil {
  271. fp.Close()
  272. return nil, 0, err
  273. }
  274. if err := syncDir(dir); err != nil {
  275. fp.Close()
  276. return nil, 0, err
  277. }
  278. return fp, netCreations, nil
  279. }
  280. // Must be called while appendLock is held.
  281. func (mf *manifestFile) rewrite() error {
  282. // In Windows the files should be closed before doing a Rename.
  283. if err := mf.fp.Close(); err != nil {
  284. return err
  285. }
  286. fp, netCreations, err := helpRewrite(mf.directory, &mf.manifest, mf.externalMagic)
  287. if err != nil {
  288. return err
  289. }
  290. mf.fp = fp
  291. mf.manifest.Creations = netCreations
  292. mf.manifest.Deletions = 0
  293. return nil
  294. }
  295. type countingReader struct {
  296. wrapped *bufio.Reader
  297. count int64
  298. }
  299. func (r *countingReader) Read(p []byte) (n int, err error) {
  300. n, err = r.wrapped.Read(p)
  301. r.count += int64(n)
  302. return
  303. }
  304. func (r *countingReader) ReadByte() (b byte, err error) {
  305. b, err = r.wrapped.ReadByte()
  306. if err == nil {
  307. r.count++
  308. }
  309. return
  310. }
  311. var (
  312. errBadMagic = stderrors.New("manifest has bad magic")
  313. errBadChecksum = stderrors.New("manifest has checksum mismatch")
  314. )
  315. // ReplayManifestFile reads the manifest file and constructs two manifest objects. (We need one
  316. // immutable copy and one mutable copy of the manifest. Easiest way is to construct two of them.)
  317. // Also, returns the last offset after a completely read manifest entry -- the file must be
  318. // truncated at that point before further appends are made (if there is a partial entry after
  319. // that). In normal conditions, truncOffset is the file size.
  320. func ReplayManifestFile(fp *os.File, extMagic uint16) (Manifest, int64, error) {
  321. r := countingReader{wrapped: bufio.NewReader(fp)}
  322. var magicBuf [8]byte
  323. if _, err := io.ReadFull(&r, magicBuf[:]); err != nil {
  324. return Manifest{}, 0, errBadMagic
  325. }
  326. if !bytes.Equal(magicBuf[0:4], magicText[:]) {
  327. return Manifest{}, 0, errBadMagic
  328. }
  329. extVersion := y.BytesToU16(magicBuf[4:6])
  330. version := y.BytesToU16(magicBuf[6:8])
  331. if version != badgerMagicVersion {
  332. return Manifest{}, 0,
  333. //nolint:lll
  334. fmt.Errorf("manifest has unsupported version: %d (we support %d).\n"+
  335. "Please see https://dgraph.io/docs/badger/faq/#i-see-manifest-has-unsupported-version-x-we-support-y-error"+
  336. " on how to fix this.",
  337. version, badgerMagicVersion)
  338. }
  339. if extVersion != extMagic {
  340. return Manifest{}, 0,
  341. fmt.Errorf("Cannot open DB because the external magic number doesn't match. "+
  342. "Expected: %d, version present in manifest: %d\n", extMagic, extVersion)
  343. }
  344. stat, err := fp.Stat()
  345. if err != nil {
  346. return Manifest{}, 0, err
  347. }
  348. build := createManifest()
  349. var offset int64
  350. for {
  351. offset = r.count
  352. var lenCrcBuf [8]byte
  353. _, err := io.ReadFull(&r, lenCrcBuf[:])
  354. if err != nil {
  355. if err == io.EOF || err == io.ErrUnexpectedEOF {
  356. break
  357. }
  358. return Manifest{}, 0, err
  359. }
  360. length := y.BytesToU32(lenCrcBuf[0:4])
  361. // Sanity check to ensure we don't over-allocate memory.
  362. if length > uint32(stat.Size()) {
  363. return Manifest{}, 0, errors.Errorf(
  364. "Buffer length: %d greater than file size: %d. Manifest file might be corrupted",
  365. length, stat.Size())
  366. }
  367. var buf = make([]byte, length)
  368. if _, err := io.ReadFull(&r, buf); err != nil {
  369. if err == io.EOF || err == io.ErrUnexpectedEOF {
  370. break
  371. }
  372. return Manifest{}, 0, err
  373. }
  374. if crc32.Checksum(buf, y.CastagnoliCrcTable) != y.BytesToU32(lenCrcBuf[4:8]) {
  375. return Manifest{}, 0, errBadChecksum
  376. }
  377. var changeSet pb.ManifestChangeSet
  378. if err := proto.Unmarshal(buf, &changeSet); err != nil {
  379. return Manifest{}, 0, err
  380. }
  381. if err := applyChangeSet(&build, &changeSet); err != nil {
  382. return Manifest{}, 0, err
  383. }
  384. }
  385. return build, offset, nil
  386. }
  387. func applyManifestChange(build *Manifest, tc *pb.ManifestChange) error {
  388. switch tc.Op {
  389. case pb.ManifestChange_CREATE:
  390. if _, ok := build.Tables[tc.Id]; ok {
  391. return fmt.Errorf("MANIFEST invalid, table %d exists", tc.Id)
  392. }
  393. build.Tables[tc.Id] = TableManifest{
  394. Level: uint8(tc.Level),
  395. KeyID: tc.KeyId,
  396. Compression: options.CompressionType(tc.Compression),
  397. }
  398. for len(build.Levels) <= int(tc.Level) {
  399. build.Levels = append(build.Levels, levelManifest{make(map[uint64]struct{})})
  400. }
  401. build.Levels[tc.Level].Tables[tc.Id] = struct{}{}
  402. build.Creations++
  403. case pb.ManifestChange_DELETE:
  404. tm, ok := build.Tables[tc.Id]
  405. if !ok {
  406. return fmt.Errorf("MANIFEST removes non-existing table %d", tc.Id)
  407. }
  408. delete(build.Levels[tm.Level].Tables, tc.Id)
  409. delete(build.Tables, tc.Id)
  410. build.Deletions++
  411. default:
  412. return fmt.Errorf("MANIFEST file has invalid manifestChange op")
  413. }
  414. return nil
  415. }
  416. // This is not a "recoverable" error -- opening the KV store fails because the MANIFEST file is
  417. // just plain broken.
  418. func applyChangeSet(build *Manifest, changeSet *pb.ManifestChangeSet) error {
  419. for _, change := range changeSet.Changes {
  420. if err := applyManifestChange(build, change); err != nil {
  421. return err
  422. }
  423. }
  424. return nil
  425. }
  426. func newCreateChange(
  427. id uint64, level int, keyID uint64, c options.CompressionType) *pb.ManifestChange {
  428. return &pb.ManifestChange{
  429. Id: id,
  430. Op: pb.ManifestChange_CREATE,
  431. Level: uint32(level),
  432. KeyId: keyID,
  433. // Hard coding it, since we're supporting only AES for now.
  434. EncryptionAlgo: pb.EncryptionAlgo_aes,
  435. Compression: uint32(c),
  436. }
  437. }
  438. func newDeleteChange(id uint64) *pb.ManifestChange {
  439. return &pb.ManifestChange{
  440. Id: id,
  441. Op: pb.ManifestChange_DELETE,
  442. }
  443. }