pack.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. # Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors
  2. #
  3. # This module is part of GitDB and is released under
  4. # the New BSD License: http://www.opensource.org/licenses/bsd-license.php
  5. """Module containing a database to deal with packs"""
  6. from gitdb.db.base import (
  7. FileDBBase,
  8. ObjectDBR,
  9. CachingDB
  10. )
  11. from gitdb.util import LazyMixin
  12. from gitdb.exc import (
  13. BadObject,
  14. UnsupportedOperation,
  15. AmbiguousObjectName
  16. )
  17. from gitdb.pack import PackEntity
  18. from functools import reduce
  19. import os
  20. import glob
  21. __all__ = ('PackedDB', )
  22. #{ Utilities
  23. class PackedDB(FileDBBase, ObjectDBR, CachingDB, LazyMixin):
  24. """A database operating on a set of object packs"""
  25. # sort the priority list every N queries
  26. # Higher values are better, performance tests don't show this has
  27. # any effect, but it should have one
  28. _sort_interval = 500
  29. def __init__(self, root_path):
  30. super().__init__(root_path)
  31. # list of lists with three items:
  32. # * hits - number of times the pack was hit with a request
  33. # * entity - Pack entity instance
  34. # * sha_to_index - PackIndexFile.sha_to_index method for direct cache query
  35. # self._entities = list() # lazy loaded list
  36. self._hit_count = 0 # amount of hits
  37. self._st_mtime = 0 # last modification data of our root path
  38. def _set_cache_(self, attr):
  39. if attr == '_entities':
  40. self._entities = list()
  41. self.update_cache(force=True)
  42. # END handle entities initialization
  43. def _sort_entities(self):
  44. self._entities.sort(key=lambda l: l[0], reverse=True)
  45. def _pack_info(self, sha):
  46. """:return: tuple(entity, index) for an item at the given sha
  47. :param sha: 20 or 40 byte sha
  48. :raise BadObject:
  49. **Note:** This method is not thread-safe, but may be hit in multi-threaded
  50. operation. The worst thing that can happen though is a counter that
  51. was not incremented, or the list being in wrong order. So we safe
  52. the time for locking here, lets see how that goes"""
  53. # presort ?
  54. if self._hit_count % self._sort_interval == 0:
  55. self._sort_entities()
  56. # END update sorting
  57. for item in self._entities:
  58. index = item[2](sha)
  59. if index is not None:
  60. item[0] += 1 # one hit for you
  61. self._hit_count += 1 # general hit count
  62. return (item[1], index)
  63. # END index found in pack
  64. # END for each item
  65. # no hit, see whether we have to update packs
  66. # NOTE: considering packs don't change very often, we safe this call
  67. # and leave it to the super-caller to trigger that
  68. raise BadObject(sha)
  69. #{ Object DB Read
  70. def has_object(self, sha):
  71. try:
  72. self._pack_info(sha)
  73. return True
  74. except BadObject:
  75. return False
  76. # END exception handling
  77. def info(self, sha):
  78. entity, index = self._pack_info(sha)
  79. return entity.info_at_index(index)
  80. def stream(self, sha):
  81. entity, index = self._pack_info(sha)
  82. return entity.stream_at_index(index)
  83. def sha_iter(self):
  84. for entity in self.entities():
  85. index = entity.index()
  86. sha_by_index = index.sha
  87. for index in range(index.size()):
  88. yield sha_by_index(index)
  89. # END for each index
  90. # END for each entity
  91. def size(self):
  92. sizes = [item[1].index().size() for item in self._entities]
  93. return reduce(lambda x, y: x + y, sizes, 0)
  94. #} END object db read
  95. #{ object db write
  96. def store(self, istream):
  97. """Storing individual objects is not feasible as a pack is designed to
  98. hold multiple objects. Writing or rewriting packs for single objects is
  99. inefficient"""
  100. raise UnsupportedOperation()
  101. #} END object db write
  102. #{ Interface
  103. def update_cache(self, force=False):
  104. """
  105. Update our cache with the actually existing packs on disk. Add new ones,
  106. and remove deleted ones. We keep the unchanged ones
  107. :param force: If True, the cache will be updated even though the directory
  108. does not appear to have changed according to its modification timestamp.
  109. :return: True if the packs have been updated so there is new information,
  110. False if there was no change to the pack database"""
  111. stat = os.stat(self.root_path())
  112. if not force and stat.st_mtime <= self._st_mtime:
  113. return False
  114. # END abort early on no change
  115. self._st_mtime = stat.st_mtime
  116. # packs are supposed to be prefixed with pack- by git-convention
  117. # get all pack files, figure out what changed
  118. pack_files = set(glob.glob(os.path.join(self.root_path(), "pack-*.pack")))
  119. our_pack_files = {item[1].pack().path() for item in self._entities}
  120. # new packs
  121. for pack_file in (pack_files - our_pack_files):
  122. # init the hit-counter/priority with the size, a good measure for hit-
  123. # probability. Its implemented so that only 12 bytes will be read
  124. entity = PackEntity(pack_file)
  125. self._entities.append([entity.pack().size(), entity, entity.index().sha_to_index])
  126. # END for each new packfile
  127. # removed packs
  128. for pack_file in (our_pack_files - pack_files):
  129. del_index = -1
  130. for i, item in enumerate(self._entities):
  131. if item[1].pack().path() == pack_file:
  132. del_index = i
  133. break
  134. # END found index
  135. # END for each entity
  136. assert del_index != -1
  137. del(self._entities[del_index])
  138. # END for each removed pack
  139. # reinitialize prioritiess
  140. self._sort_entities()
  141. return True
  142. def entities(self):
  143. """:return: list of pack entities operated upon by this database"""
  144. return [item[1] for item in self._entities]
  145. def partial_to_complete_sha(self, partial_binsha, canonical_length):
  146. """:return: 20 byte sha as inferred by the given partial binary sha
  147. :param partial_binsha: binary sha with less than 20 bytes
  148. :param canonical_length: length of the corresponding canonical representation.
  149. It is required as binary sha's cannot display whether the original hex sha
  150. had an odd or even number of characters
  151. :raise AmbiguousObjectName:
  152. :raise BadObject: """
  153. candidate = None
  154. for item in self._entities:
  155. item_index = item[1].index().partial_sha_to_index(partial_binsha, canonical_length)
  156. if item_index is not None:
  157. sha = item[1].index().sha(item_index)
  158. if candidate and candidate != sha:
  159. raise AmbiguousObjectName(partial_binsha)
  160. candidate = sha
  161. # END handle full sha could be found
  162. # END for each entity
  163. if candidate:
  164. return candidate
  165. # still not found ?
  166. raise BadObject(partial_binsha)
  167. #} END interface