basestemmer.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. class BaseStemmer(object):
  2. def __init__(self):
  3. self.set_current("")
  4. def set_current(self, value):
  5. '''
  6. Set the self.current string.
  7. '''
  8. self.current = value
  9. self.cursor = 0
  10. self.limit = len(self.current)
  11. self.limit_backward = 0
  12. self.bra = self.cursor
  13. self.ket = self.limit
  14. def get_current(self):
  15. '''
  16. Get the self.current string.
  17. '''
  18. return self.current
  19. def copy_from(self, other):
  20. self.current = other.current
  21. self.cursor = other.cursor
  22. self.limit = other.limit
  23. self.limit_backward = other.limit_backward
  24. self.bra = other.bra
  25. self.ket = other.ket
  26. def in_grouping(self, s, min, max):
  27. if self.cursor >= self.limit:
  28. return False
  29. ch = ord(self.current[self.cursor])
  30. if ch > max or ch < min:
  31. return False
  32. ch -= min
  33. if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
  34. return False
  35. self.cursor += 1
  36. return True
  37. def go_in_grouping(self, s, min, max):
  38. while self.cursor < self.limit:
  39. ch = ord(self.current[self.cursor])
  40. if ch > max or ch < min:
  41. return True
  42. ch -= min
  43. if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
  44. return True
  45. self.cursor += 1
  46. return False
  47. def in_grouping_b(self, s, min, max):
  48. if self.cursor <= self.limit_backward:
  49. return False
  50. ch = ord(self.current[self.cursor - 1])
  51. if ch > max or ch < min:
  52. return False
  53. ch -= min
  54. if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
  55. return False
  56. self.cursor -= 1
  57. return True
  58. def go_in_grouping_b(self, s, min, max):
  59. while self.cursor > self.limit_backward:
  60. ch = ord(self.current[self.cursor - 1])
  61. if ch > max or ch < min:
  62. return True
  63. ch -= min
  64. if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
  65. return True
  66. self.cursor -= 1
  67. return False
  68. def out_grouping(self, s, min, max):
  69. if self.cursor >= self.limit:
  70. return False
  71. ch = ord(self.current[self.cursor])
  72. if ch > max or ch < min:
  73. self.cursor += 1
  74. return True
  75. ch -= min
  76. if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0:
  77. self.cursor += 1
  78. return True
  79. return False
  80. def go_out_grouping(self, s, min, max):
  81. while self.cursor < self.limit:
  82. ch = ord(self.current[self.cursor])
  83. if ch <= max and ch >= min:
  84. ch -= min
  85. if (s[ch >> 3] & (0X1 << (ch & 0x7))):
  86. return True
  87. self.cursor += 1
  88. return False
  89. def out_grouping_b(self, s, min, max):
  90. if self.cursor <= self.limit_backward:
  91. return False
  92. ch = ord(self.current[self.cursor - 1])
  93. if ch > max or ch < min:
  94. self.cursor -= 1
  95. return True
  96. ch -= min
  97. if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0:
  98. self.cursor -= 1
  99. return True
  100. return False
  101. def go_out_grouping_b(self, s, min, max):
  102. while self.cursor > self.limit_backward:
  103. ch = ord(self.current[self.cursor - 1])
  104. if ch <= max and ch >= min:
  105. ch -= min
  106. if (s[ch >> 3] & (0X1 << (ch & 0x7))):
  107. return True
  108. self.cursor -= 1
  109. return False
  110. def eq_s(self, s):
  111. if self.limit - self.cursor < len(s):
  112. return False
  113. if self.current[self.cursor:self.cursor + len(s)] != s:
  114. return False
  115. self.cursor += len(s)
  116. return True
  117. def eq_s_b(self, s):
  118. if self.cursor - self.limit_backward < len(s):
  119. return False
  120. if self.current[self.cursor - len(s):self.cursor] != s:
  121. return False
  122. self.cursor -= len(s)
  123. return True
  124. def find_among(self, v):
  125. i = 0
  126. j = len(v)
  127. c = self.cursor
  128. l = self.limit
  129. common_i = 0
  130. common_j = 0
  131. first_key_inspected = False
  132. while True:
  133. k = i + ((j - i) >> 1)
  134. diff = 0
  135. common = min(common_i, common_j) # smaller
  136. w = v[k]
  137. for i2 in range(common, len(w.s)):
  138. if c + common == l:
  139. diff = -1
  140. break
  141. diff = ord(self.current[c + common]) - ord(w.s[i2])
  142. if diff != 0:
  143. break
  144. common += 1
  145. if diff < 0:
  146. j = k
  147. common_j = common
  148. else:
  149. i = k
  150. common_i = common
  151. if j - i <= 1:
  152. if i > 0:
  153. break # v->s has been inspected
  154. if j == i:
  155. break # only one item in v
  156. # - but now we need to go round once more to get
  157. # v->s inspected. This looks messy, but is actually
  158. # the optimal approach.
  159. if first_key_inspected:
  160. break
  161. first_key_inspected = True
  162. while True:
  163. w = v[i]
  164. if common_i >= len(w.s):
  165. self.cursor = c + len(w.s)
  166. if w.method is None:
  167. return w.result
  168. method = getattr(self, w.method)
  169. res = method()
  170. self.cursor = c + len(w.s)
  171. if res:
  172. return w.result
  173. i = w.substring_i
  174. if i < 0:
  175. return 0
  176. return -1 # not reachable
  177. def find_among_b(self, v):
  178. '''
  179. find_among_b is for backwards processing. Same comments apply
  180. '''
  181. i = 0
  182. j = len(v)
  183. c = self.cursor
  184. lb = self.limit_backward
  185. common_i = 0
  186. common_j = 0
  187. first_key_inspected = False
  188. while True:
  189. k = i + ((j - i) >> 1)
  190. diff = 0
  191. common = min(common_i, common_j)
  192. w = v[k]
  193. for i2 in range(len(w.s) - 1 - common, -1, -1):
  194. if c - common == lb:
  195. diff = -1
  196. break
  197. diff = ord(self.current[c - 1 - common]) - ord(w.s[i2])
  198. if diff != 0:
  199. break
  200. common += 1
  201. if diff < 0:
  202. j = k
  203. common_j = common
  204. else:
  205. i = k
  206. common_i = common
  207. if j - i <= 1:
  208. if i > 0:
  209. break
  210. if j == i:
  211. break
  212. if first_key_inspected:
  213. break
  214. first_key_inspected = True
  215. while True:
  216. w = v[i]
  217. if common_i >= len(w.s):
  218. self.cursor = c - len(w.s)
  219. if w.method is None:
  220. return w.result
  221. method = getattr(self, w.method)
  222. res = method()
  223. self.cursor = c - len(w.s)
  224. if res:
  225. return w.result
  226. i = w.substring_i
  227. if i < 0:
  228. return 0
  229. return -1 # not reachable
  230. def replace_s(self, c_bra, c_ket, s):
  231. '''
  232. to replace chars between c_bra and c_ket in self.current by the
  233. chars in s.
  234. @type c_bra int
  235. @type c_ket int
  236. @type s: string
  237. '''
  238. adjustment = len(s) - (c_ket - c_bra)
  239. self.current = self.current[0:c_bra] + s + self.current[c_ket:]
  240. self.limit += adjustment
  241. if self.cursor >= c_ket:
  242. self.cursor += adjustment
  243. elif self.cursor > c_bra:
  244. self.cursor = c_bra
  245. return adjustment
  246. def slice_check(self):
  247. if self.bra < 0 or self.bra > self.ket or self.ket > self.limit or self.limit > len(self.current):
  248. return False
  249. return True
  250. def slice_from(self, s):
  251. '''
  252. @type s string
  253. '''
  254. result = False
  255. if self.slice_check():
  256. self.replace_s(self.bra, self.ket, s)
  257. result = True
  258. return result
  259. def slice_del(self):
  260. return self.slice_from("")
  261. def insert(self, c_bra, c_ket, s):
  262. '''
  263. @type c_bra int
  264. @type c_ket int
  265. @type s: string
  266. '''
  267. adjustment = self.replace_s(c_bra, c_ket, s)
  268. if c_bra <= self.bra:
  269. self.bra += adjustment
  270. if c_bra <= self.ket:
  271. self.ket += adjustment
  272. def slice_to(self):
  273. '''
  274. Return the slice as a string.
  275. '''
  276. result = ''
  277. if self.slice_check():
  278. result = self.current[self.bra:self.ket]
  279. return result
  280. def assign_to(self):
  281. '''
  282. Return the current string up to the limit.
  283. '''
  284. return self.current[0:self.limit]
  285. def stemWord(self, word):
  286. self.set_current(word)
  287. self._stem()
  288. return self.get_current()
  289. def stemWords(self, words):
  290. return [self.stemWord(word) for word in words]