porter_stemmer.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. # Generated by Snowball 2.2.0 - https://snowballstem.org/
  2. from .basestemmer import BaseStemmer
  3. from .among import Among
  4. class PorterStemmer(BaseStemmer):
  5. '''
  6. This class implements the stemming algorithm defined by a snowball script.
  7. Generated by Snowball 2.2.0 - https://snowballstem.org/
  8. '''
  9. a_0 = [
  10. Among(u"s", -1, 3),
  11. Among(u"ies", 0, 2),
  12. Among(u"sses", 0, 1),
  13. Among(u"ss", 0, -1)
  14. ]
  15. a_1 = [
  16. Among(u"", -1, 3),
  17. Among(u"bb", 0, 2),
  18. Among(u"dd", 0, 2),
  19. Among(u"ff", 0, 2),
  20. Among(u"gg", 0, 2),
  21. Among(u"bl", 0, 1),
  22. Among(u"mm", 0, 2),
  23. Among(u"nn", 0, 2),
  24. Among(u"pp", 0, 2),
  25. Among(u"rr", 0, 2),
  26. Among(u"at", 0, 1),
  27. Among(u"tt", 0, 2),
  28. Among(u"iz", 0, 1)
  29. ]
  30. a_2 = [
  31. Among(u"ed", -1, 2),
  32. Among(u"eed", 0, 1),
  33. Among(u"ing", -1, 2)
  34. ]
  35. a_3 = [
  36. Among(u"anci", -1, 3),
  37. Among(u"enci", -1, 2),
  38. Among(u"abli", -1, 4),
  39. Among(u"eli", -1, 6),
  40. Among(u"alli", -1, 9),
  41. Among(u"ousli", -1, 11),
  42. Among(u"entli", -1, 5),
  43. Among(u"aliti", -1, 9),
  44. Among(u"biliti", -1, 13),
  45. Among(u"iviti", -1, 12),
  46. Among(u"tional", -1, 1),
  47. Among(u"ational", 10, 8),
  48. Among(u"alism", -1, 9),
  49. Among(u"ation", -1, 8),
  50. Among(u"ization", 13, 7),
  51. Among(u"izer", -1, 7),
  52. Among(u"ator", -1, 8),
  53. Among(u"iveness", -1, 12),
  54. Among(u"fulness", -1, 10),
  55. Among(u"ousness", -1, 11)
  56. ]
  57. a_4 = [
  58. Among(u"icate", -1, 2),
  59. Among(u"ative", -1, 3),
  60. Among(u"alize", -1, 1),
  61. Among(u"iciti", -1, 2),
  62. Among(u"ical", -1, 2),
  63. Among(u"ful", -1, 3),
  64. Among(u"ness", -1, 3)
  65. ]
  66. a_5 = [
  67. Among(u"ic", -1, 1),
  68. Among(u"ance", -1, 1),
  69. Among(u"ence", -1, 1),
  70. Among(u"able", -1, 1),
  71. Among(u"ible", -1, 1),
  72. Among(u"ate", -1, 1),
  73. Among(u"ive", -1, 1),
  74. Among(u"ize", -1, 1),
  75. Among(u"iti", -1, 1),
  76. Among(u"al", -1, 1),
  77. Among(u"ism", -1, 1),
  78. Among(u"ion", -1, 2),
  79. Among(u"er", -1, 1),
  80. Among(u"ous", -1, 1),
  81. Among(u"ant", -1, 1),
  82. Among(u"ent", -1, 1),
  83. Among(u"ment", 15, 1),
  84. Among(u"ement", 16, 1),
  85. Among(u"ou", -1, 1)
  86. ]
  87. g_v = [17, 65, 16, 1]
  88. g_v_WXY = [1, 17, 65, 208, 1]
  89. B_Y_found = False
  90. I_p2 = 0
  91. I_p1 = 0
  92. def __r_shortv(self):
  93. if not self.out_grouping_b(PorterStemmer.g_v_WXY, 89, 121):
  94. return False
  95. if not self.in_grouping_b(PorterStemmer.g_v, 97, 121):
  96. return False
  97. if not self.out_grouping_b(PorterStemmer.g_v, 97, 121):
  98. return False
  99. return True
  100. def __r_R1(self):
  101. if not self.I_p1 <= self.cursor:
  102. return False
  103. return True
  104. def __r_R2(self):
  105. if not self.I_p2 <= self.cursor:
  106. return False
  107. return True
  108. def __r_Step_1a(self):
  109. self.ket = self.cursor
  110. among_var = self.find_among_b(PorterStemmer.a_0)
  111. if among_var == 0:
  112. return False
  113. self.bra = self.cursor
  114. if among_var == 1:
  115. if not self.slice_from(u"ss"):
  116. return False
  117. elif among_var == 2:
  118. if not self.slice_from(u"i"):
  119. return False
  120. elif among_var == 3:
  121. if not self.slice_del():
  122. return False
  123. return True
  124. def __r_Step_1b(self):
  125. self.ket = self.cursor
  126. among_var = self.find_among_b(PorterStemmer.a_2)
  127. if among_var == 0:
  128. return False
  129. self.bra = self.cursor
  130. if among_var == 1:
  131. if not self.__r_R1():
  132. return False
  133. if not self.slice_from(u"ee"):
  134. return False
  135. else:
  136. v_1 = self.limit - self.cursor
  137. if not self.go_out_grouping_b(PorterStemmer.g_v, 97, 121):
  138. return False
  139. self.cursor -= 1
  140. self.cursor = self.limit - v_1
  141. if not self.slice_del():
  142. return False
  143. v_2 = self.limit - self.cursor
  144. among_var = self.find_among_b(PorterStemmer.a_1)
  145. if among_var == 0:
  146. return False
  147. self.cursor = self.limit - v_2
  148. if among_var == 1:
  149. c = self.cursor
  150. self.insert(self.cursor, self.cursor, u"e")
  151. self.cursor = c
  152. elif among_var == 2:
  153. self.ket = self.cursor
  154. if self.cursor <= self.limit_backward:
  155. return False
  156. self.cursor -= 1
  157. self.bra = self.cursor
  158. if not self.slice_del():
  159. return False
  160. else:
  161. if self.cursor != self.I_p1:
  162. return False
  163. v_3 = self.limit - self.cursor
  164. if not self.__r_shortv():
  165. return False
  166. self.cursor = self.limit - v_3
  167. c = self.cursor
  168. self.insert(self.cursor, self.cursor, u"e")
  169. self.cursor = c
  170. return True
  171. def __r_Step_1c(self):
  172. self.ket = self.cursor
  173. try:
  174. v_1 = self.limit - self.cursor
  175. try:
  176. if not self.eq_s_b(u"y"):
  177. raise lab1()
  178. raise lab0()
  179. except lab1: pass
  180. self.cursor = self.limit - v_1
  181. if not self.eq_s_b(u"Y"):
  182. return False
  183. except lab0: pass
  184. self.bra = self.cursor
  185. if not self.go_out_grouping_b(PorterStemmer.g_v, 97, 121):
  186. return False
  187. self.cursor -= 1
  188. if not self.slice_from(u"i"):
  189. return False
  190. return True
  191. def __r_Step_2(self):
  192. self.ket = self.cursor
  193. among_var = self.find_among_b(PorterStemmer.a_3)
  194. if among_var == 0:
  195. return False
  196. self.bra = self.cursor
  197. if not self.__r_R1():
  198. return False
  199. if among_var == 1:
  200. if not self.slice_from(u"tion"):
  201. return False
  202. elif among_var == 2:
  203. if not self.slice_from(u"ence"):
  204. return False
  205. elif among_var == 3:
  206. if not self.slice_from(u"ance"):
  207. return False
  208. elif among_var == 4:
  209. if not self.slice_from(u"able"):
  210. return False
  211. elif among_var == 5:
  212. if not self.slice_from(u"ent"):
  213. return False
  214. elif among_var == 6:
  215. if not self.slice_from(u"e"):
  216. return False
  217. elif among_var == 7:
  218. if not self.slice_from(u"ize"):
  219. return False
  220. elif among_var == 8:
  221. if not self.slice_from(u"ate"):
  222. return False
  223. elif among_var == 9:
  224. if not self.slice_from(u"al"):
  225. return False
  226. elif among_var == 10:
  227. if not self.slice_from(u"ful"):
  228. return False
  229. elif among_var == 11:
  230. if not self.slice_from(u"ous"):
  231. return False
  232. elif among_var == 12:
  233. if not self.slice_from(u"ive"):
  234. return False
  235. else:
  236. if not self.slice_from(u"ble"):
  237. return False
  238. return True
  239. def __r_Step_3(self):
  240. self.ket = self.cursor
  241. among_var = self.find_among_b(PorterStemmer.a_4)
  242. if among_var == 0:
  243. return False
  244. self.bra = self.cursor
  245. if not self.__r_R1():
  246. return False
  247. if among_var == 1:
  248. if not self.slice_from(u"al"):
  249. return False
  250. elif among_var == 2:
  251. if not self.slice_from(u"ic"):
  252. return False
  253. else:
  254. if not self.slice_del():
  255. return False
  256. return True
  257. def __r_Step_4(self):
  258. self.ket = self.cursor
  259. among_var = self.find_among_b(PorterStemmer.a_5)
  260. if among_var == 0:
  261. return False
  262. self.bra = self.cursor
  263. if not self.__r_R2():
  264. return False
  265. if among_var == 1:
  266. if not self.slice_del():
  267. return False
  268. else:
  269. try:
  270. v_1 = self.limit - self.cursor
  271. try:
  272. if not self.eq_s_b(u"s"):
  273. raise lab1()
  274. raise lab0()
  275. except lab1: pass
  276. self.cursor = self.limit - v_1
  277. if not self.eq_s_b(u"t"):
  278. return False
  279. except lab0: pass
  280. if not self.slice_del():
  281. return False
  282. return True
  283. def __r_Step_5a(self):
  284. self.ket = self.cursor
  285. if not self.eq_s_b(u"e"):
  286. return False
  287. self.bra = self.cursor
  288. try:
  289. v_1 = self.limit - self.cursor
  290. try:
  291. if not self.__r_R2():
  292. raise lab1()
  293. raise lab0()
  294. except lab1: pass
  295. self.cursor = self.limit - v_1
  296. if not self.__r_R1():
  297. return False
  298. v_2 = self.limit - self.cursor
  299. try:
  300. if not self.__r_shortv():
  301. raise lab2()
  302. return False
  303. except lab2: pass
  304. self.cursor = self.limit - v_2
  305. except lab0: pass
  306. if not self.slice_del():
  307. return False
  308. return True
  309. def __r_Step_5b(self):
  310. self.ket = self.cursor
  311. if not self.eq_s_b(u"l"):
  312. return False
  313. self.bra = self.cursor
  314. if not self.__r_R2():
  315. return False
  316. if not self.eq_s_b(u"l"):
  317. return False
  318. if not self.slice_del():
  319. return False
  320. return True
  321. def _stem(self):
  322. self.B_Y_found = False
  323. v_1 = self.cursor
  324. try:
  325. self.bra = self.cursor
  326. if not self.eq_s(u"y"):
  327. raise lab0()
  328. self.ket = self.cursor
  329. if not self.slice_from(u"Y"):
  330. return False
  331. self.B_Y_found = True
  332. except lab0: pass
  333. self.cursor = v_1
  334. v_2 = self.cursor
  335. try:
  336. while True:
  337. v_3 = self.cursor
  338. try:
  339. try:
  340. while True:
  341. v_4 = self.cursor
  342. try:
  343. if not self.in_grouping(PorterStemmer.g_v, 97, 121):
  344. raise lab4()
  345. self.bra = self.cursor
  346. if not self.eq_s(u"y"):
  347. raise lab4()
  348. self.ket = self.cursor
  349. self.cursor = v_4
  350. raise lab3()
  351. except lab4: pass
  352. self.cursor = v_4
  353. if self.cursor >= self.limit:
  354. raise lab2()
  355. self.cursor += 1
  356. except lab3: pass
  357. if not self.slice_from(u"Y"):
  358. return False
  359. self.B_Y_found = True
  360. continue
  361. except lab2: pass
  362. self.cursor = v_3
  363. break
  364. except lab1: pass
  365. self.cursor = v_2
  366. self.I_p1 = self.limit
  367. self.I_p2 = self.limit
  368. v_5 = self.cursor
  369. try:
  370. if not self.go_out_grouping(PorterStemmer.g_v, 97, 121):
  371. raise lab5()
  372. self.cursor += 1
  373. if not self.go_in_grouping(PorterStemmer.g_v, 97, 121):
  374. raise lab5()
  375. self.cursor += 1
  376. self.I_p1 = self.cursor
  377. if not self.go_out_grouping(PorterStemmer.g_v, 97, 121):
  378. raise lab5()
  379. self.cursor += 1
  380. if not self.go_in_grouping(PorterStemmer.g_v, 97, 121):
  381. raise lab5()
  382. self.cursor += 1
  383. self.I_p2 = self.cursor
  384. except lab5: pass
  385. self.cursor = v_5
  386. self.limit_backward = self.cursor
  387. self.cursor = self.limit
  388. v_6 = self.limit - self.cursor
  389. self.__r_Step_1a()
  390. self.cursor = self.limit - v_6
  391. v_7 = self.limit - self.cursor
  392. self.__r_Step_1b()
  393. self.cursor = self.limit - v_7
  394. v_8 = self.limit - self.cursor
  395. self.__r_Step_1c()
  396. self.cursor = self.limit - v_8
  397. v_9 = self.limit - self.cursor
  398. self.__r_Step_2()
  399. self.cursor = self.limit - v_9
  400. v_10 = self.limit - self.cursor
  401. self.__r_Step_3()
  402. self.cursor = self.limit - v_10
  403. v_11 = self.limit - self.cursor
  404. self.__r_Step_4()
  405. self.cursor = self.limit - v_11
  406. v_12 = self.limit - self.cursor
  407. self.__r_Step_5a()
  408. self.cursor = self.limit - v_12
  409. v_13 = self.limit - self.cursor
  410. self.__r_Step_5b()
  411. self.cursor = self.limit - v_13
  412. self.cursor = self.limit_backward
  413. v_14 = self.cursor
  414. try:
  415. if not self.B_Y_found:
  416. raise lab6()
  417. while True:
  418. v_15 = self.cursor
  419. try:
  420. try:
  421. while True:
  422. v_16 = self.cursor
  423. try:
  424. self.bra = self.cursor
  425. if not self.eq_s(u"Y"):
  426. raise lab9()
  427. self.ket = self.cursor
  428. self.cursor = v_16
  429. raise lab8()
  430. except lab9: pass
  431. self.cursor = v_16
  432. if self.cursor >= self.limit:
  433. raise lab7()
  434. self.cursor += 1
  435. except lab8: pass
  436. if not self.slice_from(u"y"):
  437. return False
  438. continue
  439. except lab7: pass
  440. self.cursor = v_15
  441. break
  442. except lab6: pass
  443. self.cursor = v_14
  444. return True
  445. class lab0(BaseException): pass
  446. class lab1(BaseException): pass
  447. class lab2(BaseException): pass
  448. class lab3(BaseException): pass
  449. class lab4(BaseException): pass
  450. class lab5(BaseException): pass
  451. class lab6(BaseException): pass
  452. class lab7(BaseException): pass
  453. class lab8(BaseException): pass
  454. class lab9(BaseException): pass