english_stemmer.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731
  1. # Generated by Snowball 2.2.0 - https://snowballstem.org/
  2. from .basestemmer import BaseStemmer
  3. from .among import Among
  4. class EnglishStemmer(BaseStemmer):
  5. '''
  6. This class implements the stemming algorithm defined by a snowball script.
  7. Generated by Snowball 2.2.0 - https://snowballstem.org/
  8. '''
  9. a_0 = [
  10. Among(u"arsen", -1, -1),
  11. Among(u"commun", -1, -1),
  12. Among(u"gener", -1, -1)
  13. ]
  14. a_1 = [
  15. Among(u"'", -1, 1),
  16. Among(u"'s'", 0, 1),
  17. Among(u"'s", -1, 1)
  18. ]
  19. a_2 = [
  20. Among(u"ied", -1, 2),
  21. Among(u"s", -1, 3),
  22. Among(u"ies", 1, 2),
  23. Among(u"sses", 1, 1),
  24. Among(u"ss", 1, -1),
  25. Among(u"us", 1, -1)
  26. ]
  27. a_3 = [
  28. Among(u"", -1, 3),
  29. Among(u"bb", 0, 2),
  30. Among(u"dd", 0, 2),
  31. Among(u"ff", 0, 2),
  32. Among(u"gg", 0, 2),
  33. Among(u"bl", 0, 1),
  34. Among(u"mm", 0, 2),
  35. Among(u"nn", 0, 2),
  36. Among(u"pp", 0, 2),
  37. Among(u"rr", 0, 2),
  38. Among(u"at", 0, 1),
  39. Among(u"tt", 0, 2),
  40. Among(u"iz", 0, 1)
  41. ]
  42. a_4 = [
  43. Among(u"ed", -1, 2),
  44. Among(u"eed", 0, 1),
  45. Among(u"ing", -1, 2),
  46. Among(u"edly", -1, 2),
  47. Among(u"eedly", 3, 1),
  48. Among(u"ingly", -1, 2)
  49. ]
  50. a_5 = [
  51. Among(u"anci", -1, 3),
  52. Among(u"enci", -1, 2),
  53. Among(u"ogi", -1, 13),
  54. Among(u"li", -1, 15),
  55. Among(u"bli", 3, 12),
  56. Among(u"abli", 4, 4),
  57. Among(u"alli", 3, 8),
  58. Among(u"fulli", 3, 9),
  59. Among(u"lessli", 3, 14),
  60. Among(u"ousli", 3, 10),
  61. Among(u"entli", 3, 5),
  62. Among(u"aliti", -1, 8),
  63. Among(u"biliti", -1, 12),
  64. Among(u"iviti", -1, 11),
  65. Among(u"tional", -1, 1),
  66. Among(u"ational", 14, 7),
  67. Among(u"alism", -1, 8),
  68. Among(u"ation", -1, 7),
  69. Among(u"ization", 17, 6),
  70. Among(u"izer", -1, 6),
  71. Among(u"ator", -1, 7),
  72. Among(u"iveness", -1, 11),
  73. Among(u"fulness", -1, 9),
  74. Among(u"ousness", -1, 10)
  75. ]
  76. a_6 = [
  77. Among(u"icate", -1, 4),
  78. Among(u"ative", -1, 6),
  79. Among(u"alize", -1, 3),
  80. Among(u"iciti", -1, 4),
  81. Among(u"ical", -1, 4),
  82. Among(u"tional", -1, 1),
  83. Among(u"ational", 5, 2),
  84. Among(u"ful", -1, 5),
  85. Among(u"ness", -1, 5)
  86. ]
  87. a_7 = [
  88. Among(u"ic", -1, 1),
  89. Among(u"ance", -1, 1),
  90. Among(u"ence", -1, 1),
  91. Among(u"able", -1, 1),
  92. Among(u"ible", -1, 1),
  93. Among(u"ate", -1, 1),
  94. Among(u"ive", -1, 1),
  95. Among(u"ize", -1, 1),
  96. Among(u"iti", -1, 1),
  97. Among(u"al", -1, 1),
  98. Among(u"ism", -1, 1),
  99. Among(u"ion", -1, 2),
  100. Among(u"er", -1, 1),
  101. Among(u"ous", -1, 1),
  102. Among(u"ant", -1, 1),
  103. Among(u"ent", -1, 1),
  104. Among(u"ment", 15, 1),
  105. Among(u"ement", 16, 1)
  106. ]
  107. a_8 = [
  108. Among(u"e", -1, 1),
  109. Among(u"l", -1, 2)
  110. ]
  111. a_9 = [
  112. Among(u"succeed", -1, -1),
  113. Among(u"proceed", -1, -1),
  114. Among(u"exceed", -1, -1),
  115. Among(u"canning", -1, -1),
  116. Among(u"inning", -1, -1),
  117. Among(u"earring", -1, -1),
  118. Among(u"herring", -1, -1),
  119. Among(u"outing", -1, -1)
  120. ]
  121. a_10 = [
  122. Among(u"andes", -1, -1),
  123. Among(u"atlas", -1, -1),
  124. Among(u"bias", -1, -1),
  125. Among(u"cosmos", -1, -1),
  126. Among(u"dying", -1, 3),
  127. Among(u"early", -1, 9),
  128. Among(u"gently", -1, 7),
  129. Among(u"howe", -1, -1),
  130. Among(u"idly", -1, 6),
  131. Among(u"lying", -1, 4),
  132. Among(u"news", -1, -1),
  133. Among(u"only", -1, 10),
  134. Among(u"singly", -1, 11),
  135. Among(u"skies", -1, 2),
  136. Among(u"skis", -1, 1),
  137. Among(u"sky", -1, -1),
  138. Among(u"tying", -1, 5),
  139. Among(u"ugly", -1, 8)
  140. ]
  141. g_v = [17, 65, 16, 1]
  142. g_v_WXY = [1, 17, 65, 208, 1]
  143. g_valid_LI = [55, 141, 2]
  144. B_Y_found = False
  145. I_p2 = 0
  146. I_p1 = 0
  147. def __r_prelude(self):
  148. self.B_Y_found = False
  149. v_1 = self.cursor
  150. try:
  151. self.bra = self.cursor
  152. if not self.eq_s(u"'"):
  153. raise lab0()
  154. self.ket = self.cursor
  155. if not self.slice_del():
  156. return False
  157. except lab0: pass
  158. self.cursor = v_1
  159. v_2 = self.cursor
  160. try:
  161. self.bra = self.cursor
  162. if not self.eq_s(u"y"):
  163. raise lab1()
  164. self.ket = self.cursor
  165. if not self.slice_from(u"Y"):
  166. return False
  167. self.B_Y_found = True
  168. except lab1: pass
  169. self.cursor = v_2
  170. v_3 = self.cursor
  171. try:
  172. while True:
  173. v_4 = self.cursor
  174. try:
  175. try:
  176. while True:
  177. v_5 = self.cursor
  178. try:
  179. if not self.in_grouping(EnglishStemmer.g_v, 97, 121):
  180. raise lab5()
  181. self.bra = self.cursor
  182. if not self.eq_s(u"y"):
  183. raise lab5()
  184. self.ket = self.cursor
  185. self.cursor = v_5
  186. raise lab4()
  187. except lab5: pass
  188. self.cursor = v_5
  189. if self.cursor >= self.limit:
  190. raise lab3()
  191. self.cursor += 1
  192. except lab4: pass
  193. if not self.slice_from(u"Y"):
  194. return False
  195. self.B_Y_found = True
  196. continue
  197. except lab3: pass
  198. self.cursor = v_4
  199. break
  200. except lab2: pass
  201. self.cursor = v_3
  202. return True
  203. def __r_mark_regions(self):
  204. self.I_p1 = self.limit
  205. self.I_p2 = self.limit
  206. v_1 = self.cursor
  207. try:
  208. try:
  209. v_2 = self.cursor
  210. try:
  211. if self.find_among(EnglishStemmer.a_0) == 0:
  212. raise lab2()
  213. raise lab1()
  214. except lab2: pass
  215. self.cursor = v_2
  216. if not self.go_out_grouping(EnglishStemmer.g_v, 97, 121):
  217. raise lab0()
  218. self.cursor += 1
  219. if not self.go_in_grouping(EnglishStemmer.g_v, 97, 121):
  220. raise lab0()
  221. self.cursor += 1
  222. except lab1: pass
  223. self.I_p1 = self.cursor
  224. if not self.go_out_grouping(EnglishStemmer.g_v, 97, 121):
  225. raise lab0()
  226. self.cursor += 1
  227. if not self.go_in_grouping(EnglishStemmer.g_v, 97, 121):
  228. raise lab0()
  229. self.cursor += 1
  230. self.I_p2 = self.cursor
  231. except lab0: pass
  232. self.cursor = v_1
  233. return True
  234. def __r_shortv(self):
  235. try:
  236. v_1 = self.limit - self.cursor
  237. try:
  238. if not self.out_grouping_b(EnglishStemmer.g_v_WXY, 89, 121):
  239. raise lab1()
  240. if not self.in_grouping_b(EnglishStemmer.g_v, 97, 121):
  241. raise lab1()
  242. if not self.out_grouping_b(EnglishStemmer.g_v, 97, 121):
  243. raise lab1()
  244. raise lab0()
  245. except lab1: pass
  246. self.cursor = self.limit - v_1
  247. if not self.out_grouping_b(EnglishStemmer.g_v, 97, 121):
  248. return False
  249. if not self.in_grouping_b(EnglishStemmer.g_v, 97, 121):
  250. return False
  251. if self.cursor > self.limit_backward:
  252. return False
  253. except lab0: pass
  254. return True
  255. def __r_R1(self):
  256. if not self.I_p1 <= self.cursor:
  257. return False
  258. return True
  259. def __r_R2(self):
  260. if not self.I_p2 <= self.cursor:
  261. return False
  262. return True
  263. def __r_Step_1a(self):
  264. v_1 = self.limit - self.cursor
  265. try:
  266. self.ket = self.cursor
  267. if self.find_among_b(EnglishStemmer.a_1) == 0:
  268. self.cursor = self.limit - v_1
  269. raise lab0()
  270. self.bra = self.cursor
  271. if not self.slice_del():
  272. return False
  273. except lab0: pass
  274. self.ket = self.cursor
  275. among_var = self.find_among_b(EnglishStemmer.a_2)
  276. if among_var == 0:
  277. return False
  278. self.bra = self.cursor
  279. if among_var == 1:
  280. if not self.slice_from(u"ss"):
  281. return False
  282. elif among_var == 2:
  283. try:
  284. v_2 = self.limit - self.cursor
  285. try:
  286. c = self.cursor - 2
  287. if c < self.limit_backward:
  288. raise lab2()
  289. self.cursor = c
  290. if not self.slice_from(u"i"):
  291. return False
  292. raise lab1()
  293. except lab2: pass
  294. self.cursor = self.limit - v_2
  295. if not self.slice_from(u"ie"):
  296. return False
  297. except lab1: pass
  298. elif among_var == 3:
  299. if self.cursor <= self.limit_backward:
  300. return False
  301. self.cursor -= 1
  302. if not self.go_out_grouping_b(EnglishStemmer.g_v, 97, 121):
  303. return False
  304. self.cursor -= 1
  305. if not self.slice_del():
  306. return False
  307. return True
  308. def __r_Step_1b(self):
  309. self.ket = self.cursor
  310. among_var = self.find_among_b(EnglishStemmer.a_4)
  311. if among_var == 0:
  312. return False
  313. self.bra = self.cursor
  314. if among_var == 1:
  315. if not self.__r_R1():
  316. return False
  317. if not self.slice_from(u"ee"):
  318. return False
  319. else:
  320. v_1 = self.limit - self.cursor
  321. if not self.go_out_grouping_b(EnglishStemmer.g_v, 97, 121):
  322. return False
  323. self.cursor -= 1
  324. self.cursor = self.limit - v_1
  325. if not self.slice_del():
  326. return False
  327. v_2 = self.limit - self.cursor
  328. among_var = self.find_among_b(EnglishStemmer.a_3)
  329. if among_var == 0:
  330. return False
  331. self.cursor = self.limit - v_2
  332. if among_var == 1:
  333. c = self.cursor
  334. self.insert(self.cursor, self.cursor, u"e")
  335. self.cursor = c
  336. elif among_var == 2:
  337. self.ket = self.cursor
  338. if self.cursor <= self.limit_backward:
  339. return False
  340. self.cursor -= 1
  341. self.bra = self.cursor
  342. if not self.slice_del():
  343. return False
  344. else:
  345. if self.cursor != self.I_p1:
  346. return False
  347. v_3 = self.limit - self.cursor
  348. if not self.__r_shortv():
  349. return False
  350. self.cursor = self.limit - v_3
  351. c = self.cursor
  352. self.insert(self.cursor, self.cursor, u"e")
  353. self.cursor = c
  354. return True
  355. def __r_Step_1c(self):
  356. self.ket = self.cursor
  357. try:
  358. v_1 = self.limit - self.cursor
  359. try:
  360. if not self.eq_s_b(u"y"):
  361. raise lab1()
  362. raise lab0()
  363. except lab1: pass
  364. self.cursor = self.limit - v_1
  365. if not self.eq_s_b(u"Y"):
  366. return False
  367. except lab0: pass
  368. self.bra = self.cursor
  369. if not self.out_grouping_b(EnglishStemmer.g_v, 97, 121):
  370. return False
  371. try:
  372. if self.cursor > self.limit_backward:
  373. raise lab2()
  374. return False
  375. except lab2: pass
  376. if not self.slice_from(u"i"):
  377. return False
  378. return True
  379. def __r_Step_2(self):
  380. self.ket = self.cursor
  381. among_var = self.find_among_b(EnglishStemmer.a_5)
  382. if among_var == 0:
  383. return False
  384. self.bra = self.cursor
  385. if not self.__r_R1():
  386. return False
  387. if among_var == 1:
  388. if not self.slice_from(u"tion"):
  389. return False
  390. elif among_var == 2:
  391. if not self.slice_from(u"ence"):
  392. return False
  393. elif among_var == 3:
  394. if not self.slice_from(u"ance"):
  395. return False
  396. elif among_var == 4:
  397. if not self.slice_from(u"able"):
  398. return False
  399. elif among_var == 5:
  400. if not self.slice_from(u"ent"):
  401. return False
  402. elif among_var == 6:
  403. if not self.slice_from(u"ize"):
  404. return False
  405. elif among_var == 7:
  406. if not self.slice_from(u"ate"):
  407. return False
  408. elif among_var == 8:
  409. if not self.slice_from(u"al"):
  410. return False
  411. elif among_var == 9:
  412. if not self.slice_from(u"ful"):
  413. return False
  414. elif among_var == 10:
  415. if not self.slice_from(u"ous"):
  416. return False
  417. elif among_var == 11:
  418. if not self.slice_from(u"ive"):
  419. return False
  420. elif among_var == 12:
  421. if not self.slice_from(u"ble"):
  422. return False
  423. elif among_var == 13:
  424. if not self.eq_s_b(u"l"):
  425. return False
  426. if not self.slice_from(u"og"):
  427. return False
  428. elif among_var == 14:
  429. if not self.slice_from(u"less"):
  430. return False
  431. else:
  432. if not self.in_grouping_b(EnglishStemmer.g_valid_LI, 99, 116):
  433. return False
  434. if not self.slice_del():
  435. return False
  436. return True
  437. def __r_Step_3(self):
  438. self.ket = self.cursor
  439. among_var = self.find_among_b(EnglishStemmer.a_6)
  440. if among_var == 0:
  441. return False
  442. self.bra = self.cursor
  443. if not self.__r_R1():
  444. return False
  445. if among_var == 1:
  446. if not self.slice_from(u"tion"):
  447. return False
  448. elif among_var == 2:
  449. if not self.slice_from(u"ate"):
  450. return False
  451. elif among_var == 3:
  452. if not self.slice_from(u"al"):
  453. return False
  454. elif among_var == 4:
  455. if not self.slice_from(u"ic"):
  456. return False
  457. elif among_var == 5:
  458. if not self.slice_del():
  459. return False
  460. else:
  461. if not self.__r_R2():
  462. return False
  463. if not self.slice_del():
  464. return False
  465. return True
  466. def __r_Step_4(self):
  467. self.ket = self.cursor
  468. among_var = self.find_among_b(EnglishStemmer.a_7)
  469. if among_var == 0:
  470. return False
  471. self.bra = self.cursor
  472. if not self.__r_R2():
  473. return False
  474. if among_var == 1:
  475. if not self.slice_del():
  476. return False
  477. else:
  478. try:
  479. v_1 = self.limit - self.cursor
  480. try:
  481. if not self.eq_s_b(u"s"):
  482. raise lab1()
  483. raise lab0()
  484. except lab1: pass
  485. self.cursor = self.limit - v_1
  486. if not self.eq_s_b(u"t"):
  487. return False
  488. except lab0: pass
  489. if not self.slice_del():
  490. return False
  491. return True
  492. def __r_Step_5(self):
  493. self.ket = self.cursor
  494. among_var = self.find_among_b(EnglishStemmer.a_8)
  495. if among_var == 0:
  496. return False
  497. self.bra = self.cursor
  498. if among_var == 1:
  499. try:
  500. v_1 = self.limit - self.cursor
  501. try:
  502. if not self.__r_R2():
  503. raise lab1()
  504. raise lab0()
  505. except lab1: pass
  506. self.cursor = self.limit - v_1
  507. if not self.__r_R1():
  508. return False
  509. v_2 = self.limit - self.cursor
  510. try:
  511. if not self.__r_shortv():
  512. raise lab2()
  513. return False
  514. except lab2: pass
  515. self.cursor = self.limit - v_2
  516. except lab0: pass
  517. if not self.slice_del():
  518. return False
  519. else:
  520. if not self.__r_R2():
  521. return False
  522. if not self.eq_s_b(u"l"):
  523. return False
  524. if not self.slice_del():
  525. return False
  526. return True
  527. def __r_exception2(self):
  528. self.ket = self.cursor
  529. if self.find_among_b(EnglishStemmer.a_9) == 0:
  530. return False
  531. self.bra = self.cursor
  532. if self.cursor > self.limit_backward:
  533. return False
  534. return True
  535. def __r_exception1(self):
  536. self.bra = self.cursor
  537. among_var = self.find_among(EnglishStemmer.a_10)
  538. if among_var == 0:
  539. return False
  540. self.ket = self.cursor
  541. if self.cursor < self.limit:
  542. return False
  543. if among_var == 1:
  544. if not self.slice_from(u"ski"):
  545. return False
  546. elif among_var == 2:
  547. if not self.slice_from(u"sky"):
  548. return False
  549. elif among_var == 3:
  550. if not self.slice_from(u"die"):
  551. return False
  552. elif among_var == 4:
  553. if not self.slice_from(u"lie"):
  554. return False
  555. elif among_var == 5:
  556. if not self.slice_from(u"tie"):
  557. return False
  558. elif among_var == 6:
  559. if not self.slice_from(u"idl"):
  560. return False
  561. elif among_var == 7:
  562. if not self.slice_from(u"gentl"):
  563. return False
  564. elif among_var == 8:
  565. if not self.slice_from(u"ugli"):
  566. return False
  567. elif among_var == 9:
  568. if not self.slice_from(u"earli"):
  569. return False
  570. elif among_var == 10:
  571. if not self.slice_from(u"onli"):
  572. return False
  573. elif among_var == 11:
  574. if not self.slice_from(u"singl"):
  575. return False
  576. return True
  577. def __r_postlude(self):
  578. if not self.B_Y_found:
  579. return False
  580. while True:
  581. v_1 = self.cursor
  582. try:
  583. try:
  584. while True:
  585. v_2 = self.cursor
  586. try:
  587. self.bra = self.cursor
  588. if not self.eq_s(u"Y"):
  589. raise lab2()
  590. self.ket = self.cursor
  591. self.cursor = v_2
  592. raise lab1()
  593. except lab2: pass
  594. self.cursor = v_2
  595. if self.cursor >= self.limit:
  596. raise lab0()
  597. self.cursor += 1
  598. except lab1: pass
  599. if not self.slice_from(u"y"):
  600. return False
  601. continue
  602. except lab0: pass
  603. self.cursor = v_1
  604. break
  605. return True
  606. def _stem(self):
  607. try:
  608. v_1 = self.cursor
  609. try:
  610. if not self.__r_exception1():
  611. raise lab1()
  612. raise lab0()
  613. except lab1: pass
  614. self.cursor = v_1
  615. try:
  616. v_2 = self.cursor
  617. try:
  618. c = self.cursor + 3
  619. if c > self.limit:
  620. raise lab3()
  621. self.cursor = c
  622. raise lab2()
  623. except lab3: pass
  624. self.cursor = v_2
  625. raise lab0()
  626. except lab2: pass
  627. self.cursor = v_1
  628. self.__r_prelude()
  629. self.__r_mark_regions()
  630. self.limit_backward = self.cursor
  631. self.cursor = self.limit
  632. v_5 = self.limit - self.cursor
  633. self.__r_Step_1a()
  634. self.cursor = self.limit - v_5
  635. try:
  636. v_6 = self.limit - self.cursor
  637. try:
  638. if not self.__r_exception2():
  639. raise lab5()
  640. raise lab4()
  641. except lab5: pass
  642. self.cursor = self.limit - v_6
  643. v_7 = self.limit - self.cursor
  644. self.__r_Step_1b()
  645. self.cursor = self.limit - v_7
  646. v_8 = self.limit - self.cursor
  647. self.__r_Step_1c()
  648. self.cursor = self.limit - v_8
  649. v_9 = self.limit - self.cursor
  650. self.__r_Step_2()
  651. self.cursor = self.limit - v_9
  652. v_10 = self.limit - self.cursor
  653. self.__r_Step_3()
  654. self.cursor = self.limit - v_10
  655. v_11 = self.limit - self.cursor
  656. self.__r_Step_4()
  657. self.cursor = self.limit - v_11
  658. v_12 = self.limit - self.cursor
  659. self.__r_Step_5()
  660. self.cursor = self.limit - v_12
  661. except lab4: pass
  662. self.cursor = self.limit_backward
  663. v_13 = self.cursor
  664. self.__r_postlude()
  665. self.cursor = v_13
  666. except lab0: pass
  667. return True
  668. class lab0(BaseException): pass
  669. class lab1(BaseException): pass
  670. class lab2(BaseException): pass
  671. class lab3(BaseException): pass
  672. class lab4(BaseException): pass
  673. class lab5(BaseException): pass