| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323 |
- class BaseStemmer(object):
- def __init__(self):
- self.set_current("")
- def set_current(self, value):
- '''
- Set the self.current string.
- '''
- self.current = value
- self.cursor = 0
- self.limit = len(self.current)
- self.limit_backward = 0
- self.bra = self.cursor
- self.ket = self.limit
- def get_current(self):
- '''
- Get the self.current string.
- '''
- return self.current
- def copy_from(self, other):
- self.current = other.current
- self.cursor = other.cursor
- self.limit = other.limit
- self.limit_backward = other.limit_backward
- self.bra = other.bra
- self.ket = other.ket
- def in_grouping(self, s, min, max):
- if self.cursor >= self.limit:
- return False
- ch = ord(self.current[self.cursor])
- if ch > max or ch < min:
- return False
- ch -= min
- if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
- return False
- self.cursor += 1
- return True
- def go_in_grouping(self, s, min, max):
- while self.cursor < self.limit:
- ch = ord(self.current[self.cursor])
- if ch > max or ch < min:
- return True
- ch -= min
- if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
- return True
- self.cursor += 1
- return False
- def in_grouping_b(self, s, min, max):
- if self.cursor <= self.limit_backward:
- return False
- ch = ord(self.current[self.cursor - 1])
- if ch > max or ch < min:
- return False
- ch -= min
- if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
- return False
- self.cursor -= 1
- return True
- def go_in_grouping_b(self, s, min, max):
- while self.cursor > self.limit_backward:
- ch = ord(self.current[self.cursor - 1])
- if ch > max or ch < min:
- return True
- ch -= min
- if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
- return True
- self.cursor -= 1
- return False
- def out_grouping(self, s, min, max):
- if self.cursor >= self.limit:
- return False
- ch = ord(self.current[self.cursor])
- if ch > max or ch < min:
- self.cursor += 1
- return True
- ch -= min
- if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0:
- self.cursor += 1
- return True
- return False
- def go_out_grouping(self, s, min, max):
- while self.cursor < self.limit:
- ch = ord(self.current[self.cursor])
- if ch <= max and ch >= min:
- ch -= min
- if (s[ch >> 3] & (0X1 << (ch & 0x7))):
- return True
- self.cursor += 1
- return False
- def out_grouping_b(self, s, min, max):
- if self.cursor <= self.limit_backward:
- return False
- ch = ord(self.current[self.cursor - 1])
- if ch > max or ch < min:
- self.cursor -= 1
- return True
- ch -= min
- if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0:
- self.cursor -= 1
- return True
- return False
- def go_out_grouping_b(self, s, min, max):
- while self.cursor > self.limit_backward:
- ch = ord(self.current[self.cursor - 1])
- if ch <= max and ch >= min:
- ch -= min
- if (s[ch >> 3] & (0X1 << (ch & 0x7))):
- return True
- self.cursor -= 1
- return False
- def eq_s(self, s):
- if self.limit - self.cursor < len(s):
- return False
- if self.current[self.cursor:self.cursor + len(s)] != s:
- return False
- self.cursor += len(s)
- return True
- def eq_s_b(self, s):
- if self.cursor - self.limit_backward < len(s):
- return False
- if self.current[self.cursor - len(s):self.cursor] != s:
- return False
- self.cursor -= len(s)
- return True
- def find_among(self, v):
- i = 0
- j = len(v)
- c = self.cursor
- l = self.limit
- common_i = 0
- common_j = 0
- first_key_inspected = False
- while True:
- k = i + ((j - i) >> 1)
- diff = 0
- common = min(common_i, common_j) # smaller
- w = v[k]
- for i2 in range(common, len(w.s)):
- if c + common == l:
- diff = -1
- break
- diff = ord(self.current[c + common]) - ord(w.s[i2])
- if diff != 0:
- break
- common += 1
- if diff < 0:
- j = k
- common_j = common
- else:
- i = k
- common_i = common
- if j - i <= 1:
- if i > 0:
- break # v->s has been inspected
- if j == i:
- break # only one item in v
- # - but now we need to go round once more to get
- # v->s inspected. This looks messy, but is actually
- # the optimal approach.
- if first_key_inspected:
- break
- first_key_inspected = True
- while True:
- w = v[i]
- if common_i >= len(w.s):
- self.cursor = c + len(w.s)
- if w.method is None:
- return w.result
- method = getattr(self, w.method)
- res = method()
- self.cursor = c + len(w.s)
- if res:
- return w.result
- i = w.substring_i
- if i < 0:
- return 0
- return -1 # not reachable
- def find_among_b(self, v):
- '''
- find_among_b is for backwards processing. Same comments apply
- '''
- i = 0
- j = len(v)
- c = self.cursor
- lb = self.limit_backward
- common_i = 0
- common_j = 0
- first_key_inspected = False
- while True:
- k = i + ((j - i) >> 1)
- diff = 0
- common = min(common_i, common_j)
- w = v[k]
- for i2 in range(len(w.s) - 1 - common, -1, -1):
- if c - common == lb:
- diff = -1
- break
- diff = ord(self.current[c - 1 - common]) - ord(w.s[i2])
- if diff != 0:
- break
- common += 1
- if diff < 0:
- j = k
- common_j = common
- else:
- i = k
- common_i = common
- if j - i <= 1:
- if i > 0:
- break
- if j == i:
- break
- if first_key_inspected:
- break
- first_key_inspected = True
- while True:
- w = v[i]
- if common_i >= len(w.s):
- self.cursor = c - len(w.s)
- if w.method is None:
- return w.result
- method = getattr(self, w.method)
- res = method()
- self.cursor = c - len(w.s)
- if res:
- return w.result
- i = w.substring_i
- if i < 0:
- return 0
- return -1 # not reachable
- def replace_s(self, c_bra, c_ket, s):
- '''
- to replace chars between c_bra and c_ket in self.current by the
- chars in s.
- @type c_bra int
- @type c_ket int
- @type s: string
- '''
- adjustment = len(s) - (c_ket - c_bra)
- self.current = self.current[0:c_bra] + s + self.current[c_ket:]
- self.limit += adjustment
- if self.cursor >= c_ket:
- self.cursor += adjustment
- elif self.cursor > c_bra:
- self.cursor = c_bra
- return adjustment
- def slice_check(self):
- if self.bra < 0 or self.bra > self.ket or self.ket > self.limit or self.limit > len(self.current):
- return False
- return True
- def slice_from(self, s):
- '''
- @type s string
- '''
- result = False
- if self.slice_check():
- self.replace_s(self.bra, self.ket, s)
- result = True
- return result
- def slice_del(self):
- return self.slice_from("")
- def insert(self, c_bra, c_ket, s):
- '''
- @type c_bra int
- @type c_ket int
- @type s: string
- '''
- adjustment = self.replace_s(c_bra, c_ket, s)
- if c_bra <= self.bra:
- self.bra += adjustment
- if c_bra <= self.ket:
- self.ket += adjustment
- def slice_to(self):
- '''
- Return the slice as a string.
- '''
- result = ''
- if self.slice_check():
- result = self.current[self.bra:self.ket]
- return result
- def assign_to(self):
- '''
- Return the current string up to the limit.
- '''
- return self.current[0:self.limit]
- def stemWord(self, word):
- self.set_current(word)
- self._stem()
- return self.get_current()
- def stemWords(self, words):
- return [self.stemWord(word) for word in words]
|