translit.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. # -*- coding: utf-8 -*-
  2. # -*- test-case-name: pytils.test.test_translit -*-
  3. """
  4. Simple transliteration
  5. """
  6. import re
  7. TRANSTABLE = (
  8. ("'", "'"),
  9. ('"', '"'),
  10. ("‘", "'"),
  11. ("’", "'"),
  12. ("«", '"'),
  13. ("»", '"'),
  14. ("“", '"'),
  15. ("”", '"'),
  16. ("–", "-"), # en dash
  17. ("—", "-"), # em dash
  18. ("‒", "-"), # figure dash
  19. ("−", "-"), # minus
  20. ("…", "..."),
  21. ("№", "#"),
  22. ## upper
  23. # three-symbols replacements
  24. ("Щ", "Sch"),
  25. # on russian->english translation only first replacement will be done
  26. # i.e. Sch
  27. # but on english->russian translation both variants (Sch and SCH) will play
  28. ("Щ", "SCH"),
  29. # two-symbol replacements
  30. ("Ё", "Yo"),
  31. ("Ё", "YO"),
  32. ("Ж", "Zh"),
  33. ("Ж", "ZH"),
  34. ("Ц", "Ts"),
  35. ("Ц", "TS"),
  36. ("Ч", "Ch"),
  37. ("Ч", "CH"),
  38. ("Ш", "Sh"),
  39. ("Ш", "SH"),
  40. ("Ы", "Yi"),
  41. ("Ы", "YI"),
  42. ("Ю", "YU"),
  43. ("Ю", "Yu"),
  44. ("Я", "Ya"),
  45. ("Я", "YA"),
  46. # one-symbol replacements
  47. ("А", "A"),
  48. ("Б", "B"),
  49. ("В", "V"),
  50. ("Г", "G"),
  51. ("Д", "D"),
  52. ("Е", "E"),
  53. ("З", "Z"),
  54. ("И", "I"),
  55. ("Й", "J"),
  56. ("К", "K"),
  57. ("Л", "L"),
  58. ("М", "M"),
  59. ("Н", "N"),
  60. ("О", "O"),
  61. ("П", "P"),
  62. ("Р", "R"),
  63. ("С", "S"),
  64. ("Т", "T"),
  65. ("У", "U"),
  66. ("Ф", "F"),
  67. ("Х", "H"),
  68. ("Э", "E"),
  69. ("Ъ", "`"),
  70. ("Ь", "'"),
  71. ## lower
  72. # three-symbols replacements
  73. ("щ", "sch"),
  74. # two-symbols replacements
  75. ("ё", "yo"),
  76. ("ж", "zh"),
  77. ("ц", "ts"),
  78. ("ч", "ch"),
  79. ("ш", "sh"),
  80. ("ы", "yi"),
  81. ("ю", "yu"),
  82. ("я", "ya"),
  83. # one-symbol replacements
  84. ("а", "a"),
  85. ("б", "b"),
  86. ("в", "v"),
  87. ("г", "g"),
  88. ("д", "d"),
  89. ("е", "e"),
  90. ("з", "z"),
  91. ("и", "i"),
  92. ("й", "j"),
  93. ("к", "k"),
  94. ("л", "l"),
  95. ("м", "m"),
  96. ("н", "n"),
  97. ("о", "o"),
  98. ("п", "p"),
  99. ("р", "r"),
  100. ("с", "s"),
  101. ("т", "t"),
  102. ("у", "u"),
  103. ("ф", "f"),
  104. ("х", "h"),
  105. ("э", "e"),
  106. ("ъ", "`"),
  107. ("ь", "'"),
  108. # Make english alphabet full: append english-english pairs
  109. # for symbols which is not used in russian-english
  110. # translations. Used in slugify.
  111. ("c", "c"),
  112. ("q", "q"),
  113. ("y", "y"),
  114. ("x", "x"),
  115. ("w", "w"),
  116. ("1", "1"),
  117. ("2", "2"),
  118. ("3", "3"),
  119. ("4", "4"),
  120. ("5", "5"),
  121. ("6", "6"),
  122. ("7", "7"),
  123. ("8", "8"),
  124. ("9", "9"),
  125. ("0", "0"),
  126. ) #: Translation table
  127. RU_ALPHABET = [x[0] for x in TRANSTABLE] #: Russian alphabet that we can translate
  128. EN_ALPHABET = [x[1] for x in TRANSTABLE] #: English alphabet that we can detransliterate
  129. ALPHABET = RU_ALPHABET + EN_ALPHABET #: Alphabet that we can (de)transliterate
  130. def translify(in_string, strict=True):
  131. """
  132. Translify russian text
  133. @param in_string: input string
  134. @type in_string: C{str}
  135. @param strict: raise error if transliteration is incomplete.
  136. (True by default)
  137. @type strict: C{bool}
  138. @return: transliterated string
  139. @rtype: C{str}
  140. @raise ValueError: when string doesn't transliterate completely.
  141. Raised only if strict=True
  142. """
  143. translit = in_string
  144. for symb_in, symb_out in TRANSTABLE:
  145. translit = translit.replace(symb_in, symb_out)
  146. if strict and any(ord(symb) > 128 for symb in translit):
  147. raise ValueError("Unicode string doesn't transliterate completely, " + \
  148. "is it russian?")
  149. return translit
  150. def detranslify(in_string):
  151. """
  152. Detranslify
  153. @param in_string: input string
  154. @type in_string: C{basestring}
  155. @return: detransliterated string
  156. @rtype: C{str}
  157. @raise ValueError: if in_string is C{str}, but it isn't ascii
  158. """
  159. try:
  160. russian = str(in_string)
  161. except UnicodeDecodeError:
  162. raise ValueError("We expects if in_string is 8-bit string," + \
  163. "then it consists only ASCII chars, but now it doesn't. " + \
  164. "Use unicode in this case.")
  165. for symb_out, symb_in in TRANSTABLE:
  166. russian = russian.replace(symb_in, symb_out)
  167. # TODO: выбрать правильный регистр для ь и ъ
  168. # твердый и мягкий знак в dentranslify всегда будут в верхнем регистре
  169. # потому что ` и ' не несут информацию о регистре
  170. return russian
  171. def slugify(in_string):
  172. """
  173. Prepare string for slug (i.e. URL or file/dir name)
  174. @param in_string: input string
  175. @type in_string: C{basestring}
  176. @return: slug-string
  177. @rtype: C{str}
  178. @raise ValueError: if in_string is C{str}, but it isn't ascii
  179. """
  180. try:
  181. u_in_string = str(in_string).lower()
  182. except UnicodeDecodeError:
  183. raise ValueError("We expects when in_string is str type," + \
  184. "it is an ascii, but now it isn't. Use unicode " + \
  185. "in this case.")
  186. # convert & to "and"
  187. u_in_string = re.sub('\&amp\;|\&', ' and ', u_in_string)
  188. # replace spaces by hyphen
  189. u_in_string = re.sub('[-\s]+', '-', u_in_string)
  190. # remove symbols that not in alphabet
  191. u_in_string = ''.join([symb for symb in u_in_string if symb in ALPHABET])
  192. # translify it
  193. out_string = translify(u_in_string)
  194. # remove non-alpha
  195. return re.sub('[^\w\s-]', '', out_string).strip().lower()
  196. def dirify(in_string):
  197. """
  198. Alias for L{slugify}
  199. """
  200. slugify(in_string)