util_cjk.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. package util
  2. import "unicode"
  3. var cjkRadicalsSupplement = &unicode.RangeTable{
  4. R16: []unicode.Range16{
  5. {0x2E80, 0x2EFF, 1},
  6. },
  7. }
  8. var kangxiRadicals = &unicode.RangeTable{
  9. R16: []unicode.Range16{
  10. {0x2F00, 0x2FDF, 1},
  11. },
  12. }
  13. var ideographicDescriptionCharacters = &unicode.RangeTable{
  14. R16: []unicode.Range16{
  15. {0x2FF0, 0x2FFF, 1},
  16. },
  17. }
  18. var cjkSymbolsAndPunctuation = &unicode.RangeTable{
  19. R16: []unicode.Range16{
  20. {0x3000, 0x303F, 1},
  21. },
  22. }
  23. var hiragana = &unicode.RangeTable{
  24. R16: []unicode.Range16{
  25. {0x3040, 0x309F, 1},
  26. },
  27. }
  28. var katakana = &unicode.RangeTable{
  29. R16: []unicode.Range16{
  30. {0x30A0, 0x30FF, 1},
  31. },
  32. }
  33. var kanbun = &unicode.RangeTable{
  34. R16: []unicode.Range16{
  35. {0x3130, 0x318F, 1},
  36. {0x3190, 0x319F, 1},
  37. },
  38. }
  39. var cjkStrokes = &unicode.RangeTable{
  40. R16: []unicode.Range16{
  41. {0x31C0, 0x31EF, 1},
  42. },
  43. }
  44. var katakanaPhoneticExtensions = &unicode.RangeTable{
  45. R16: []unicode.Range16{
  46. {0x31F0, 0x31FF, 1},
  47. },
  48. }
  49. var cjkCompatibility = &unicode.RangeTable{
  50. R16: []unicode.Range16{
  51. {0x3300, 0x33FF, 1},
  52. },
  53. }
  54. var cjkUnifiedIdeographsExtensionA = &unicode.RangeTable{
  55. R16: []unicode.Range16{
  56. {0x3400, 0x4DBF, 1},
  57. },
  58. }
  59. var cjkUnifiedIdeographs = &unicode.RangeTable{
  60. R16: []unicode.Range16{
  61. {0x4E00, 0x9FFF, 1},
  62. },
  63. }
  64. var yiSyllables = &unicode.RangeTable{
  65. R16: []unicode.Range16{
  66. {0xA000, 0xA48F, 1},
  67. },
  68. }
  69. var yiRadicals = &unicode.RangeTable{
  70. R16: []unicode.Range16{
  71. {0xA490, 0xA4CF, 1},
  72. },
  73. }
  74. var cjkCompatibilityIdeographs = &unicode.RangeTable{
  75. R16: []unicode.Range16{
  76. {0xF900, 0xFAFF, 1},
  77. },
  78. }
  79. var verticalForms = &unicode.RangeTable{
  80. R16: []unicode.Range16{
  81. {0xFE10, 0xFE1F, 1},
  82. },
  83. }
  84. var cjkCompatibilityForms = &unicode.RangeTable{
  85. R16: []unicode.Range16{
  86. {0xFE30, 0xFE4F, 1},
  87. },
  88. }
  89. var smallFormVariants = &unicode.RangeTable{
  90. R16: []unicode.Range16{
  91. {0xFE50, 0xFE6F, 1},
  92. },
  93. }
  94. var halfwidthAndFullwidthForms = &unicode.RangeTable{
  95. R16: []unicode.Range16{
  96. {0xFF00, 0xFFEF, 1},
  97. },
  98. }
  99. var kanaSupplement = &unicode.RangeTable{
  100. R32: []unicode.Range32{
  101. {0x1B000, 0x1B0FF, 1},
  102. },
  103. }
  104. var kanaExtendedA = &unicode.RangeTable{
  105. R32: []unicode.Range32{
  106. {0x1B100, 0x1B12F, 1},
  107. },
  108. }
  109. var smallKanaExtension = &unicode.RangeTable{
  110. R32: []unicode.Range32{
  111. {0x1B130, 0x1B16F, 1},
  112. },
  113. }
  114. var cjkUnifiedIdeographsExtensionB = &unicode.RangeTable{
  115. R32: []unicode.Range32{
  116. {0x20000, 0x2A6DF, 1},
  117. },
  118. }
  119. var cjkUnifiedIdeographsExtensionC = &unicode.RangeTable{
  120. R32: []unicode.Range32{
  121. {0x2A700, 0x2B73F, 1},
  122. },
  123. }
  124. var cjkUnifiedIdeographsExtensionD = &unicode.RangeTable{
  125. R32: []unicode.Range32{
  126. {0x2B740, 0x2B81F, 1},
  127. },
  128. }
  129. var cjkUnifiedIdeographsExtensionE = &unicode.RangeTable{
  130. R32: []unicode.Range32{
  131. {0x2B820, 0x2CEAF, 1},
  132. },
  133. }
  134. var cjkUnifiedIdeographsExtensionF = &unicode.RangeTable{
  135. R32: []unicode.Range32{
  136. {0x2CEB0, 0x2EBEF, 1},
  137. },
  138. }
  139. var cjkCompatibilityIdeographsSupplement = &unicode.RangeTable{
  140. R32: []unicode.Range32{
  141. {0x2F800, 0x2FA1F, 1},
  142. },
  143. }
  144. var cjkUnifiedIdeographsExtensionG = &unicode.RangeTable{
  145. R32: []unicode.Range32{
  146. {0x30000, 0x3134F, 1},
  147. },
  148. }
  149. // IsEastAsianWideRune returns trhe if the given rune is an east asian wide character, otherwise false.
  150. func IsEastAsianWideRune(r rune) bool {
  151. return unicode.Is(unicode.Hiragana, r) ||
  152. unicode.Is(unicode.Katakana, r) ||
  153. unicode.Is(unicode.Han, r) ||
  154. unicode.Is(unicode.Lm, r) ||
  155. unicode.Is(unicode.Hangul, r) ||
  156. unicode.Is(cjkSymbolsAndPunctuation, r)
  157. }
  158. // IsSpaceDiscardingUnicodeRune returns true if the given rune is space-discarding unicode character, otherwise false.
  159. // See https://www.w3.org/TR/2020/WD-css-text-3-20200429/#space-discard-set
  160. func IsSpaceDiscardingUnicodeRune(r rune) bool {
  161. return unicode.Is(cjkRadicalsSupplement, r) ||
  162. unicode.Is(kangxiRadicals, r) ||
  163. unicode.Is(ideographicDescriptionCharacters, r) ||
  164. unicode.Is(cjkSymbolsAndPunctuation, r) ||
  165. unicode.Is(hiragana, r) ||
  166. unicode.Is(katakana, r) ||
  167. unicode.Is(kanbun, r) ||
  168. unicode.Is(cjkStrokes, r) ||
  169. unicode.Is(katakanaPhoneticExtensions, r) ||
  170. unicode.Is(cjkCompatibility, r) ||
  171. unicode.Is(cjkUnifiedIdeographsExtensionA, r) ||
  172. unicode.Is(cjkUnifiedIdeographs, r) ||
  173. unicode.Is(yiSyllables, r) ||
  174. unicode.Is(yiRadicals, r) ||
  175. unicode.Is(cjkCompatibilityIdeographs, r) ||
  176. unicode.Is(verticalForms, r) ||
  177. unicode.Is(cjkCompatibilityForms, r) ||
  178. unicode.Is(smallFormVariants, r) ||
  179. unicode.Is(halfwidthAndFullwidthForms, r) ||
  180. unicode.Is(kanaSupplement, r) ||
  181. unicode.Is(kanaExtendedA, r) ||
  182. unicode.Is(smallKanaExtension, r) ||
  183. unicode.Is(cjkUnifiedIdeographsExtensionB, r) ||
  184. unicode.Is(cjkUnifiedIdeographsExtensionC, r) ||
  185. unicode.Is(cjkUnifiedIdeographsExtensionD, r) ||
  186. unicode.Is(cjkUnifiedIdeographsExtensionE, r) ||
  187. unicode.Is(cjkUnifiedIdeographsExtensionF, r) ||
  188. unicode.Is(cjkCompatibilityIdeographsSupplement, r) ||
  189. unicode.Is(cjkUnifiedIdeographsExtensionG, r)
  190. }
  191. // EastAsianWidth returns the east asian width of the given rune.
  192. // See https://www.unicode.org/reports/tr11/tr11-36.html
  193. func EastAsianWidth(r rune) string {
  194. switch {
  195. case r == 0x3000,
  196. (0xFF01 <= r && r <= 0xFF60),
  197. (0xFFE0 <= r && r <= 0xFFE6):
  198. return "F"
  199. case r == 0x20A9,
  200. (0xFF61 <= r && r <= 0xFFBE),
  201. (0xFFC2 <= r && r <= 0xFFC7),
  202. (0xFFCA <= r && r <= 0xFFCF),
  203. (0xFFD2 <= r && r <= 0xFFD7),
  204. (0xFFDA <= r && r <= 0xFFDC),
  205. (0xFFE8 <= r && r <= 0xFFEE):
  206. return "H"
  207. case (0x1100 <= r && r <= 0x115F),
  208. (0x11A3 <= r && r <= 0x11A7),
  209. (0x11FA <= r && r <= 0x11FF),
  210. (0x2329 <= r && r <= 0x232A),
  211. (0x2E80 <= r && r <= 0x2E99),
  212. (0x2E9B <= r && r <= 0x2EF3),
  213. (0x2F00 <= r && r <= 0x2FD5),
  214. (0x2FF0 <= r && r <= 0x2FFB),
  215. (0x3001 <= r && r <= 0x303E),
  216. (0x3041 <= r && r <= 0x3096),
  217. (0x3099 <= r && r <= 0x30FF),
  218. (0x3105 <= r && r <= 0x312D),
  219. (0x3131 <= r && r <= 0x318E),
  220. (0x3190 <= r && r <= 0x31BA),
  221. (0x31C0 <= r && r <= 0x31E3),
  222. (0x31F0 <= r && r <= 0x321E),
  223. (0x3220 <= r && r <= 0x3247),
  224. (0x3250 <= r && r <= 0x32FE),
  225. (0x3300 <= r && r <= 0x4DBF),
  226. (0x4E00 <= r && r <= 0xA48C),
  227. (0xA490 <= r && r <= 0xA4C6),
  228. (0xA960 <= r && r <= 0xA97C),
  229. (0xAC00 <= r && r <= 0xD7A3),
  230. (0xD7B0 <= r && r <= 0xD7C6),
  231. (0xD7CB <= r && r <= 0xD7FB),
  232. (0xF900 <= r && r <= 0xFAFF),
  233. (0xFE10 <= r && r <= 0xFE19),
  234. (0xFE30 <= r && r <= 0xFE52),
  235. (0xFE54 <= r && r <= 0xFE66),
  236. (0xFE68 <= r && r <= 0xFE6B),
  237. (0x1B000 <= r && r <= 0x1B001),
  238. (0x1F200 <= r && r <= 0x1F202),
  239. (0x1F210 <= r && r <= 0x1F23A),
  240. (0x1F240 <= r && r <= 0x1F248),
  241. (0x1F250 <= r && r <= 0x1F251),
  242. (0x20000 <= r && r <= 0x2F73F),
  243. (0x2B740 <= r && r <= 0x2FFFD),
  244. (0x30000 <= r && r <= 0x3FFFD):
  245. return "W"
  246. case (0x0020 <= r && r <= 0x007E),
  247. (0x00A2 <= r && r <= 0x00A3),
  248. (0x00A5 <= r && r <= 0x00A6),
  249. r == 0x00AC,
  250. r == 0x00AF,
  251. (0x27E6 <= r && r <= 0x27ED),
  252. (0x2985 <= r && r <= 0x2986):
  253. return "Na"
  254. case (0x00A1 == r),
  255. (0x00A4 == r),
  256. (0x00A7 <= r && r <= 0x00A8),
  257. (0x00AA == r),
  258. (0x00AD <= r && r <= 0x00AE),
  259. (0x00B0 <= r && r <= 0x00B4),
  260. (0x00B6 <= r && r <= 0x00BA),
  261. (0x00BC <= r && r <= 0x00BF),
  262. (0x00C6 == r),
  263. (0x00D0 == r),
  264. (0x00D7 <= r && r <= 0x00D8),
  265. (0x00DE <= r && r <= 0x00E1),
  266. (0x00E6 == r),
  267. (0x00E8 <= r && r <= 0x00EA),
  268. (0x00EC <= r && r <= 0x00ED),
  269. (0x00F0 == r),
  270. (0x00F2 <= r && r <= 0x00F3),
  271. (0x00F7 <= r && r <= 0x00FA),
  272. (0x00FC == r),
  273. (0x00FE == r),
  274. (0x0101 == r),
  275. (0x0111 == r),
  276. (0x0113 == r),
  277. (0x011B == r),
  278. (0x0126 <= r && r <= 0x0127),
  279. (0x012B == r),
  280. (0x0131 <= r && r <= 0x0133),
  281. (0x0138 == r),
  282. (0x013F <= r && r <= 0x0142),
  283. (0x0144 == r),
  284. (0x0148 <= r && r <= 0x014B),
  285. (0x014D == r),
  286. (0x0152 <= r && r <= 0x0153),
  287. (0x0166 <= r && r <= 0x0167),
  288. (0x016B == r),
  289. (0x01CE == r),
  290. (0x01D0 == r),
  291. (0x01D2 == r),
  292. (0x01D4 == r),
  293. (0x01D6 == r),
  294. (0x01D8 == r),
  295. (0x01DA == r),
  296. (0x01DC == r),
  297. (0x0251 == r),
  298. (0x0261 == r),
  299. (0x02C4 == r),
  300. (0x02C7 == r),
  301. (0x02C9 <= r && r <= 0x02CB),
  302. (0x02CD == r),
  303. (0x02D0 == r),
  304. (0x02D8 <= r && r <= 0x02DB),
  305. (0x02DD == r),
  306. (0x02DF == r),
  307. (0x0300 <= r && r <= 0x036F),
  308. (0x0391 <= r && r <= 0x03A1),
  309. (0x03A3 <= r && r <= 0x03A9),
  310. (0x03B1 <= r && r <= 0x03C1),
  311. (0x03C3 <= r && r <= 0x03C9),
  312. (0x0401 == r),
  313. (0x0410 <= r && r <= 0x044F),
  314. (0x0451 == r),
  315. (0x2010 == r),
  316. (0x2013 <= r && r <= 0x2016),
  317. (0x2018 <= r && r <= 0x2019),
  318. (0x201C <= r && r <= 0x201D),
  319. (0x2020 <= r && r <= 0x2022),
  320. (0x2024 <= r && r <= 0x2027),
  321. (0x2030 == r),
  322. (0x2032 <= r && r <= 0x2033),
  323. (0x2035 == r),
  324. (0x203B == r),
  325. (0x203E == r),
  326. (0x2074 == r),
  327. (0x207F == r),
  328. (0x2081 <= r && r <= 0x2084),
  329. (0x20AC == r),
  330. (0x2103 == r),
  331. (0x2105 == r),
  332. (0x2109 == r),
  333. (0x2113 == r),
  334. (0x2116 == r),
  335. (0x2121 <= r && r <= 0x2122),
  336. (0x2126 == r),
  337. (0x212B == r),
  338. (0x2153 <= r && r <= 0x2154),
  339. (0x215B <= r && r <= 0x215E),
  340. (0x2160 <= r && r <= 0x216B),
  341. (0x2170 <= r && r <= 0x2179),
  342. (0x2189 == r),
  343. (0x2190 <= r && r <= 0x2199),
  344. (0x21B8 <= r && r <= 0x21B9),
  345. (0x21D2 == r),
  346. (0x21D4 == r),
  347. (0x21E7 == r),
  348. (0x2200 == r),
  349. (0x2202 <= r && r <= 0x2203),
  350. (0x2207 <= r && r <= 0x2208),
  351. (0x220B == r),
  352. (0x220F == r),
  353. (0x2211 == r),
  354. (0x2215 == r),
  355. (0x221A == r),
  356. (0x221D <= r && r <= 0x2220),
  357. (0x2223 == r),
  358. (0x2225 == r),
  359. (0x2227 <= r && r <= 0x222C),
  360. (0x222E == r),
  361. (0x2234 <= r && r <= 0x2237),
  362. (0x223C <= r && r <= 0x223D),
  363. (0x2248 == r),
  364. (0x224C == r),
  365. (0x2252 == r),
  366. (0x2260 <= r && r <= 0x2261),
  367. (0x2264 <= r && r <= 0x2267),
  368. (0x226A <= r && r <= 0x226B),
  369. (0x226E <= r && r <= 0x226F),
  370. (0x2282 <= r && r <= 0x2283),
  371. (0x2286 <= r && r <= 0x2287),
  372. (0x2295 == r),
  373. (0x2299 == r),
  374. (0x22A5 == r),
  375. (0x22BF == r),
  376. (0x2312 == r),
  377. (0x2460 <= r && r <= 0x24E9),
  378. (0x24EB <= r && r <= 0x254B),
  379. (0x2550 <= r && r <= 0x2573),
  380. (0x2580 <= r && r <= 0x258F),
  381. (0x2592 <= r && r <= 0x2595),
  382. (0x25A0 <= r && r <= 0x25A1),
  383. (0x25A3 <= r && r <= 0x25A9),
  384. (0x25B2 <= r && r <= 0x25B3),
  385. (0x25B6 <= r && r <= 0x25B7),
  386. (0x25BC <= r && r <= 0x25BD),
  387. (0x25C0 <= r && r <= 0x25C1),
  388. (0x25C6 <= r && r <= 0x25C8),
  389. (0x25CB == r),
  390. (0x25CE <= r && r <= 0x25D1),
  391. (0x25E2 <= r && r <= 0x25E5),
  392. (0x25EF == r),
  393. (0x2605 <= r && r <= 0x2606),
  394. (0x2609 == r),
  395. (0x260E <= r && r <= 0x260F),
  396. (0x2614 <= r && r <= 0x2615),
  397. (0x261C == r),
  398. (0x261E == r),
  399. (0x2640 == r),
  400. (0x2642 == r),
  401. (0x2660 <= r && r <= 0x2661),
  402. (0x2663 <= r && r <= 0x2665),
  403. (0x2667 <= r && r <= 0x266A),
  404. (0x266C <= r && r <= 0x266D),
  405. (0x266F == r),
  406. (0x269E <= r && r <= 0x269F),
  407. (0x26BE <= r && r <= 0x26BF),
  408. (0x26C4 <= r && r <= 0x26CD),
  409. (0x26CF <= r && r <= 0x26E1),
  410. (0x26E3 == r),
  411. (0x26E8 <= r && r <= 0x26FF),
  412. (0x273D == r),
  413. (0x2757 == r),
  414. (0x2776 <= r && r <= 0x277F),
  415. (0x2B55 <= r && r <= 0x2B59),
  416. (0x3248 <= r && r <= 0x324F),
  417. (0xE000 <= r && r <= 0xF8FF),
  418. (0xFE00 <= r && r <= 0xFE0F),
  419. (0xFFFD == r),
  420. (0x1F100 <= r && r <= 0x1F10A),
  421. (0x1F110 <= r && r <= 0x1F12D),
  422. (0x1F130 <= r && r <= 0x1F169),
  423. (0x1F170 <= r && r <= 0x1F19A),
  424. (0xE0100 <= r && r <= 0xE01EF),
  425. (0xF0000 <= r && r <= 0xFFFFD),
  426. (0x100000 <= r && r <= 0x10FFFD):
  427. return "A"
  428. default:
  429. return "N"
  430. }
  431. }