splitfunc.go 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. package graphemes
  2. import (
  3. "bufio"
  4. "github.com/clipperhouse/stringish"
  5. )
  6. // is determines if lookup intersects propert(ies)
  7. func (lookup property) is(properties property) bool {
  8. return (lookup & properties) != 0
  9. }
  10. const _Ignore = _Extend
  11. // SplitFunc is a bufio.SplitFunc implementation of Unicode grapheme cluster segmentation, for use with bufio.Scanner.
  12. //
  13. // See https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
  14. var SplitFunc bufio.SplitFunc = splitFunc[[]byte]
  15. func splitFunc[T stringish.Interface](data T, atEOF bool) (advance int, token T, err error) {
  16. var empty T
  17. if len(data) == 0 {
  18. return 0, empty, nil
  19. }
  20. // These vars are stateful across loop iterations
  21. var pos int
  22. var lastExIgnore property = 0 // "last excluding ignored categories"
  23. var lastLastExIgnore property = 0 // "last one before that"
  24. var regionalIndicatorCount int
  25. // Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
  26. // to the right of the ×, from which we look back or forward
  27. current, w := lookup(data[pos:])
  28. if w == 0 {
  29. if !atEOF {
  30. // Rune extends past current data, request more
  31. return 0, empty, nil
  32. }
  33. pos = len(data)
  34. return pos, data[:pos], nil
  35. }
  36. // https://unicode.org/reports/tr29/#GB1
  37. // Start of text always advances
  38. pos += w
  39. for {
  40. eot := pos == len(data) // "end of text"
  41. if eot {
  42. if !atEOF {
  43. // Token extends past current data, request more
  44. return 0, empty, nil
  45. }
  46. // https://unicode.org/reports/tr29/#GB2
  47. break
  48. }
  49. /*
  50. We've switched the evaluation order of GB1↓ and GB2↑. It's ok:
  51. because we've checked for len(data) at the top of this function,
  52. sot and eot are mutually exclusive, order doesn't matter.
  53. */
  54. // Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
  55. // to the right of the ×, from which we look back or forward
  56. // Remember previous properties to avoid lookups/lookbacks
  57. last := current
  58. if !last.is(_Ignore) {
  59. lastLastExIgnore = lastExIgnore
  60. lastExIgnore = last
  61. }
  62. current, w = lookup(data[pos:])
  63. if w == 0 {
  64. if atEOF {
  65. // Just return the bytes, we can't do anything with them
  66. pos = len(data)
  67. break
  68. }
  69. // Rune extends past current data, request more
  70. return 0, empty, nil
  71. }
  72. // Optimization: no rule can possibly apply
  73. if current|last == 0 { // i.e. both are zero
  74. break
  75. }
  76. // https://unicode.org/reports/tr29/#GB3
  77. if current.is(_LF) && last.is(_CR) {
  78. pos += w
  79. continue
  80. }
  81. // https://unicode.org/reports/tr29/#GB4
  82. // https://unicode.org/reports/tr29/#GB5
  83. if (current | last).is(_Control | _CR | _LF) {
  84. break
  85. }
  86. // https://unicode.org/reports/tr29/#GB6
  87. if current.is(_L|_V|_LV|_LVT) && last.is(_L) {
  88. pos += w
  89. continue
  90. }
  91. // https://unicode.org/reports/tr29/#GB7
  92. if current.is(_V|_T) && last.is(_LV|_V) {
  93. pos += w
  94. continue
  95. }
  96. // https://unicode.org/reports/tr29/#GB8
  97. if current.is(_T) && last.is(_LVT|_T) {
  98. pos += w
  99. continue
  100. }
  101. // https://unicode.org/reports/tr29/#GB9
  102. if current.is(_Extend | _ZWJ) {
  103. pos += w
  104. continue
  105. }
  106. // https://unicode.org/reports/tr29/#GB9a
  107. if current.is(_SpacingMark) {
  108. pos += w
  109. continue
  110. }
  111. // https://unicode.org/reports/tr29/#GB9b
  112. if last.is(_Prepend) {
  113. pos += w
  114. continue
  115. }
  116. // https://unicode.org/reports/tr29/#GB9c
  117. // TODO(clipperhouse):
  118. // It appears to be added in Unicode 15.1.0:
  119. // https://unicode.org/versions/Unicode15.1.0/#Migration
  120. // This package currently supports Unicode 15.0.0, so
  121. // out of scope for now
  122. // https://unicode.org/reports/tr29/#GB11
  123. if current.is(_ExtendedPictographic) && last.is(_ZWJ) && lastLastExIgnore.is(_ExtendedPictographic) {
  124. pos += w
  125. continue
  126. }
  127. // https://unicode.org/reports/tr29/#GB12
  128. // https://unicode.org/reports/tr29/#GB13
  129. if (current & last).is(_RegionalIndicator) {
  130. regionalIndicatorCount++
  131. odd := regionalIndicatorCount%2 == 1
  132. if odd {
  133. pos += w
  134. continue
  135. }
  136. }
  137. // If we fall through all the above rules, it's a grapheme cluster break
  138. break
  139. }
  140. // Return token
  141. return pos, data[:pos], nil
  142. }