| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174 |
- package graphemes
- import (
- "bufio"
- "github.com/clipperhouse/stringish"
- )
- // is determines if lookup intersects propert(ies)
- func (lookup property) is(properties property) bool {
- return (lookup & properties) != 0
- }
- const _Ignore = _Extend
- // SplitFunc is a bufio.SplitFunc implementation of Unicode grapheme cluster segmentation, for use with bufio.Scanner.
- //
- // See https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
- var SplitFunc bufio.SplitFunc = splitFunc[[]byte]
- func splitFunc[T stringish.Interface](data T, atEOF bool) (advance int, token T, err error) {
- var empty T
- if len(data) == 0 {
- return 0, empty, nil
- }
- // These vars are stateful across loop iterations
- var pos int
- var lastExIgnore property = 0 // "last excluding ignored categories"
- var lastLastExIgnore property = 0 // "last one before that"
- var regionalIndicatorCount int
- // Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
- // to the right of the ×, from which we look back or forward
- current, w := lookup(data[pos:])
- if w == 0 {
- if !atEOF {
- // Rune extends past current data, request more
- return 0, empty, nil
- }
- pos = len(data)
- return pos, data[:pos], nil
- }
- // https://unicode.org/reports/tr29/#GB1
- // Start of text always advances
- pos += w
- for {
- eot := pos == len(data) // "end of text"
- if eot {
- if !atEOF {
- // Token extends past current data, request more
- return 0, empty, nil
- }
- // https://unicode.org/reports/tr29/#GB2
- break
- }
- /*
- We've switched the evaluation order of GB1↓ and GB2↑. It's ok:
- because we've checked for len(data) at the top of this function,
- sot and eot are mutually exclusive, order doesn't matter.
- */
- // Rules are usually of the form Cat1 × Cat2; "current" refers to the first property
- // to the right of the ×, from which we look back or forward
- // Remember previous properties to avoid lookups/lookbacks
- last := current
- if !last.is(_Ignore) {
- lastLastExIgnore = lastExIgnore
- lastExIgnore = last
- }
- current, w = lookup(data[pos:])
- if w == 0 {
- if atEOF {
- // Just return the bytes, we can't do anything with them
- pos = len(data)
- break
- }
- // Rune extends past current data, request more
- return 0, empty, nil
- }
- // Optimization: no rule can possibly apply
- if current|last == 0 { // i.e. both are zero
- break
- }
- // https://unicode.org/reports/tr29/#GB3
- if current.is(_LF) && last.is(_CR) {
- pos += w
- continue
- }
- // https://unicode.org/reports/tr29/#GB4
- // https://unicode.org/reports/tr29/#GB5
- if (current | last).is(_Control | _CR | _LF) {
- break
- }
- // https://unicode.org/reports/tr29/#GB6
- if current.is(_L|_V|_LV|_LVT) && last.is(_L) {
- pos += w
- continue
- }
- // https://unicode.org/reports/tr29/#GB7
- if current.is(_V|_T) && last.is(_LV|_V) {
- pos += w
- continue
- }
- // https://unicode.org/reports/tr29/#GB8
- if current.is(_T) && last.is(_LVT|_T) {
- pos += w
- continue
- }
- // https://unicode.org/reports/tr29/#GB9
- if current.is(_Extend | _ZWJ) {
- pos += w
- continue
- }
- // https://unicode.org/reports/tr29/#GB9a
- if current.is(_SpacingMark) {
- pos += w
- continue
- }
- // https://unicode.org/reports/tr29/#GB9b
- if last.is(_Prepend) {
- pos += w
- continue
- }
- // https://unicode.org/reports/tr29/#GB9c
- // TODO(clipperhouse):
- // It appears to be added in Unicode 15.1.0:
- // https://unicode.org/versions/Unicode15.1.0/#Migration
- // This package currently supports Unicode 15.0.0, so
- // out of scope for now
- // https://unicode.org/reports/tr29/#GB11
- if current.is(_ExtendedPictographic) && last.is(_ZWJ) && lastLastExIgnore.is(_ExtendedPictographic) {
- pos += w
- continue
- }
- // https://unicode.org/reports/tr29/#GB12
- // https://unicode.org/reports/tr29/#GB13
- if (current & last).is(_RegionalIndicator) {
- regionalIndicatorCount++
- odd := regionalIndicatorCount%2 == 1
- if odd {
- pos += w
- continue
- }
- }
- // If we fall through all the above rules, it's a grapheme cluster break
- break
- }
- // Return token
- return pos, data[:pos], nil
- }
|