@@ -149,3 +149,80 @@ func AppendIfMissing(str string, suffix string) string {
149149 }
150150 return str + suffix
151151}
152+
153+ const minCJKCharacter = '\u3400'
154+
155+ // IsAlphabet checks r is a letter but not CJK character.
156+ func IsAlphabet (r rune ) bool {
157+ if ! unicode .IsLetter (r ) {
158+ return false
159+ }
160+
161+ switch {
162+ // Quick check for non-CJK character.
163+ case r < minCJKCharacter :
164+ return true
165+
166+ // Common CJK characters.
167+ case r >= '\u4E00' && r <= '\u9FCC' :
168+ return false
169+
170+ // Rare CJK characters.
171+ case r >= '\u3400' && r <= '\u4D85' :
172+ return false
173+
174+ // Rare and historic CJK characters.
175+ case r >= '\U00020000' && r <= '\U0002B81D' :
176+ return false
177+ }
178+ return true
179+ }
180+
181+ // WordSplit splits a string into words. Returns a slice of words.
182+ // If there is no word in a string, return nil.
183+ //
184+ // Word is defined as a locale dependent string containing alphabetic characters,
185+ // which may also contain but not start with `'` and `-` characters.
186+ func WordSplit (str string ) []string {
187+ if DeleteWhitespace (str ) == "" {
188+ return []string {"" }
189+ }
190+
191+ var word string
192+ var words []string
193+ var r rune
194+ var size , pos int
195+
196+ inWord := false
197+
198+ for len (str ) > 0 {
199+ r , size = utf8 .DecodeRuneInString (str )
200+
201+ switch {
202+ case IsAlphabet (r ):
203+ if ! inWord {
204+ inWord = true
205+ word = str
206+ pos = 0
207+ }
208+
209+ case inWord && (r == '\'' || r == '-' ):
210+ // Still in word.
211+
212+ default :
213+ if inWord {
214+ inWord = false
215+ words = append (words , word [:pos ])
216+ }
217+ }
218+
219+ pos += size
220+ str = str [size :]
221+ }
222+
223+ if inWord {
224+ words = append (words , word [:pos ])
225+ }
226+
227+ return words
228+ }
0 commit comments