Skip to content

Commit c14f800

Browse files
committed
WordSplit
1 parent 541864f commit c14f800

2 files changed

Lines changed: 93 additions & 0 deletions

File tree

stringutils.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,80 @@ func AppendIfMissing(str string, suffix string) string {
149149
}
150150
return str + suffix
151151
}
152+
153+
const minCJKCharacter = '\u3400'
154+
155+
// IsAlphabet checks r is a letter but not CJK character.
156+
func IsAlphabet(r rune) bool {
157+
if !unicode.IsLetter(r) {
158+
return false
159+
}
160+
161+
switch {
162+
// Quick check for non-CJK character.
163+
case r < minCJKCharacter:
164+
return true
165+
166+
// Common CJK characters.
167+
case r >= '\u4E00' && r <= '\u9FCC':
168+
return false
169+
170+
// Rare CJK characters.
171+
case r >= '\u3400' && r <= '\u4D85':
172+
return false
173+
174+
// Rare and historic CJK characters.
175+
case r >= '\U00020000' && r <= '\U0002B81D':
176+
return false
177+
}
178+
return true
179+
}
180+
181+
// WordSplit splits a string into words. Returns a slice of words.
182+
// If there is no word in a string, return nil.
183+
//
184+
// Word is defined as a locale dependent string containing alphabetic characters,
185+
// which may also contain but not start with `'` and `-` characters.
186+
func WordSplit(str string) []string {
187+
if DeleteWhitespace(str) == "" {
188+
return []string{""}
189+
}
190+
191+
var word string
192+
var words []string
193+
var r rune
194+
var size, pos int
195+
196+
inWord := false
197+
198+
for len(str) > 0 {
199+
r, size = utf8.DecodeRuneInString(str)
200+
201+
switch {
202+
case IsAlphabet(r):
203+
if !inWord {
204+
inWord = true
205+
word = str
206+
pos = 0
207+
}
208+
209+
case inWord && (r == '\'' || r == '-'):
210+
// Still in word.
211+
212+
default:
213+
if inWord {
214+
inWord = false
215+
words = append(words, word[:pos])
216+
}
217+
}
218+
219+
pos += size
220+
str = str[size:]
221+
}
222+
223+
if inWord {
224+
words = append(words, word[:pos])
225+
}
226+
227+
return words
228+
}

stringutils_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,3 +221,19 @@ func TestAppendIfMissing(t *testing.T) {
221221
result = AppendIfMissing("abc.exe", ".exe")
222222
assertions.Equal("abc.exe", result)
223223
}
224+
225+
func TestWordSplit(t *testing.T) {
226+
assertions := assert.New(t)
227+
228+
result := WordSplit("abc")
229+
assertions.Equal([]string{"abc"}, result)
230+
231+
result = WordSplit("car house")
232+
assertions.Equal([]string{"car", "house"}, result)
233+
234+
result = WordSplit("car-house")
235+
assertions.Equal([]string{"car-house"}, result)
236+
237+
result = WordSplit("")
238+
assertions.Equal([]string{""}, result)
239+
}

0 commit comments

Comments
 (0)