Skip to content

Commit 469b904

Browse files
committed
fix bad tokenizer
1 parent 9c0f5ba commit 469b904

1 file changed

Lines changed: 2 additions & 3 deletions

File tree

server/preprocessing/other-scripts/features.R

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,10 @@ library(stringr)
22
vflog <- getLogger('vis.features')
33

44
TypeCountTokenizer <- function(x) {
5-
tokens = unlist(lapply(strsplit(words(x), split=";"), paste), use.names = FALSE)
6-
tokens = unlist(lapply(tokens, stri_replace_all, regex='[^[:alnum:]-]', replacement=""))
7-
return(tokens)
5+
unlist(strsplit(as.character(x), "[^[:alnum:]-]"))
86
}
97

8+
109
create_corpus <- function(metadata, text, stops) {
1110
docs <- data.frame(doc_id = text$id, text = text$content)
1211
corpus <- VCorpus(DataframeSource(docs))

0 commit comments

Comments
 (0)