@@ -7,8 +7,6 @@ TypeCountTokenizer <- function(x) {
77
88
99create_corpus <- function (metadata , text , stops ) {
10- # log text example content
11- vflog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " text example content:" , text $ content [1 ], collapse = " \n " ))
1210 docs <- data.frame (doc_id = text $ id , text = text $ content )
1311 corpus <- VCorpus(DataframeSource(docs ))
1412
@@ -32,15 +30,7 @@ create_corpus <- function(metadata, text, stops) {
3230
3331
3432create_tdm_matrix <- function (corpus , sparsity = 1 ) {
35- # log example content from the corpus object
36- vflog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " corpus example content:" , corpus [[1 ]]$ content , collapse = " \n " ))
3733 tdm <- TermDocumentMatrix(corpus )
38- # log all available information about tdm
39- vflog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " tdm dimensions:" , dim(tdm )))
40- vflog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " tdm sparsity:" , sum(tdm == 0 ) / prod(dim(tdm ))))
41- vflog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " tdm max value:" , max(tdm )))
42- vflog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " tdm min value:" , min(tdm )))
43- vflog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " tdm NA values:" , sum(is.na(tdm ))))
4434 if (sparsity < 1 ) {
4535 tdm <- removeSparseTerms(tdm , sparsity )
4636 }
0 commit comments