@@ -85,44 +85,39 @@ deduplicate_titles <- function(metadata, list_size) {
8585
8686}
8787
88- replace_keywords_if_empty <- function (metadata , stops , service ) {
88+ replace_keywords_if_empty <- function (metadata , stops ) {
8989 metadata $ subject <- unlist(lapply(metadata $ subject , function (x ) {gsub(" +" , " " , x )}))
9090 missing_subjects = which(lapply(metadata $ subject , function (x ) {nchar(x )}) < = 1 )
91- vplog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " Documents without subjects:" , length(missing_subjects )))
92- if (service == " linkedcat" || service == " linkedcat_authorview" || service == " linkedcat_browseview" ) {
93- metadata $ subject [missing_subjects ] <- metadata $ bkl_caption [missing_subjects ]
94- metadata $ subject [is.na(metadata $ subject )] <- " "
95- } else {
96- candidates = mapply(paste , metadata $ title )
97- candidates = mclapply(candidates , function (x )paste(removeWords(x , stops ), collapse = " " ))
98- candidates = lapply(candidates , function (x ) {gsub(" [^[:alpha:]]" , " " , x )})
99- candidates = lapply(candidates , function (x ) {gsub(" +" , " " , x )})
100- candidates_bigrams = lapply(lapply(candidates , function (x )unlist(lapply(ngrams(unlist(strsplit(x , split = " " )), 2 ), paste , collapse = " _" ))), paste , collapse = " " )
101- # candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
102- candidates = mapply(paste , candidates , candidates_bigrams )
103- # candidates = lapply(candidates, function(x) {gsub('\\b\\d+\\s','', x)})
104-
105- nn_corpus = Corpus(VectorSource(candidates ))
106- nn_tfidf = TermDocumentMatrix(nn_corpus , control = list (tokenize = SplitTokenizer , weighting = function (x ) weightSMART(x , spec = " ntn" )))
107- tfidf_top = apply(nn_tfidf , 2 , function (x ) {x2 <- sort(x , TRUE );x2 [x2 > = x2 [3 ]]})
108- tfidf_top_names = lapply(tfidf_top , names )
109- replacement_keywords <- mclapply(tfidf_top_names , function (x ) filter_out_nested_ngrams(x , 3 ))
110- replacement_keywords = lapply(replacement_keywords , FUN = function (x ) {paste(unlist(x ), collapse = " ; " )})
111- replacement_keywords = gsub(" _" , " " , replacement_keywords )
112-
113- metadata $ subject [missing_subjects ] <- replacement_keywords [missing_subjects ]
91+ if (length(missing_subjects ) == 0 ) {
92+ return (metadata )
11493 }
94+ vplog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " Documents without subjects:" , length(missing_subjects )))
95+ candidates = mapply(paste , metadata $ title )
96+ candidates = mclapply(candidates , function (x )paste(removeWords(x , stops ), collapse = " " ))
97+ candidates = lapply(candidates , function (x ) {gsub(" [^[:alpha:]]" , " " , x )})
98+ candidates = lapply(candidates , function (x ) {gsub(" +" , " " , x )})
99+ candidates_bigrams = lapply(lapply(candidates , expand_ngrams , n = 2 ), paste , collapse = " " )
100+ candidates = mapply(paste , candidates , candidates_bigrams )
101+
102+ nn_corpus = Corpus(VectorSource(candidates ))
103+ nn_tfidf = TermDocumentMatrix(nn_corpus )
104+ tfidf_top = apply(nn_tfidf , 2 , function (x ) {x2 <- sort(x , TRUE );x2 [x2 > = x2 [3 ]]})
105+ tfidf_top_names = lapply(tfidf_top , names )
106+ replacement_keywords <- mclapply(tfidf_top_names , function (x ) filter_out_nested_ngrams(x , 3 ))
107+ replacement_keywords = lapply(replacement_keywords , FUN = function (x ) {paste(unlist(x ), collapse = " ; " )})
108+ replacement_keywords = gsub(" _" , " " , replacement_keywords )
109+
110+ metadata $ subject [missing_subjects ] <- replacement_keywords [missing_subjects ]
115111 missing_subjects = which(lapply(metadata $ subject , function (x ) {nchar(x )}) < = 1 )
112+ vplog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " Documents without subjects after replacing from title:" , length(missing_subjects )))
116113 if (length(missing_subjects ) > 0 ) {
117- for (i in missing_subjects ) {
114+ foreach (i = missing_subjects ) % dopar % {
118115 candidates = mapply(paste , metadata $ title [i ], metadata $ paper_abstract [i ])
119116 candidates = lapply(candidates , function (x )paste(removeWords(x , stops ), collapse = " " ))
120117 candidates = lapply(candidates , function (x ) {gsub(" [^[:alpha:]]" , " " , x )})
121118 candidates = lapply(candidates , function (x ) {gsub(" +" , " " , x )})
122- candidates_bigrams = lapply(lapply(candidates , function (x )unlist(lapply(ngrams(unlist(strsplit(x , split = " " )), 2 ), paste , collapse = " _" ))), paste , collapse = " " )
123- # candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
119+ candidates_bigrams = lapply(lapply(candidates , expand_ngrams , n = 2 ), paste , collapse = " " )
124120 candidates = mapply(paste , candidates , candidates_bigrams )
125- # candidates = lapply(candidates, function(x) {gsub('\\b\\d+\\s','', x)})
126121 nn_count = sort(table(strsplit(candidates , " " )), decreasing = T )
127122 replacement_keywords <- filter_out_nested_ngrams(names(nn_count ), 3 )
128123 replacement_keywords = lapply(replacement_keywords , FUN = function (x ) {paste(unlist(x ), collapse = " ; " )})
0 commit comments