cleanup summarization

chreman · chreman · commit cec92cb7e567 · 2021-12-24T12:21:28.000+01:00
diff --git a/server/preprocessing/other-scripts/preprocess.R b/server/preprocessing/other-scripts/preprocess.R
@@ -85,44 +85,39 @@ deduplicate_titles <- function(metadata, list_size) {
 
 }
 
-replace_keywords_if_empty <- function(metadata, stops, service) {
+replace_keywords_if_empty <- function(metadata, stops) {
   metadata$subject <- unlist(lapply(metadata$subject, function(x) {gsub(" +", " ", x)}))
   missing_subjects = which(lapply(metadata$subject, function(x) {nchar(x)}) <= 1)
-  vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects:", length(missing_subjects)))
-  if (service == "linkedcat" || service == "linkedcat_authorview" || service == "linkedcat_browseview") {
-    metadata$subject[missing_subjects] <- metadata$bkl_caption[missing_subjects]
-    metadata$subject[is.na(metadata$subject)] <- ""
-  } else {
-    candidates = mapply(paste, metadata$title)
-    candidates = mclapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
-    candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
-    candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
-    candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
-    #candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
-    candidates = mapply(paste, candidates, candidates_bigrams)
-    #candidates = lapply(candidates, function(x) {gsub('\\b\\d+\\s','', x)})
-
-    nn_corpus = Corpus(VectorSource(candidates))
-    nn_tfidf = TermDocumentMatrix(nn_corpus, control = list(tokenize = SplitTokenizer, weighting = function(x) weightSMART(x, spec="ntn")))
-    tfidf_top = apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>=x2[3]]})
-    tfidf_top_names = lapply(tfidf_top, names)
-    replacement_keywords <- mclapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
-    replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
-    replacement_keywords = gsub("_", " ", replacement_keywords)
-
-    metadata$subject[missing_subjects] <- replacement_keywords[missing_subjects]
+  if (length(missing_subjects) == 0) {
+    return(metadata)
   }
+  vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects:", length(missing_subjects)))
+  candidates = mapply(paste, metadata$title)
+  candidates = mclapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
+  candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
+  candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
+  candidates_bigrams = lapply(lapply(candidates, expand_ngrams, n=2), paste, collapse=" ")
+  candidates = mapply(paste, candidates, candidates_bigrams)
+
+  nn_corpus = Corpus(VectorSource(candidates))
+  nn_tfidf = TermDocumentMatrix(nn_corpus)
+  tfidf_top = apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>=x2[3]]})
+  tfidf_top_names = lapply(tfidf_top, names)
+  replacement_keywords <- mclapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
+  replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
+  replacement_keywords = gsub("_", " ", replacement_keywords)
+
+  metadata$subject[missing_subjects] <- replacement_keywords[missing_subjects]
   missing_subjects = which(lapply(metadata$subject, function(x) {nchar(x)}) <= 1)
+  vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects after replacing from title:", length(missing_subjects)))
   if (length(missing_subjects) > 0) {
     for (i in missing_subjects) {
       candidates = mapply(paste, metadata$title[i], metadata$paper_abstract[i])
       candidates = lapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
       candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
       candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
-      candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
-      #candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
+      candidates_bigrams = lapply(lapply(candidates, expand_ngrams, n=2), paste, collapse=" ")
       candidates = mapply(paste, candidates, candidates_bigrams)
-      #candidates = lapply(candidates, function(x) {gsub('\\b\\d+\\s','', x)})
       nn_count = sort(table(strsplit(candidates, " ")), decreasing = T)
       replacement_keywords <- filter_out_nested_ngrams(names(nn_count), 3)
       replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
diff --git a/server/preprocessing/other-scripts/summarize.R b/server/preprocessing/other-scripts/summarize.R
@@ -9,6 +9,11 @@ SplitTokenizer <- function(x) {
 trim <- function (x) gsub("^\\s+|\\s+$", "", x)
 
 
+expand_ngrams <- function(text, n) {
+  text <- trimws(text)
+  lapply(lapply(text, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split = " ")), n), paste, collapse  = "_"))), paste, collapse = " ")
+}
+
 prune_ngrams <- function(ngrams, stops){
   ngrams = mapply(strsplit, ngrams, split=" |;")
   tokenized_ngrams = mapply(function(x) {
@@ -64,8 +69,8 @@ create_cluster_labels <- function(clusters, metadata,
       candidates = lapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
       candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
       candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
-      candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
-      candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
+      candidates_bigrams = lapply(lapply(candidates, expand_ngrams, n=2), paste, collapse=" ")
+      candidates_trigrams = lapply(lapply(candidates, expand_ngrams, n=3), paste, collapse=" ")
       candidates = mapply(paste, candidates, candidates_bigrams, candidates_trigrams)
       nn_count = sort(table(strsplit(paste(candidates, collapse=" "), " ")), decreasing = T)
       summary <- filter_out_nested_ngrams(names(nn_count), 3)
@@ -191,10 +196,11 @@ fill_empty_clusters <- function(nn_tfidf, nn_corpus){
   return(replacement_tfidf_top)
 }
 
+
 get_title_ngrams <- function(titles, stops, ngram_lengths) {
   # for ngrams: we have to collapse with "_" or else tokenizers will split ngrams again at that point and we'll be left with unigrams
-  titles_bigrams = prune_ngrams(lapply(lapply(titles, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split = " ")), 2), paste, collapse  = "_"))), paste, collapse = " "), stops)
-  titles_trigrams = prune_ngrams(lapply(lapply(titles, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split = " ")), 3), paste, collapse = "_"))), paste, collapse = " "), stops)
+  titles_bigrams = prune_ngrams(expand_ngrams(titles, 2), stops)
+  titles_trigrams = prune_ngrams(expand_ngrams(titles, 3), stops)
   return(c(titles_bigrams, titles_trigrams))
 }
 
diff --git a/server/preprocessing/other-scripts/vis_layout.R b/server/preprocessing/other-scripts/vis_layout.R
@@ -65,7 +65,7 @@ vis_layout <- function(text, metadata, service,
     layout <- get_ndms(as.dist(features), mindim=2, maxdim=2)
 
     vlog$debug("get cluster summaries")
-    metadata = replace_keywords_if_empty(metadata, stops, service)
+    metadata = replace_keywords_if_empty(metadata, stops)
     type_counts <- get_type_counts(corpus$unlowered)
     named_clusters <- create_cluster_labels(clusters, metadata,
                                             service, lang,