Skip to content

Commit cec92cb

Browse files
committed
cleanup summarization
1 parent 5247a33 commit cec92cb

3 files changed

Lines changed: 33 additions & 32 deletions

File tree

server/preprocessing/other-scripts/preprocess.R

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -85,44 +85,39 @@ deduplicate_titles <- function(metadata, list_size) {
8585

8686
}
8787

88-
replace_keywords_if_empty <- function(metadata, stops, service) {
88+
replace_keywords_if_empty <- function(metadata, stops) {
8989
metadata$subject <- unlist(lapply(metadata$subject, function(x) {gsub(" +", " ", x)}))
9090
missing_subjects = which(lapply(metadata$subject, function(x) {nchar(x)}) <= 1)
91-
vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects:", length(missing_subjects)))
92-
if (service == "linkedcat" || service == "linkedcat_authorview" || service == "linkedcat_browseview") {
93-
metadata$subject[missing_subjects] <- metadata$bkl_caption[missing_subjects]
94-
metadata$subject[is.na(metadata$subject)] <- ""
95-
} else {
96-
candidates = mapply(paste, metadata$title)
97-
candidates = mclapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
98-
candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
99-
candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
100-
candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
101-
#candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
102-
candidates = mapply(paste, candidates, candidates_bigrams)
103-
#candidates = lapply(candidates, function(x) {gsub('\\b\\d+\\s','', x)})
104-
105-
nn_corpus = Corpus(VectorSource(candidates))
106-
nn_tfidf = TermDocumentMatrix(nn_corpus, control = list(tokenize = SplitTokenizer, weighting = function(x) weightSMART(x, spec="ntn")))
107-
tfidf_top = apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>=x2[3]]})
108-
tfidf_top_names = lapply(tfidf_top, names)
109-
replacement_keywords <- mclapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
110-
replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
111-
replacement_keywords = gsub("_", " ", replacement_keywords)
112-
113-
metadata$subject[missing_subjects] <- replacement_keywords[missing_subjects]
91+
if (length(missing_subjects) == 0) {
92+
return(metadata)
11493
}
94+
vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects:", length(missing_subjects)))
95+
candidates = mapply(paste, metadata$title)
96+
candidates = mclapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
97+
candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
98+
candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
99+
candidates_bigrams = lapply(lapply(candidates, expand_ngrams, n=2), paste, collapse=" ")
100+
candidates = mapply(paste, candidates, candidates_bigrams)
101+
102+
nn_corpus = Corpus(VectorSource(candidates))
103+
nn_tfidf = TermDocumentMatrix(nn_corpus)
104+
tfidf_top = apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>=x2[3]]})
105+
tfidf_top_names = lapply(tfidf_top, names)
106+
replacement_keywords <- mclapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
107+
replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
108+
replacement_keywords = gsub("_", " ", replacement_keywords)
109+
110+
metadata$subject[missing_subjects] <- replacement_keywords[missing_subjects]
115111
missing_subjects = which(lapply(metadata$subject, function(x) {nchar(x)}) <= 1)
112+
vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects after replacing from title:", length(missing_subjects)))
116113
if (length(missing_subjects) > 0) {
117114
for (i in missing_subjects) {
118115
candidates = mapply(paste, metadata$title[i], metadata$paper_abstract[i])
119116
candidates = lapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
120117
candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
121118
candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
122-
candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
123-
#candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
119+
candidates_bigrams = lapply(lapply(candidates, expand_ngrams, n=2), paste, collapse=" ")
124120
candidates = mapply(paste, candidates, candidates_bigrams)
125-
#candidates = lapply(candidates, function(x) {gsub('\\b\\d+\\s','', x)})
126121
nn_count = sort(table(strsplit(candidates, " ")), decreasing = T)
127122
replacement_keywords <- filter_out_nested_ngrams(names(nn_count), 3)
128123
replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})

server/preprocessing/other-scripts/summarize.R

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ SplitTokenizer <- function(x) {
99
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
1010

1111

12+
expand_ngrams <- function(text, n) {
13+
text <- trimws(text)
14+
lapply(lapply(text, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split = " ")), n), paste, collapse = "_"))), paste, collapse = " ")
15+
}
16+
1217
prune_ngrams <- function(ngrams, stops){
1318
ngrams = mapply(strsplit, ngrams, split=" |;")
1419
tokenized_ngrams = mapply(function(x) {
@@ -64,8 +69,8 @@ create_cluster_labels <- function(clusters, metadata,
6469
candidates = lapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
6570
candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
6671
candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
67-
candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
68-
candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
72+
candidates_bigrams = lapply(lapply(candidates, expand_ngrams, n=2), paste, collapse=" ")
73+
candidates_trigrams = lapply(lapply(candidates, expand_ngrams, n=3), paste, collapse=" ")
6974
candidates = mapply(paste, candidates, candidates_bigrams, candidates_trigrams)
7075
nn_count = sort(table(strsplit(paste(candidates, collapse=" "), " ")), decreasing = T)
7176
summary <- filter_out_nested_ngrams(names(nn_count), 3)
@@ -191,10 +196,11 @@ fill_empty_clusters <- function(nn_tfidf, nn_corpus){
191196
return(replacement_tfidf_top)
192197
}
193198

199+
194200
get_title_ngrams <- function(titles, stops, ngram_lengths) {
195201
# for ngrams: we have to collapse with "_" or else tokenizers will split ngrams again at that point and we'll be left with unigrams
196-
titles_bigrams = prune_ngrams(lapply(lapply(titles, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split = " ")), 2), paste, collapse = "_"))), paste, collapse = " "), stops)
197-
titles_trigrams = prune_ngrams(lapply(lapply(titles, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split = " ")), 3), paste, collapse = "_"))), paste, collapse = " "), stops)
202+
titles_bigrams = prune_ngrams(expand_ngrams(titles, 2), stops)
203+
titles_trigrams = prune_ngrams(expand_ngrams(titles, 3), stops)
198204
return(c(titles_bigrams, titles_trigrams))
199205
}
200206

server/preprocessing/other-scripts/vis_layout.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ vis_layout <- function(text, metadata, service,
6565
layout <- get_ndms(as.dist(features), mindim=2, maxdim=2)
6666

6767
vlog$debug("get cluster summaries")
68-
metadata = replace_keywords_if_empty(metadata, stops, service)
68+
metadata = replace_keywords_if_empty(metadata, stops)
6969
type_counts <- get_type_counts(corpus$unlowered)
7070
named_clusters <- create_cluster_labels(clusters, metadata,
7171
service, lang,

0 commit comments

Comments
 (0)