Skip to content

Commit e042e31

Browse files
committed
various bugfixes
1 parent bf13b7d commit e042e31

3 files changed

Lines changed: 5 additions & 4 deletions

File tree

server/preprocessing/other-scripts/base.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ etl <- function(res, repo, non_public) {
189189
subject_cleaned = gsub("[^:;]+ ?:: ?[^;]+(;|$)?", "", subject_cleaned) #remove classification with separator ::
190190
subject_cleaned = gsub("[^\\[;]+\\[[A-Z,0-9]+\\](;|$)?", "", subject_cleaned) # remove WHO classification
191191
subject_cleaned = gsub("Info:\\w+-(\\w+\\/)+", "", subject_cleaned) # remove Info:eu-repo/classification/
192-
subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9][A-Za-z0-9 \\/\\.]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
192+
subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9 \\/\\.-]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
193193
if (!is.null(params$vis_type) && params$vis_type == "timeline") {
194194
subject_cleaned = gsub("FOS ", "", subject_cleaned) # remove FOS classification tag, but keep classifcation name
195195
arxiv_classification_string = "(cs|econ|eess|math|astro-ph|nlin|q-bio|q-fin|stat)\\.[A-Z]{2}|cond-mat\\.[a-z\\-]+|hep-(ex|lat|ph|th)|math-ph|nucl-(ex|th)|physics\\.[a-z\\-]+|(astro-ph|gr-qc|quant-ph|cond-mat)"
@@ -215,6 +215,7 @@ etl <- function(res, repo, non_public) {
215215
subject_cleaned = gsub(": ", "", subject_cleaned) # clean up keyword separation
216216
subject_cleaned = gsub("^; $", "", subject_cleaned) # clean up keyword separation
217217
subject_cleaned = gsub(";+", ";", subject_cleaned) # clean up keyword separation
218+
subject_cleaned = gsub(",+", ",", subject_cleaned) # clean up keyword separation
218219
subject_cleaned = gsub(",", ", ", subject_cleaned) # clean up keyword separation
219220
subject_cleaned = gsub("\\s+", " ", subject_cleaned) # clean up keyword separation
220221
subject_cleaned = stringi::stri_trim(subject_cleaned) # clean up keyword separation

server/preprocessing/other-scripts/summarize.R

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,9 @@ create_cluster_labels <- function(clusters, metadata,
9797

9898
fix_cluster_labels <- function(clusterlabels, type_counts){
9999
unlist(mclapply(clusterlabels, function(x) {
100-
fix_keyword_casing(x, type_counts)
100+
x <- fix_keyword_casing(x, type_counts)
101+
# clean up titles from format issues
102+
x <- gsub(",+", ",", x)
101103
}))
102104
}
103105

server/workers/dataprocessing/src/headstart.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,6 @@ def run(self):
9292
self.logger.error("Could not connect to remote Redis server, is the SSH tunnel open?")
9393
try:
9494
if params.get('vis_type') == "timeline":
95-
# the step of create_map can be dropped once deduplication is possible in API backend as well
96-
# TODO: create deduplicate endpoint in service worker and connect to that
9795
metadata = self.execute_search(params, input_data)
9896
sg_data = sg.get_streamgraph_data(json.loads(metadata),
9997
params.get('q'),

0 commit comments

Comments
 (0)