OpenKnowledgeMaps
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎doc/server_config.md‎
Lines changed: 1 addition & 1 deletion b/‎doc/server_config.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker-compose-dataworker.yml‎
Lines changed: 20 additions & 0 deletions b/‎docker-compose-dataworker.yml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 53 additions & 7 deletions b/‎docker-compose.yml‎
Lines changed: 53 additions & 7 deletions
diff --git a/‎server/preprocessing/other-scripts/base.R‎
Lines changed: 2 additions & 1 deletion b/‎server/preprocessing/other-scripts/base.R‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎server/preprocessing/other-scripts/cluster.R‎
Lines changed: 1 addition & 1 deletion b/‎server/preprocessing/other-scripts/cluster.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎server/preprocessing/other-scripts/features.R‎
Lines changed: 15 additions & 24 deletions b/‎server/preprocessing/other-scripts/features.R‎
Lines changed: 15 additions & 24 deletions
diff --git a/‎server/preprocessing/other-scripts/postprocess.R‎
Lines changed: 3 additions & 24 deletions b/‎server/preprocessing/other-scripts/postprocess.R‎
Lines changed: 3 additions & 24 deletions
diff --git a/‎server/preprocessing/other-scripts/preprocess.R‎
Lines changed: 23 additions & 28 deletions b/‎server/preprocessing/other-scripts/preprocess.R‎
Lines changed: 23 additions & 28 deletions
@@ -18,6 +18,7 @@ dist/
 .Rhistory
 .ipynb_checkpoints
 .Rprofile
+.Rproj*
 coverage/
 
 # local deployment files
@@ -33,4 +34,5 @@ coverage/
 /lc_browseview_cache.json
 /lc_cache.json
 /linkedcat.sqlite
-.env
+.env
+.Rproj.user
@@ -90,4 +90,4 @@ Duplicate config.ini in server/preprocessing/conf/ and rename it to config_local
 ## Logging configuration
 
 In the default setting, Headstart will log behavior only to the console. If you want to log to a file, please add following environment variable to your Renviron (in local mode) or Renviron.site (if called on a server). Headstart will then log events to a file on the `INFO` loglevel.
-* `HEADSTART_LOGFILE`: Path to a logfile, e.g. `/path/to/logfile.log`. Please make sure that the folder structure exists, e.g. `/path/to/`.
+* `LOGFILE`: Path to a logfile, e.g. `/path/to/logfile.log`. Please make sure that the folder structure exists, e.g. `/path/to/`.
@@ -0,0 +1,20 @@
+version: '3.7'
+
+services:
+
+  dataprocessing:
+    image: dataprocessing:${SERVICE_VERSION}
+    env_file:
+    - server/workers/dataprocessing/dataprocessing.env
+    environment:
+      SERVICE_VERSION: "${SERVICE_VERSION}"
+      REDIS_HOST: "${REDIS_HOST}"
+      REDIS_PORT: "${REDIS_PORT}"
+      REDIS_DB: "${REDIS_DB}"
+      REDIS_PASSWORD: "${REDIS_PASSWORD}"
+      REDIS_SSL: "${REDIS_SSL}"
+    restart: always
+    volumes:
+      - /opt/local/renv/cache:/renv/cache
+      - /var/log/headstart:/var/log/headstart
+    network_mode: host
@@ -26,7 +26,7 @@ services:
       - headstart
 
   redis:
-    image: 'redis:4.0-alpine'
+    image: 'redis:6.0-alpine'
     restart: always
     hostname: "${REDIS_HOST}"
     environment:
@@ -36,6 +36,9 @@ services:
     volumes:
         - 'redis:/var/lib/redis/data'
         - ./server/workers/redis.conf:/etc/redis/redis.conf
+        - ./server/workers/certs:/etc/certs
+    ports:
+     - "127.0.0.1:${REDIS_PORT}:6379"
     restart: always
     networks: 
       - headstart
@@ -49,6 +52,11 @@ services:
       REDIS_PORT: "${REDIS_PORT}"
       REDIS_PASSWORD: "${REDIS_PASSWORD}"
       REDIS_DB: "${REDIS_DB}"
+      REDIS_SSL: "${REDIS_SSL}"
+      BEHIND_PROXY: "${BEHIND_PROXY}"
+      DEFAULT_DATABASE: "${DEFAULT_DATABASE}"
+      DATABASES: "${DATABASES}"
+      FLASK_ENV: "${FLASK_ENV}"
     command: ["gunicorn",  "--workers", "10", "--threads", "2", "-b", "0.0.0.0:${API_PORT}", "app:app", "--timeout", "300"]
     depends_on:
       - redis
@@ -64,6 +72,10 @@ services:
       POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}"
       POSTGRES_HOST: "${POSTGRES_HOST}"
       POSTGRES_PORT: "${POSTGRES_PORT}"
+      BEHIND_PROXY: "${BEHIND_PROXY}"
+      DEFAULT_DATABASE: "${DEFAULT_DATABASE}"
+      DATABASES: "${DATABASES}"
+      FLASK_ENV: "${FLASK_ENV}"
     command: ["gunicorn",  "--workers", "10", "--threads", "2", "-b", "0.0.0.0:${API_PORT}", "app:app", "--timeout", "300"]
     networks: 
       - headstart
@@ -78,6 +90,15 @@ services:
       REDIS_PORT: "${REDIS_PORT}"
       REDIS_DB: "${REDIS_DB}"
       REDIS_PASSWORD: "${REDIS_PASSWORD}"
+      REDIS_SSL: "${REDIS_SSL}"
+      LOGLEVEL: "${LOGLEVEL}"
+      TRIPLE_USER: "${TRIPLE_USER}"
+      TRIPLE_PASS: "${TRIPLE_PASS}"
+      TRIPLE_HOST: "${TRIPLE_HOST}"
+      TRIPLE_PORT: "${TRIPLE_PORT}"
+      TRIPLE_DOCUMENTS_INDEX: "${TRIPLE_DOCUMENTS_INDEX}"
+      TRIPLE_PROJECTS_INDEX: "${TRIPLE_PROJECTS_INDEX}"
+      TRIPLE_AUTHORS_INDEX: "${TRIPLE_AUTHORS_INDEX}"
     restart: always
     depends_on:
       - redis
@@ -94,6 +115,8 @@ services:
       REDIS_PORT: "${REDIS_PORT}"
       REDIS_DB: "${REDIS_DB}"
       REDIS_PASSWORD: "${REDIS_PASSWORD}"
+      REDIS_SSL: "${REDIS_SSL}"
+      LOGLEVEL: "${LOGLEVEL}"
     restart: always
     depends_on:
       - redis
@@ -110,6 +133,8 @@ services:
       REDIS_PORT: "${REDIS_PORT}"
       REDIS_DB: "${REDIS_DB}"
       REDIS_PASSWORD: "${REDIS_PASSWORD}"
+      REDIS_SSL: "${REDIS_SSL}"
+      LOGLEVEL: "${LOGLEVEL}"
     restart: always
     volumes:
       - /opt/local/renv/cache:/renv/cache
@@ -121,14 +146,21 @@ services:
 
   base:
     image: base:${SERVICE_VERSION}
-    env_file:
-      - server/workers/base/base.env
     environment:
       SERVICE_VERSION: "${SERVICE_VERSION}"
       REDIS_HOST: "${REDIS_HOST}"
       REDIS_PORT: "${REDIS_PORT}"
       REDIS_DB: "${REDIS_DB}"
       REDIS_PASSWORD: "${REDIS_PASSWORD}"
+      REDIS_SSL: "${REDIS_SSL}"
+      LOGLEVEL: "${LOGLEVEL}"
+      LOGFILE: "/var/log/headstart/headstart.log"
+      RENV_VERSION: 0.14.0-5
+      CRAN_REPOS: https://cran.wu.ac.at
+      LC_ALL: "en_US.UTF-8"
+      LANG: "en_US.UTF-8"
+      RENV_PATHS_CACHE: /renv/cache
+      PYTHONIOENCODING: "utf-8"
     restart: always
     volumes:
       - /opt/local/renv/cache:/renv/cache
@@ -140,14 +172,21 @@ services:
 
   pubmed:
     image: pubmed:${SERVICE_VERSION}
-    env_file:
-      - server/workers/pubmed/pubmed.env
     environment:
       SERVICE_VERSION: "${SERVICE_VERSION}"
       REDIS_HOST: "${REDIS_HOST}"
       REDIS_PORT: "${REDIS_PORT}"
       REDIS_DB: "${REDIS_DB}"
       REDIS_PASSWORD: "${REDIS_PASSWORD}"
+      REDIS_SSL: "${REDIS_SSL}"
+      LOGLEVEL: "${LOGLEVEL}"
+      LOGFILE: "/var/log/headstart/headstart.log"
+      RENV_VERSION: 0.14.0-5
+      CRAN_REPOS: https://cran.wu.ac.at
+      LC_ALL: "en_US.UTF-8"
+      LANG: "en_US.UTF-8"
+      RENV_PATHS_CACHE: /renv/cache
+      PYTHONIOENCODING: "utf-8"
     restart: always
     volumes:
       - /opt/local/renv/cache:/renv/cache
@@ -159,14 +198,21 @@ services:
 
   openaire:
     image: openaire:${SERVICE_VERSION}
-    env_file:
-      - server/workers/openaire/openaire.env
     environment:
       SERVICE_VERSION: "${SERVICE_VERSION}"
       REDIS_HOST: "${REDIS_HOST}"
       REDIS_PORT: "${REDIS_PORT}"
       REDIS_DB: "${REDIS_DB}"
       REDIS_PASSWORD: "${REDIS_PASSWORD}"
+      REDIS_SSL: "${REDIS_SSL}"
+      LOGLEVEL: "${LOGLEVEL}"
+      LOGFILE: "/var/log/headstart/headstart.log"
+      RENV_VERSION: 0.14.0-5
+      CRAN_REPOS: https://cran.wu.ac.at
+      LC_ALL: "en_US.UTF-8"
+      LANG: "en_US.UTF-8"
+      RENV_PATHS_CACHE: /renv/cache
+      PYTHONIOENCODING: "utf-8"
     restart: always
     volumes:
       - /opt/local/renv/cache:/renv/cache
 
@@ -1,4 +1,5 @@
 library(rbace)
+library(stringr)
 
 # get_papers
 #
@@ -173,7 +174,7 @@ find_dois <- function(link) {
       || (startsWith(link, "https://doi.org"))
       || (startsWith(link, "http://dx.doi.org"))
       || (startsWith(link, "https://dx.doi.org"))) {
-    doi <- stringr::str_replace(link, "http:", "https:")
+    doi <- str_replace(link, "http:", "https:")
   } else {
     doi <- ""
   }
 
@@ -91,7 +91,7 @@ create_clusters <- function(distance_matrix, max_clusters=-1, method="ward.D") {
     vclog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Number of Clusters:", num_clusters, sep=" "))
     vclog$debug(paste("CutOff-Description:", attributes(cut_off)$description))
   }
-  clusters = list("labels"=labels, "cluster"=cluster, "groups"=groups, "num_clusters"=num_clusters)
+  clusters = list("labels"=labels, "groups"=groups, "num_clusters"=num_clusters)
   return(clusters)
 
 }
 
@@ -1,21 +1,22 @@
+library(stringr)
 vflog <- getLogger('vis.features')
 
-create_corpus <- function(metadata, text, languages=c("en")) {
-  valid <- getStemLanguages()
-  text["languages"] <- languages
-  mapping <- list(content = "content", id = "id", languages = "languages")
-  myReader <- readTabular(mapping = mapping)
+TypeCountTokenizer <- function(x) {
+  unlist(strsplit(as.character(x), "[^[:alnum:]-]"))
+}
+
 
-  corpus <- Corpus(DataframeSource(text),
-                   readerControl = list(reader = myReader))
+create_corpus <- function(metadata, text, stops) {
+  docs <- data.frame(doc_id = text$id, text = text$content)
+  corpus <- VCorpus(DataframeSource(docs))
 
   # Replace non-convertible bytes in with strings showing their hex codes,
   # see http://tm.r-forge.r-project.org/faq.html
   corpus <- tm_map(corpus, content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))
   unlowered <- corpus
   corpus <- tm_map(corpus, removePunctuation)
   corpus <- tm_map(corpus, content_transformer(tolower))
-  corpus <- tm_map(corpus, remove_stop_words)
+  corpus <- tm_map(corpus, removeWords, stops)
   corpus <- tm_map(corpus, stripWhitespace)
   unstemmed <- corpus
   stemmed <- tm_map(corpus, stemDocument)
@@ -41,22 +42,12 @@ get_distance_matrix <- function(tdm_matrix, method = "cosine") {
   return(distance_matrix)
 }
 
+get_type_counts <- function(corpus) {
+  type_counts = apply(TermDocumentMatrix(corpus, control=list(tokenize=TypeCountTokenizer, tolower = FALSE)), 1, sum)
+  return(type_counts)
+}
+
 concatenate_features <- function(...) {
   # expects a list of feature matrices which can be extended horizontally
   return(cbind(...))
-}
-
-remove_stop_words <- function(x, languages) UseMethod("remove_stop_words", x)
-remove_stop_words.character <- function(x, languages) {
-  y <- unlist(strsplit(x, " "))
-  stops = list()
-  for (lang in languages) {
-    stops <- c(stops, get_stopwords(lang, TESTING))
-  }
-  stopword <- unlist(lapply(y, function(z) z %in% stops))
-  doc <- y[which(!stopword)]
-  doc <- paste(doc, collapse = " ")
-}
-remove_stop_words.PlainTextDocument <- function(x, languages = meta(x, "languages")) {
-  content_transformer(remove_stop_words.character)(x, languages)
-}
+}
@@ -5,44 +5,23 @@ create_overview_output <- function(named_clusters, layout, metadata) {
   x = layout$X1
   y = layout$X2
   labels = named_clusters$labels
-  cluster = named_clusters$cluster
   num_clusters = named_clusters$num_clusters
   cluster_labels = named_clusters$cluster_labels
 
   # Prepare the output
-  result = data.frame(cbind(x, y, labels, cluster_labels))
+  result = data.frame(cbind(x, y, labels, cluster_labels, metadata$id))
+  names(result)[5] <- "id"
   unique_groups = data.frame(unique(result$cluster_labels))
   colnames(unique_groups) <- "cluster_labels"
   unique_groups$groups <- seq_along(unique_groups$cluster_labels)
   result = merge(result, unique_groups, by='cluster_labels')
-  output = merge(metadata, result, by.x="id", by.y="labels", all=TRUE)
+  output = merge(metadata, result, by.x="id", by.y="id", all=TRUE)
 
   names(output)[names(output)=="groups"] <- "area_uri"
   output["area"] = paste(output$cluster_labels, sep="")
 
   output_json = toJSON(output)
 
-  if(exists("DEBUG") && DEBUG == TRUE) {
-    library(ggplot2)
-    # Plot results from multidimensional scaling, highlight clusters with symbols
-    temp <- fromJSON(output_json)
-    temp$x <- as.numeric(temp$x)
-    temp$y <- as.numeric(temp$y)
-    temp$title <- unlist(lapply(temp$title, substr, start=0, stop=15))
-    g <- ggplot(temp, aes(x, y, label=title)) +
-          geom_point(aes(colour=area_uri)) +
-          geom_text(size=2)
-    ggsave(file = "debug_nmds.svg", plot = g, width = 15, height = 15)
-  }
-
-  # NEEDS FIX
-  # if(exists("DEBUG") && DEBUG == TRUE) {
-  #   # Write output to file
-  #   file_handle = file("output_file.csv", open="w")
-  #   write.csv(output, file=file_handle, row.names=FALSE)
-  #   close(file_handle)
-  # }
-
   return(output_json)
 
 }
 
@@ -85,44 +85,39 @@ deduplicate_titles <- function(metadata, list_size) {
 
 }
 
-replace_keywords_if_empty <- function(metadata, stops, service) {
+replace_keywords_if_empty <- function(metadata, stops) {
   metadata$subject <- unlist(lapply(metadata$subject, function(x) {gsub(" +", " ", x)}))
   missing_subjects = which(lapply(metadata$subject, function(x) {nchar(x)}) <= 1)
-  vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects:", length(missing_subjects)))
-  if (service == "linkedcat" || service == "linkedcat_authorview" || service == "linkedcat_browseview") {
-    metadata$subject[missing_subjects] <- metadata$bkl_caption[missing_subjects]
-    metadata$subject[is.na(metadata$subject)] <- ""
-  } else {
-    candidates = mapply(paste, metadata$title)
-    candidates = mclapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
-    candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
-    candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
-    candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
-    #candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
-    candidates = mapply(paste, candidates, candidates_bigrams)
-    #candidates = lapply(candidates, function(x) {gsub('\\b\\d+\\s','', x)})
-
-    nn_corpus = Corpus(VectorSource(candidates))
-    nn_tfidf = TermDocumentMatrix(nn_corpus, control = list(tokenize = SplitTokenizer, weighting = function(x) weightSMART(x, spec="ntn")))
-    tfidf_top = apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>=x2[3]]})
-    tfidf_top_names = lapply(tfidf_top, names)
-    replacement_keywords <- mclapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
-    replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
-    replacement_keywords = gsub("_", " ", replacement_keywords)
-
-    metadata$subject[missing_subjects] <- replacement_keywords[missing_subjects]
+  if (length(missing_subjects) == 0) {
+    return(metadata)
   }
+  vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects:", length(missing_subjects)))
+  candidates = mapply(paste, metadata$title)
+  candidates = mclapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
+  candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
+  candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
+  candidates_bigrams = lapply(lapply(candidates, expand_ngrams, n=2), paste, collapse=" ")
+  candidates = mapply(paste, candidates, candidates_bigrams)
+
+  nn_corpus = Corpus(VectorSource(candidates))
+  nn_tfidf = TermDocumentMatrix(nn_corpus)
+  tfidf_top = apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>=x2[3]]})
+  tfidf_top_names = lapply(tfidf_top, names)
+  replacement_keywords <- mclapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
+  replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
+  replacement_keywords = gsub("_", " ", replacement_keywords)
+
+  metadata$subject[missing_subjects] <- replacement_keywords[missing_subjects]
   missing_subjects = which(lapply(metadata$subject, function(x) {nchar(x)}) <= 1)
+  vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects after replacing from title:", length(missing_subjects)))
   if (length(missing_subjects) > 0) {
-    for (i in missing_subjects) {
+    foreach (i = missing_subjects) %dopar% {
       candidates = mapply(paste, metadata$title[i], metadata$paper_abstract[i])
       candidates = lapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
       candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
       candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
-      candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
-      #candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
+      candidates_bigrams = lapply(lapply(candidates, expand_ngrams, n=2), paste, collapse=" ")
       candidates = mapply(paste, candidates, candidates_bigrams)
-      #candidates = lapply(candidates, function(x) {gsub('\\b\\d+\\s','', x)})
       nn_count = sort(table(strsplit(candidates, " ")), decreasing = T)
       replacement_keywords <- filter_out_nested_ngrams(names(nn_count), 3)
       replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`library(rbace)`
	`2`	`+library(stringr)`
`2`	`3`
`3`	`4`	`# get_papers`
`4`	`5`	`#`
`@@ -173,7 +174,7 @@ find_dois <- function(link) {`
`173`	`174`	`\|\| (startsWith(link, "https://doi.org"))`
`174`	`175`	`\|\| (startsWith(link, "http://dx.doi.org"))`
`175`	`176`	`\|\| (startsWith(link, "https://dx.doi.org"))) {`
`176`		`- doi <- stringr::str_replace(link, "http:", "https:")`
	`177`	`+ doi <- str_replace(link, "http:", "https:")`
`177`	`178`	`} else {`
`178`	`179`	`doi <- ""`
`179`	`180`	`}`
Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ create_clusters <- function(distance_matrix, max_clusters=-1, method="ward.D") {`
`91`	`91`	`vclog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Number of Clusters:", num_clusters, sep=" "))`
`92`	`92`	`vclog$debug(paste("CutOff-Description:", attributes(cut_off)$description))`
`93`	`93`	`}`
`94`		`- clusters = list("labels"=labels, "cluster"=cluster, "groups"=groups, "num_clusters"=num_clusters)`
	`94`	`+ clusters = list("labels"=labels, "groups"=groups, "num_clusters"=num_clusters)`
`95`	`95`	`return(clusters)`
`96`	`96`
`97`	`97`	`}`