Skip to content

Commit b392500

Browse files
authored
Merge pull request #619 from OpenKnowledgeMaps/scaling-upgrades-M2
Scaling upgrades
2 parents 2ce53a5 + fd96136 commit b392500

67 files changed

Lines changed: 2224 additions & 2596 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ dist/
1818
.Rhistory
1919
.ipynb_checkpoints
2020
.Rprofile
21+
.Rproj*
2122
coverage/
2223

2324
# local deployment files
@@ -33,4 +34,5 @@ coverage/
3334
/lc_browseview_cache.json
3435
/lc_cache.json
3536
/linkedcat.sqlite
36-
.env
37+
.env
38+
.Rproj.user

doc/server_config.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,4 +90,4 @@ Duplicate config.ini in server/preprocessing/conf/ and rename it to config_local
9090
## Logging configuration
9191

9292
In the default setting, Headstart will log behavior only to the console. If you want to log to a file, please add following environment variable to your Renviron (in local mode) or Renviron.site (if called on a server). Headstart will then log events to a file on the `INFO` loglevel.
93-
* `HEADSTART_LOGFILE`: Path to a logfile, e.g. `/path/to/logfile.log`. Please make sure that the folder structure exists, e.g. `/path/to/`.
93+
* `LOGFILE`: Path to a logfile, e.g. `/path/to/logfile.log`. Please make sure that the folder structure exists, e.g. `/path/to/`.

docker-compose-dataworker.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
version: '3.7'
2+
3+
services:
4+
5+
dataprocessing:
6+
image: dataprocessing:${SERVICE_VERSION}
7+
env_file:
8+
- server/workers/dataprocessing/dataprocessing.env
9+
environment:
10+
SERVICE_VERSION: "${SERVICE_VERSION}"
11+
REDIS_HOST: "${REDIS_HOST}"
12+
REDIS_PORT: "${REDIS_PORT}"
13+
REDIS_DB: "${REDIS_DB}"
14+
REDIS_PASSWORD: "${REDIS_PASSWORD}"
15+
REDIS_SSL: "${REDIS_SSL}"
16+
restart: always
17+
volumes:
18+
- /opt/local/renv/cache:/renv/cache
19+
- /var/log/headstart:/var/log/headstart
20+
network_mode: host

docker-compose.yml

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ services:
2626
- headstart
2727

2828
redis:
29-
image: 'redis:4.0-alpine'
29+
image: 'redis:6.0-alpine'
3030
restart: always
3131
hostname: "${REDIS_HOST}"
3232
environment:
@@ -36,6 +36,9 @@ services:
3636
volumes:
3737
- 'redis:/var/lib/redis/data'
3838
- ./server/workers/redis.conf:/etc/redis/redis.conf
39+
- ./server/workers/certs:/etc/certs
40+
ports:
41+
- "127.0.0.1:${REDIS_PORT}:6379"
3942
restart: always
4043
networks:
4144
- headstart
@@ -49,6 +52,11 @@ services:
4952
REDIS_PORT: "${REDIS_PORT}"
5053
REDIS_PASSWORD: "${REDIS_PASSWORD}"
5154
REDIS_DB: "${REDIS_DB}"
55+
REDIS_SSL: "${REDIS_SSL}"
56+
BEHIND_PROXY: "${BEHIND_PROXY}"
57+
DEFAULT_DATABASE: "${DEFAULT_DATABASE}"
58+
DATABASES: "${DATABASES}"
59+
FLASK_ENV: "${FLASK_ENV}"
5260
command: ["gunicorn", "--workers", "10", "--threads", "2", "-b", "0.0.0.0:${API_PORT}", "app:app", "--timeout", "300"]
5361
depends_on:
5462
- redis
@@ -64,6 +72,10 @@ services:
6472
POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}"
6573
POSTGRES_HOST: "${POSTGRES_HOST}"
6674
POSTGRES_PORT: "${POSTGRES_PORT}"
75+
BEHIND_PROXY: "${BEHIND_PROXY}"
76+
DEFAULT_DATABASE: "${DEFAULT_DATABASE}"
77+
DATABASES: "${DATABASES}"
78+
FLASK_ENV: "${FLASK_ENV}"
6779
command: ["gunicorn", "--workers", "10", "--threads", "2", "-b", "0.0.0.0:${API_PORT}", "app:app", "--timeout", "300"]
6880
networks:
6981
- headstart
@@ -78,6 +90,15 @@ services:
7890
REDIS_PORT: "${REDIS_PORT}"
7991
REDIS_DB: "${REDIS_DB}"
8092
REDIS_PASSWORD: "${REDIS_PASSWORD}"
93+
REDIS_SSL: "${REDIS_SSL}"
94+
LOGLEVEL: "${LOGLEVEL}"
95+
TRIPLE_USER: "${TRIPLE_USER}"
96+
TRIPLE_PASS: "${TRIPLE_PASS}"
97+
TRIPLE_HOST: "${TRIPLE_HOST}"
98+
TRIPLE_PORT: "${TRIPLE_PORT}"
99+
TRIPLE_DOCUMENTS_INDEX: "${TRIPLE_DOCUMENTS_INDEX}"
100+
TRIPLE_PROJECTS_INDEX: "${TRIPLE_PROJECTS_INDEX}"
101+
TRIPLE_AUTHORS_INDEX: "${TRIPLE_AUTHORS_INDEX}"
81102
restart: always
82103
depends_on:
83104
- redis
@@ -94,6 +115,8 @@ services:
94115
REDIS_PORT: "${REDIS_PORT}"
95116
REDIS_DB: "${REDIS_DB}"
96117
REDIS_PASSWORD: "${REDIS_PASSWORD}"
118+
REDIS_SSL: "${REDIS_SSL}"
119+
LOGLEVEL: "${LOGLEVEL}"
97120
restart: always
98121
depends_on:
99122
- redis
@@ -110,6 +133,8 @@ services:
110133
REDIS_PORT: "${REDIS_PORT}"
111134
REDIS_DB: "${REDIS_DB}"
112135
REDIS_PASSWORD: "${REDIS_PASSWORD}"
136+
REDIS_SSL: "${REDIS_SSL}"
137+
LOGLEVEL: "${LOGLEVEL}"
113138
restart: always
114139
volumes:
115140
- /opt/local/renv/cache:/renv/cache
@@ -121,14 +146,21 @@ services:
121146

122147
base:
123148
image: base:${SERVICE_VERSION}
124-
env_file:
125-
- server/workers/base/base.env
126149
environment:
127150
SERVICE_VERSION: "${SERVICE_VERSION}"
128151
REDIS_HOST: "${REDIS_HOST}"
129152
REDIS_PORT: "${REDIS_PORT}"
130153
REDIS_DB: "${REDIS_DB}"
131154
REDIS_PASSWORD: "${REDIS_PASSWORD}"
155+
REDIS_SSL: "${REDIS_SSL}"
156+
LOGLEVEL: "${LOGLEVEL}"
157+
LOGFILE: "/var/log/headstart/headstart.log"
158+
RENV_VERSION: 0.14.0-5
159+
CRAN_REPOS: https://cran.wu.ac.at
160+
LC_ALL: "en_US.UTF-8"
161+
LANG: "en_US.UTF-8"
162+
RENV_PATHS_CACHE: /renv/cache
163+
PYTHONIOENCODING: "utf-8"
132164
restart: always
133165
volumes:
134166
- /opt/local/renv/cache:/renv/cache
@@ -140,14 +172,21 @@ services:
140172

141173
pubmed:
142174
image: pubmed:${SERVICE_VERSION}
143-
env_file:
144-
- server/workers/pubmed/pubmed.env
145175
environment:
146176
SERVICE_VERSION: "${SERVICE_VERSION}"
147177
REDIS_HOST: "${REDIS_HOST}"
148178
REDIS_PORT: "${REDIS_PORT}"
149179
REDIS_DB: "${REDIS_DB}"
150180
REDIS_PASSWORD: "${REDIS_PASSWORD}"
181+
REDIS_SSL: "${REDIS_SSL}"
182+
LOGLEVEL: "${LOGLEVEL}"
183+
LOGFILE: "/var/log/headstart/headstart.log"
184+
RENV_VERSION: 0.14.0-5
185+
CRAN_REPOS: https://cran.wu.ac.at
186+
LC_ALL: "en_US.UTF-8"
187+
LANG: "en_US.UTF-8"
188+
RENV_PATHS_CACHE: /renv/cache
189+
PYTHONIOENCODING: "utf-8"
151190
restart: always
152191
volumes:
153192
- /opt/local/renv/cache:/renv/cache
@@ -159,14 +198,21 @@ services:
159198

160199
openaire:
161200
image: openaire:${SERVICE_VERSION}
162-
env_file:
163-
- server/workers/openaire/openaire.env
164201
environment:
165202
SERVICE_VERSION: "${SERVICE_VERSION}"
166203
REDIS_HOST: "${REDIS_HOST}"
167204
REDIS_PORT: "${REDIS_PORT}"
168205
REDIS_DB: "${REDIS_DB}"
169206
REDIS_PASSWORD: "${REDIS_PASSWORD}"
207+
REDIS_SSL: "${REDIS_SSL}"
208+
LOGLEVEL: "${LOGLEVEL}"
209+
LOGFILE: "/var/log/headstart/headstart.log"
210+
RENV_VERSION: 0.14.0-5
211+
CRAN_REPOS: https://cran.wu.ac.at
212+
LC_ALL: "en_US.UTF-8"
213+
LANG: "en_US.UTF-8"
214+
RENV_PATHS_CACHE: /renv/cache
215+
PYTHONIOENCODING: "utf-8"
170216
restart: always
171217
volumes:
172218
- /opt/local/renv/cache:/renv/cache

server/preprocessing/other-scripts/base.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
library(rbace)
2+
library(stringr)
23

34
# get_papers
45
#
@@ -173,7 +174,7 @@ find_dois <- function(link) {
173174
|| (startsWith(link, "https://doi.org"))
174175
|| (startsWith(link, "http://dx.doi.org"))
175176
|| (startsWith(link, "https://dx.doi.org"))) {
176-
doi <- stringr::str_replace(link, "http:", "https:")
177+
doi <- str_replace(link, "http:", "https:")
177178
} else {
178179
doi <- ""
179180
}

server/preprocessing/other-scripts/cluster.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ create_clusters <- function(distance_matrix, max_clusters=-1, method="ward.D") {
9191
vclog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Number of Clusters:", num_clusters, sep=" "))
9292
vclog$debug(paste("CutOff-Description:", attributes(cut_off)$description))
9393
}
94-
clusters = list("labels"=labels, "cluster"=cluster, "groups"=groups, "num_clusters"=num_clusters)
94+
clusters = list("labels"=labels, "groups"=groups, "num_clusters"=num_clusters)
9595
return(clusters)
9696

9797
}
Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,22 @@
1+
library(stringr)
12
vflog <- getLogger('vis.features')
23

3-
create_corpus <- function(metadata, text, languages=c("en")) {
4-
valid <- getStemLanguages()
5-
text["languages"] <- languages
6-
mapping <- list(content = "content", id = "id", languages = "languages")
7-
myReader <- readTabular(mapping = mapping)
4+
TypeCountTokenizer <- function(x) {
5+
unlist(strsplit(as.character(x), "[^[:alnum:]-]"))
6+
}
7+
88

9-
corpus <- Corpus(DataframeSource(text),
10-
readerControl = list(reader = myReader))
9+
create_corpus <- function(metadata, text, stops) {
10+
docs <- data.frame(doc_id = text$id, text = text$content)
11+
corpus <- VCorpus(DataframeSource(docs))
1112

1213
# Replace non-convertible bytes in with strings showing their hex codes,
1314
# see http://tm.r-forge.r-project.org/faq.html
1415
corpus <- tm_map(corpus, content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))
1516
unlowered <- corpus
1617
corpus <- tm_map(corpus, removePunctuation)
1718
corpus <- tm_map(corpus, content_transformer(tolower))
18-
corpus <- tm_map(corpus, remove_stop_words)
19+
corpus <- tm_map(corpus, removeWords, stops)
1920
corpus <- tm_map(corpus, stripWhitespace)
2021
unstemmed <- corpus
2122
stemmed <- tm_map(corpus, stemDocument)
@@ -41,22 +42,12 @@ get_distance_matrix <- function(tdm_matrix, method = "cosine") {
4142
return(distance_matrix)
4243
}
4344

45+
get_type_counts <- function(corpus) {
46+
type_counts = apply(TermDocumentMatrix(corpus, control=list(tokenize=TypeCountTokenizer, tolower = FALSE)), 1, sum)
47+
return(type_counts)
48+
}
49+
4450
concatenate_features <- function(...) {
4551
# expects a list of feature matrices which can be extended horizontally
4652
return(cbind(...))
47-
}
48-
49-
remove_stop_words <- function(x, languages) UseMethod("remove_stop_words", x)
50-
remove_stop_words.character <- function(x, languages) {
51-
y <- unlist(strsplit(x, " "))
52-
stops = list()
53-
for (lang in languages) {
54-
stops <- c(stops, get_stopwords(lang, TESTING))
55-
}
56-
stopword <- unlist(lapply(y, function(z) z %in% stops))
57-
doc <- y[which(!stopword)]
58-
doc <- paste(doc, collapse = " ")
59-
}
60-
remove_stop_words.PlainTextDocument <- function(x, languages = meta(x, "languages")) {
61-
content_transformer(remove_stop_words.character)(x, languages)
62-
}
53+
}

server/preprocessing/other-scripts/postprocess.R

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,44 +5,23 @@ create_overview_output <- function(named_clusters, layout, metadata) {
55
x = layout$X1
66
y = layout$X2
77
labels = named_clusters$labels
8-
cluster = named_clusters$cluster
98
num_clusters = named_clusters$num_clusters
109
cluster_labels = named_clusters$cluster_labels
1110

1211
# Prepare the output
13-
result = data.frame(cbind(x, y, labels, cluster_labels))
12+
result = data.frame(cbind(x, y, labels, cluster_labels, metadata$id))
13+
names(result)[5] <- "id"
1414
unique_groups = data.frame(unique(result$cluster_labels))
1515
colnames(unique_groups) <- "cluster_labels"
1616
unique_groups$groups <- seq_along(unique_groups$cluster_labels)
1717
result = merge(result, unique_groups, by='cluster_labels')
18-
output = merge(metadata, result, by.x="id", by.y="labels", all=TRUE)
18+
output = merge(metadata, result, by.x="id", by.y="id", all=TRUE)
1919

2020
names(output)[names(output)=="groups"] <- "area_uri"
2121
output["area"] = paste(output$cluster_labels, sep="")
2222

2323
output_json = toJSON(output)
2424

25-
if(exists("DEBUG") && DEBUG == TRUE) {
26-
library(ggplot2)
27-
# Plot results from multidimensional scaling, highlight clusters with symbols
28-
temp <- fromJSON(output_json)
29-
temp$x <- as.numeric(temp$x)
30-
temp$y <- as.numeric(temp$y)
31-
temp$title <- unlist(lapply(temp$title, substr, start=0, stop=15))
32-
g <- ggplot(temp, aes(x, y, label=title)) +
33-
geom_point(aes(colour=area_uri)) +
34-
geom_text(size=2)
35-
ggsave(file = "debug_nmds.svg", plot = g, width = 15, height = 15)
36-
}
37-
38-
# NEEDS FIX
39-
# if(exists("DEBUG") && DEBUG == TRUE) {
40-
# # Write output to file
41-
# file_handle = file("output_file.csv", open="w")
42-
# write.csv(output, file=file_handle, row.names=FALSE)
43-
# close(file_handle)
44-
# }
45-
4625
return(output_json)
4726

4827
}

server/preprocessing/other-scripts/preprocess.R

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -85,44 +85,39 @@ deduplicate_titles <- function(metadata, list_size) {
8585

8686
}
8787

88-
replace_keywords_if_empty <- function(metadata, stops, service) {
88+
replace_keywords_if_empty <- function(metadata, stops) {
8989
metadata$subject <- unlist(lapply(metadata$subject, function(x) {gsub(" +", " ", x)}))
9090
missing_subjects = which(lapply(metadata$subject, function(x) {nchar(x)}) <= 1)
91-
vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects:", length(missing_subjects)))
92-
if (service == "linkedcat" || service == "linkedcat_authorview" || service == "linkedcat_browseview") {
93-
metadata$subject[missing_subjects] <- metadata$bkl_caption[missing_subjects]
94-
metadata$subject[is.na(metadata$subject)] <- ""
95-
} else {
96-
candidates = mapply(paste, metadata$title)
97-
candidates = mclapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
98-
candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
99-
candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
100-
candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
101-
#candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
102-
candidates = mapply(paste, candidates, candidates_bigrams)
103-
#candidates = lapply(candidates, function(x) {gsub('\\b\\d+\\s','', x)})
104-
105-
nn_corpus = Corpus(VectorSource(candidates))
106-
nn_tfidf = TermDocumentMatrix(nn_corpus, control = list(tokenize = SplitTokenizer, weighting = function(x) weightSMART(x, spec="ntn")))
107-
tfidf_top = apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>=x2[3]]})
108-
tfidf_top_names = lapply(tfidf_top, names)
109-
replacement_keywords <- mclapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
110-
replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
111-
replacement_keywords = gsub("_", " ", replacement_keywords)
112-
113-
metadata$subject[missing_subjects] <- replacement_keywords[missing_subjects]
91+
if (length(missing_subjects) == 0) {
92+
return(metadata)
11493
}
94+
vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects:", length(missing_subjects)))
95+
candidates = mapply(paste, metadata$title)
96+
candidates = mclapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
97+
candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
98+
candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
99+
candidates_bigrams = lapply(lapply(candidates, expand_ngrams, n=2), paste, collapse=" ")
100+
candidates = mapply(paste, candidates, candidates_bigrams)
101+
102+
nn_corpus = Corpus(VectorSource(candidates))
103+
nn_tfidf = TermDocumentMatrix(nn_corpus)
104+
tfidf_top = apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>=x2[3]]})
105+
tfidf_top_names = lapply(tfidf_top, names)
106+
replacement_keywords <- mclapply(tfidf_top_names, function(x) filter_out_nested_ngrams(x, 3))
107+
replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
108+
replacement_keywords = gsub("_", " ", replacement_keywords)
109+
110+
metadata$subject[missing_subjects] <- replacement_keywords[missing_subjects]
115111
missing_subjects = which(lapply(metadata$subject, function(x) {nchar(x)}) <= 1)
112+
vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects after replacing from title:", length(missing_subjects)))
116113
if (length(missing_subjects) > 0) {
117-
for (i in missing_subjects) {
114+
foreach (i = missing_subjects) %dopar% {
118115
candidates = mapply(paste, metadata$title[i], metadata$paper_abstract[i])
119116
candidates = lapply(candidates, function(x)paste(removeWords(x, stops), collapse=""))
120117
candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
121118
candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
122-
candidates_bigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 2), paste, collapse="_"))), paste, collapse=" ")
123-
#candidates_trigrams = lapply(lapply(candidates, function(x)unlist(lapply(ngrams(unlist(strsplit(x, split=" ")), 3), paste, collapse="_"))), paste, collapse=" ")
119+
candidates_bigrams = lapply(lapply(candidates, expand_ngrams, n=2), paste, collapse=" ")
124120
candidates = mapply(paste, candidates, candidates_bigrams)
125-
#candidates = lapply(candidates, function(x) {gsub('\\b\\d+\\s','', x)})
126121
nn_count = sort(table(strsplit(candidates, " ")), decreasing = T)
127122
replacement_keywords <- filter_out_nested_ngrams(names(nn_count), 3)
128123
replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})

0 commit comments

Comments
 (0)