Skip to content

Commit 5ac3366

Browse files
authored
Merge pull request #745 from OpenKnowledgeMaps/dev
Dev
2 parents 7cb2b6a + 95643a2 commit 5ac3366

18 files changed

Lines changed: 290 additions & 213 deletions

File tree

package-lock.json

Lines changed: 193 additions & 176 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

server/preprocessing/other-scripts/base.R

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,19 +188,34 @@ etl <- function(res, repo, non_public) {
188188
subject_cleaned = gsub("[^\\(;]+\\(all\\)(;|$)?", "", subject_cleaned) # remove general subjects
189189
subject_cleaned = gsub("[^:;]+ ?:: ?[^;]+(;|$)?", "", subject_cleaned) #remove classification with separator ::
190190
subject_cleaned = gsub("[^\\[;]+\\[[A-Z,0-9]+\\](;|$)?", "", subject_cleaned) # remove WHO classification
191-
subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9][A-Za-z0-9 \\/\\.]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
191+
subject_cleaned = gsub("Info:\\w+-(\\w+\\/)+", "", subject_cleaned) # remove Info:eu-repo/classification/
192+
subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9 \\/\\.-]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
193+
if (!is.null(params$vis_type) && params$vis_type == "timeline") {
194+
subject_cleaned = gsub("FOS ", "", subject_cleaned) # remove FOS classification tag, but keep classifcation name
195+
arxiv_classification_string = "(cs|econ|eess|math|astro-ph|nlin|q-bio|q-fin|stat)\\.[A-Z]{2}|cond-mat\\.[a-z\\-]+|hep-(ex|lat|ph|th)|math-ph|nucl-(ex|th)|physics\\.[a-z\\-]+|(astro-ph|gr-qc|quant-ph|cond-mat)"
196+
subject_cleaned = gsub(arxiv_classification_string, "", subject_cleaned, perl=TRUE) # remove arXiv classification short code, but keep classifcation name
197+
} else {
198+
subject_cleaned = gsub("FOS [A-Za-z ]+", "", subject_cleaned) # remove FOS classifications (Fields of Science and Technology)
199+
arxiv_classification_string = "(([A-Za-z ]+ )?cond-mat\\.[a-z\\-]+)|([\\w ]+ )?(cs|econ|eess|math|astro-ph|nlin|q-bio|q-fin|stat)\\.[A-Z]{2}|cond-mat\\.[a-z\\-]+|hep-(ex|lat|ph|th)|math-ph|nucl-(ex|th)|physics\\.[a-z\\-]+|([\\w ]+ )(astro-ph|gr-qc|quant-ph|cond-mat)"
200+
subject_cleaned = gsub(arxiv_classification_string, "", subject_cleaned, perl=TRUE) # remove arXiv classification, except on streamgraphs
201+
}
202+
subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9 \\/\\.]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
192203
subject_cleaned = gsub("(wikidata)?\\.org/entity/[qQ]([\\d]+)?", "", subject_cleaned) # remove wikidata classification
193204
subject_cleaned = gsub("</keyword><keyword>", "", subject_cleaned) # remove </keyword><keyword>
194205
subject_cleaned = gsub("\\[No keyword\\]", "", subject_cleaned)
195206
subject_cleaned = gsub("\\[[^\\[]+\\][^\\;]+(;|$)?", "", subject_cleaned) # remove classification
196207
subject_cleaned = gsub("[0-9]{2,} [A-Z]+[^;]*(;|$)?", "", subject_cleaned) #remove classification
197208
subject_cleaned = gsub(" -- ", "; ", subject_cleaned) #replace inconsistent keyword separation
209+
subject_cleaned = gsub("[-]{2,}", "; ", subject_cleaned) #replace inconsistent keyword separation
210+
subject_cleaned = gsub("[A-Z]\\.\\d\\.\\d+", "", subject_cleaned) #replace inconsistent keyword separation
198211
subject_cleaned = gsub(" \\( ", "; ", subject_cleaned) #replace inconsistent keyword separation
199212
subject_cleaned = gsub("(\\w* \\w*(\\.)( \\w* \\w*)?)", "; ", subject_cleaned) # remove overly broad keywords separated by .
200213
subject_cleaned = gsub("\\. ", "; ", subject_cleaned) # replace inconsistent keyword separation
201214
subject_cleaned = gsub(" ?\\d[:?-?]?(\\d+.)+", "", subject_cleaned) # replace residuals like 5:621.313.323 or '5-76.95'
202-
subject_cleaned = gsub("\\w+:\\w+-(\\w+\\/)+", "", subject_cleaned) # replace residuals like Info:eu-repo/classification/
215+
subject_cleaned = gsub(": ", "", subject_cleaned) # clean up keyword separation
203216
subject_cleaned = gsub("^; $", "", subject_cleaned) # clean up keyword separation
217+
subject_cleaned = gsub(";+", ";", subject_cleaned) # clean up keyword separation
218+
subject_cleaned = gsub(",+", ",", subject_cleaned) # clean up keyword separation
204219
subject_cleaned = gsub(",", ", ", subject_cleaned) # clean up keyword separation
205220
subject_cleaned = gsub("\\s+", " ", subject_cleaned) # clean up keyword separation
206221
subject_cleaned = stringi::stri_trim(subject_cleaned) # clean up keyword separation

server/preprocessing/other-scripts/features.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ create_corpus <- function(metadata, text, stops) {
1919
batch_size <- 1000
2020
total_length <- length(stops)
2121
for (i in seq(1, total_length, batch_size)) {
22-
corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)])
22+
try(corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)]))
2323
}
2424
corpus <- tm_map(corpus, stripWhitespace)
2525
unstemmed <- corpus

server/preprocessing/other-scripts/summarize.R

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,9 @@ create_cluster_labels <- function(clusters, metadata,
9797

9898
fix_cluster_labels <- function(clusterlabels, type_counts){
9999
unlist(mclapply(clusterlabels, function(x) {
100-
fix_keyword_casing(x, type_counts)
100+
x <- fix_keyword_casing(x, type_counts)
101+
# clean up titles from format issues
102+
x <- gsub(",+", ",", x)
101103
}))
102104
}
103105

@@ -179,7 +181,7 @@ another_prune_ngrams <- function(ngrams, stops){
179181
# check if first token of ngrams in stopword list
180182
batch_size <- 1000
181183
total_length <- length(stops)
182-
for (i in seq(1, total_length, batch_size)) {
184+
for (i in seq(1, total_length, batch_size)) try({
183185
tokens = lapply(tokens, function(y){
184186
Filter(function(x){
185187
if (x[1] != "") !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], x[1]))
@@ -189,7 +191,7 @@ another_prune_ngrams <- function(ngrams, stops){
189191
Filter(function(x){
190192
if (tail(x,1) != "") !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], tail(x,1)))
191193
}, y)})
192-
}
194+
})
193195
# check that first token is not the same as the last token
194196
tokens = lapply(tokens, function(y){
195197
if(length(y) > 1) {

server/preprocessing/other-scripts/test/params_base.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"document_types":["121"],
33
"from":"1665-01-01",
4-
"to":"2023-07-21",
4+
"to":"2023-09-25",
55
"sorting":"most-relevant",
66
"vis_id": "TEST_ID",
77
"min_descsize": 300,

server/preprocessing/other-scripts/test/test_base.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ options(warn=1)
77
wd <- dirname(dirname(rstudioapi::getActiveDocumentContext()$path))
88
setwd(wd) #Don't forget to set your working directory
99

10-
query <- 'test' #args[2]
10+
query <- 'cond-mat.mtrl-sci' #args[2]
1111
service <- "base"
1212
params <- NULL
1313
params_file <- "test/params_base.json"

server/services/searchBASE.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
$precomputed_id = (isset($_POST["unique_id"]))?($_POST["unique_id"]):(null);
1212

1313
$params_array = array("from", "to", "document_types", "sorting", "min_descsize");
14-
$optional_get_params = ["repo", "coll", "vis_type", "q_advanced", "lang_id"];
14+
$optional_get_params = ["repo", "coll", "vis_type", "q_advanced", "lang_id", "custom_title"];
1515

1616
function filterEmptyString($value)
1717
{

server/workers/api/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
flask
22
flask-cors
3-
flask_sqlalchemy
43
flask_restx
54
Werkzeug
65
marshmallow

server/workers/api/src/apis/request_validators.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class SearchParamSchema(Schema):
3232
repo_name = fields.Str()
3333
coll = fields.Str()
3434
list_size = fields.Int()
35+
custom_title = fields.Str()
3536

3637

3738
@pre_load

server/workers/build_docker_images.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#!/bin/bash
22
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
3-
services=("api" "persistence" "gsheets" "dataprocessing" "base" "pubmed" "openaire")
3+
services=("api" "persistence" "dataprocessing" "base" "pubmed" "openaire")
44
for service in ${services[@]}; do
5+
echo ""
6+
echo "Building $service"
7+
echo ""
58
docker build -f "$SCRIPT_DIR/../workers/$service/Dockerfile" -t "$service:`git rev-parse HEAD`" "$SCRIPT_DIR/../"
69
done
7-

0 commit comments

Comments
 (0)