Skip to content

Commit 86cb34d

Browse files
authored
Merge pull request #743 from OpenKnowledgeMaps/classification-cleanup
Classification cleanup
2 parents bf52d18 + 9cf32bb commit 86cb34d

11 files changed

Lines changed: 42 additions & 18 deletions

File tree

server/preprocessing/other-scripts/base.R

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,19 +188,34 @@ etl <- function(res, repo, non_public) {
188188
subject_cleaned = gsub("[^\\(;]+\\(all\\)(;|$)?", "", subject_cleaned) # remove general subjects
189189
subject_cleaned = gsub("[^:;]+ ?:: ?[^;]+(;|$)?", "", subject_cleaned) #remove classification with separator ::
190190
subject_cleaned = gsub("[^\\[;]+\\[[A-Z,0-9]+\\](;|$)?", "", subject_cleaned) # remove WHO classification
191-
subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9][A-Za-z0-9 \\/\\.]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
191+
subject_cleaned = gsub("Info:\\w+-(\\w+\\/)+", "", subject_cleaned) # remove Info:eu-repo/classification/
192+
subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9 \\/\\.-]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
193+
if (!is.null(params$vis_type) && params$vis_type == "timeline") {
194+
subject_cleaned = gsub("FOS ", "", subject_cleaned) # remove FOS classification tag, but keep classifcation name
195+
arxiv_classification_string = "(cs|econ|eess|math|astro-ph|nlin|q-bio|q-fin|stat)\\.[A-Z]{2}|cond-mat\\.[a-z\\-]+|hep-(ex|lat|ph|th)|math-ph|nucl-(ex|th)|physics\\.[a-z\\-]+|(astro-ph|gr-qc|quant-ph|cond-mat)"
196+
subject_cleaned = gsub(arxiv_classification_string, "", subject_cleaned, perl=TRUE) # remove arXiv classification short code, but keep classifcation name
197+
} else {
198+
subject_cleaned = gsub("FOS [A-Za-z ]+", "", subject_cleaned) # remove FOS classifications (Fields of Science and Technology)
199+
arxiv_classification_string = "(([A-Za-z ]+ )?cond-mat\\.[a-z\\-]+)|([\\w ]+ )?(cs|econ|eess|math|astro-ph|nlin|q-bio|q-fin|stat)\\.[A-Z]{2}|cond-mat\\.[a-z\\-]+|hep-(ex|lat|ph|th)|math-ph|nucl-(ex|th)|physics\\.[a-z\\-]+|([\\w ]+ )(astro-ph|gr-qc|quant-ph|cond-mat)"
200+
subject_cleaned = gsub(arxiv_classification_string, "", subject_cleaned, perl=TRUE) # remove arXiv classification, except on streamgraphs
201+
}
202+
subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9 \\/\\.]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
192203
subject_cleaned = gsub("(wikidata)?\\.org/entity/[qQ]([\\d]+)?", "", subject_cleaned) # remove wikidata classification
193204
subject_cleaned = gsub("</keyword><keyword>", "", subject_cleaned) # remove </keyword><keyword>
194205
subject_cleaned = gsub("\\[No keyword\\]", "", subject_cleaned)
195206
subject_cleaned = gsub("\\[[^\\[]+\\][^\\;]+(;|$)?", "", subject_cleaned) # remove classification
196207
subject_cleaned = gsub("[0-9]{2,} [A-Z]+[^;]*(;|$)?", "", subject_cleaned) #remove classification
197208
subject_cleaned = gsub(" -- ", "; ", subject_cleaned) #replace inconsistent keyword separation
209+
subject_cleaned = gsub("[-]{2,}", "; ", subject_cleaned) #replace inconsistent keyword separation
210+
subject_cleaned = gsub("[A-Z]\\.\\d\\.\\d+", "", subject_cleaned) #replace inconsistent keyword separation
198211
subject_cleaned = gsub(" \\( ", "; ", subject_cleaned) #replace inconsistent keyword separation
199212
subject_cleaned = gsub("(\\w* \\w*(\\.)( \\w* \\w*)?)", "; ", subject_cleaned) # remove overly broad keywords separated by .
200213
subject_cleaned = gsub("\\. ", "; ", subject_cleaned) # replace inconsistent keyword separation
201214
subject_cleaned = gsub(" ?\\d[:?-?]?(\\d+.)+", "", subject_cleaned) # replace residuals like 5:621.313.323 or '5-76.95'
202-
subject_cleaned = gsub("\\w+:\\w+-(\\w+\\/)+", "", subject_cleaned) # replace residuals like Info:eu-repo/classification/
215+
subject_cleaned = gsub(": ", "", subject_cleaned) # clean up keyword separation
203216
subject_cleaned = gsub("^; $", "", subject_cleaned) # clean up keyword separation
217+
subject_cleaned = gsub(";+", ";", subject_cleaned) # clean up keyword separation
218+
subject_cleaned = gsub(",+", ",", subject_cleaned) # clean up keyword separation
204219
subject_cleaned = gsub(",", ", ", subject_cleaned) # clean up keyword separation
205220
subject_cleaned = gsub("\\s+", " ", subject_cleaned) # clean up keyword separation
206221
subject_cleaned = stringi::stri_trim(subject_cleaned) # clean up keyword separation

server/preprocessing/other-scripts/features.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ create_corpus <- function(metadata, text, stops) {
1919
batch_size <- 1000
2020
total_length <- length(stops)
2121
for (i in seq(1, total_length, batch_size)) {
22-
corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)])
22+
try(corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)]))
2323
}
2424
corpus <- tm_map(corpus, stripWhitespace)
2525
unstemmed <- corpus

server/preprocessing/other-scripts/summarize.R

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,9 @@ create_cluster_labels <- function(clusters, metadata,
9797

9898
fix_cluster_labels <- function(clusterlabels, type_counts){
9999
unlist(mclapply(clusterlabels, function(x) {
100-
fix_keyword_casing(x, type_counts)
100+
x <- fix_keyword_casing(x, type_counts)
101+
# clean up titles from format issues
102+
x <- gsub(",+", ",", x)
101103
}))
102104
}
103105

@@ -179,7 +181,7 @@ another_prune_ngrams <- function(ngrams, stops){
179181
# check if first token of ngrams in stopword list
180182
batch_size <- 1000
181183
total_length <- length(stops)
182-
for (i in seq(1, total_length, batch_size)) {
184+
for (i in seq(1, total_length, batch_size)) try({
183185
tokens = lapply(tokens, function(y){
184186
Filter(function(x){
185187
if (x[1] != "") !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], x[1]))
@@ -189,7 +191,7 @@ another_prune_ngrams <- function(ngrams, stops){
189191
Filter(function(x){
190192
if (tail(x,1) != "") !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], tail(x,1)))
191193
}, y)})
192-
}
194+
})
193195
# check that first token is not the same as the last token
194196
tokens = lapply(tokens, function(y){
195197
if(length(y) > 1) {

server/preprocessing/other-scripts/test/params_base.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"document_types":["121"],
33
"from":"1665-01-01",
4-
"to":"2023-07-21",
4+
"to":"2023-09-25",
55
"sorting":"most-relevant",
66
"vis_id": "TEST_ID",
77
"min_descsize": 300,

server/preprocessing/other-scripts/test/test_base.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ options(warn=1)
77
wd <- dirname(dirname(rstudioapi::getActiveDocumentContext()$path))
88
setwd(wd) #Don't forget to set your working directory
99

10-
query <- 'test' #args[2]
10+
query <- 'cond-mat.mtrl-sci' #args[2]
1111
service <- "base"
1212
params <- NULL
1313
params_file <- "test/params_base.json"

server/services/searchOpenAire.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
use headstart\library;
99

1010
$post_params = $_POST;
11+
$precomputed_id = (isset($_POST["unique_id"]))?($_POST["unique_id"]):(null);
1112

1213
if (array_key_exists("acronymtitle", $_POST)) {
1314
$q = library\CommUtils::getParameter($_POST, "acronymtitle");
@@ -41,7 +42,7 @@
4142
, $post_params, $param_array
4243
, false
4344
, true, $id_array
44-
, null, true);
45+
, $precomputed_id, true);
4546

4647
echo $result
4748

server/workers/api/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
flask
22
flask-cors
3-
flask_sqlalchemy
43
flask_restx
54
Werkzeug
65
marshmallow

server/workers/build_docker_images.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#!/bin/bash
22
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
3-
services=("api" "persistence" "gsheets" "dataprocessing" "base" "pubmed" "openaire")
3+
services=("api" "persistence" "dataprocessing" "base" "pubmed" "openaire")
44
for service in ${services[@]}; do
5+
echo ""
6+
echo "Building $service"
7+
echo ""
58
docker build -f "$SCRIPT_DIR/../workers/$service/Dockerfile" -t "$service:`git rev-parse HEAD`" "$SCRIPT_DIR/../"
69
done
7-

server/workers/dataprocessing/src/headstart.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,6 @@ def run(self):
9292
self.logger.error("Could not connect to remote Redis server, is the SSH tunnel open?")
9393
try:
9494
if params.get('vis_type') == "timeline":
95-
# the step of create_map can be dropped once deduplication is possible in API backend as well
96-
# TODO: create deduplicate endpoint in service worker and connect to that
9795
metadata = self.execute_search(params, input_data)
9896
sg_data = sg.get_streamgraph_data(json.loads(metadata),
9997
params.get('q'),

server/workers/dataprocessing/src/streamgraph.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,12 @@ def __init__(self, loglevel="INFO"):
2626

2727
def tokenize(self, s):
2828
#return re.split("; | - |, |: ", s)
29-
s = re.sub(r"[\(\)]", "", s)
30-
return re.split("; ", s)
29+
t = re.sub(r"[\(\)]", "", s)
30+
t = re.split("; ", t)
31+
t = [s for s in t]
32+
t = [s.replace(";", "") for s in t]
33+
t = [s.strip() for s in t]
34+
return t
3135

3236
def get_streamgraph_data(self, metadata, query, n=12, method="count"):
3337
metadata = pd.DataFrame.from_records(metadata)
@@ -36,7 +40,7 @@ def get_streamgraph_data(self, metadata, query, n=12, method="count"):
3640
df.dropna(axis=0, subset=["year"], inplace=True)
3741
df.year = pd.to_datetime(df.year.map(lambda x: x.replace(month=1, day=1).strftime('%Y-%m-%d')))
3842
df = df[df.subject.map(lambda x: x is not None)]
39-
df.subject = df.subject.map(lambda x: [s.lower() for s in self.tokenize(x)] if isinstance(x, str) else "")
43+
df.subject = df.subject.map(lambda x: self.tokenize(x.lower()) if isinstance(x, str) else [])
4044
df = df[df.subject.map(lambda x: x != [])]
4145
df["boundary_label"] = df.year
4246
df = df.explode('subject')
@@ -177,6 +181,8 @@ def reduce_daterange(self, daterange, df):
177181
# 5% which is chosen here is an arbitrary value, could also be higher 10% or lower
178182
min_value = int(yearly_sums.sum() * 0.05)
179183
start_index = yearly_sums_cum[yearly_sums_cum > min_value].index[0]
184+
self.logger.debug(f"Start index: {start_index}")
185+
self.logger.debug(f"Start year: {x[start_index]}")
180186
df.y = df.y.map(lambda x: x[start_index:])
181187
df.ids_timestep = df.ids_timestep.map(lambda x: x[start_index:])
182188
x = x[start_index:]

0 commit comments

Comments
 (0)