Merge pull request #743 from OpenKnowledgeMaps/classification-cleanup

chreman · web-flow · commit 86cb34d75bc8 · 2023-11-16T11:33:55.000+01:00
Classification cleanup
diff --git a/server/preprocessing/other-scripts/base.R b/server/preprocessing/other-scripts/base.R
@@ -188,19 +188,34 @@ etl <- function(res, repo, non_public) {
   subject_cleaned = gsub("[^\\(;]+\\(all\\)(;|$)?", "", subject_cleaned) # remove general subjects
   subject_cleaned = gsub("[^:;]+ ?:: ?[^;]+(;|$)?", "", subject_cleaned) #remove classification with separator ::
   subject_cleaned = gsub("[^\\[;]+\\[[A-Z,0-9]+\\](;|$)?", "", subject_cleaned) # remove WHO classification
-  subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9][A-Za-z0-9 \\/\\.]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
+  subject_cleaned = gsub("Info:\\w+-(\\w+\\/)+", "", subject_cleaned) # remove Info:eu-repo/classification/
+  subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9 \\/\\.-]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
+  if (!is.null(params$vis_type) && params$vis_type == "timeline") {
+    subject_cleaned = gsub("FOS ", "", subject_cleaned) # remove FOS classification tag, but keep classifcation name
+    arxiv_classification_string = "(cs|econ|eess|math|astro-ph|nlin|q-bio|q-fin|stat)\\.[A-Z]{2}|cond-mat\\.[a-z\\-]+|hep-(ex|lat|ph|th)|math-ph|nucl-(ex|th)|physics\\.[a-z\\-]+|(astro-ph|gr-qc|quant-ph|cond-mat)"
+    subject_cleaned = gsub(arxiv_classification_string, "", subject_cleaned, perl=TRUE) # remove arXiv classification short code, but keep classifcation name
+  } else {
+    subject_cleaned = gsub("FOS [A-Za-z ]+", "", subject_cleaned) # remove FOS classifications (Fields of Science and Technology)
+    arxiv_classification_string = "(([A-Za-z ]+ )?cond-mat\\.[a-z\\-]+)|([\\w ]+ )?(cs|econ|eess|math|astro-ph|nlin|q-bio|q-fin|stat)\\.[A-Z]{2}|cond-mat\\.[a-z\\-]+|hep-(ex|lat|ph|th)|math-ph|nucl-(ex|th)|physics\\.[a-z\\-]+|([\\w ]+ )(astro-ph|gr-qc|quant-ph|cond-mat)"
+    subject_cleaned = gsub(arxiv_classification_string, "", subject_cleaned, perl=TRUE) # remove arXiv classification, except on streamgraphs    
+  }
+  subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9 \\/\\.]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
   subject_cleaned = gsub("(wikidata)?\\.org/entity/[qQ]([\\d]+)?", "", subject_cleaned) # remove wikidata classification
   subject_cleaned = gsub("</keyword><keyword>", "", subject_cleaned) # remove </keyword><keyword>
   subject_cleaned = gsub("\\[No keyword\\]", "", subject_cleaned)
   subject_cleaned = gsub("\\[[^\\[]+\\][^\\;]+(;|$)?", "", subject_cleaned) # remove classification
   subject_cleaned = gsub("[0-9]{2,} [A-Z]+[^;]*(;|$)?", "", subject_cleaned) #remove classification
   subject_cleaned = gsub(" -- ", "; ", subject_cleaned) #replace inconsistent keyword separation
+  subject_cleaned = gsub("[-]{2,}", "; ", subject_cleaned) #replace inconsistent keyword separation
+  subject_cleaned = gsub("[A-Z]\\.\\d\\.\\d+", "", subject_cleaned) #replace inconsistent keyword separation
   subject_cleaned = gsub(" \\(  ", "; ", subject_cleaned) #replace inconsistent keyword separation
   subject_cleaned = gsub("(\\w* \\w*(\\.)( \\w* \\w*)?)", "; ", subject_cleaned) # remove overly broad keywords separated by .
   subject_cleaned = gsub("\\. ", "; ", subject_cleaned) # replace inconsistent keyword separation
   subject_cleaned = gsub(" ?\\d[:?-?]?(\\d+.)+", "", subject_cleaned) # replace residuals like 5:621.313.323 or '5-76.95'
-  subject_cleaned = gsub("\\w+:\\w+-(\\w+\\/)+", "", subject_cleaned) # replace residuals like Info:eu-repo/classification/
+  subject_cleaned = gsub(": ", "", subject_cleaned) # clean up keyword separation
   subject_cleaned = gsub("^; $", "", subject_cleaned) # clean up keyword separation
+  subject_cleaned = gsub(";+", ";", subject_cleaned) # clean up keyword separation
+  subject_cleaned = gsub(",+", ",", subject_cleaned) # clean up keyword separation
   subject_cleaned = gsub(",", ", ", subject_cleaned) # clean up keyword separation
   subject_cleaned = gsub("\\s+", " ", subject_cleaned) # clean up keyword separation
   subject_cleaned = stringi::stri_trim(subject_cleaned) # clean up keyword separation
diff --git a/server/preprocessing/other-scripts/features.R b/server/preprocessing/other-scripts/features.R
@@ -19,7 +19,7 @@ create_corpus <- function(metadata, text, stops) {
   batch_size <- 1000
   total_length <- length(stops)
   for (i in seq(1, total_length, batch_size)) {
-    corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)])
+    try(corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)]))
   }
   corpus <- tm_map(corpus, stripWhitespace)
   unstemmed <- corpus
diff --git a/server/preprocessing/other-scripts/summarize.R b/server/preprocessing/other-scripts/summarize.R
@@ -97,7 +97,9 @@ create_cluster_labels <- function(clusters, metadata,
 
 fix_cluster_labels <- function(clusterlabels, type_counts){
   unlist(mclapply(clusterlabels, function(x) {
-    fix_keyword_casing(x, type_counts)
+    x <- fix_keyword_casing(x, type_counts)
+    # clean up titles from format issues
+    x <- gsub(",+", ",", x)
     }))
 }
 
@@ -179,7 +181,7 @@ another_prune_ngrams <- function(ngrams, stops){
   # check if first token of ngrams in stopword list
   batch_size <- 1000
   total_length <- length(stops)
-  for (i in seq(1, total_length, batch_size)) {
+  for (i in seq(1, total_length, batch_size)) try({
     tokens = lapply(tokens, function(y){
                             Filter(function(x){
                                     if (x[1] != "") !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], x[1]))
@@ -189,7 +191,7 @@ another_prune_ngrams <- function(ngrams, stops){
                             Filter(function(x){
                                     if (tail(x,1) != "") !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], tail(x,1)))
                                               }, y)})
-  }
+  })
   # check that first token is not the same as the last token
   tokens = lapply(tokens, function(y){
                     if(length(y) > 1) {
diff --git a/server/preprocessing/other-scripts/test/params_base.json b/server/preprocessing/other-scripts/test/params_base.json
@@ -1,7 +1,7 @@
 {
   "document_types":["121"],
   "from":"1665-01-01",
-  "to":"2023-07-21",
+  "to":"2023-09-25",
   "sorting":"most-relevant",
   "vis_id": "TEST_ID",
   "min_descsize": 300,
diff --git a/server/preprocessing/other-scripts/test/test_base.R b/server/preprocessing/other-scripts/test/test_base.R
@@ -7,7 +7,7 @@ options(warn=1)
 wd <- dirname(dirname(rstudioapi::getActiveDocumentContext()$path))
 setwd(wd) #Don't forget to set your working directory
 
-query <- 'test' #args[2]
+query <- 'cond-mat.mtrl-sci' #args[2]
 service <- "base"
 params <- NULL
 params_file <- "test/params_base.json"
diff --git a/server/services/searchOpenAire.php b/server/services/searchOpenAire.php
@@ -8,6 +8,7 @@
 use headstart\library;
 
 $post_params = $_POST;
+$precomputed_id = (isset($_POST["unique_id"]))?($_POST["unique_id"]):(null);
 
 if (array_key_exists("acronymtitle", $_POST)) {
     $q = library\CommUtils::getParameter($_POST, "acronymtitle");
@@ -41,7 +42,7 @@
                   , $post_params, $param_array
                   , false
                   , true, $id_array
-                  , null, true);
+                  , $precomputed_id, true);
 
 echo $result
 
diff --git a/server/workers/api/requirements.txt b/server/workers/api/requirements.txt
@@ -1,6 +1,5 @@
 flask
 flask-cors
-flask_sqlalchemy
 flask_restx
 Werkzeug
 marshmallow
diff --git a/server/workers/build_docker_images.sh b/server/workers/build_docker_images.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-services=("api" "persistence" "gsheets" "dataprocessing" "base" "pubmed" "openaire")
+services=("api" "persistence" "dataprocessing" "base" "pubmed" "openaire")
 for service in ${services[@]}; do
+    echo ""
+    echo "Building $service"
+    echo ""
     docker build -f "$SCRIPT_DIR/../workers/$service/Dockerfile" -t "$service:`git rev-parse HEAD`" "$SCRIPT_DIR/../"
 done
-
diff --git a/server/workers/dataprocessing/src/headstart.py b/server/workers/dataprocessing/src/headstart.py
@@ -92,8 +92,6 @@ def run(self):
                 self.logger.error("Could not connect to remote Redis server, is the SSH tunnel open?")
             try:
                 if params.get('vis_type') == "timeline":
-                    # the step of create_map can be dropped once deduplication is possible in API backend as well
-                    # TODO: create deduplicate endpoint in service worker and connect to that
                     metadata = self.execute_search(params, input_data)
                     sg_data = sg.get_streamgraph_data(json.loads(metadata),
                                                     params.get('q'),
diff --git a/server/workers/dataprocessing/src/streamgraph.py b/server/workers/dataprocessing/src/streamgraph.py
@@ -26,8 +26,12 @@ def __init__(self, loglevel="INFO"):
 
     def tokenize(self, s):
         #return re.split("; | - |, |: ", s)
-        s = re.sub(r"[\(\)]", "", s)
-        return re.split("; ", s)
+        t = re.sub(r"[\(\)]", "", s)
+        t = re.split("; ", t)
+        t = [s for s in t]
+        t = [s.replace(";", "") for s in t]
+        t = [s.strip() for s in t]
+        return t
 
     def get_streamgraph_data(self, metadata, query, n=12, method="count"):
         metadata = pd.DataFrame.from_records(metadata)
@@ -36,7 +40,7 @@ def get_streamgraph_data(self, metadata, query, n=12, method="count"):
         df.dropna(axis=0, subset=["year"], inplace=True)
         df.year = pd.to_datetime(df.year.map(lambda x: x.replace(month=1, day=1).strftime('%Y-%m-%d')))
         df = df[df.subject.map(lambda x: x is not None)]
-        df.subject = df.subject.map(lambda x: [s.lower() for s in self.tokenize(x)] if isinstance(x, str) else "")
+        df.subject = df.subject.map(lambda x: self.tokenize(x.lower()) if isinstance(x, str) else [])
         df = df[df.subject.map(lambda x: x != [])]
         df["boundary_label"] = df.year
         df = df.explode('subject')
@@ -177,6 +181,8 @@ def reduce_daterange(self, daterange, df):
         # 5% which is chosen here is an arbitrary value, could also be higher 10% or lower
         min_value = int(yearly_sums.sum() * 0.05)
         start_index = yearly_sums_cum[yearly_sums_cum > min_value].index[0]
+        self.logger.debug(f"Start index: {start_index}")
+        self.logger.debug(f"Start year: {x[start_index]}")
         df.y = df.y.map(lambda x: x[start_index:])
         df.ids_timestep = df.ids_timestep.map(lambda x: x[start_index:])
         x = x[start_index:]
diff --git a/server/workers/persistence/Dockerfile b/server/workers/persistence/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.6.11-slim
+FROM python:3.7
 
 MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org"
 
@@ -8,6 +8,7 @@ RUN apt-get install -y libpq-dev
 
 WORKDIR /persistence
 COPY workers/persistence/requirements.txt .
+RUN pip install --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip install git+https://github.com/python-restx/flask-restx
 COPY workers/persistence/src/ ./

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ create_corpus <- function(metadata, text, stops) {`
`19`	`19`	`batch_size <- 1000`
`20`	`20`	`total_length <- length(stops)`
`21`	`21`	`for (i in seq(1, total_length, batch_size)) {`
`22`		`- corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)])`
	`22`	`+ try(corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)]))`
`23`	`23`	`}`
`24`	`24`	`corpus <- tm_map(corpus, stripWhitespace)`
`25`	`25`	`unstemmed <- corpus`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"document_types":["121"],`
`3`	`3`	`"from":"1665-01-01",`
`4`		`- "to":"2023-07-21",`
	`4`	`+ "to":"2023-09-25",`
`5`	`5`	`"sorting":"most-relevant",`
`6`	`6`	`"vis_id": "TEST_ID",`
`7`	`7`	`"min_descsize": 300,`