Merge pull request #745 from OpenKnowledgeMaps/dev

chreman · web-flow · commit 5ac33666996f · 2023-11-17T19:54:22.000+01:00
Dev
diff --git a/package-lock.json b/package-lock.json
diff --git a/server/preprocessing/other-scripts/base.R b/server/preprocessing/other-scripts/base.R
@@ -188,19 +188,34 @@ etl <- function(res, repo, non_public) {
   subject_cleaned = gsub("[^\\(;]+\\(all\\)(;|$)?", "", subject_cleaned) # remove general subjects
   subject_cleaned = gsub("[^:;]+ ?:: ?[^;]+(;|$)?", "", subject_cleaned) #remove classification with separator ::
   subject_cleaned = gsub("[^\\[;]+\\[[A-Z,0-9]+\\](;|$)?", "", subject_cleaned) # remove WHO classification
-  subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9][A-Za-z0-9 \\/\\.]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
+  subject_cleaned = gsub("Info:\\w+-(\\w+\\/)+", "", subject_cleaned) # remove Info:eu-repo/classification/
+  subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9 \\/\\.-]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
+  if (!is.null(params$vis_type) && params$vis_type == "timeline") {
+    subject_cleaned = gsub("FOS ", "", subject_cleaned) # remove FOS classification tag, but keep classifcation name
+    arxiv_classification_string = "(cs|econ|eess|math|astro-ph|nlin|q-bio|q-fin|stat)\\.[A-Z]{2}|cond-mat\\.[a-z\\-]+|hep-(ex|lat|ph|th)|math-ph|nucl-(ex|th)|physics\\.[a-z\\-]+|(astro-ph|gr-qc|quant-ph|cond-mat)"
+    subject_cleaned = gsub(arxiv_classification_string, "", subject_cleaned, perl=TRUE) # remove arXiv classification short code, but keep classifcation name
+  } else {
+    subject_cleaned = gsub("FOS [A-Za-z ]+", "", subject_cleaned) # remove FOS classifications (Fields of Science and Technology)
+    arxiv_classification_string = "(([A-Za-z ]+ )?cond-mat\\.[a-z\\-]+)|([\\w ]+ )?(cs|econ|eess|math|astro-ph|nlin|q-bio|q-fin|stat)\\.[A-Z]{2}|cond-mat\\.[a-z\\-]+|hep-(ex|lat|ph|th)|math-ph|nucl-(ex|th)|physics\\.[a-z\\-]+|([\\w ]+ )(astro-ph|gr-qc|quant-ph|cond-mat)"
+    subject_cleaned = gsub(arxiv_classification_string, "", subject_cleaned, perl=TRUE) # remove arXiv classification, except on streamgraphs    
+  }
+  subject_cleaned = gsub("([A-Za-z]+:[A-Za-z0-9 \\/\\.]+);?", "", subject_cleaned, perl=TRUE) # clean up annotations with prefix e.g. theme:annotation
   subject_cleaned = gsub("(wikidata)?\\.org/entity/[qQ]([\\d]+)?", "", subject_cleaned) # remove wikidata classification
   subject_cleaned = gsub("</keyword><keyword>", "", subject_cleaned) # remove </keyword><keyword>
   subject_cleaned = gsub("\\[No keyword\\]", "", subject_cleaned)
   subject_cleaned = gsub("\\[[^\\[]+\\][^\\;]+(;|$)?", "", subject_cleaned) # remove classification
   subject_cleaned = gsub("[0-9]{2,} [A-Z]+[^;]*(;|$)?", "", subject_cleaned) #remove classification
   subject_cleaned = gsub(" -- ", "; ", subject_cleaned) #replace inconsistent keyword separation
+  subject_cleaned = gsub("[-]{2,}", "; ", subject_cleaned) #replace inconsistent keyword separation
+  subject_cleaned = gsub("[A-Z]\\.\\d\\.\\d+", "", subject_cleaned) #replace inconsistent keyword separation
   subject_cleaned = gsub(" \\(  ", "; ", subject_cleaned) #replace inconsistent keyword separation
   subject_cleaned = gsub("(\\w* \\w*(\\.)( \\w* \\w*)?)", "; ", subject_cleaned) # remove overly broad keywords separated by .
   subject_cleaned = gsub("\\. ", "; ", subject_cleaned) # replace inconsistent keyword separation
   subject_cleaned = gsub(" ?\\d[:?-?]?(\\d+.)+", "", subject_cleaned) # replace residuals like 5:621.313.323 or '5-76.95'
-  subject_cleaned = gsub("\\w+:\\w+-(\\w+\\/)+", "", subject_cleaned) # replace residuals like Info:eu-repo/classification/
+  subject_cleaned = gsub(": ", "", subject_cleaned) # clean up keyword separation
   subject_cleaned = gsub("^; $", "", subject_cleaned) # clean up keyword separation
+  subject_cleaned = gsub(";+", ";", subject_cleaned) # clean up keyword separation
+  subject_cleaned = gsub(",+", ",", subject_cleaned) # clean up keyword separation
   subject_cleaned = gsub(",", ", ", subject_cleaned) # clean up keyword separation
   subject_cleaned = gsub("\\s+", " ", subject_cleaned) # clean up keyword separation
   subject_cleaned = stringi::stri_trim(subject_cleaned) # clean up keyword separation
diff --git a/server/preprocessing/other-scripts/features.R b/server/preprocessing/other-scripts/features.R
@@ -19,7 +19,7 @@ create_corpus <- function(metadata, text, stops) {
   batch_size <- 1000
   total_length <- length(stops)
   for (i in seq(1, total_length, batch_size)) {
-    corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)])
+    try(corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)]))
   }
   corpus <- tm_map(corpus, stripWhitespace)
   unstemmed <- corpus
diff --git a/server/preprocessing/other-scripts/summarize.R b/server/preprocessing/other-scripts/summarize.R
@@ -97,7 +97,9 @@ create_cluster_labels <- function(clusters, metadata,
 
 fix_cluster_labels <- function(clusterlabels, type_counts){
   unlist(mclapply(clusterlabels, function(x) {
-    fix_keyword_casing(x, type_counts)
+    x <- fix_keyword_casing(x, type_counts)
+    # clean up titles from format issues
+    x <- gsub(",+", ",", x)
     }))
 }
 
@@ -179,7 +181,7 @@ another_prune_ngrams <- function(ngrams, stops){
   # check if first token of ngrams in stopword list
   batch_size <- 1000
   total_length <- length(stops)
-  for (i in seq(1, total_length, batch_size)) {
+  for (i in seq(1, total_length, batch_size)) try({
     tokens = lapply(tokens, function(y){
                             Filter(function(x){
                                     if (x[1] != "") !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], x[1]))
@@ -189,7 +191,7 @@ another_prune_ngrams <- function(ngrams, stops){
                             Filter(function(x){
                                     if (tail(x,1) != "") !any(stri_detect_fixed(stops[i:min(i+batch_size -1, total_length)], tail(x,1)))
                                               }, y)})
-  }
+  })
   # check that first token is not the same as the last token
   tokens = lapply(tokens, function(y){
                     if(length(y) > 1) {
diff --git a/server/preprocessing/other-scripts/test/params_base.json b/server/preprocessing/other-scripts/test/params_base.json
@@ -1,7 +1,7 @@
 {
   "document_types":["121"],
   "from":"1665-01-01",
-  "to":"2023-07-21",
+  "to":"2023-09-25",
   "sorting":"most-relevant",
   "vis_id": "TEST_ID",
   "min_descsize": 300,
diff --git a/server/preprocessing/other-scripts/test/test_base.R b/server/preprocessing/other-scripts/test/test_base.R
@@ -7,7 +7,7 @@ options(warn=1)
 wd <- dirname(dirname(rstudioapi::getActiveDocumentContext()$path))
 setwd(wd) #Don't forget to set your working directory
 
-query <- 'test' #args[2]
+query <- 'cond-mat.mtrl-sci' #args[2]
 service <- "base"
 params <- NULL
 params_file <- "test/params_base.json"
diff --git a/server/services/searchBASE.php b/server/services/searchBASE.php
@@ -11,7 +11,7 @@
 $precomputed_id = (isset($_POST["unique_id"]))?($_POST["unique_id"]):(null);
 
 $params_array = array("from", "to", "document_types", "sorting", "min_descsize");
-$optional_get_params = ["repo", "coll", "vis_type", "q_advanced", "lang_id"];
+$optional_get_params = ["repo", "coll", "vis_type", "q_advanced", "lang_id", "custom_title"];
 
 function filterEmptyString($value)
 {
diff --git a/server/workers/api/requirements.txt b/server/workers/api/requirements.txt
@@ -1,6 +1,5 @@
 flask
 flask-cors
-flask_sqlalchemy
 flask_restx
 Werkzeug
 marshmallow
diff --git a/server/workers/api/src/apis/request_validators.py b/server/workers/api/src/apis/request_validators.py
@@ -32,6 +32,7 @@ class SearchParamSchema(Schema):
     repo_name = fields.Str()
     coll = fields.Str()
     list_size = fields.Int()
+    custom_title = fields.Str()
 
 
     @pre_load
diff --git a/server/workers/build_docker_images.sh b/server/workers/build_docker_images.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-services=("api" "persistence" "gsheets" "dataprocessing" "base" "pubmed" "openaire")
+services=("api" "persistence" "dataprocessing" "base" "pubmed" "openaire")
 for service in ${services[@]}; do
+    echo ""
+    echo "Building $service"
+    echo ""
     docker build -f "$SCRIPT_DIR/../workers/$service/Dockerfile" -t "$service:`git rev-parse HEAD`" "$SCRIPT_DIR/../"
 done
-
diff --git a/server/workers/dataprocessing/src/headstart.py b/server/workers/dataprocessing/src/headstart.py
@@ -92,8 +92,6 @@ def run(self):
                 self.logger.error("Could not connect to remote Redis server, is the SSH tunnel open?")
             try:
                 if params.get('vis_type') == "timeline":
-                    # the step of create_map can be dropped once deduplication is possible in API backend as well
-                    # TODO: create deduplicate endpoint in service worker and connect to that
                     metadata = self.execute_search(params, input_data)
                     sg_data = sg.get_streamgraph_data(json.loads(metadata),
                                                     params.get('q'),
diff --git a/server/workers/dataprocessing/src/streamgraph.py b/server/workers/dataprocessing/src/streamgraph.py
@@ -26,8 +26,12 @@ def __init__(self, loglevel="INFO"):
 
     def tokenize(self, s):
         #return re.split("; | - |, |: ", s)
-        s = re.sub(r"[\(\)]", "", s)
-        return re.split("; ", s)
+        t = re.sub(r"[\(\)]", "", s)
+        t = re.split("; ", t)
+        t = [s for s in t]
+        t = [s.replace(";", "") for s in t]
+        t = [s.strip() for s in t]
+        return t
 
     def get_streamgraph_data(self, metadata, query, n=12, method="count"):
         metadata = pd.DataFrame.from_records(metadata)
@@ -36,7 +40,7 @@ def get_streamgraph_data(self, metadata, query, n=12, method="count"):
         df.dropna(axis=0, subset=["year"], inplace=True)
         df.year = pd.to_datetime(df.year.map(lambda x: x.replace(month=1, day=1).strftime('%Y-%m-%d')))
         df = df[df.subject.map(lambda x: x is not None)]
-        df.subject = df.subject.map(lambda x: [s.lower() for s in self.tokenize(x)] if isinstance(x, str) else "")
+        df.subject = df.subject.map(lambda x: self.tokenize(x.lower()) if isinstance(x, str) else [])
         df = df[df.subject.map(lambda x: x != [])]
         df["boundary_label"] = df.year
         df = df.explode('subject')
@@ -177,6 +181,8 @@ def reduce_daterange(self, daterange, df):
         # 5% which is chosen here is an arbitrary value, could also be higher 10% or lower
         min_value = int(yearly_sums.sum() * 0.05)
         start_index = yearly_sums_cum[yearly_sums_cum > min_value].index[0]
+        self.logger.debug(f"Start index: {start_index}")
+        self.logger.debug(f"Start year: {x[start_index]}")
         df.y = df.y.map(lambda x: x[start_index:])
         df.ids_timestep = df.ids_timestep.map(lambda x: x[start_index:])
         x = x[start_index:]
diff --git a/server/workers/persistence/Dockerfile b/server/workers/persistence/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.6.11-slim
+FROM python:3.7
 
 MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org"
 
@@ -8,6 +8,7 @@ RUN apt-get install -y libpq-dev
 
 WORKDIR /persistence
 COPY workers/persistence/requirements.txt .
+RUN pip install --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip install git+https://github.com/python-restx/flask-restx
 COPY workers/persistence/src/ ./
diff --git a/vis/js/components/Heading.js b/vis/js/components/Heading.js
@@ -1,6 +1,7 @@
 import React from "react";
 import { connect } from "react-redux";
 
+
 import {
   BasicTitle,
   ProjectTitle,
@@ -19,7 +20,7 @@ const Heading = ({
                    headingParams,
                    streamgraph,
                    q_advanced,
-                   // customTitle,
+                     service,
                  }) => {
   if (zoomed) {
     const label = streamgraph
@@ -41,13 +42,11 @@ const Heading = ({
 
   let queryString = queryConcatenator([query, q_advanced])
 
-  // console.log("customTitle", headingParams.customTitle)
-
   return (
       // html template starts here
       <div className="heading-container">
         <h4 className="heading">
-          {renderTitle(localization, queryString, headingParams)}
+            {renderTitle(localization, queryString, headingParams, service)}
         </h4>
       </div>
       // html template ends here
@@ -62,10 +61,11 @@ const mapStateToProps = (state) => ({
   headingParams: state.heading,
   streamgraph: state.chartType === STREAMGRAPH_MODE,
   q_advanced: state.q_advanced.text,
-  // context: state,
-  // customTitle: state.heading.customTitle,
+    // get source BASE or PubMed
+    service: state.contextLine.dataSource,
 });
 
+
 export default connect(mapStateToProps)(Heading);
 
 // This should probably make its way to a more global config
@@ -75,7 +75,8 @@ const MAX_LENGTH_CUSTOM = 100;
 /**
  * Renders the title for the correct setup.
  */
-const renderTitle = (localization, query, headingParams) => {
+const renderTitle = (localization, query, headingParams, service) => {
+
   if (headingParams.presetTitle) {
     return <BasicTitle title={headingParams.presetTitle} />;
   }
@@ -104,10 +105,18 @@ const renderTitle = (localization, query, headingParams) => {
       );
     }
 
+    // this condition for BASE service and custom title if its value exists in config params
+      if (service === "BASE") {
+              if (headingParams.customTitle) {
+                  return <StandardTitle label={label} title={headingParams.customTitle}/>;
+              }
+      }
+
     return <StandardTitle label={label} title={query} />;
   }
 
-  return <BasicTitle title={localization.default_title} />;
+
+    return <BasicTitle title={localization.default_title} />;
 };
 
 const renderViperTitle = (title, acronym, projectId) => {
diff --git a/vis/js/reducers/heading.js b/vis/js/reducers/heading.js
@@ -5,9 +5,6 @@ const context = (state = {}, action) => {
 
   switch (action.type) {
     case "INITIALIZE":
-      // console.log("action", action)
-      // console.log("params", action.contextObject.params)
-      // console.log("custom_title", action.contextObject.params.custom_title)
       return {
         title: action.contextObject.params
             ? action.contextObject.params.title
@@ -19,22 +16,22 @@ const context = (state = {}, action) => {
             ? action.contextObject.params.project_id
             : undefined,
         presetTitle: action.configObject.title,
-        titleStyle: getTitleStyle(action.configObject),
+        // Todo: set titleStyle = "custom" if custom_title exists
+        titleStyle: action.contextObject.params.custom_title ? 'custom' : getTitleStyle(action.configObject),
         titleLabelType: getTitleLabelType(action.configObject),
-        customTitle: action.configObject.custom_title,
-        // customTitle: action.contextObject.params.custom_title,
+        customTitle: action.configObject.custom_title ? action.configObject.custom_title : action.contextObject.params.custom_title,
       };
     default:
       return state;
   }
 };
 
 const getTitleStyle = (config) => {
+  
   if (config.create_title_from_context) {
     if (config.create_title_from_context_style) {
       return config.create_title_from_context_style;
     }
-
     return "standard";
   }
 
diff --git a/vis/js/templates/modals/CitationModal.jsx b/vis/js/templates/modals/CitationModal.jsx
@@ -11,6 +11,7 @@ import { formatString, removeEmbedParam } from "../../utils/string";
 import CopyButton from "../CopyButton";
 import useMatomo from "../../utils/useMatomo";
 import { queryConcatenator } from "../../utils/data";
+import {unescapeHTML} from "../../utils/unescapeHTMLentities.js";
 
 const CitationModal = ({
   open,
@@ -20,6 +21,7 @@ const CitationModal = ({
   customTitle,
   timestamp,
   q_advanced,
+                         titleStyle,
 }) => {
   const loc = useLocalizationContext();
   const { trackEvent } = useMatomo();
@@ -37,7 +39,7 @@ const CitationModal = ({
     customQuery = customQuery.substr(0, 100) + "[..]";
   }
   if (customTitle) {
-    customQuery = customTitle;
+    customQuery = unescapeHTML(customTitle);
   }
 
   const date = getDateFromTimestamp(timestamp);
@@ -88,9 +90,10 @@ const mapStateToProps = (state) => ({
   isStreamgraph: state.chartType === STREAMGRAPH_MODE,
   query: state.query.text,
   customTitle:
-    state.heading.titleStyle === "custom" ? state.heading.customTitle : null,
+      state.heading.titleStyle === "custom" ? state.heading.customTitle : null,
   timestamp: state.misc.timestamp,
   q_advanced: state.q_advanced.text,
+  titleStyle: state.heading.titleStyle
 });
 
 const mapDispatchToProps = (dispatch) => ({
diff --git a/vis/js/templates/modals/infomodal/subcomponents/StandardKMInfo.jsx b/vis/js/templates/modals/infomodal/subcomponents/StandardKMInfo.jsx
@@ -4,14 +4,20 @@ import { Modal } from "react-bootstrap";
 import AboutSoftware from "./AboutSoftware";
 import DataSource from "./DataSource";
 import { queryConcatenator } from "../../../../utils/data";
+import {unescapeHTML} from "../../../../utils/unescapeHTMLentities.js";
+
+
 
 const StandardKMInfo = ({
   serviceName,
   serviceDesc,
   serviceLogo,
-  params: { query, customTitle, repo_name, q_advanced },
+                          params: {query, customTitle, repo_name, q_advanced},
 }) => {
   let queryString = queryConcatenator([query, q_advanced])
+
+  customTitle = customTitle && unescapeHTML(customTitle)
+
   return (
     // html template starts here
     <>
@@ -24,6 +30,7 @@ const StandardKMInfo = ({
             This knowledge map presents you with a topical overview of research
             on{" "}
             <strong className="hs-strong">
+              {/*{(customTitle && true && customTitle !== "undefined") ? customTitle : queryString}*/}
               {customTitle ? customTitle : queryString}
             </strong>{" "}
             based on the 100{" "}
@@ -46,7 +53,7 @@ const StandardKMInfo = ({
         {!!customTitle && (
           <p>
             This map has a custom title and was created using the following
-            query: <strong className="hs-strong">{query}</strong>
+            query: <strong className="hs-strong">{queryString}</strong>
           </p>
         )}
         <p>
diff --git a/vis/js/utils/unescapeHTMLentities.js b/vis/js/utils/unescapeHTMLentities.js
@@ -0,0 +1,20 @@
+export const unescapeHTML = (string) => {
+    let entityMap = {
+        "&amp;": "&",
+        "&lt;": "<",
+        "&gt;": ">",
+        "&quot;": '"',
+        "&#34;": '"',
+        "&#39;": "'",
+        "&#x2F;": "/",
+        "&#x60;": "`",
+        "&#x3D;": "=",
+    };
+
+    return String(string).replace(
+        /(&amp;|&lt;|&gt;|&quot;|&#34;|&#39;|&#x2F;|&#x60;|&#x3D;)/g,
+        function (s) {
+            return entityMap[s];
+        }
+    );
+};

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ create_corpus <- function(metadata, text, stops) {`
`19`	`19`	`batch_size <- 1000`
`20`	`20`	`total_length <- length(stops)`
`21`	`21`	`for (i in seq(1, total_length, batch_size)) {`
`22`		`- corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)])`
	`22`	`+ try(corpus <- tm_map(corpus, removeWords, stops[i:min(i+batch_size -1, total_length)]))`
`23`	`23`	`}`
`24`	`24`	`corpus <- tm_map(corpus, stripWhitespace)`
`25`	`25`	`unstemmed <- corpus`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"document_types":["121"],`
`3`	`3`	`"from":"1665-01-01",`
`4`		`- "to":"2023-07-21",`
	`4`	`+ "to":"2023-09-25",`
`5`	`5`	`"sorting":"most-relevant",`
`6`	`6`	`"vis_id": "TEST_ID",`
`7`	`7`	`"min_descsize": 300,`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`$precomputed_id = (isset($_POST["unique_id"]))?($_POST["unique_id"]):(null);`
`12`	`12`
`13`	`13`	`$params_array = array("from", "to", "document_types", "sorting", "min_descsize");`
`14`		`-$optional_get_params = ["repo", "coll", "vis_type", "q_advanced", "lang_id"];`
	`14`	`+$optional_get_params = ["repo", "coll", "vis_type", "q_advanced", "lang_id", "custom_title"];`
`15`	`15`
`16`	`16`	`function filterEmptyString($value)`
`17`	`17`	`{`