Merge pull request #733 from OpenKnowledgeMaps/dev

chreman · web-flow · commit b9a40662d85f · 2023-08-04T12:11:42.000+02:00
2023-08-03
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -54,7 +54,7 @@
     "@babel/plugin-transform-runtime": "^7.17.0",
     "@babel/preset-env": "^7.15.6",
     "@babel/preset-react": "^7.14.5",
-    "babel-core": "^6.11.4",
+    "babel-core": "^7.0.0-bridge.0",
     "babel-jest": "^27.3.1",
     "babel-loader": "^8.2.2",
     "babel-preset-es2015": "^6.9.0",
diff --git a/server/preprocessing/other-scripts/preprocess.R b/server/preprocessing/other-scripts/preprocess.R
@@ -143,9 +143,13 @@ replace_keywords_if_empty <- function(metadata, stops) {
   vplog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Documents without subjects after replacing from title:", length(missing_subjects)))
   if (length(missing_subjects) > 0) {
     foreach (i = missing_subjects) %dopar% {
-      candidates = mapply(paste, metadata$title[i], metadata$paper_abstract[i])
+      if (nrow(metadata) == 1) {
+        candidates = mapply(paste, metadata$title, metadata$paper_abstract)
+      } else {
+        candidates = mapply(paste, metadata$title[i,], metadata$paper_abstract[i,])
+      }
       for (i in seq(1, total_length, batch_size)) {
-      candidates = mclapply(candidates, function(x)paste(removeWords(x, stops[i:min(i+batch_size -1, total_length)]), collapse=""))
+        candidates = mclapply(candidates, function(x)paste(removeWords(x, stops[i:min(i+batch_size -1, total_length)]), collapse=""))
       }
       candidates = lapply(candidates, function(x) {gsub("[^[:alpha:]]", " ", x)})
       candidates = lapply(candidates, function(x) {gsub(" +", " ", x)})
@@ -155,7 +159,11 @@ replace_keywords_if_empty <- function(metadata, stops) {
       replacement_keywords <- filter_out_nested_ngrams(names(nn_count), 3)
       replacement_keywords = lapply(replacement_keywords, FUN = function(x) {paste(unlist(x), collapse="; ")})
       replacement_keywords = gsub("_", " ", replacement_keywords)
-      metadata$subject[i] <- paste(replacement_keywords, collapse="; ")
+      if (nrow(metadata) == 1) {
+        metadata$subject <- paste(replacement_keywords, collapse="; ")
+      } else {
+        metadata$subject[i] <- paste(replacement_keywords, collapse="; ")
+      }
     }
   }
   return(metadata)
diff --git a/server/services/displayPDF.php b/server/services/displayPDF.php
@@ -1,6 +1,6 @@
 <html>
 <head>
-<meta http-equiv="refresh" content="0; url=pdf.js-hypothes.is/viewer/web/viewer.html?file=<?php echo $_GET["file"] ?>" />
+<meta http-equiv="refresh" content="0; url=pdf.js-hypothes.is/viewer/web/viewer.html?file=<?php echo htmlspecialchars($_GET["file"]) ?>" />
 </head>
 <body>
 </body>
diff --git a/server/services/snapshot/headstart_snapshot.php b/server/services/snapshot/headstart_snapshot.php
@@ -8,7 +8,7 @@
     <body style="margin:0px; padding:0px">
 
         <div id="visualization"></div>
-        <script type="text/javascript" src="data-config_<?php echo $_GET['service'] ?>.js"></script>
+        <script type="text/javascript" src="data-config_<?php echo htmlspecialchars($_GET['service']) ?>.js"></script>
 		<script src="../../../../js/search_options.js"></script>
         <script>
             data_config.files = [{
@@ -18,7 +18,7 @@
             data_config.server_url = window.location.href.replace(/[^/]*$/, '') + "../../";
             data_config.show_context = true;
             data_config.create_title_from_context= true;
-            data_config.options = options_<?php echo $_GET['service']; ?>.dropdowns;
+            data_config.options = options_<?php echo htmlspecialchars($_GET['service']); ?>.dropdowns;
             if (<?php echo json_encode($_GET['service']) ?> === "linkedcat" ||
                 <?php echo json_encode($_GET['service']) ?> === "linkedcat_authorview" ||
                 <?php echo json_encode($_GET['service']) ?> === "linkedcat_browseview") {
diff --git a/server/workers/base/renv.lock b/server/workers/base/renv.lock
@@ -199,9 +199,7 @@
     "renv": {
       "Package": "renv",
       "Version": "0.14.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "30e5eba91b67f7f4d75d31de14bbfbdc"
+      "Source": "Repository"
     },
     "rlang": {
       "Package": "rlang",
diff --git a/server/workers/base/src/base.py b/server/workers/base/src/base.py
@@ -88,12 +88,13 @@ def execute_search(self, params):
                 res = raw_metadata
             else:
                 metadata = pd.DataFrame(raw_metadata)
+                metadata = self.sanitize_metadata(metadata)
                 metadata = filter_duplicates(metadata)
                 metadata = pd.concat([metadata, parse_annotations_for_all(metadata, "subject_orig")], axis=1)
                 metadata = metadata.head(params.get('list_size'))
                 metadata.reset_index(inplace=True, drop=True)
                 metadata = self.enrich_metadata(metadata)
-                text = pd.concat([metadata.id, metadata[["title", "paper_abstract", "subject_orig", "published_in", "authors"]]
+                text = pd.concat([metadata.id, metadata[["title", "paper_abstract", "subject_orig", "published_in", "sanitized_authors"]]
                                          .apply(lambda x: " ".join(x), axis=1)], axis=1)
                 text.columns = ["id", "content"]
                 input_data = {}
@@ -108,6 +109,10 @@ def execute_search(self, params):
             self.logger.error(error)
             raise
 
+    def sanitize_metadata(self, metadata):
+        metadata["sanitized_authors"] = metadata["authors"].map(lambda x: sanitize_authors(x))
+        return metadata
+
     def enrich_metadata(self, metadata):
         metadata["repo"] = metadata["content_provider"].map(lambda x: self.content_providers.get(x, ""))
         enrichment = improved_df_parsing(metadata)
@@ -331,7 +336,7 @@ def parse_annotations_for_all(metadata, field_name):
     parsed_annotations = pd.DataFrame(metadata[field_name].map(lambda x: parse_annotations(x)))
     parsed_annotations.columns = ["annotations"]
     expanded_annotations = expand_dict_columns(parsed_annotations)
-    return parsed_annotations
+    return expanded_annotations
 
 # convert DataFrame with dict columns to DataFrame with columns for each dict key
 def expand_dict_columns(df):
@@ -347,4 +352,10 @@ def expand_dict_columns(df):
 
 def clean_up_annotations(df, field):
     df[field] = df[field].map(lambda x: pattern_annotations.sub("", x).strip())
-    return df
+    return df
+
+def sanitize_authors(authors, n=15):
+    authors = authors.split("; ")
+    if len(authors) > n:
+        authors = authors[:n-1] + authors[-1:]
+    return "; ".join(authors)
diff --git a/server/workers/dataprocessing/src/streamgraph.py b/server/workers/dataprocessing/src/streamgraph.py
@@ -199,11 +199,7 @@ def aggregate_ids(series):
         return "NA"
 
 
-def load_stopwords():
-    stopwords = []
-    if os.path.isfile("../../resources/additional_stopwords.txt"):
-        with open ("../../resources/additional_stopwords.txt") as infile:
-            stopwords = infile.read().splitlines()
-    return stopwords
-
-stopwords = load_stopwords()
+stopwords = ["archeo", "archi", "art", "anthro-bio", "class", "info", "museo", "demo",
+                       "eco", "edu", "envir", "genre", "geo", "hist", "hisphilso", "droit",
+                       "lang", "litt", "manag", "stat", "musiq", "phil", "scipo", "psy",
+                       "relig", "anthro-se", "socio"]
diff --git a/vis/js/dataprocessing/managers/DataManager.js b/vis/js/dataprocessing/managers/DataManager.js
@@ -143,11 +143,21 @@ class DataManager {
   __parseAuthors(paper) {
     paper.authors_objects = extractAuthors(paper.authors);
     paper.authors_list = getAuthorsList(
-      paper.authors,
-      this.config.convert_author_names
+        paper.authors,
+        this.config.convert_author_names
     );
 
-    paper.authors_string = paper.authors_list.join(", ");
+      // old variable with all authors_string
+    // paper.authors_string = paper.authors_list.join(", ");
+
+    if (paper.authors_list.length > 15) {
+        const firstAuthors = paper.authors_list.slice(0, 14).join(", ");
+      const lastAuthor = paper.authors_list[paper.authors_list.length - 1];
+        // get first 14 authors and add "..." and last author for the visual part the map
+        paper.authors_string = `${firstAuthors}, ... ${lastAuthor}`;
+        // in the case of more than 15 authors left an array of 16 authors for further processing in the visual part with "..." between the authors
+        paper.authors_list = paper.authors_list.slice(0, 15).concat(lastAuthor);
+    }
   }
 
   // migrated from legacy code
diff --git a/vis/js/templates/listentry/Details.jsx b/vis/js/templates/listentry/Details.jsx
@@ -6,31 +6,34 @@ import { useLocalizationContext } from "../../components/LocalizationProvider";
 
 const MAX_AUTHORS_LENGTH = 100;
 
-const Details = ({ authors, source, isSelected }) => {
+const Details = ({authors, source, isSelected}) => {
   const loc = useLocalizationContext();
 
   const authorsString = getAuthorsString(
-    authors,
-    isSelected ? Number.POSITIVE_INFINITY : MAX_AUTHORS_LENGTH
+      authors,
+      isSelected ? Number.POSITIVE_INFINITY : MAX_AUTHORS_LENGTH
   );
 
+  // console.log("Details.jsx: authorsString: ", authorsString);
+  console.log("Details.jsx: loc.default_authors: ", loc.default_authors);
+
   return (
-    // html template starts here
-    <div className="list_details">
-      <div className="list_authors">
-        <Highlight queryHighlight>
-          {authorsString ? authorsString : loc.default_authors}
-        </Highlight>
-      </div>
-      {!!source && (
-        <div className={"list_source" + (isSelected ? "" : " short")}>
+      // html template starts here
+      <div className="list_details">
+        <div className="list_authors">
+          <Highlight queryHighlight>
+            {authorsString ? authorsString : loc.default_authors}
+          </Highlight>
+        </div>
+        {!!source && (
+            <div className={"list_source" + (isSelected ? "" : " short")}>
           <span className="list_published_in">
             <Highlight queryHighlight>{source}</Highlight>
           </span>
-        </div>
-      )}
-    </div>
-    // html template ends here
+            </div>
+        )}
+      </div>
+      // html template ends here
   );
 };
 
@@ -45,15 +48,22 @@ const getAuthorsString = (authorsList, maxLength) => {
     return "";
   }
 
+
   const authorsListCopy = [...authorsList];
 
   const ellipsis = "...";
   const join = ", ";
   let finalString = authorsListCopy.shift();
+  if (authorsList.length > 15) {
+    const first19Authors = authorsList.slice(0, 14).join(", ");
+    const lastAuthor = authorsList[authorsList.length - 1];
+    finalString = `${first19Authors}, ... ${lastAuthor}`;
+    return finalString;
+  }
   while (authorsListCopy.length > 0) {
     const nextAuthor = authorsListCopy.shift();
     let nextPossibleLength =
-      finalString.length + join.length + nextAuthor.length;
+        finalString.length + join.length + nextAuthor.length;
 
     if (authorsListCopy.length !== 0) {
       nextPossibleLength += ellipsis.length;

-Original file line number
+Diff line change
@@ @@ -1,6 +1,6 @@ @@
 <html>
 <head>
 -<meta http-equiv="refresh" content="0; url=pdf.js-hypothes.is/viewer/web/viewer.html?file=<?php echo $_GET["file"] ?>" />
 +<meta http-equiv="refresh" content="0; url=pdf.js-hypothes.is/viewer/web/viewer.html?file=<?php echo htmlspecialchars($_GET["file"]) ?>" />
 </head>
 <body>
 </body>