OpenKnowledgeMaps
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docker-compose-end2endtest.yml‎
Lines changed: 39 additions & 12 deletions b/‎docker-compose-end2endtest.yml‎
Lines changed: 39 additions & 12 deletions
diff --git a/‎examples/project_website/base.html‎
Lines changed: 3 additions & 2 deletions b/‎examples/project_website/base.html‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/project_website/data/custom_clustering.json‎
Lines changed: 10 additions & 0 deletions b/‎examples/project_website/data/custom_clustering.json‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎server/preprocessing/other-scripts/base.R‎
Lines changed: 69 additions & 17 deletions b/‎server/preprocessing/other-scripts/base.R‎
Lines changed: 69 additions & 17 deletions
diff --git a/‎server/preprocessing/other-scripts/cluster.R‎
Lines changed: 0 additions & 9 deletions b/‎server/preprocessing/other-scripts/cluster.R‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎server/preprocessing/other-scripts/run_vis_layout.R‎
Lines changed: 2 additions & 2 deletions b/‎server/preprocessing/other-scripts/run_vis_layout.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎server/preprocessing/other-scripts/summarize.R‎
Lines changed: 42 additions & 8 deletions b/‎server/preprocessing/other-scripts/summarize.R‎
Lines changed: 42 additions & 8 deletions
diff --git a/‎server/preprocessing/other-scripts/test/params_base.json‎
Lines changed: 4 additions & 3 deletions b/‎server/preprocessing/other-scripts/test/params_base.json‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎server/preprocessing/other-scripts/test/test_base.R‎
Lines changed: 1 addition & 1 deletion b/‎server/preprocessing/other-scripts/test/test_base.R‎
Lines changed: 1 addition & 1 deletion
@@ -41,8 +41,10 @@ server/preprocessing/other-scripts/renv
 .Rhistory
 .Rprofile
 .Rproj*
+/*.Rproj
 .Rproj.user
 
+
 # python files
 *.pyc
 *.pkl
 
@@ -24,10 +24,9 @@ services:
       - db
       - backend
     restart: "no"
-    entrypoint: [ "pytest", '/app/workers/tests/test_end2end.py', '-s', '-rfA']
+    entrypoint: ["pytest", '/app/workers/tests/test_end2end.py', '-s', '-rfA']
     networks:
-      test:
-        ipv4_address: 172.18.0.2
+      - test
 
   backend:
     container_name: backend
@@ -37,13 +36,47 @@ services:
       dockerfile: ./Dockerfile_backend
     volumes:
       - ./server/:/var/www/html/server
+      - ./server/workers/tests/test_data/test.sqlite:/var/www/localstorage/test.sqlite
     restart: "no"
     networks:
-      test:
-        ipv4_address: 172.18.0.3
+      - test
     ports:
       - "80:80"
 
+  api:
+    build:
+      context: server
+      dockerfile: workers/api/Dockerfile
+    restart: unless-stopped
+    environment:
+      SERVICE_VERSION: "test"
+      BEHIND_PROXY: "false"
+      DEFAULT_DATABASE: "testdb"
+      FLASK_ENV: "development"
+    volumes:
+      - ./server/workers/tests/mock_app.py:/app/mock_app.py
+    command: ["python", "mock_app.py"]
+    networks: 
+      - test
+
+  persistence:
+    container_name: api
+    hostname: "test_api"
+    build:
+      context: server
+      dockerfile: workers/persistence/Dockerfile
+    restart: "no"
+    environment:
+      SERVICE_VERSION: "test"
+      BEHIND_PROXY: "false"
+      DEFAULT_DATABASE: "testdb"
+      FLASK_ENV: "development"
+    volumes:
+      - ./server/workers/tests/mock_app.py:/app/mock_app.py
+    command: ["python", "mock_app.py"]
+    networks:
+      - test
+
   db:
     container_name: test_db
     image: 'postgres:12.2-alpine'
@@ -63,16 +96,10 @@ services:
     ports:
       - "5432:5432"
     networks:
-      test:
-        ipv4_address: 172.18.0.4
+      - test
 
 volumes:
   db_data:
 
 networks:
   test:
-    driver: bridge
-    ipam:
-      config:
-        - subnet: 172.18.0.0/16
-          gateway: 172.18.0.1
 
@@ -93,7 +93,8 @@
             //title: "fake news",
             //title: "dotcom",
             //title: "cognitive dissonance",
-            title: "custom_title",
+            // title: "custom_title",
+            title: "custom_clustering",
             // file: "./data/digital-education.json",
             // file: "./data/digital-education-lang.json",
             // file: "./data/digital-education-lang[].json",
@@ -103,7 +104,7 @@
             //file: "./data/fake-news-sg.json",
             //file: "./data/dotcom-sg.json",
             //file: "./data/cognitive-dissonance.json"
-            file: "./data/custom_title.json",
+            file: "./data/custom_clustering.json",
             // other attributes:
             is_streamgraph: false, // set true for streamgraph data
             show_area: true, // set false for streamgraph data
 
@@ -65,7 +65,6 @@ get_papers <- function(query, params,
 
   if (!is.null(exact_query) && exact_query != '') {
     base_query <- paste(paste0("(",exact_query,")"), date_string, document_types, collapse=" ")
-    base_query <- paste(paste0("(",exact_query,")"), date_string, document_types, collapse=" ")
   } else {
     base_query <- paste(date_string, document_types, collapse=" ")
   }
@@ -94,11 +93,24 @@ get_papers <- function(query, params,
     non_public = FALSE
   }
 
+  cc <- params$custom_clustering
+  if (!is.null(cc)) {
+    if (cc %in% names(fieldmapper)) {
+      # this is the generic case for existing metadata
+      custom_clustering_query <- paste(fieldmapper[[cc]], ":", "*", sep="")
+      base_query <- paste(base_query, custom_clustering_query)
+    } else {
+      # this is the speciality case for custom clustering on annotations
+      custom_clustering_query <- paste("dcsubject:", cc, "*", sep="")
+      base_query <- paste(base_query, custom_clustering_query)
+      custom_clustering_query <- paste('textus:', '"', cc, ':"', sep="")
+      base_query <- paste(base_query, custom_clustering_query)
+      custom_clustering_query <- paste(cc, ':*', sep="")
+      base_query <- paste(base_query, custom_clustering_query)
+    }
+  }
+
   blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "BASE query:", base_query))
-  blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Sort by:", sortby_string))
-  blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Min descsize:", min_descsize))
-  blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Target:", repo))
-  blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Collection:", coll))
 
   # execute search
   offset = 0
@@ -120,9 +132,15 @@ get_papers <- function(query, params,
   metadata <- sanitize_abstract(metadata)
   metadata <- mark_duplicates(metadata)
   metadata$has_dataset <- unlist(lapply(metadata$resulttype, function(x) "Dataset" %in% x))
-  req_limit <- 9
 
+  req_limit <- 9
   r <- 0
+  # check if custom clustering annotation param is in metadata
+  if (!is.null(cc)) {
+    if (!(cc %in% names(fieldmapper))) {
+      has_custom_clustering_annotation <- unlist(lapply(metadata$subject_orig, function(x) grepl(paste0(cc, ":"), x, fixed=TRUE)))
+      metadata <- metadata[has_custom_clustering_annotation,]
+  }}
   while (nrow(metadata) - sum(metadata$is_duplicate) < limit && attr(res_raw, "numFound") > offset+120 && r < req_limit) {
     offset <- offset+120
     res_raw <- get_raw_data(limit,
@@ -141,17 +159,28 @@ get_papers <- function(query, params,
     metadata <- sanitize_abstract(metadata)
     metadata <- mark_duplicates(metadata)
     metadata$has_dataset <- unlist(lapply(metadata$resulttype, function(x) "Dataset" %in% x))
+    # check if custom clustering annotation param is in metadata
+    if (!is.null(cc)) {
+      if (!(cc %in% names(fieldmapper))) {
+        has_custom_clustering_annotation <- unlist(lapply(metadata$subject_orig, function(x) grepl(paste0(cc, ":"), x, fixed=TRUE)))
+        metadata <- metadata[has_custom_clustering_annotation,]
+    }}
     r <- r+1
   }
+  # check if custom clustering annotation param is in metadata
+  if (!is.null(cc)) {
+    if (!(cc %in% names(fieldmapper))) {
+      has_custom_clustering_annotation <- unlist(lapply(metadata$subject_orig, function(x) grepl(paste0(cc, ":"), x, fixed=TRUE)))
+      metadata <- metadata[has_custom_clustering_annotation,]
+  }}
   blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Deduplication retrieval requests:", r))
 
   metadata <- unique(metadata, by = "id")
-  text = data.frame(matrix(nrow=length(metadata$id)))
-  text$id = metadata$id
-  # Add all keywords, including classification to text
-  text$content = paste(metadata$title, metadata$paper_abstract,
-                       metadata$subject_orig, metadata$published_in, metadata$authors,
-                       sep=" ")
+  # Add all keywords, including classification to text content for clustering
+  text <- data.frame(id = metadata$id,
+                     content = paste(metadata$title, metadata$paper_abstract,
+                                     metadata$subject_orig, metadata$published_in, metadata$authors,
+                                     sep=" "))
 
 
   input_data=list("metadata" = metadata, "text"=text)
@@ -228,13 +257,13 @@ etl <- function(res, repo, non_public) {
   metadata$url = metadata$id
   metadata$relevance = c(nrow(metadata):1)
   metadata$resulttype = lapply(res$dctypenorm, decode_dctypenorm)
-  metadata$dctype = check_metadata(res$dctype)
-  metadata$dctypenorm = check_metadata(res$dctypenorm)
+  metadata$type = check_metadata(res$dctype)
+  metadata$typenorm = check_metadata(res$dctypenorm)
   metadata$doi = unlist(lapply(metadata$link, find_dois))
-  metadata$dclang = check_metadata(res$dclang)
-  metadata$dclanguage = check_metadata(res$dclanguage)
+  metadata$lang = check_metadata(res$dclang)
+  metadata$language = check_metadata(res$dclanguage)
   metadata$content_provider = check_metadata(res$dcprovider)
-  metadata$dccoverage = check_metadata(res$dccoverage)
+  metadata$coverage = check_metadata(res$dccoverage)
   if(repo=="fttriple" && non_public==TRUE) {
     metadata$content_provider <- "GoTriple"
   }
@@ -339,3 +368,26 @@ dctypenorm_decoder <- list(
   "183"="Thesis: doctoral and postdoctoral",
   "182"="Thesis: master"
 )
+
+fieldmapper <- list(
+  "relation"="dcrelation",
+  "identifier"="identifier",
+  "title"="dctitle",
+  "paper_abstract"="dcdescription",
+  "published_in"="dcsource",
+  "year"="dcdate",
+  "subject"="dcsubject",
+  "authors"="dccreator",
+  "link"="dclink",
+  "oa_state"="dcoa",
+  "url"="dcdocid",
+  "relevance"="relevance",
+  "resulttype"="dctypenorm",
+  "type"="dctype",
+  "typenorm"="dctypenorm",
+  "doi"="doi",
+  "lang"="dclang",
+  "language"="dclanguage",
+  "content_provider"="dcprovider",
+  "coverage"="dccoverage"
+)
@@ -79,15 +79,6 @@ create_clusters <- function(distance_matrix, max_clusters=-1, method="ward.D") {
       groups <- cutree(cluster, k=num_clusters)
     }
 
-    # NEEDS FIX
-    # if(exists("DEBUG") && DEBUG == TRUE) {
-    #   # Plot result of clustering to PDF file
-    #   pdf("clustering.pdf", width=19, height=12)
-    #   plot(cluster, labels=metadata$title, cex=0.6)
-    #   rect.hclust(cluster, k=num_clusters, border="red")
-    #   dev.off()
-    # }
-
     vclog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Number of Clusters:", num_clusters, sep=" "))
     vclog$debug(paste("CutOff-Description:", attributes(cut_off)$description))
   }
 
@@ -55,14 +55,14 @@ metadata <- fromJSON(input_data$metadata)
 
 MAX_CLUSTERS = params$MAX_CLUSTERS
 
-
 failed <- list(params=params)
 tryCatch({
   output_json = vis_layout(text, metadata,
                            service,
                            max_clusters = MAX_CLUSTERS,
                            taxonomy_separator = params$taxonomy_separator,
-                           vis_type=vis_type, list_size = params$list_size)
+                           vis_type=vis_type, list_size = params$list_size,
+                           params=params)
 }, error=function(err){
  tslog$error(gsub("\n", " ", paste("Processing failed", query, paste(params, collapse=" "), err, sep="||")))
  failed$query <<- query
 
@@ -51,14 +51,20 @@ prune_ngrams <- function(ngrams, stops){
 create_cluster_labels <- function(clusters, metadata,
                                   type_counts,
                                   weightingspec,
-                                  top_n, stops, taxonomy_separator="/") {
-  nn_corpus <- get_cluster_corpus(clusters, metadata, stops, taxonomy_separator)
+                                  top_n, stops, taxonomy_separator="/",
+                                  params=NULL) {
+  cc <- params$custom_clustering
+  if (!(is.null(cc)) && (cc %in% names(metadata))) {
+    nn_corpus <- get_custom_cluster_corpus(clusters, metadata, stops, taxonomy_separator, custom_clustering=cc)
+  } else {
+    nn_corpus <- get_cluster_corpus(clusters, metadata, stops, taxonomy_separator)
+  }
   nn_tfidf <- TermDocumentMatrix(nn_corpus, control = list(
-                                      tokenize = SplitTokenizer,
-                                      weighting = function(x) weightSMART(x, spec="ntn"),
-                                      bounds = list(local = c(2, Inf)),
-                                      tolower = TRUE
-                                ))
+    tokenize = SplitTokenizer,
+    weighting = function(x) weightSMART(x, spec="ntn"),
+    bounds = list(local = c(2, Inf)),
+    tolower = TRUE
+  ))
   tfidf_top <- apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>0]})
   empty_tfidf <- which(apply(nn_tfidf, 2, sum)==0)
   tfidf_top[c(empty_tfidf)] <- fill_empty_clusters(nn_tfidf, nn_corpus)[c(empty_tfidf)]
@@ -90,6 +96,9 @@ create_cluster_labels <- function(clusters, metadata,
     }
     clusters$cluster_labels[c(matches)] = summary
   }
+  if (!(is.null(cc)) && (cc %in% names(metadata$annotations))) {
+    clusters$cluster_labels = metadata$annotations[[cc]]
+  }
   clusters$cluster_labels <- fix_cluster_labels(clusters$cluster_labels, type_counts)
   return(clusters)
 }
@@ -118,8 +127,33 @@ match_keyword_case <- function(x, type_counts) {
   if (!is.na(y)) return(y) else return(x)
 }
 
+get_custom_cluster_corpus <- function(clusters, metadata, stops, taxonomy_separator,
+                               add_title_ngrams = T, custom_clustering=NULL) {
+  subjectlist = list()
+  for (k in seq(1, clusters$num_clusters)) {
+    matches = which(unname(clusters$groups == k) == TRUE)
+    custom_input = metadata[[custom_clustering]][matches]
+    batch_size <- 1000
+    total_length <- length(stops)
+    for (i in seq(1, total_length, batch_size)) {
+      custom_input = lapply(custom_input, function(x) {removeWords(x, stops[i:min(i+batch_size -1, total_length)])})
+    }
+    custom_input = mapply(gsub, custom_input, pattern = "; ", replacement=";")
+    custom_input = mapply(gsub, custom_input, pattern=" ", replacement="_")
+
+    all_subjects = paste(custom_input, collapse=" ")
+    all_subjects <- str_replace_all(all_subjects, "\\?+_\\?+|\\?+|\\?+ ", "")
+    all_subjects <- str_replace_all(all_subjects, ";+", ";")
+    all_subjects <- str_replace_all(all_subjects, " ?; ?", ";")
+    all_subjects <- str_replace_all(all_subjects, " +", ";")
+    subjectlist = c(subjectlist, all_subjects)
+  }
+  nn_corpus <- VCorpus(VectorSource(subjectlist))
+  return(nn_corpus)
+}
+
 get_cluster_corpus <- function(clusters, metadata, stops, taxonomy_separator,
-                               add_title_ngrams = T) {
+                               add_title_ngrams = T, custom_clustering=NULL) {
   subjectlist = list()
   for (k in seq(1, clusters$num_clusters)) {
     matches = which(unname(clusters$groups == k) == TRUE)
 
@@ -1,10 +1,11 @@
 {
-  "document_types":["121"],
+  "document_types":["121", "7", "13", "14", "15", "16", "17", "18", "6"],
   "from":"1665-01-01",
-  "to":"2023-09-25",
+  "to":"2023-12-07",
   "sorting":"most-relevant",
   "vis_id": "TEST_ID",
   "min_descsize": 300,
   "limit": 120,
-  "list_size": 100
+  "list_size": 100,
+  "custom_clustering": "mesh"
 }
@@ -7,7 +7,7 @@ options(warn=1)
 wd <- dirname(dirname(rstudioapi::getActiveDocumentContext()$path))
 setwd(wd) #Don't forget to set your working directory
 
-query <- 'cond-mat.mtrl-sci' #args[2]
+query <- "species" #args[2]
 service <- "base"
 params <- NULL
 params_file <- "test/params_base.json"