Skip to content

Commit b83f88a

Browse files
committed
Merge remote-tracking branch 'upstream/master' into database-migration-cleanup
2 parents d60da1f + 3860c29 commit b83f88a

25 files changed

Lines changed: 310 additions & 118 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,10 @@ server/preprocessing/other-scripts/renv
4141
.Rhistory
4242
.Rprofile
4343
.Rproj*
44+
/*.Rproj
4445
.Rproj.user
4546

47+
4648
# python files
4749
*.pyc
4850
*.pkl

docker-compose-end2endtest.yml

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,9 @@ services:
2424
- db
2525
- backend
2626
restart: "no"
27-
entrypoint: [ "pytest", '/app/workers/tests/test_end2end.py', '-s', '-rfA']
27+
entrypoint: ["pytest", '/app/workers/tests/test_end2end.py', '-s', '-rfA']
2828
networks:
29-
test:
30-
ipv4_address: 172.18.0.2
29+
- test
3130

3231
backend:
3332
container_name: backend
@@ -37,13 +36,47 @@ services:
3736
dockerfile: ./Dockerfile_backend
3837
volumes:
3938
- ./server/:/var/www/html/server
39+
- ./server/workers/tests/test_data/test.sqlite:/var/www/localstorage/test.sqlite
4040
restart: "no"
4141
networks:
42-
test:
43-
ipv4_address: 172.18.0.3
42+
- test
4443
ports:
4544
- "80:80"
4645

46+
api:
47+
build:
48+
context: server
49+
dockerfile: workers/api/Dockerfile
50+
restart: unless-stopped
51+
environment:
52+
SERVICE_VERSION: "test"
53+
BEHIND_PROXY: "false"
54+
DEFAULT_DATABASE: "testdb"
55+
FLASK_ENV: "development"
56+
volumes:
57+
- ./server/workers/tests/mock_app.py:/app/mock_app.py
58+
command: ["python", "mock_app.py"]
59+
networks:
60+
- test
61+
62+
persistence:
63+
container_name: api
64+
hostname: "test_api"
65+
build:
66+
context: server
67+
dockerfile: workers/persistence/Dockerfile
68+
restart: "no"
69+
environment:
70+
SERVICE_VERSION: "test"
71+
BEHIND_PROXY: "false"
72+
DEFAULT_DATABASE: "testdb"
73+
FLASK_ENV: "development"
74+
volumes:
75+
- ./server/workers/tests/mock_app.py:/app/mock_app.py
76+
command: ["python", "mock_app.py"]
77+
networks:
78+
- test
79+
4780
db:
4881
container_name: test_db
4982
image: 'postgres:12.2-alpine'
@@ -63,16 +96,10 @@ services:
6396
ports:
6497
- "5432:5432"
6598
networks:
66-
test:
67-
ipv4_address: 172.18.0.4
99+
- test
68100

69101
volumes:
70102
db_data:
71103

72104
networks:
73105
test:
74-
driver: bridge
75-
ipam:
76-
config:
77-
- subnet: 172.18.0.0/16
78-
gateway: 172.18.0.1

examples/project_website/base.html

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@
9393
//title: "fake news",
9494
//title: "dotcom",
9595
//title: "cognitive dissonance",
96-
title: "custom_title",
96+
// title: "custom_title",
97+
title: "custom_clustering",
9798
// file: "./data/digital-education.json",
9899
// file: "./data/digital-education-lang.json",
99100
// file: "./data/digital-education-lang[].json",
@@ -103,7 +104,7 @@
103104
//file: "./data/fake-news-sg.json",
104105
//file: "./data/dotcom-sg.json",
105106
//file: "./data/cognitive-dissonance.json"
106-
file: "./data/custom_title.json",
107+
file: "./data/custom_clustering.json",
107108
// other attributes:
108109
is_streamgraph: false, // set true for streamgraph data
109110
show_area: true, // set false for streamgraph data

examples/project_website/data/custom_clustering.json

Lines changed: 10 additions & 0 deletions
Large diffs are not rendered by default.

server/preprocessing/other-scripts/base.R

Lines changed: 69 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@ get_papers <- function(query, params,
6565

6666
if (!is.null(exact_query) && exact_query != '') {
6767
base_query <- paste(paste0("(",exact_query,")"), date_string, document_types, collapse=" ")
68-
base_query <- paste(paste0("(",exact_query,")"), date_string, document_types, collapse=" ")
6968
} else {
7069
base_query <- paste(date_string, document_types, collapse=" ")
7170
}
@@ -94,11 +93,24 @@ get_papers <- function(query, params,
9493
non_public = FALSE
9594
}
9695

96+
cc <- params$custom_clustering
97+
if (!is.null(cc)) {
98+
if (cc %in% names(fieldmapper)) {
99+
# this is the generic case for existing metadata
100+
custom_clustering_query <- paste(fieldmapper[[cc]], ":", "*", sep="")
101+
base_query <- paste(base_query, custom_clustering_query)
102+
} else {
103+
# this is the speciality case for custom clustering on annotations
104+
custom_clustering_query <- paste("dcsubject:", cc, "*", sep="")
105+
base_query <- paste(base_query, custom_clustering_query)
106+
custom_clustering_query <- paste('textus:', '"', cc, ':"', sep="")
107+
base_query <- paste(base_query, custom_clustering_query)
108+
custom_clustering_query <- paste(cc, ':*', sep="")
109+
base_query <- paste(base_query, custom_clustering_query)
110+
}
111+
}
112+
97113
blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "BASE query:", base_query))
98-
blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Sort by:", sortby_string))
99-
blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Min descsize:", min_descsize))
100-
blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Target:", repo))
101-
blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Collection:", coll))
102114

103115
# execute search
104116
offset = 0
@@ -120,9 +132,15 @@ get_papers <- function(query, params,
120132
metadata <- sanitize_abstract(metadata)
121133
metadata <- mark_duplicates(metadata)
122134
metadata$has_dataset <- unlist(lapply(metadata$resulttype, function(x) "Dataset" %in% x))
123-
req_limit <- 9
124135

136+
req_limit <- 9
125137
r <- 0
138+
# check if custom clustering annotation param is in metadata
139+
if (!is.null(cc)) {
140+
if (!(cc %in% names(fieldmapper))) {
141+
has_custom_clustering_annotation <- unlist(lapply(metadata$subject_orig, function(x) grepl(paste0(cc, ":"), x, fixed=TRUE)))
142+
metadata <- metadata[has_custom_clustering_annotation,]
143+
}}
126144
while (nrow(metadata) - sum(metadata$is_duplicate) < limit && attr(res_raw, "numFound") > offset+120 && r < req_limit) {
127145
offset <- offset+120
128146
res_raw <- get_raw_data(limit,
@@ -141,17 +159,28 @@ get_papers <- function(query, params,
141159
metadata <- sanitize_abstract(metadata)
142160
metadata <- mark_duplicates(metadata)
143161
metadata$has_dataset <- unlist(lapply(metadata$resulttype, function(x) "Dataset" %in% x))
162+
# check if custom clustering annotation param is in metadata
163+
if (!is.null(cc)) {
164+
if (!(cc %in% names(fieldmapper))) {
165+
has_custom_clustering_annotation <- unlist(lapply(metadata$subject_orig, function(x) grepl(paste0(cc, ":"), x, fixed=TRUE)))
166+
metadata <- metadata[has_custom_clustering_annotation,]
167+
}}
144168
r <- r+1
145169
}
170+
# check if custom clustering annotation param is in metadata
171+
if (!is.null(cc)) {
172+
if (!(cc %in% names(fieldmapper))) {
173+
has_custom_clustering_annotation <- unlist(lapply(metadata$subject_orig, function(x) grepl(paste0(cc, ":"), x, fixed=TRUE)))
174+
metadata <- metadata[has_custom_clustering_annotation,]
175+
}}
146176
blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Deduplication retrieval requests:", r))
147177

148178
metadata <- unique(metadata, by = "id")
149-
text = data.frame(matrix(nrow=length(metadata$id)))
150-
text$id = metadata$id
151-
# Add all keywords, including classification to text
152-
text$content = paste(metadata$title, metadata$paper_abstract,
153-
metadata$subject_orig, metadata$published_in, metadata$authors,
154-
sep=" ")
179+
# Add all keywords, including classification to text content for clustering
180+
text <- data.frame(id = metadata$id,
181+
content = paste(metadata$title, metadata$paper_abstract,
182+
metadata$subject_orig, metadata$published_in, metadata$authors,
183+
sep=" "))
155184

156185

157186
input_data=list("metadata" = metadata, "text"=text)
@@ -228,13 +257,13 @@ etl <- function(res, repo, non_public) {
228257
metadata$url = metadata$id
229258
metadata$relevance = c(nrow(metadata):1)
230259
metadata$resulttype = lapply(res$dctypenorm, decode_dctypenorm)
231-
metadata$dctype = check_metadata(res$dctype)
232-
metadata$dctypenorm = check_metadata(res$dctypenorm)
260+
metadata$type = check_metadata(res$dctype)
261+
metadata$typenorm = check_metadata(res$dctypenorm)
233262
metadata$doi = unlist(lapply(metadata$link, find_dois))
234-
metadata$dclang = check_metadata(res$dclang)
235-
metadata$dclanguage = check_metadata(res$dclanguage)
263+
metadata$lang = check_metadata(res$dclang)
264+
metadata$language = check_metadata(res$dclanguage)
236265
metadata$content_provider = check_metadata(res$dcprovider)
237-
metadata$dccoverage = check_metadata(res$dccoverage)
266+
metadata$coverage = check_metadata(res$dccoverage)
238267
if(repo=="fttriple" && non_public==TRUE) {
239268
metadata$content_provider <- "GoTriple"
240269
}
@@ -339,3 +368,26 @@ dctypenorm_decoder <- list(
339368
"183"="Thesis: doctoral and postdoctoral",
340369
"182"="Thesis: master"
341370
)
371+
372+
fieldmapper <- list(
373+
"relation"="dcrelation",
374+
"identifier"="identifier",
375+
"title"="dctitle",
376+
"paper_abstract"="dcdescription",
377+
"published_in"="dcsource",
378+
"year"="dcdate",
379+
"subject"="dcsubject",
380+
"authors"="dccreator",
381+
"link"="dclink",
382+
"oa_state"="dcoa",
383+
"url"="dcdocid",
384+
"relevance"="relevance",
385+
"resulttype"="dctypenorm",
386+
"type"="dctype",
387+
"typenorm"="dctypenorm",
388+
"doi"="doi",
389+
"lang"="dclang",
390+
"language"="dclanguage",
391+
"content_provider"="dcprovider",
392+
"coverage"="dccoverage"
393+
)

server/preprocessing/other-scripts/cluster.R

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,6 @@ create_clusters <- function(distance_matrix, max_clusters=-1, method="ward.D") {
7979
groups <- cutree(cluster, k=num_clusters)
8080
}
8181

82-
# NEEDS FIX
83-
# if(exists("DEBUG") && DEBUG == TRUE) {
84-
# # Plot result of clustering to PDF file
85-
# pdf("clustering.pdf", width=19, height=12)
86-
# plot(cluster, labels=metadata$title, cex=0.6)
87-
# rect.hclust(cluster, k=num_clusters, border="red")
88-
# dev.off()
89-
# }
90-
9182
vclog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Number of Clusters:", num_clusters, sep=" "))
9283
vclog$debug(paste("CutOff-Description:", attributes(cut_off)$description))
9384
}

server/preprocessing/other-scripts/run_vis_layout.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,14 @@ metadata <- fromJSON(input_data$metadata)
5555

5656
MAX_CLUSTERS = params$MAX_CLUSTERS
5757

58-
5958
failed <- list(params=params)
6059
tryCatch({
6160
output_json = vis_layout(text, metadata,
6261
service,
6362
max_clusters = MAX_CLUSTERS,
6463
taxonomy_separator = params$taxonomy_separator,
65-
vis_type=vis_type, list_size = params$list_size)
64+
vis_type=vis_type, list_size = params$list_size,
65+
params=params)
6666
}, error=function(err){
6767
tslog$error(gsub("\n", " ", paste("Processing failed", query, paste(params, collapse=" "), err, sep="||")))
6868
failed$query <<- query

server/preprocessing/other-scripts/summarize.R

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,20 @@ prune_ngrams <- function(ngrams, stops){
5151
create_cluster_labels <- function(clusters, metadata,
5252
type_counts,
5353
weightingspec,
54-
top_n, stops, taxonomy_separator="/") {
55-
nn_corpus <- get_cluster_corpus(clusters, metadata, stops, taxonomy_separator)
54+
top_n, stops, taxonomy_separator="/",
55+
params=NULL) {
56+
cc <- params$custom_clustering
57+
if (!(is.null(cc)) && (cc %in% names(metadata))) {
58+
nn_corpus <- get_custom_cluster_corpus(clusters, metadata, stops, taxonomy_separator, custom_clustering=cc)
59+
} else {
60+
nn_corpus <- get_cluster_corpus(clusters, metadata, stops, taxonomy_separator)
61+
}
5662
nn_tfidf <- TermDocumentMatrix(nn_corpus, control = list(
57-
tokenize = SplitTokenizer,
58-
weighting = function(x) weightSMART(x, spec="ntn"),
59-
bounds = list(local = c(2, Inf)),
60-
tolower = TRUE
61-
))
63+
tokenize = SplitTokenizer,
64+
weighting = function(x) weightSMART(x, spec="ntn"),
65+
bounds = list(local = c(2, Inf)),
66+
tolower = TRUE
67+
))
6268
tfidf_top <- apply(nn_tfidf, 2, function(x) {x2 <- sort(x, TRUE);x2[x2>0]})
6369
empty_tfidf <- which(apply(nn_tfidf, 2, sum)==0)
6470
tfidf_top[c(empty_tfidf)] <- fill_empty_clusters(nn_tfidf, nn_corpus)[c(empty_tfidf)]
@@ -90,6 +96,9 @@ create_cluster_labels <- function(clusters, metadata,
9096
}
9197
clusters$cluster_labels[c(matches)] = summary
9298
}
99+
if (!(is.null(cc)) && (cc %in% names(metadata$annotations))) {
100+
clusters$cluster_labels = metadata$annotations[[cc]]
101+
}
93102
clusters$cluster_labels <- fix_cluster_labels(clusters$cluster_labels, type_counts)
94103
return(clusters)
95104
}
@@ -118,8 +127,33 @@ match_keyword_case <- function(x, type_counts) {
118127
if (!is.na(y)) return(y) else return(x)
119128
}
120129

130+
get_custom_cluster_corpus <- function(clusters, metadata, stops, taxonomy_separator,
131+
add_title_ngrams = T, custom_clustering=NULL) {
132+
subjectlist = list()
133+
for (k in seq(1, clusters$num_clusters)) {
134+
matches = which(unname(clusters$groups == k) == TRUE)
135+
custom_input = metadata[[custom_clustering]][matches]
136+
batch_size <- 1000
137+
total_length <- length(stops)
138+
for (i in seq(1, total_length, batch_size)) {
139+
custom_input = lapply(custom_input, function(x) {removeWords(x, stops[i:min(i+batch_size -1, total_length)])})
140+
}
141+
custom_input = mapply(gsub, custom_input, pattern = "; ", replacement=";")
142+
custom_input = mapply(gsub, custom_input, pattern=" ", replacement="_")
143+
144+
all_subjects = paste(custom_input, collapse=" ")
145+
all_subjects <- str_replace_all(all_subjects, "\\?+_\\?+|\\?+|\\?+ ", "")
146+
all_subjects <- str_replace_all(all_subjects, ";+", ";")
147+
all_subjects <- str_replace_all(all_subjects, " ?; ?", ";")
148+
all_subjects <- str_replace_all(all_subjects, " +", ";")
149+
subjectlist = c(subjectlist, all_subjects)
150+
}
151+
nn_corpus <- VCorpus(VectorSource(subjectlist))
152+
return(nn_corpus)
153+
}
154+
121155
get_cluster_corpus <- function(clusters, metadata, stops, taxonomy_separator,
122-
add_title_ngrams = T) {
156+
add_title_ngrams = T, custom_clustering=NULL) {
123157
subjectlist = list()
124158
for (k in seq(1, clusters$num_clusters)) {
125159
matches = which(unname(clusters$groups == k) == TRUE)
Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
{
2-
"document_types":["121"],
2+
"document_types":["121", "7", "13", "14", "15", "16", "17", "18", "6"],
33
"from":"1665-01-01",
4-
"to":"2023-09-25",
4+
"to":"2023-12-07",
55
"sorting":"most-relevant",
66
"vis_id": "TEST_ID",
77
"min_descsize": 300,
88
"limit": 120,
9-
"list_size": 100
9+
"list_size": 100,
10+
"custom_clustering": "mesh"
1011
}

server/preprocessing/other-scripts/test/test_base.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ options(warn=1)
77
wd <- dirname(dirname(rstudioapi::getActiveDocumentContext()$path))
88
setwd(wd) #Don't forget to set your working directory
99

10-
query <- 'cond-mat.mtrl-sci' #args[2]
10+
query <- "species" #args[2]
1111
service <- "base"
1212
params <- NULL
1313
params_file <- "test/params_base.json"

0 commit comments

Comments
 (0)