@@ -65,7 +65,6 @@ get_papers <- function(query, params,
6565
6666 if (! is.null(exact_query ) && exact_query != ' ' ) {
6767 base_query <- paste(paste0(" (" ,exact_query ," )" ), date_string , document_types , collapse = " " )
68- base_query <- paste(paste0(" (" ,exact_query ," )" ), date_string , document_types , collapse = " " )
6968 } else {
7069 base_query <- paste(date_string , document_types , collapse = " " )
7170 }
@@ -94,11 +93,24 @@ get_papers <- function(query, params,
9493 non_public = FALSE
9594 }
9695
96+ cc <- params $ custom_clustering
97+ if (! is.null(cc )) {
98+ if (cc %in% names(fieldmapper )) {
99+ # this is the generic case for existing metadata
100+ custom_clustering_query <- paste(fieldmapper [[cc ]], " :" , " *" , sep = " " )
101+ base_query <- paste(base_query , custom_clustering_query )
102+ } else {
103+ # this is the speciality case for custom clustering on annotations
104+ custom_clustering_query <- paste(" dcsubject:" , cc , " *" , sep = " " )
105+ base_query <- paste(base_query , custom_clustering_query )
106+ custom_clustering_query <- paste(' textus:' , ' "' , cc , ' :"' , sep = " " )
107+ base_query <- paste(base_query , custom_clustering_query )
108+ custom_clustering_query <- paste(cc , ' :*' , sep = " " )
109+ base_query <- paste(base_query , custom_clustering_query )
110+ }
111+ }
112+
97113 blog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " BASE query:" , base_query ))
98- blog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " Sort by:" , sortby_string ))
99- blog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " Min descsize:" , min_descsize ))
100- blog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " Target:" , repo ))
101- blog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " Collection:" , coll ))
102114
103115 # execute search
104116 offset = 0
@@ -120,9 +132,15 @@ get_papers <- function(query, params,
120132 metadata <- sanitize_abstract(metadata )
121133 metadata <- mark_duplicates(metadata )
122134 metadata $ has_dataset <- unlist(lapply(metadata $ resulttype , function (x ) " Dataset" %in% x ))
123- req_limit <- 9
124135
136+ req_limit <- 9
125137 r <- 0
138+ # check if custom clustering annotation param is in metadata
139+ if (! is.null(cc )) {
140+ if (! (cc %in% names(fieldmapper ))) {
141+ has_custom_clustering_annotation <- unlist(lapply(metadata $ subject_orig , function (x ) grepl(paste0(cc , " :" ), x , fixed = TRUE )))
142+ metadata <- metadata [has_custom_clustering_annotation ,]
143+ }}
126144 while (nrow(metadata ) - sum(metadata $ is_duplicate ) < limit && attr(res_raw , " numFound" ) > offset + 120 && r < req_limit ) {
127145 offset <- offset + 120
128146 res_raw <- get_raw_data(limit ,
@@ -141,17 +159,28 @@ get_papers <- function(query, params,
141159 metadata <- sanitize_abstract(metadata )
142160 metadata <- mark_duplicates(metadata )
143161 metadata $ has_dataset <- unlist(lapply(metadata $ resulttype , function (x ) " Dataset" %in% x ))
162+ # check if custom clustering annotation param is in metadata
163+ if (! is.null(cc )) {
164+ if (! (cc %in% names(fieldmapper ))) {
165+ has_custom_clustering_annotation <- unlist(lapply(metadata $ subject_orig , function (x ) grepl(paste0(cc , " :" ), x , fixed = TRUE )))
166+ metadata <- metadata [has_custom_clustering_annotation ,]
167+ }}
144168 r <- r + 1
145169 }
170+ # check if custom clustering annotation param is in metadata
171+ if (! is.null(cc )) {
172+ if (! (cc %in% names(fieldmapper ))) {
173+ has_custom_clustering_annotation <- unlist(lapply(metadata $ subject_orig , function (x ) grepl(paste0(cc , " :" ), x , fixed = TRUE )))
174+ metadata <- metadata [has_custom_clustering_annotation ,]
175+ }}
146176 blog $ info(paste(" vis_id:" , .GlobalEnv $ VIS_ID , " Deduplication retrieval requests:" , r ))
147177
148178 metadata <- unique(metadata , by = " id" )
149- text = data.frame (matrix (nrow = length(metadata $ id )))
150- text $ id = metadata $ id
151- # Add all keywords, including classification to text
152- text $ content = paste(metadata $ title , metadata $ paper_abstract ,
153- metadata $ subject_orig , metadata $ published_in , metadata $ authors ,
154- sep = " " )
179+ # Add all keywords, including classification to text content for clustering
180+ text <- data.frame (id = metadata $ id ,
181+ content = paste(metadata $ title , metadata $ paper_abstract ,
182+ metadata $ subject_orig , metadata $ published_in , metadata $ authors ,
183+ sep = " " ))
155184
156185
157186 input_data = list (" metadata" = metadata , " text" = text )
@@ -228,13 +257,13 @@ etl <- function(res, repo, non_public) {
228257 metadata $ url = metadata $ id
229258 metadata $ relevance = c(nrow(metadata ): 1 )
230259 metadata $ resulttype = lapply(res $ dctypenorm , decode_dctypenorm )
231- metadata $ dctype = check_metadata(res $ dctype )
232- metadata $ dctypenorm = check_metadata(res $ dctypenorm )
260+ metadata $ type = check_metadata(res $ dctype )
261+ metadata $ typenorm = check_metadata(res $ dctypenorm )
233262 metadata $ doi = unlist(lapply(metadata $ link , find_dois ))
234- metadata $ dclang = check_metadata(res $ dclang )
235- metadata $ dclanguage = check_metadata(res $ dclanguage )
263+ metadata $ lang = check_metadata(res $ dclang )
264+ metadata $ language = check_metadata(res $ dclanguage )
236265 metadata $ content_provider = check_metadata(res $ dcprovider )
237- metadata $ dccoverage = check_metadata(res $ dccoverage )
266+ metadata $ coverage = check_metadata(res $ dccoverage )
238267 if (repo == " fttriple" && non_public == TRUE ) {
239268 metadata $ content_provider <- " GoTriple"
240269 }
@@ -339,3 +368,26 @@ dctypenorm_decoder <- list(
339368 " 183" = " Thesis: doctoral and postdoctoral" ,
340369 " 182" = " Thesis: master"
341370)
371+
372+ fieldmapper <- list (
373+ " relation" = " dcrelation" ,
374+ " identifier" = " identifier" ,
375+ " title" = " dctitle" ,
376+ " paper_abstract" = " dcdescription" ,
377+ " published_in" = " dcsource" ,
378+ " year" = " dcdate" ,
379+ " subject" = " dcsubject" ,
380+ " authors" = " dccreator" ,
381+ " link" = " dclink" ,
382+ " oa_state" = " dcoa" ,
383+ " url" = " dcdocid" ,
384+ " relevance" = " relevance" ,
385+ " resulttype" = " dctypenorm" ,
386+ " type" = " dctype" ,
387+ " typenorm" = " dctypenorm" ,
388+ " doi" = " doi" ,
389+ " lang" = " dclang" ,
390+ " language" = " dclanguage" ,
391+ " content_provider" = " dcprovider" ,
392+ " coverage" = " dccoverage"
393+ )
0 commit comments