@@ -188,19 +188,34 @@ etl <- function(res, repo, non_public) {
188188 subject_cleaned = gsub(" [^\\ (;]+\\ (all\\ )(;|$)?" , " " , subject_cleaned ) # remove general subjects
189189 subject_cleaned = gsub(" [^:;]+ ?:: ?[^;]+(;|$)?" , " " , subject_cleaned ) # remove classification with separator ::
190190 subject_cleaned = gsub(" [^\\ [;]+\\ [[A-Z,0-9]+\\ ](;|$)?" , " " , subject_cleaned ) # remove WHO classification
191- subject_cleaned = gsub(" ([A-Za-z]+:[A-Za-z0-9][A-Za-z0-9 \\ /\\ .]+);?" , " " , subject_cleaned , perl = TRUE ) # clean up annotations with prefix e.g. theme:annotation
191+ subject_cleaned = gsub(" Info:\\ w+-(\\ w+\\ /)+" , " " , subject_cleaned ) # remove Info:eu-repo/classification/
192+ subject_cleaned = gsub(" ([A-Za-z]+:[A-Za-z0-9 \\ /\\ .-]+);?" , " " , subject_cleaned , perl = TRUE ) # clean up annotations with prefix e.g. theme:annotation
193+ if (! is.null(params $ vis_type ) && params $ vis_type == " timeline" ) {
194+ subject_cleaned = gsub(" FOS " , " " , subject_cleaned ) # remove FOS classification tag, but keep classifcation name
195+ arxiv_classification_string = " (cs|econ|eess|math|astro-ph|nlin|q-bio|q-fin|stat)\\ .[A-Z]{2}|cond-mat\\ .[a-z\\ -]+|hep-(ex|lat|ph|th)|math-ph|nucl-(ex|th)|physics\\ .[a-z\\ -]+|(astro-ph|gr-qc|quant-ph|cond-mat)"
196+ subject_cleaned = gsub(arxiv_classification_string , " " , subject_cleaned , perl = TRUE ) # remove arXiv classification short code, but keep classifcation name
197+ } else {
198+ subject_cleaned = gsub(" FOS [A-Za-z ]+" , " " , subject_cleaned ) # remove FOS classifications (Fields of Science and Technology)
199+ arxiv_classification_string = " (([A-Za-z ]+ )?cond-mat\\ .[a-z\\ -]+)|([\\ w ]+ )?(cs|econ|eess|math|astro-ph|nlin|q-bio|q-fin|stat)\\ .[A-Z]{2}|cond-mat\\ .[a-z\\ -]+|hep-(ex|lat|ph|th)|math-ph|nucl-(ex|th)|physics\\ .[a-z\\ -]+|([\\ w ]+ )(astro-ph|gr-qc|quant-ph|cond-mat)"
200+ subject_cleaned = gsub(arxiv_classification_string , " " , subject_cleaned , perl = TRUE ) # remove arXiv classification, except on streamgraphs
201+ }
202+ subject_cleaned = gsub(" ([A-Za-z]+:[A-Za-z0-9 \\ /\\ .]+);?" , " " , subject_cleaned , perl = TRUE ) # clean up annotations with prefix e.g. theme:annotation
192203 subject_cleaned = gsub(" (wikidata)?\\ .org/entity/[qQ]([\\ d]+)?" , " " , subject_cleaned ) # remove wikidata classification
193204 subject_cleaned = gsub(" </keyword><keyword>" , " " , subject_cleaned ) # remove </keyword><keyword>
194205 subject_cleaned = gsub(" \\ [No keyword\\ ]" , " " , subject_cleaned )
195206 subject_cleaned = gsub(" \\ [[^\\ []+\\ ][^\\ ;]+(;|$)?" , " " , subject_cleaned ) # remove classification
196207 subject_cleaned = gsub(" [0-9]{2,} [A-Z]+[^;]*(;|$)?" , " " , subject_cleaned ) # remove classification
197208 subject_cleaned = gsub(" -- " , " ; " , subject_cleaned ) # replace inconsistent keyword separation
209+ subject_cleaned = gsub(" [-]{2,}" , " ; " , subject_cleaned ) # replace inconsistent keyword separation
210+ subject_cleaned = gsub(" [A-Z]\\ .\\ d\\ .\\ d+" , " " , subject_cleaned ) # replace inconsistent keyword separation
198211 subject_cleaned = gsub(" \\ ( " , " ; " , subject_cleaned ) # replace inconsistent keyword separation
199212 subject_cleaned = gsub(" (\\ w* \\ w*(\\ .)( \\ w* \\ w*)?)" , " ; " , subject_cleaned ) # remove overly broad keywords separated by .
200213 subject_cleaned = gsub(" \\ . " , " ; " , subject_cleaned ) # replace inconsistent keyword separation
201214 subject_cleaned = gsub(" ?\\ d[:?-?]?(\\ d+.)+" , " " , subject_cleaned ) # replace residuals like 5:621.313.323 or '5-76.95'
202- subject_cleaned = gsub(" \\ w+: \\ w+-( \\ w+ \\ /)+ " , " " , subject_cleaned ) # replace residuals like Info:eu-repo/classification/
215+ subject_cleaned = gsub(" : " , " " , subject_cleaned ) # clean up keyword separation
203216 subject_cleaned = gsub(" ^; $" , " " , subject_cleaned ) # clean up keyword separation
217+ subject_cleaned = gsub(" ;+" , " ;" , subject_cleaned ) # clean up keyword separation
218+ subject_cleaned = gsub(" ,+" , " ," , subject_cleaned ) # clean up keyword separation
204219 subject_cleaned = gsub(" ," , " , " , subject_cleaned ) # clean up keyword separation
205220 subject_cleaned = gsub(" \\ s+" , " " , subject_cleaned ) # clean up keyword separation
206221 subject_cleaned = stringi :: stri_trim(subject_cleaned ) # clean up keyword separation
0 commit comments