Merge pull request #758 from OpenKnowledgeMaps/exclude-date-filters-param

chreman · web-flow · commit e625f2dcbaea · 2024-04-28T20:39:54.000+02:00
Exclude date filters param
diff --git a/examples/project_website/base.html b/examples/project_website/base.html
@@ -93,8 +93,8 @@
             //title: "fake news",
             //title: "dotcom",
             //title: "cognitive dissonance",
-            // title: "custom_title",
-            title: "custom_clustering",
+            // title: "exclude_date_filters",
+            title: "philosophy_no_dates",
             // file: "./data/digital-education.json",
             // file: "./data/digital-education-lang.json",
             // file: "./data/digital-education-lang[].json",
@@ -104,7 +104,9 @@
             //file: "./data/fake-news-sg.json",
             //file: "./data/dotcom-sg.json",
             //file: "./data/cognitive-dissonance.json"
-            file: "./data/custom_clustering.json",
+            // file: "./data/custom_title.json",
+            // file: "./data/exclude_date_filters.json",
+            file: "./data/philosophy_no_dates.json",
             // other attributes:
             is_streamgraph: false, // set true for streamgraph data
             show_area: true, // set false for streamgraph data
diff --git a/examples/project_website/data/exclude_date_filters.json b/examples/project_website/data/exclude_date_filters.json
diff --git a/examples/project_website/data/philosophy_no_dates.json b/examples/project_website/data/philosophy_no_dates.json
diff --git a/server/preprocessing/other-scripts/base.R b/server/preprocessing/other-scripts/base.R
@@ -52,21 +52,31 @@ get_papers <- function(query, params,
 
   blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "exact query:", exact_query))
 
-  year_from = params$from
-  year_to = params$to
   limit = params$limit
 
   # prepare query fields
-  date_string = paste0("dcdate:[", params$from, " TO ", params$to , "]")
   document_types = paste("dctypenorm:", "(", paste(params$document_types, collapse=" OR "), ")", sep="")
   
   sortby_string = ifelse(params$sorting == "most-recent", "dcyear desc", "")
   return_fields <- "dcdocid,dctitle,dcdescription,dcsource,dcdate,dcsubject,dccreator,dclink,dcoa,dcidentifier,dcrelation,dctype,dctypenorm,dcprovider,dclang,dclanguage,dccoverage"
 
   if (!is.null(exact_query) && exact_query != '') {
-    base_query <- paste(paste0("(",exact_query,")"), date_string, document_types, collapse=" ")
+    base_query <- paste(paste0("(",exact_query,")"), document_types, collapse=" ")
   } else {
-    base_query <- paste(date_string, document_types, collapse=" ")
+    base_query <- paste(document_types, collapse=" ")
+  }
+
+  if (!is.null(params$vis_type) && params$vis_type == "timeline") {
+    if (!is.null(params$exclude_date_filters)) {
+      params$exclude_date_filters <- NULL
+    }
+  }
+
+  if (!is.null(params$exclude_date_filters)
+      && (params$exclude_date_filters == TRUE || params$exclude_date_filters == "true")) {
+  } else {
+    date_string = paste0("dcdate:[", params$from, " TO ", params$to , "]")
+    base_query <- paste(date_string, base_query)
   }
 
   # apply language filter if parameter is set
diff --git a/server/preprocessing/other-scripts/test/params_base.json b/server/preprocessing/other-scripts/test/params_base.json
@@ -7,5 +7,5 @@
   "min_descsize": 300,
   "limit": 120,
   "list_size": 100,
-  "repo": "ftunivbern"
+  "exclude_date_filters": "true"
 }
diff --git a/server/services/searchBASE.php b/server/services/searchBASE.php
@@ -10,8 +10,9 @@
 $dirty_query = library\CommUtils::getParameter($_POST, "q");
 $precomputed_id = (isset($_POST["unique_id"]))?($_POST["unique_id"]):(null);
 
-$params_array = array("from", "to", "document_types", "sorting", "min_descsize");
-$optional_get_params = ["repo", "coll", "vis_type", "q_advanced", "lang_id", "custom_title", "custom_clustering"];
+$params_array = array("document_types", "sorting", "min_descsize");
+$optional_get_params = ["repo", "coll", "vis_type", "q_advanced", "lang_id", "custom_title", "exclude_date_filters", "from", "to", "custom_clustering"];
+
 
 function filterEmptyString($value)
 {
@@ -36,6 +37,33 @@ function filterEmptyString($value)
         $post_params["lang_id"] = ["all-lang"];
     }
 }
+// ignore date filters if vis_type is set to timeline
+if (isset($post_params["vis_type"]) && $post_params["vis_type"] == "timeline") {
+    unset($params_array[array_search("exclude_date_filters", $params_array)]);
+    unset($post_params["exclude_date_filters"]);
+}
+// check if exclude_date_filters is set and true
+if (isset($post_params["exclude_date_filters"]) && $post_params["exclude_date_filters"] === true) {
+    // Add "today" and exclude "from" and "to" from the $params_array
+    $params_array = array_merge($params_array, ["today"]);
+    unset($params_array["from"], $params_array["to"]);
+}
+
+// re-establish historic order for backwards ID compatibility
+$historic_params_order = array("from", "to", "document_types", "sorting", "min_descsize", "repo");
+$reordered_params = array();
+foreach($historic_params_order as $param) {
+    if (isset($post_params[$param])) {
+        $reordered_params[] = $param;
+    }
+}
+foreach($params_array as $param) {
+    if (!in_array($param, $reordered_params)) {
+        $reordered_params[] = $param;
+    }
+}
+$params_array = $reordered_params;
+
 
 
 $result = search("base", $dirty_query
diff --git a/server/workers/api/src/apis/request_validators.py b/server/workers/api/src/apis/request_validators.py
@@ -1,15 +1,17 @@
 from datetime import datetime
-from marshmallow import Schema, fields, pre_load, validates, ValidationError
+from marshmallow import Schema, fields, pre_load, validates, ValidationError, EXCLUDE
 
 
 class SearchParamSchema(Schema):
+    class Meta:
+        unknown = EXCLUDE
+        
     q = fields.Str()
     q_advanced = fields.Str()
     sorting = fields.Str(required=True)
-    from_ = fields.Date(required=True, data_key="from",
+    from_ = fields.Date(data_key="from",
                         format="%Y-%m-%d")
-    to = fields.Date(required=True,
-                     format="%Y-%m-%d")
+    to = fields.Date(format="%Y-%m-%d")
     vis_type = fields.Str(require=True)
     limit = fields.Int()
     year_range = fields.Str()
@@ -33,15 +35,18 @@ class SearchParamSchema(Schema):
     coll = fields.Str()
     list_size = fields.Int()
     custom_title = fields.Str()
+    exclude_date_filters = fields.Boolean()
     custom_clustering = fields.Str()
 
 
     @pre_load
     def fix_years(self, in_data, **kwargs):
-        if len(in_data.get('from')) == 4:
-            in_data["from"] = in_data["from"]+"-01-01"
-        if len(in_data.get('to')) == 4:
-            in_data["to"] = in_data["to"]+"-12-31"
+        if "from" in in_data:
+            if len(in_data.get('from')) == 4:
+                in_data["from"] = in_data["from"]+"-01-01"
+        if "to" in in_data:
+            if len(in_data.get('to')) == 4:
+                in_data["to"] = in_data["to"]+"-12-31"
         return in_data
 
     @pre_load
diff --git a/server/workers/api/src/apis/utils.py b/server/workers/api/src/apis/utils.py
@@ -91,9 +91,12 @@ def get_or_create_contentprovider_lookup():
             cp_dict = df.name.to_dict()
             return cp_dict
     except Exception as e:
-        df = pd.read_json("contentproviders.json")
-        df.set_index("internal_name", inplace=True)
-        cp_dict = df.name.to_dict()
-        return cp_dict
+        try:
+            df = pd.read_json("contentproviders.json")
+            df.set_index("internal_name", inplace=True)
+            cp_dict = df.name.to_dict()
+            return cp_dict
+        except Exception as e:
+            return {}
 
 contentprovider_lookup = get_or_create_contentprovider_lookup()
diff --git a/server/workers/base/src/base.py b/server/workers/base/src/base.py
@@ -11,6 +11,9 @@
 import numpy as np
 from parsers import improved_df_parsing
 
+from datetime import datetime
+import sys
+
 formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
                               datefmt='%Y-%m-%d %H:%M:%S')
 
@@ -54,7 +57,7 @@ def base_rate_limit_reached(self):
         BASE demands one request per second (1 QPS), per
         https://www.base-search.net/about/download/base_interface.pdf
         """
-        
+
         t = self.redis_store.time()[0]
         self.redis_store.setnx(self.rate_key, 0)
         try:
@@ -122,7 +125,10 @@ def execute_search(self, params):
             raise
 
     def sanitize_metadata(self, metadata):
+
         metadata["sanitized_authors"] = metadata["authors"].map(lambda x: sanitize_authors(x))
+        metadata["year"] = metadata["year"].map(lambda x: sanitize_year(x))
+
         return metadata
 
     def enrich_metadata(self, metadata):
@@ -166,7 +172,7 @@ def run(self):
                     res = self.execute_search(params)
                     res["id"] = k
                     if res.get("status") == "error" or params.get('raw') is True:
-                        self.redis_store.set(k+"_output", json.dumps(res))                        
+                        self.redis_store.set(k + "_output", json.dumps(res))
                     else:
                         self.redis_store.rpush("input_data", json.dumps(res).encode('utf8'))
                         q_len = self.redis_store.llen("input_data")
@@ -195,7 +201,7 @@ def find_version_in_doi(doi):
         return int(m[0])
     else:
         return None
-    
+
 def extract_doi_suffix(doi):
     return doi.split("/")[4:]
 
@@ -248,7 +254,8 @@ def add_false_negatives(df):
     df.loc[df[(~df.is_duplicate) & (df.doi_duplicate)].index, "is_duplicate"] = True
     return df
 
-def find_duplicate_indexes(df):    
+
+def find_duplicate_indexes(df):
     dupind = df.id.map(lambda x: df[df.duplicates.str.contains(x)].index)
     tmp = pd.DataFrame(dupind).astype(str).drop_duplicates().index
     return dupind[tmp]
@@ -269,7 +276,8 @@ def mark_latest_doi(df, dupind):
                 df.loc[latest.index, "is_latest"] = True
                 df.loc[latest.index, "keep"] = True
     return df
-    
+
+
 def remove_textual_duplicates_from_different_sources(df, dupind):
     for _, idx in dupind.iteritems():
         if len(idx) > 1:
@@ -348,7 +356,8 @@ def parse_annotations(field):
             return {}
     else:
         return {}
-    
+
+
 def parse_annotations_for_all(metadata, field_name):
     parsed_annotations = pd.DataFrame(metadata[field_name].map(lambda x: parse_annotations(x)))
     parsed_annotations.columns = ["annotations"]
@@ -363,4 +372,24 @@ def sanitize_authors(authors, n=15):
     authors = authors.split("; ")
     if len(authors) > n:
         authors = authors[:n-1] + authors[-1:]
-    return "; ".join(authors)
+    return "; ".join(authors)
+
+
+def sanitize_year(year_str):
+
+    sanitized_year = ''
+    date_formats = ["%Y-%m-%d", "%Y-%m", "%Y-%m-%dT%H:%M:%SZ", "%Y %b %d"]
+
+    for fmt in date_formats:
+        try:
+            date_time_obj = datetime.strptime(year_str, fmt)
+            sanitized_year = year_str  # here we keep the original string
+            break
+        except ValueError:
+            continue
+
+    # Handle formats like "2019"
+    if year_str.isdigit() and not sanitized_year:  # check sanitized_year to avoid overwriting
+        sanitized_year = year_str  # here we keep the original string
+
+    return sanitized_year
diff --git a/vis/js/components/ContextLine.js b/vis/js/components/ContextLine.js
@@ -35,8 +35,7 @@ class ContextLine extends React.Component {
     if (hidden) {
       return null;
     }
-
-    return (
+      return (
       <ContextLineTemplate>
           {params.showAuthor && (
               <Author
@@ -60,10 +59,12 @@ class ContextLine extends React.Component {
                   popoverContainer={this.props.popoverContainer}
               />
           )}
-          {defined(params.timespan) &&
+          {(defined(params.timespan) && (!params.excludeDateFilters || params.excludeDateFilters === "false")) &&
+              // {defined(params.timespan) &&
               <Timespan>
                   <ContextTimeFrame popoverContainer={popoverContainer} timespan={params.timespan}/>
-              </Timespan>}
+              </Timespan>
+          }
           <DocumentTypes
               documentTypes={params.documentTypes}
               popoverContainer={popoverContainer}
diff --git a/vis/js/reducers/contextLine.js b/vis/js/reducers/contextLine.js
@@ -66,6 +66,10 @@ const contextLine = (state = {}, action) => {
             context.params && context.params.lang_id
                 ? getDocumentLanguage(config, context)
                 : null,
+        //   exclude date filters parameter
+        excludeDateFilters: context.params && context.params.exclude_date_filters
+            ? context.params.exclude_date_filters
+            : null,
       };
     default:
       return state;

Original file line number	Diff line number	Diff line change
`@@ -7,5 +7,5 @@`
`7`	`7`	`"min_descsize": 300,`
`8`	`8`	`"limit": 120,`
`9`	`9`	`"list_size": 100,`
`10`		`- "repo": "ftunivbern"`
	`10`	`+ "exclude_date_filters": "true"`
`11`	`11`	`}`