Merge remote-tracking branch 'upstream/exclude-date-filters-param' into exclude-date-filters-param

chreman · chreman · commit 4f2476681811 · 2023-12-18T13:44:54.000+01:00
diff --git a/examples/project_website/base.html b/examples/project_website/base.html
@@ -93,7 +93,8 @@
             //title: "fake news",
             //title: "dotcom",
             //title: "cognitive dissonance",
-            title: "exclude_date_filters",
+            // title: "exclude_date_filters",
+            title: "philosophy_no_dates",
             // file: "./data/digital-education.json",
             // file: "./data/digital-education-lang.json",
             // file: "./data/digital-education-lang[].json",
@@ -104,7 +105,8 @@
             //file: "./data/dotcom-sg.json",
             //file: "./data/cognitive-dissonance.json"
             // file: "./data/custom_title.json",
-            file: "./data/exclude_date_filters.json",
+            // file: "./data/exclude_date_filters.json",
+            file: "./data/philosophy_no_dates.json",
             // other attributes:
             is_streamgraph: false, // set true for streamgraph data
             show_area: true, // set false for streamgraph data
diff --git a/examples/project_website/data/philosophy_no_dates.json b/examples/project_website/data/philosophy_no_dates.json
diff --git a/server/workers/base/src/base.py b/server/workers/base/src/base.py
@@ -11,6 +11,9 @@
 import time
 import numpy as np
 
+from datetime import datetime
+import sys
+
 formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
                               datefmt='%Y-%m-%d %H:%M:%S')
 
@@ -54,7 +57,7 @@ def base_rate_limit_reached(self):
         BASE demands one request per second (1 QPS), per
         https://www.base-search.net/about/download/base_interface.pdf
         """
-        
+
         t = self.redis_store.time()[0]
         self.redis_store.setnx(self.rate_key, 0)
         try:
@@ -110,7 +113,10 @@ def execute_search(self, params):
             raise
 
     def sanitize_metadata(self, metadata):
+
         metadata["sanitized_authors"] = metadata["authors"].map(lambda x: sanitize_authors(x))
+        metadata["year"] = metadata["year"].map(lambda x: sanitize_year(x))
+
         return metadata
 
     def enrich_metadata(self, metadata):
@@ -154,7 +160,7 @@ def run(self):
                     res = self.execute_search(params)
                     res["id"] = k
                     if res.get("status") == "error" or params.get('raw') is True:
-                        self.redis_store.set(k+"_output", json.dumps(res))                        
+                        self.redis_store.set(k + "_output", json.dumps(res))
                     else:
                         self.redis_store.rpush("input_data", json.dumps(res).encode('utf8'))
                         q_len = self.redis_store.llen("input_data")
@@ -183,7 +189,7 @@ def find_version_in_doi(doi):
         return int(m[0])
     else:
         return None
-    
+
 def extract_doi_suffix(doi):
     return doi.split("/")[4:]
 
@@ -236,7 +242,8 @@ def add_false_negatives(df):
     df.loc[df[(~df.is_duplicate) & (df.doi_duplicate)].index, "is_duplicate"] = True
     return df
 
-def find_duplicate_indexes(df):    
+
+def find_duplicate_indexes(df):
     dupind = df.id.map(lambda x: df[df.duplicates.str.contains(x)].index)
     tmp = pd.DataFrame(dupind).astype(str).drop_duplicates().index
     return dupind[tmp]
@@ -257,7 +264,8 @@ def mark_latest_doi(df, dupind):
                 df.loc[latest.index, "is_latest"] = True
                 df.loc[latest.index, "keep"] = True
     return df
-    
+
+
 def remove_textual_duplicates_from_different_sources(df, dupind):
     for _, idx in dupind.iteritems():
         if len(idx) > 1:
@@ -331,7 +339,8 @@ def parse_annotations(field):
         return annotations.to_dict("list")
     else:
         return {}
-    
+
+
 def parse_annotations_for_all(metadata, field_name):
     parsed_annotations = pd.DataFrame(metadata[field_name].map(lambda x: parse_annotations(x)))
     parsed_annotations.columns = ["annotations"]
@@ -344,7 +353,7 @@ def expand_dict_columns(df):
     unique_annotation_keys = set().union(*unique_annotation_keys.to_list())
     if len(unique_annotation_keys) > 0:
         for c in df.columns:
-            if type(df[c].iloc[0]) is dict:            
+            if type(df[c].iloc[0]) is dict:
                 for uk in unique_annotation_keys:
                     df[c+"_"+uk] = df[c].map(lambda x: x[uk] if uk in x.keys() else [])
                     df[c+"_"+uk] = df[c+"_"+uk].map(lambda x: [uk for uk in x if uk is not np.nan])
@@ -358,4 +367,24 @@ def sanitize_authors(authors, n=15):
     authors = authors.split("; ")
     if len(authors) > n:
         authors = authors[:n-1] + authors[-1:]
-    return "; ".join(authors)
+    return "; ".join(authors)
+
+
+def sanitize_year(year_str):
+
+    sanitized_year = ''
+    date_formats = ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"]
+
+    for fmt in date_formats:
+        try:
+            date_time_obj = datetime.strptime(year_str, fmt)
+            sanitized_year = year_str  # here we keep the original string
+            break
+        except ValueError:
+            continue
+
+    # Handle formats like "2019"
+    if year_str.isdigit() and not sanitized_year:  # check sanitized_year to avoid overwriting
+        sanitized_year = year_str  # here we keep the original string
+
+    return sanitized_year