1111import time
1212import numpy as np
1313
14+ from datetime import datetime
15+ import sys
16+
1417formatter = logging .Formatter (fmt = '%(asctime)s %(levelname)-8s %(message)s' ,
1518 datefmt = '%Y-%m-%d %H:%M:%S' )
1619
@@ -54,7 +57,7 @@ def base_rate_limit_reached(self):
5457 BASE demands one request per second (1 QPS), per
5558 https://www.base-search.net/about/download/base_interface.pdf
5659 """
57-
60+
5861 t = self .redis_store .time ()[0 ]
5962 self .redis_store .setnx (self .rate_key , 0 )
6063 try :
@@ -110,7 +113,10 @@ def execute_search(self, params):
110113 raise
111114
112115 def sanitize_metadata (self , metadata ):
116+
113117 metadata ["sanitized_authors" ] = metadata ["authors" ].map (lambda x : sanitize_authors (x ))
118+ metadata ["year" ] = metadata ["year" ].map (lambda x : sanitize_year (x ))
119+
114120 return metadata
115121
116122 def enrich_metadata (self , metadata ):
@@ -154,7 +160,7 @@ def run(self):
154160 res = self .execute_search (params )
155161 res ["id" ] = k
156162 if res .get ("status" ) == "error" or params .get ('raw' ) is True :
157- self .redis_store .set (k + "_output" , json .dumps (res ))
163+ self .redis_store .set (k + "_output" , json .dumps (res ))
158164 else :
159165 self .redis_store .rpush ("input_data" , json .dumps (res ).encode ('utf8' ))
160166 q_len = self .redis_store .llen ("input_data" )
@@ -183,7 +189,7 @@ def find_version_in_doi(doi):
183189 return int (m [0 ])
184190 else :
185191 return None
186-
192+
187193def extract_doi_suffix (doi ):
188194 return doi .split ("/" )[4 :]
189195
@@ -236,7 +242,8 @@ def add_false_negatives(df):
236242 df .loc [df [(~ df .is_duplicate ) & (df .doi_duplicate )].index , "is_duplicate" ] = True
237243 return df
238244
239- def find_duplicate_indexes (df ):
245+
246+ def find_duplicate_indexes (df ):
240247 dupind = df .id .map (lambda x : df [df .duplicates .str .contains (x )].index )
241248 tmp = pd .DataFrame (dupind ).astype (str ).drop_duplicates ().index
242249 return dupind [tmp ]
@@ -257,7 +264,8 @@ def mark_latest_doi(df, dupind):
257264 df .loc [latest .index , "is_latest" ] = True
258265 df .loc [latest .index , "keep" ] = True
259266 return df
260-
267+
268+
261269def remove_textual_duplicates_from_different_sources (df , dupind ):
262270 for _ , idx in dupind .iteritems ():
263271 if len (idx ) > 1 :
@@ -331,7 +339,8 @@ def parse_annotations(field):
331339 return annotations .to_dict ("list" )
332340 else :
333341 return {}
334-
342+
343+
335344def parse_annotations_for_all (metadata , field_name ):
336345 parsed_annotations = pd .DataFrame (metadata [field_name ].map (lambda x : parse_annotations (x )))
337346 parsed_annotations .columns = ["annotations" ]
@@ -344,7 +353,7 @@ def expand_dict_columns(df):
344353 unique_annotation_keys = set ().union (* unique_annotation_keys .to_list ())
345354 if len (unique_annotation_keys ) > 0 :
346355 for c in df .columns :
347- if type (df [c ].iloc [0 ]) is dict :
356+ if type (df [c ].iloc [0 ]) is dict :
348357 for uk in unique_annotation_keys :
349358 df [c + "_" + uk ] = df [c ].map (lambda x : x [uk ] if uk in x .keys () else [])
350359 df [c + "_" + uk ] = df [c + "_" + uk ].map (lambda x : [uk for uk in x if uk is not np .nan ])
@@ -358,4 +367,24 @@ def sanitize_authors(authors, n=15):
358367 authors = authors .split ("; " )
359368 if len (authors ) > n :
360369 authors = authors [:n - 1 ] + authors [- 1 :]
361- return "; " .join (authors )
370+ return "; " .join (authors )
371+
372+
373+ def sanitize_year (year_str ):
374+
375+ sanitized_year = ''
376+ date_formats = ["%Y-%m-%d" , "%Y-%m-%dT%H:%M:%SZ" ]
377+
378+ for fmt in date_formats :
379+ try :
380+ date_time_obj = datetime .strptime (year_str , fmt )
381+ sanitized_year = year_str # here we keep the original string
382+ break
383+ except ValueError :
384+ continue
385+
386+ # Handle formats like "2019"
387+ if year_str .isdigit () and not sanitized_year : # check sanitized_year to avoid overwriting
388+ sanitized_year = year_str # here we keep the original string
389+
390+ return sanitized_year
0 commit comments