1111import numpy as np
1212from parsers import improved_df_parsing
1313
14+ from datetime import datetime
15+ import sys
16+
1417formatter = logging .Formatter (fmt = '%(asctime)s %(levelname)-8s %(message)s' ,
1518 datefmt = '%Y-%m-%d %H:%M:%S' )
1619
@@ -54,7 +57,7 @@ def base_rate_limit_reached(self):
5457 BASE demands one request per second (1 QPS), per
5558 https://www.base-search.net/about/download/base_interface.pdf
5659 """
57-
60+
5861 t = self .redis_store .time ()[0 ]
5962 self .redis_store .setnx (self .rate_key , 0 )
6063 try :
@@ -122,7 +125,10 @@ def execute_search(self, params):
122125 raise
123126
124127 def sanitize_metadata (self , metadata ):
128+
125129 metadata ["sanitized_authors" ] = metadata ["authors" ].map (lambda x : sanitize_authors (x ))
130+ metadata ["year" ] = metadata ["year" ].map (lambda x : sanitize_year (x ))
131+
126132 return metadata
127133
128134 def enrich_metadata (self , metadata ):
@@ -166,7 +172,7 @@ def run(self):
166172 res = self .execute_search (params )
167173 res ["id" ] = k
168174 if res .get ("status" ) == "error" or params .get ('raw' ) is True :
169- self .redis_store .set (k + "_output" , json .dumps (res ))
175+ self .redis_store .set (k + "_output" , json .dumps (res ))
170176 else :
171177 self .redis_store .rpush ("input_data" , json .dumps (res ).encode ('utf8' ))
172178 q_len = self .redis_store .llen ("input_data" )
@@ -195,7 +201,7 @@ def find_version_in_doi(doi):
195201 return int (m [0 ])
196202 else :
197203 return None
198-
204+
199205def extract_doi_suffix (doi ):
200206 return doi .split ("/" )[4 :]
201207
@@ -248,7 +254,8 @@ def add_false_negatives(df):
248254 df .loc [df [(~ df .is_duplicate ) & (df .doi_duplicate )].index , "is_duplicate" ] = True
249255 return df
250256
251- def find_duplicate_indexes (df ):
257+
258+ def find_duplicate_indexes (df ):
252259 dupind = df .id .map (lambda x : df [df .duplicates .str .contains (x )].index )
253260 tmp = pd .DataFrame (dupind ).astype (str ).drop_duplicates ().index
254261 return dupind [tmp ]
@@ -269,7 +276,8 @@ def mark_latest_doi(df, dupind):
269276 df .loc [latest .index , "is_latest" ] = True
270277 df .loc [latest .index , "keep" ] = True
271278 return df
272-
279+
280+
273281def remove_textual_duplicates_from_different_sources (df , dupind ):
274282 for _ , idx in dupind .iteritems ():
275283 if len (idx ) > 1 :
@@ -348,7 +356,8 @@ def parse_annotations(field):
348356 return {}
349357 else :
350358 return {}
351-
359+
360+
352361def parse_annotations_for_all (metadata , field_name ):
353362 parsed_annotations = pd .DataFrame (metadata [field_name ].map (lambda x : parse_annotations (x )))
354363 parsed_annotations .columns = ["annotations" ]
@@ -363,4 +372,24 @@ def sanitize_authors(authors, n=15):
363372 authors = authors .split ("; " )
364373 if len (authors ) > n :
365374 authors = authors [:n - 1 ] + authors [- 1 :]
366- return "; " .join (authors )
375+ return "; " .join (authors )
376+
377+
378+ def sanitize_year (year_str ):
379+
380+ sanitized_year = ''
381+ date_formats = ["%Y-%m-%d" , "%Y-%m" , "%Y-%m-%dT%H:%M:%SZ" , "%Y %b %d" ]
382+
383+ for fmt in date_formats :
384+ try :
385+ date_time_obj = datetime .strptime (year_str , fmt )
386+ sanitized_year = year_str # here we keep the original string
387+ break
388+ except ValueError :
389+ continue
390+
391+ # Handle formats like "2019"
392+ if year_str .isdigit () and not sanitized_year : # check sanitized_year to avoid overwriting
393+ sanitized_year = year_str # here we keep the original string
394+
395+ return sanitized_year
0 commit comments