@@ -88,12 +88,13 @@ def execute_search(self, params):
8888 res = raw_metadata
8989 else :
9090 metadata = pd .DataFrame (raw_metadata )
91+ metadata = self .sanitize_metadata (metadata )
9192 metadata = filter_duplicates (metadata )
9293 metadata = pd .concat ([metadata , parse_annotations_for_all (metadata , "subject_orig" )], axis = 1 )
9394 metadata = metadata .head (params .get ('list_size' ))
9495 metadata .reset_index (inplace = True , drop = True )
9596 metadata = self .enrich_metadata (metadata )
96- text = pd .concat ([metadata .id , metadata [["title" , "paper_abstract" , "subject_orig" , "published_in" , "authors " ]]
97+ text = pd .concat ([metadata .id , metadata [["title" , "paper_abstract" , "subject_orig" , "published_in" , "sanitized_authors " ]]
9798 .apply (lambda x : " " .join (x ), axis = 1 )], axis = 1 )
9899 text .columns = ["id" , "content" ]
99100 input_data = {}
@@ -108,6 +109,10 @@ def execute_search(self, params):
108109 self .logger .error (error )
109110 raise
110111
112+ def sanitize_metadata (self , metadata ):
113+ metadata ["sanitized_authors" ] = metadata ["authors" ].map (lambda x : sanitize_authors (x ))
114+ return metadata
115+
111116 def enrich_metadata (self , metadata ):
112117 metadata ["repo" ] = metadata ["content_provider" ].map (lambda x : self .content_providers .get (x , "" ))
113118 enrichment = improved_df_parsing (metadata )
@@ -331,7 +336,7 @@ def parse_annotations_for_all(metadata, field_name):
331336 parsed_annotations = pd .DataFrame (metadata [field_name ].map (lambda x : parse_annotations (x )))
332337 parsed_annotations .columns = ["annotations" ]
333338 expanded_annotations = expand_dict_columns (parsed_annotations )
334- return parsed_annotations
339+ return expanded_annotations
335340
336341# convert DataFrame with dict columns to DataFrame with columns for each dict key
337342def expand_dict_columns (df ):
@@ -347,4 +352,10 @@ def expand_dict_columns(df):
347352
348353def clean_up_annotations (df , field ):
349354 df [field ] = df [field ].map (lambda x : pattern_annotations .sub ("" , x ).strip ())
350- return df
355+ return df
356+
357+ def sanitize_authors (authors , n = 15 ):
358+ authors = authors .split ("; " )
359+ if len (authors ) > n :
360+ authors = authors [:n - 1 ] + authors [- 1 :]
361+ return "; " .join (authors )
0 commit comments