@@ -181,7 +181,7 @@ def dedup_data_transform_candidates(candidates):
181181 return [items [0 ] for _ , items in candidate_groups .items ()]
182182
183183
184- def get_field_summary (field_name , df , field_sample_size ):
184+ def get_field_summary (field_name , df , field_sample_size , max_val_chars = 100 ):
185185 try :
186186 values = sorted ([x for x in list (set (df [field_name ].values )) if x != None ])
187187 except :
@@ -196,11 +196,22 @@ def get_field_summary(field_name, df, field_sample_size):
196196 else :
197197 val_sample = values [:int (sample_size / 2 )] + ["..." ] + values [- (sample_size - int (sample_size / 2 )):]
198198
199- val_str = ', ' .join ([str (s ) if ',' not in str (s ) else f'"{ str (s )} "' for s in val_sample ])
199+ def sample_val_cap (val ):
200+ if len (str (val )) > max_val_chars :
201+ s = str (val )[:max_val_chars ] + "..."
202+ else :
203+ s = str (val )
204+
205+ if ',' in s :
206+ s = f'"{ s } "'
207+
208+ return s
209+
210+ val_str = ', ' .join ([sample_val_cap (str (s )) for s in val_sample ])
200211
201212 return f"{ field_name } -- type: { df [field_name ].dtype } , values: { val_str } "
202213
203- def generate_data_summary (input_tables , include_data_samples = True , field_sample_size = 7 ):
214+ def generate_data_summary (input_tables , include_data_samples = True , field_sample_size = 7 , max_val_chars = 140 ):
204215
205216 input_table_names = [f'{ string_to_py_varname (t ["name" ])} ' for t in input_tables ]
206217
@@ -209,7 +220,7 @@ def generate_data_summary(input_tables, include_data_samples=True, field_sample_
209220 field_summaries = []
210221 for input_data in input_tables :
211222 df = pd .DataFrame (input_data ['rows' ])
212- s = '\n \t ' .join ([get_field_summary (fname , df , field_sample_size ) for fname in list (df .columns .values )])
223+ s = '\n \t ' .join ([get_field_summary (fname , df , field_sample_size , max_val_chars ) for fname in list (df .columns .values )])
213224 field_summaries .append (s )
214225
215226 table_field_summaries = [f'table_{ i } ({ input_table_names [i ]} ) fields:\n \t { s } ' for i , s in enumerate (field_summaries )]
0 commit comments