@@ -26,8 +26,12 @@ def __init__(self, loglevel="INFO"):
2626
2727 def tokenize (self , s ):
2828 #return re.split("; | - |, |: ", s)
29- s = re .sub (r"[\(\)]" , "" , s )
30- return re .split ("; " , s )
29+ t = re .sub (r"[\(\)]" , "" , s )
30+ t = re .split ("; " , t )
31+ t = [s for s in t ]
32+ t = [s .replace (";" , "" ) for s in t ]
33+ t = [s .strip () for s in t ]
34+ return t
3135
3236 def get_streamgraph_data (self , metadata , query , n = 12 , method = "count" ):
3337 metadata = pd .DataFrame .from_records (metadata )
@@ -36,7 +40,7 @@ def get_streamgraph_data(self, metadata, query, n=12, method="count"):
3640 df .dropna (axis = 0 , subset = ["year" ], inplace = True )
3741 df .year = pd .to_datetime (df .year .map (lambda x : x .replace (month = 1 , day = 1 ).strftime ('%Y-%m-%d' )))
3842 df = df [df .subject .map (lambda x : x is not None )]
39- df .subject = df .subject .map (lambda x : [ s . lower () for s in self .tokenize (x )] if isinstance (x , str ) else "" )
43+ df .subject = df .subject .map (lambda x : self .tokenize (x . lower ()) if isinstance (x , str ) else [] )
4044 df = df [df .subject .map (lambda x : x != [])]
4145 df ["boundary_label" ] = df .year
4246 df = df .explode ('subject' )
@@ -177,6 +181,8 @@ def reduce_daterange(self, daterange, df):
177181 # 5% which is chosen here is an arbitrary value, could also be higher 10% or lower
178182 min_value = int (yearly_sums .sum () * 0.05 )
179183 start_index = yearly_sums_cum [yearly_sums_cum > min_value ].index [0 ]
184+ self .logger .debug (f"Start index: { start_index } " )
185+ self .logger .debug (f"Start year: { x [start_index ]} " )
180186 df .y = df .y .map (lambda x : x [start_index :])
181187 df .ids_timestep = df .ids_timestep .map (lambda x : x [start_index :])
182188 x = x [start_index :]
0 commit comments