Skip to content

Commit 3b2243c

Browse files
committed
various bugfixes
1 parent e042e31 commit 3b2243c

1 file changed

Lines changed: 9 additions & 3 deletions

File tree

server/workers/dataprocessing/src/streamgraph.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,12 @@ def __init__(self, loglevel="INFO"):
2626

2727
def tokenize(self, s):
2828
#return re.split("; | - |, |: ", s)
29-
s = re.sub(r"[\(\)]", "", s)
30-
return re.split("; ", s)
29+
t = re.sub(r"[\(\)]", "", s)
30+
t = re.split("; ", t)
31+
t = [s for s in t]
32+
t = [s.replace(";", "") for s in t]
33+
t = [s.strip() for s in t]
34+
return t
3135

3236
def get_streamgraph_data(self, metadata, query, n=12, method="count"):
3337
metadata = pd.DataFrame.from_records(metadata)
@@ -36,7 +40,7 @@ def get_streamgraph_data(self, metadata, query, n=12, method="count"):
3640
df.dropna(axis=0, subset=["year"], inplace=True)
3741
df.year = pd.to_datetime(df.year.map(lambda x: x.replace(month=1, day=1).strftime('%Y-%m-%d')))
3842
df = df[df.subject.map(lambda x: x is not None)]
39-
df.subject = df.subject.map(lambda x: [s.lower() for s in self.tokenize(x)] if isinstance(x, str) else "")
43+
df.subject = df.subject.map(lambda x: self.tokenize(x.lower()) if isinstance(x, str) else [])
4044
df = df[df.subject.map(lambda x: x != [])]
4145
df["boundary_label"] = df.year
4246
df = df.explode('subject')
@@ -177,6 +181,8 @@ def reduce_daterange(self, daterange, df):
177181
# 5% which is chosen here is an arbitrary value, could also be higher 10% or lower
178182
min_value = int(yearly_sums.sum() * 0.05)
179183
start_index = yearly_sums_cum[yearly_sums_cum > min_value].index[0]
184+
self.logger.debug(f"Start index: {start_index}")
185+
self.logger.debug(f"Start year: {x[start_index]}")
180186
df.y = df.y.map(lambda x: x[start_index:])
181187
df.ids_timestep = df.ids_timestep.map(lambda x: x[start_index:])
182188
x = x[start_index:]

0 commit comments

Comments
 (0)