Skip to content

Commit 4f24766

Browse files
committed
Merge remote-tracking branch 'upstream/exclude-date-filters-param' into exclude-date-filters-param
2 parents 353af2f + ee81bfb commit 4f24766

3 files changed

Lines changed: 51 additions & 10 deletions

File tree

examples/project_website/base.html

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@
9393
//title: "fake news",
9494
//title: "dotcom",
9595
//title: "cognitive dissonance",
96-
title: "exclude_date_filters",
96+
// title: "exclude_date_filters",
97+
title: "philosophy_no_dates",
9798
// file: "./data/digital-education.json",
9899
// file: "./data/digital-education-lang.json",
99100
// file: "./data/digital-education-lang[].json",
@@ -104,7 +105,8 @@
104105
//file: "./data/dotcom-sg.json",
105106
//file: "./data/cognitive-dissonance.json"
106107
// file: "./data/custom_title.json",
107-
file: "./data/exclude_date_filters.json",
108+
// file: "./data/exclude_date_filters.json",
109+
file: "./data/philosophy_no_dates.json",
108110
// other attributes:
109111
is_streamgraph: false, // set true for streamgraph data
110112
show_area: true, // set false for streamgraph data

examples/project_website/data/philosophy_no_dates.json

Lines changed: 10 additions & 0 deletions
Large diffs are not rendered by default.

server/workers/base/src/base.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
import time
1212
import numpy as np
1313

14+
from datetime import datetime
15+
import sys
16+
1417
formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
1518
datefmt='%Y-%m-%d %H:%M:%S')
1619

@@ -54,7 +57,7 @@ def base_rate_limit_reached(self):
5457
BASE demands one request per second (1 QPS), per
5558
https://www.base-search.net/about/download/base_interface.pdf
5659
"""
57-
60+
5861
t = self.redis_store.time()[0]
5962
self.redis_store.setnx(self.rate_key, 0)
6063
try:
@@ -110,7 +113,10 @@ def execute_search(self, params):
110113
raise
111114

112115
def sanitize_metadata(self, metadata):
116+
113117
metadata["sanitized_authors"] = metadata["authors"].map(lambda x: sanitize_authors(x))
118+
metadata["year"] = metadata["year"].map(lambda x: sanitize_year(x))
119+
114120
return metadata
115121

116122
def enrich_metadata(self, metadata):
@@ -154,7 +160,7 @@ def run(self):
154160
res = self.execute_search(params)
155161
res["id"] = k
156162
if res.get("status") == "error" or params.get('raw') is True:
157-
self.redis_store.set(k+"_output", json.dumps(res))
163+
self.redis_store.set(k + "_output", json.dumps(res))
158164
else:
159165
self.redis_store.rpush("input_data", json.dumps(res).encode('utf8'))
160166
q_len = self.redis_store.llen("input_data")
@@ -183,7 +189,7 @@ def find_version_in_doi(doi):
183189
return int(m[0])
184190
else:
185191
return None
186-
192+
187193
def extract_doi_suffix(doi):
188194
return doi.split("/")[4:]
189195

@@ -236,7 +242,8 @@ def add_false_negatives(df):
236242
df.loc[df[(~df.is_duplicate) & (df.doi_duplicate)].index, "is_duplicate"] = True
237243
return df
238244

239-
def find_duplicate_indexes(df):
245+
246+
def find_duplicate_indexes(df):
240247
dupind = df.id.map(lambda x: df[df.duplicates.str.contains(x)].index)
241248
tmp = pd.DataFrame(dupind).astype(str).drop_duplicates().index
242249
return dupind[tmp]
@@ -257,7 +264,8 @@ def mark_latest_doi(df, dupind):
257264
df.loc[latest.index, "is_latest"] = True
258265
df.loc[latest.index, "keep"] = True
259266
return df
260-
267+
268+
261269
def remove_textual_duplicates_from_different_sources(df, dupind):
262270
for _, idx in dupind.iteritems():
263271
if len(idx) > 1:
@@ -331,7 +339,8 @@ def parse_annotations(field):
331339
return annotations.to_dict("list")
332340
else:
333341
return {}
334-
342+
343+
335344
def parse_annotations_for_all(metadata, field_name):
336345
parsed_annotations = pd.DataFrame(metadata[field_name].map(lambda x: parse_annotations(x)))
337346
parsed_annotations.columns = ["annotations"]
@@ -344,7 +353,7 @@ def expand_dict_columns(df):
344353
unique_annotation_keys = set().union(*unique_annotation_keys.to_list())
345354
if len(unique_annotation_keys) > 0:
346355
for c in df.columns:
347-
if type(df[c].iloc[0]) is dict:
356+
if type(df[c].iloc[0]) is dict:
348357
for uk in unique_annotation_keys:
349358
df[c+"_"+uk] = df[c].map(lambda x: x[uk] if uk in x.keys() else [])
350359
df[c+"_"+uk] = df[c+"_"+uk].map(lambda x: [uk for uk in x if uk is not np.nan])
@@ -358,4 +367,24 @@ def sanitize_authors(authors, n=15):
358367
authors = authors.split("; ")
359368
if len(authors) > n:
360369
authors = authors[:n-1] + authors[-1:]
361-
return "; ".join(authors)
370+
return "; ".join(authors)
371+
372+
373+
def sanitize_year(year_str):
374+
375+
sanitized_year = ''
376+
date_formats = ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"]
377+
378+
for fmt in date_formats:
379+
try:
380+
date_time_obj = datetime.strptime(year_str, fmt)
381+
sanitized_year = year_str # here we keep the original string
382+
break
383+
except ValueError:
384+
continue
385+
386+
# Handle formats like "2019"
387+
if year_str.isdigit() and not sanitized_year: # check sanitized_year to avoid overwriting
388+
sanitized_year = year_str # here we keep the original string
389+
390+
return sanitized_year

0 commit comments

Comments
 (0)