Skip to content

Commit 874db5e

Browse files
committed
refactor: better logic of required columns defining
1 parent d31f83f commit 874db5e

3 files changed

Lines changed: 34 additions & 21 deletions

File tree

server/workers/common/common/utils.py

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -139,23 +139,30 @@ def push_metadata_to_queue(
139139
:param source_list: define from which service additional metadata will be received (available values: "crossref", "altmetric").
140140
:return: request_id for the receiving of the request result.
141141
"""
142+
# Checks that valid values are specified in the source array
142143
check_metadata_enrichment_source(source_list)
143144

145+
# Creates a new unique request identifier that will then be used to retrieve the result
144146
request_id = str(uuid.uuid4())
147+
148+
# Specifies from which sources to obtain information
145149
params["metrics_sources"] = source_list
150+
151+
# Payload object creation
146152
task_data = json.dumps({
147153
"id": request_id,
148154
"params": params,
149155
"metadata": metadata.to_json(orient="records"),
150156
})
151157

158+
# Pushing request to Redis and returning request id
152159
redis_store.rpush("metrics", task_data)
153160
return request_id
154161

155162

156163
def check_metadata_enrichment_source(source_list: List[str]) -> None:
157164
"""
158-
Checks that source for metadata enrichment contains correct values.
165+
Checks that valid values are specified in the source array.
159166
160167
:param source_list: List of sources from where metadata will be enriched.
161168
:return: None.
@@ -173,23 +180,29 @@ def fetch_enriched_metadata(redis_store: redis.Redis, request_id: str, timeout:
173180
:param timeout: Results waiting time (default - 600 seconds).
174181
:return: Enriched DataFrame with metadata.
175182
"""
183+
# Getting result of metadata enrichment from Redis
176184
result = get_key(redis_store, request_id, timeout)
177185
return pd.DataFrame(result["input_data"])
178186

179187

180-
def get_metadata_columns_for_integration(integration: Literal["pubmed", "orcid"]):
188+
def get_metadata_columns_for_source(source_list: List[str]) -> List[str]:
181189
"""
182-
Returning required metadata columns for different integrations.
190+
Returning required metadata columns for different sources.
183191
184-
:param integration: integration service.
192+
:param source_list: List of sources from where metadata received.
185193
:return: array with required metadata columns.
186194
"""
195+
# Checks that valid values are specified in the source array
196+
check_metadata_enrichment_source(source_list)
197+
198+
# Define required metadata columns for different sources and return them
199+
result = []
187200

188-
if integration == 'pubmed':
189-
return ["citation_count"]
190-
elif integration == 'orcid':
191-
return [
192-
"citation_count",
201+
if "crossref" in source_list:
202+
result.extend(["citation_count"])
203+
204+
if "altmetric" in source_list:
205+
result.extend([
193206
"cited_by_wikipedia_count",
194207
"cited_by_msm_count",
195208
"cited_by_policies_count",
@@ -202,21 +215,24 @@ def get_metadata_columns_for_integration(integration: Literal["pubmed", "orcid"]
202215
"cited_by_qna_count",
203216
"cited_by_tweeters_count",
204217
"cited_by_videos_count"
205-
]
218+
])
206219

207-
return []
220+
return result
208221

209222

210-
def ensure_required_columns(metadata: pd.DataFrame, integration: Literal["pubmed", "orcid"]) -> pd.DataFrame:
223+
def ensure_required_columns(metadata: pd.DataFrame, source_list: List[str]) -> pd.DataFrame:
211224
"""
212225
Checks that all necessary columns are available or adding them with NaN value.
213226
214227
:param metadata: DataFrame with metadata.
215-
:param integration: integration service.
228+
:param source_list: List of sources from where metadata received.
216229
:return: Updated DataFrame.
217230
"""
231+
# Checks that valid values are specified in the source array
232+
check_metadata_enrichment_source(source_list)
218233

219-
columns = get_metadata_columns_for_integration(integration)
234+
# Gets metadata columns that must be received from source(-s)
235+
columns = get_metadata_columns_for_source(source_list)
220236
for column in columns:
221237
if column not in metadata.columns:
222238
metadata[column] = np.NaN
@@ -229,7 +245,6 @@ def enrich_metadata(
229245
params: Dict[str, Union[str, List[str]]],
230246
metadata: pd.DataFrame,
231247
source_list: List[str],
232-
integration: Literal["pubmed", "orcid"]
233248
) -> pd.DataFrame:
234249
"""
235250
Enriching metadata - adding information about citations from Redis.
@@ -240,7 +255,7 @@ def enrich_metadata(
240255
:param source: define from which service additional metadata will be received (available values: "crossref", "altmetric").
241256
:return: Enriched DataFrame with metadata.
242257
"""
243-
# Checks that source list contains valid values
258+
# Checks that valid values are specified in the source array
244259
check_metadata_enrichment_source(source_list)
245260

246261
# Creates a request to metrics for metadata enrichment
@@ -251,5 +266,5 @@ def enrich_metadata(
251266
enriched_metadata = fetch_enriched_metadata(redis, request_id)
252267

253268
# Checks that all necessary columns are available or adding them with NaN value
254-
enriched_metadata = ensure_required_columns(enriched_metadata, integration)
269+
enriched_metadata = ensure_required_columns(enriched_metadata, source_list)
255270
return enriched_metadata

server/workers/orcid/src/orcid_service.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -389,9 +389,8 @@ def _retrieve_author_info_and_metadata(self, orcid: Orcid) -> Tuple[AuthorInfo,
389389
def _process_metadata(self, metadata: pd.DataFrame, author_info: AuthorInfo, params: Dict[str, str]) -> pd.DataFrame:
390390
metadata["authors"] = metadata["authors"].replace("", author_info.author_name)
391391

392-
integration = 'orcid'
393392
source_for_metadata_enrichment = ["crossref", "altmetric"]
394-
metadata = enrich_metadata(self.redis_store, params, metadata, source_for_metadata_enrichment, integration)
393+
metadata = enrich_metadata(self.redis_store, params, metadata, source_for_metadata_enrichment)
395394

396395
self.logger.debug(f'metadata shape after base enrichment: {metadata.shape}')
397396
author_info = self.enrich_author_info(author_info, metadata, params)

server/workers/pubmed/src/pubmed.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,8 @@ def execute_search(self, params):
4242

4343
metadata = pd.DataFrame(raw_metadata)
4444

45-
integration = 'pubmed'
4645
source_for_metadata_enrichment = ["crossref"]
47-
metadata = enrich_metadata(self.redis_store, params, metadata, source_for_metadata_enrichment, integration)
46+
metadata = enrich_metadata(self.redis_store, params, metadata, source_for_metadata_enrichment)
4847

4948
text = pd.DataFrame(raw_text)
5049

0 commit comments

Comments
 (0)