|
8 | 8 | from pyorcid import Orcid, errors as pyorcid_errors |
9 | 9 | from pyorcid.orcid_authentication import OrcidAuthentication |
10 | 10 | from typing import Tuple |
11 | | -from common.utils import get_key, get_nested_value |
| 11 | +from common.utils import get_key, get_nested_value, enrich_metadata |
12 | 12 | from repositories.author_info import AuthorInfoRepository |
13 | 13 | from repositories.works import WorksRepository |
14 | 14 | from redis import StrictRedis |
@@ -79,56 +79,7 @@ def execute_search(self, params: Dict[str, str]) -> Union[SuccessResult, ErrorRe |
79 | 79 | except (pyorcid_errors.Unauthorized, Exception) as e: |
80 | 80 | return self._handle_error(params, "unexpected data processing error", e) |
81 | 81 |
|
82 | | - def enrich_metadata(self, params: Dict[str, str], metadata: pd.DataFrame) -> pd.DataFrame: |
83 | | - """ |
84 | | - This function enriches the metadata DataFrame with additional information |
85 | | - from external sources, in this case crossref and altmetric. |
86 | | - The function will store the enriched metadata in the Redis queue for further |
87 | | - processing, from where it will be picked up by the metrics worker. |
88 | | - Returned data will be the original metadata enriched with additional |
89 | | - metadata columns from the external sources. |
90 | | -
|
91 | | - Parameters: |
92 | | - - params (dict): The parameters for the search endpoint. |
93 | | - - metadata (pd.DataFrame): The metadata DataFrame to enrich. |
94 | 82 |
|
95 | | - Returns: |
96 | | - - pd.DataFrame: The enriched metadata DataFrame. |
97 | | - """ |
98 | | - |
99 | | - self.logger.debug(f"Enriching metadata for ORCID {params.get('orcid')}") |
100 | | - |
101 | | - request_id = str(uuid.uuid4()) |
102 | | - task_data = { |
103 | | - "id": request_id, |
104 | | - "params": params, |
105 | | - "metadata": metadata.to_json(orient="records"), |
106 | | - } |
107 | | - self.redis_store.rpush("metrics", json.dumps(task_data)) |
108 | | - result = get_key(self.redis_store, request_id, 600) |
109 | | - |
110 | | - metadata = pd.DataFrame(result["input_data"]) |
111 | | - |
112 | | - for c in [ |
113 | | - "citation_count", |
114 | | - "cited_by_wikipedia_count", |
115 | | - "cited_by_msm_count", |
116 | | - "cited_by_policies_count", |
117 | | - "cited_by_patents_count", |
118 | | - "cited_by_accounts_count", |
119 | | - "cited_by_fbwalls_count", |
120 | | - "cited_by_feeds_count", |
121 | | - "cited_by_gplus_count", |
122 | | - "cited_by_rdts_count", |
123 | | - "cited_by_qna_count", |
124 | | - "cited_by_tweeters_count", |
125 | | - "cited_by_videos_count" |
126 | | - ]: |
127 | | - if c not in metadata.columns: |
128 | | - metadata[c] = np.NaN |
129 | | - |
130 | | - return metadata |
131 | | - |
132 | 83 | def log_dataframe(self, df: pd.DataFrame, params: Dict[str, str], name: str, ): |
133 | 84 | orcid = params.get('orcid') |
134 | 85 |
|
@@ -437,7 +388,8 @@ def _retrieve_author_info_and_metadata(self, orcid: Orcid) -> Tuple[AuthorInfo, |
437 | 388 |
|
438 | 389 | def _process_metadata(self, metadata: pd.DataFrame, author_info: AuthorInfo, params: Dict[str, str]) -> pd.DataFrame: |
439 | 390 | metadata["authors"] = metadata["authors"].replace("", author_info.author_name) |
440 | | - metadata = self.enrich_metadata(params, metadata) |
| 391 | + source_for_metadata_enrichment = "crossref" |
| 392 | + metadata = enrich_metadata(self.redis_store, params, metadata, source_for_metadata_enrichment) |
441 | 393 | self.logger.debug(f'metadata shape after base enrichment: {metadata.shape}') |
442 | 394 | author_info = self.enrich_author_info(author_info, metadata, params) |
443 | 395 | self.logger.debug(f'metadata shape after enrichment: {metadata.shape}') |
|
0 commit comments