1+ import re
12import os
23import json
34import time
45import uuid
5- from dateutil .parser import parse
6- from datetime import timedelta
7- import re
86import redis
9- import pandas as pd
107import pathlib
8+ import numpy as np
9+ import pandas as pd
10+ from datetime import timedelta
11+ from dateutil .parser import parse
1112
1213
1314redis_config = {
@@ -119,3 +120,91 @@ def get_nested_value(data, keys, default=None):
119120 if data is None :
120121 return default
121122 return data
123+
124+
125+ def push_metadata_to_queue (redis_store , params , metadata ):
126+ """
127+ Sending metadata for processing into Redis queue and returning the request_id.
128+
129+ :param redis_store: Object of the Redis store.
130+ :param params: Request params.
131+ :param metadata: DataFrame with default metadata.
132+ :return: request_id for the receiving of the request result.
133+ """
134+
135+ request_id = str (uuid .uuid4 ())
136+ params ["metrics_sources" ] = ["crossref" ]
137+ task_data = json .dumps ({
138+ "id" : request_id ,
139+ "params" : params ,
140+ "metadata" : metadata .to_json (orient = "records" ),
141+ })
142+
143+ redis_store .rpush ("metrics" , task_data )
144+ return request_id
145+
146+
147+ def fetch_enriched_metadata (redis_store , request_id , timeout = 600 ):
148+ """
149+ Getting enriched metadata from Redis.
150+
151+ :param redis_store: Object of the Redis store.
152+ :param request_id: Unique indemnificator of the request.
153+ :param timeout: Results waiting time (default - 600 seconds).
154+ :return: Enriched DataFrame with metadata.
155+ """
156+ result = get_key (redis_store , request_id , timeout )
157+ return pd .DataFrame (result ["input_data" ])
158+
159+
160+ def ensure_required_columns (metadata : pd .DataFrame ) -> pd .DataFrame :
161+ """
162+ Checks that all necessary columns are available or adding them with NaN value.
163+
164+ :param metadata: DataFrame with metadata.
165+ :return: Updated DataFrame.
166+ """
167+ REQUIRED_METADATA_COLUMNS = [
168+ "citation_count" ,
169+ "cited_by_wikipedia_count" ,
170+ "cited_by_msm_count" ,
171+ "cited_by_policies_count" ,
172+ "cited_by_patents_count" ,
173+ "cited_by_accounts_count" ,
174+ "cited_by_fbwalls_count" ,
175+ "cited_by_feeds_count" ,
176+ "cited_by_gplus_count" ,
177+ "cited_by_rdts_count" ,
178+ "cited_by_qna_count" ,
179+ "cited_by_tweeters_count" ,
180+ "cited_by_videos_count"
181+ ]
182+
183+ for column in REQUIRED_METADATA_COLUMNS :
184+ if column not in metadata .columns :
185+ metadata [column ] = np .NaN
186+
187+ return metadata
188+
189+
190+ def enrich_metadata (redis_store , params , metadata : pd .DataFrame ) -> pd .DataFrame :
191+ """
192+ Enriching metadata - adding information about citations from Redis.
193+
194+ :param redis_store: store object of Redis.
195+ :param params: params of the request.
196+ :param metadata: DataFrame with default metadata.
197+
198+ :return: Enriched DataFrame with metadata.
199+ """
200+
201+ # Creates a request to metrics for metadata enrichment
202+ # and returns request_id for receiving the result later
203+ request_id = push_metadata_to_queue (redis_store , params , metadata )
204+
205+ # Getting the result after metadata enrichment at metrics
206+ enriched_metadata = fetch_enriched_metadata (redis_store , request_id )
207+
208+ # Checks that all necessary columns are available or adding them with NaN value
209+ enriched_metadata = ensure_required_columns (enriched_metadata )
210+ return enriched_metadata
0 commit comments