Merge remote-tracking branch 'upstream/bugfix/pubmed-citations' into bugfix/pubmed-citations

chreman · chreman · commit fdd012f9757f · 2025-03-17T16:23:29.000+01:00
diff --git a/server/preprocessing/other-scripts/run_metrics.R b/server/preprocessing/other-scripts/run_metrics.R
@@ -50,7 +50,7 @@ if (!is.null(params$lang_id)) {
 if (!is.null(params$metrics_sources)) {
   metrics_sources <- params$metrics_sources
 } else {
-  metrics_sources <- 'all'
+  metrics_sources <- c("altmetric", "crossref")
 }
 
 source('metrics.R')
diff --git a/server/workers/common/common/utils.py b/server/workers/common/common/utils.py
@@ -1,13 +1,14 @@
+import re
 import os
 import json
 import time
 import uuid
-from dateutil.parser import parse
-from datetime import timedelta
-import re
 import redis
-import pandas as pd
 import pathlib
+import numpy as np
+import pandas as pd
+from datetime import timedelta
+from dateutil.parser import parse
 
 
 redis_config = {
@@ -119,3 +120,91 @@ def get_nested_value(data, keys, default=None):
         if data is None:
             return default
     return data
+
+
+def push_metadata_to_queue(redis_store, params, metadata):
+    """
+    Sending metadata for processing into Redis queue and returning the request_id.
+
+    :param redis_store: Object of the Redis store.
+    :param params: Request params.
+    :param metadata: DataFrame with default metadata.
+    :return: request_id for the receiving of the request result.
+    """
+
+    request_id = str(uuid.uuid4())
+    params["metrics_sources"] = ["crossref"]
+    task_data = json.dumps({
+        "id": request_id,
+        "params": params,
+        "metadata": metadata.to_json(orient="records"),
+    })
+
+    redis_store.rpush("metrics", task_data)
+    return request_id
+
+
+def fetch_enriched_metadata(redis_store, request_id, timeout = 600):
+    """
+    Getting enriched metadata from Redis.
+
+    :param redis_store: Object of the Redis store.
+    :param request_id: Unique indemnificator of the request.
+    :param timeout: Results waiting time (default - 600 seconds).
+    :return: Enriched DataFrame with metadata.
+    """
+    result = get_key(redis_store, request_id, timeout)
+    return pd.DataFrame(result["input_data"])
+
+
+def ensure_required_columns(metadata: pd.DataFrame) -> pd.DataFrame:
+    """
+    Checks that all necessary columns are available or adding them with NaN value.
+
+    :param metadata: DataFrame with metadata.
+    :return: Updated DataFrame.
+    """
+    REQUIRED_METADATA_COLUMNS = [
+        "citation_count",
+        "cited_by_wikipedia_count",
+        "cited_by_msm_count",
+        "cited_by_policies_count",
+        "cited_by_patents_count",
+        "cited_by_accounts_count",
+        "cited_by_fbwalls_count",
+        "cited_by_feeds_count",
+        "cited_by_gplus_count",
+        "cited_by_rdts_count",
+        "cited_by_qna_count",
+        "cited_by_tweeters_count",
+        "cited_by_videos_count"
+    ]
+
+    for column in REQUIRED_METADATA_COLUMNS:
+        if column not in metadata.columns:
+            metadata[column] = np.NaN
+
+    return metadata
+
+
+def enrich_metadata(redis_store, params, metadata: pd.DataFrame) -> pd.DataFrame:
+    """
+    Enriching metadata - adding information about citations from Redis.
+
+    :param redis_store: store object of Redis.
+    :param params: params of the request.
+    :param metadata: DataFrame with default metadata.
+
+    :return: Enriched DataFrame with metadata.
+    """
+
+    # Creates a request to metrics for metadata enrichment
+    # and returns request_id for receiving the result later
+    request_id = push_metadata_to_queue(redis_store, params, metadata)
+
+    # Getting the result after metadata enrichment at metrics
+    enriched_metadata = fetch_enriched_metadata(redis_store, request_id)
+
+    # Checks that all necessary columns are available or adding them with NaN value
+    enriched_metadata = ensure_required_columns(enriched_metadata)
+    return enriched_metadata
diff --git a/server/workers/metrics/src/metrics.py b/server/workers/metrics/src/metrics.py
@@ -34,16 +34,16 @@ def next_item(self):
 
     @error_logging_aspect(log_level=logging.ERROR)
     def execute_search(self, params: dict, metadata: str) -> dict:
+        self.logger.debug(f"execute_search function running in metrics.py")
+
         command = [
-            self.command, 
-            self.runner, 
-            self.wd, 
-            params.get('q'), 
+            self.command,
+            self.runner,
+            self.wd,
+            params.get('q'),
             params.get('service')
         ]
 
-        self.logger.debug(f"Executing command: {command}")
-
         data = {
             "params": params,
             "metadata": metadata
@@ -59,11 +59,19 @@ def execute_search(self, params: dict, metadata: str) -> dict:
             )
             stdout, stderr = proc.communicate(json.dumps(data))
 
-            self.logger.debug(f"Stdout: {stdout}")
+            # TODO: Remove after development
+            self.logger.debug(f"Raw stdout: {stdout}")
+            self.logger.debug(f"Raw stderr: {stderr}")
 
             output = [line for line in stdout.split('\n') if line]
             errors = [line for line in stderr.split('\n') if line]
 
+            # TODO: Remove after development
+            if not output:
+                raise ValueError("No output received from the subprocess")
+            if len(output) < 2:
+                raise ValueError(f"Unexpected output format: {output}")
+
             if not output:
                 raise ValueError("No output received from the subprocess")
 
diff --git a/server/workers/pubmed/src/pubmed.py b/server/workers/pubmed/src/pubmed.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import logging
 from common.r_wrapper import RWrapper
+from common.utils import enrich_metadata
 
 formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
                               datefmt='%Y-%m-%d %H:%M:%S')
@@ -19,35 +20,43 @@ def next_item(self):
         endpoint = msg.get('endpoint')
         return k, params, endpoint
 
+
     def execute_search(self, params):
         q = params.get('q')
         service = params.get('service')
-        data = {}
-        data["params"] = params
-        cmd = [self.command, self.runner, self.wd,
-               q, service]
+        data = {"params": params}
+        cmd = [self.command, self.runner, self.wd, q, service]
+
         try:
             proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                                     encoding='utf-8')
             stdout, stderr = proc.communicate(json.dumps(data))
-            output = [o for o in stdout.split('\n') if len(o) > 0]
-            error = [o for o in stderr.split('\n') if len(o) > 0]
+            output = [o for o in stdout.split('\n') if o]
+            error = [o for o in stderr.split('\n') if o]
+
             raw_metadata = json.loads(output[-2])
             raw_text = json.loads(output[-1])
+
             if isinstance(raw_metadata, dict) and raw_metadata.get('status') == "error":
-                res = raw_metadata
-            else:
-                metadata = pd.DataFrame(raw_metadata)
-                text = pd.DataFrame(raw_text)
-                input_data = {}
-                input_data["metadata"] = metadata.to_json(orient='records')
-                input_data["text"] = text.to_json(orient='records')
-                res = {}
-                res["input_data"] = input_data
-                res["params"] = params
-            return res
+                return raw_metadata
+
+            metadata = pd.DataFrame(raw_metadata)
+
+            metadata = enrich_metadata(self.redis_store, params, metadata)
+            for index, row in metadata.iterrows():
+                self.logger.debug(f"Title: {row['title']}, DOI: {row['doi']}, Citations: {row.get('citation_count', 'N/A')}")
+
+            text = pd.DataFrame(raw_text)
+
+            input_data = {
+                "metadata": metadata.to_json(orient='records'),
+                "text": text.to_json(orient='records')
+            }
+
+            return {"input_data": input_data, "params": params}
+
         except Exception as e:
-            self.logger.error(e)
+            self.logger.error(f"Error in execute_search: {e}")
             self.logger.error(error)
             raise
 

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ if (!is.null(params$lang_id)) {`
`50`	`50`	`if (!is.null(params$metrics_sources)) {`
`51`	`51`	`metrics_sources <- params$metrics_sources`
`52`	`52`	`} else {`
`53`		`- metrics_sources <- 'all'`
	`53`	`+ metrics_sources <- c("altmetric", "crossref")`
`54`	`54`	`}`
`55`	`55`
`56`	`56`	`source('metrics.R')`