Skip to content

Commit c400f24

Browse files
authored
Merge pull request #797 from OpenKnowledgeMaps/fix/get-PDF-issue
fix: get pdf issue
2 parents 04aefd6 + d1cedce commit c400f24

20 files changed

Lines changed: 916 additions & 129 deletions

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ dist/
1010
/config.js
1111
.cache
1212
coverage/
13+
.vscode/
1314

1415
# local deployment files
1516
/deploy.sh
@@ -31,6 +32,7 @@ server/workers/tests/*.txt
3132
server/workers/tests/testutils/
3233
local_dev/renv/*
3334
local_dev/dev.env
35+
local_dev/paper_preview
3436

3537
# php files
3638
/server/classes/headstart/vendor

docker-compose.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ services:
256256
- ../Headstart:/var/www/html/headstart
257257
- ./local_dev/config_local_headstart.ini:/var/www/html/headstart/server/preprocessing/conf/config_local.ini
258258
- ./local_dev/entrypoint.php:/var/www/html/entrypoint.php
259+
- ./local_dev/paper_preview:/var/www/html/headstart/server/paper_preview
259260
ports:
260261
- 127.0.0.1:8085:80
261262
networks:

local_dev/config_local_headstart.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# Full path to the preprocessing directory
66
preprocessing_dir = "/var/www/html/dev/server/preprocessing/"
77
# Full path to the images directory for the client. Needs to be in the public_html/www directory. Make sure that your webserver has write access to this directory.
8-
images_path = "/var/www/html/dev/server/paper_preview/"
8+
images_path = "/var/www/html/headstart/server/paper_preview/"
99
# Host of the client visualization
1010
host = "dev-searchflow-1/"
1111
# Relative path to the client visualization. Needs to be in the public_html/www directory.

local_dev/searchflow-container/Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,6 @@ ENV PATH $NVM_DIR/versions/node/v$NODE_VERSION/bin:$PATH
3131
ENV NPM_PATH $NVM_DIR/versions/node/v$NODE_VERSION/bin/npm
3232
RUN $NPM_PATH install -g puppeteer@^2.0.0 --unsafe-perm=true --allow-root
3333

34-
RUN sed -i 's#AllowOverride [Nn]one#AllowOverride All#' /etc/apache2/apache2.conf
34+
RUN sed -i 's#AllowOverride [Nn]one#AllowOverride All#' /etc/apache2/apache2.conf
35+
COPY local_dev.ini /usr/local/etc/php/conf.d/local_dev.ini
36+
RUN chown root:root /usr/local/etc/php/conf.d/local_dev.ini
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
log_errors = On
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"version":1,"defects":{"GetPDFTest::testGetRedirectURL":8,"GetPDFTest::testGetRedirectURLReturnsParsedURL":7,"GetPDFTest::testReturnsSuccessWhenPdfDownloaded":7,"GetPDFTest::testFullScriptReturnsExpectedErrorJson":8,"GetPDFTest::testFullScriptReturnsExpectedSuccessJson":8,"GetPDFTest::testStartsWithReturnsFalse":8,"GetPDFTest::testParsePDFLinkFromMetaCitationTag":5,"GetPDFTest::testParsePDFLinkFromRawPDFLink":5,"GetPDFTest::testGetContentFromURLReturnsExpectedContent":7,"GetPDFTest::testGetRedirectDOAJParsesUrlCorrectly":7,"GetPDFTest::testParsePDFLinkWithBarePDFUrl":7,"GetPDFTest::testParsePDFLinkWithDirectPDFHeader":7,"GetPDFTest::testGetRedirectURLCallsParsePDFLinkCorrectly":8,"GetPDFTest::testGetPDFAndDownloadWritesValidPDF":8,"GetPDFTest::testGetPDFAndDownloadFailsOnNonPDF":7,"GetPDFTest::testGetContentFromURLMockedReturnsExpectedArray":7,"GetPDFTest::testGetRedirectURLWithMockedGetContentFromURL":7},"times":{"GetPDFTest::testStartsWithReturnsTrue":3.34,"GetPDFTest::testStartsWithReturnsFalse":3.118,"GetPDFTest::testStartsWithEmptyNeedle":3.192,"GetPDFTest::testStartsWithEmptyHaystack":4.502,"GetPDFTest::testParsePDFLinkFromMetaTag":0,"GetPDFTest::testGetRedirectURL":0.002,"GetPDFTest::testParsePDFLinkFromMetaCitationTag":0.006,"GetPDFTest::testParsePDFLinkFromRawPDFLink":0.004,"GetPDFTest::testGetRedirectURLReturnsParsedURL":1.35,"GetPDFTest::testGetRedirectURLWithMockedContent":0.001,"GetPDFTest::testReturnsSuccessWhenPdfDownloaded":0.016,"GetPDFTest::testFullScriptReturnsExpectedSuccessJson":2.323,"GetPDFTest::testFullScriptReturnsExpectedErrorJson":0,"GetPDFTest::testGetContentFromURLReturnsExpectedContent":5.187,"GetPDFTest::testGetRedirectDOAJParsesUrlCorrectly":3.814,"GetPDFTest::testParsePDFLinkWithDirectPDFHeader":4.308,"GetPDFTest::testParsePDFLinkWithBarePDFUrl":3.226,"GetPDFTest::testParsePDFLinkWithFullPDFUrl":3.243,"GetPDFTest::testParsePDFLinkNoMatchReturnsFalse":3.658,"GetPDFTest::testFullScriptWorksSuccessfully":7.587,"GetPDFTest::testGetContentFromURLMockedReturnsExpectedArray":0.001,"GetPDFTest::testGetRedirectURLWithMockedGetContentFromURL":0.001,"GetPDFTest::testGetPDFAndDownloadWritesValidPDF":0.001,"GetPDFTest::testGetPDFAndDownloadFailsOnNonPDF":0.01}}

server/services/getPDF.php

Lines changed: 179 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -4,71 +4,195 @@
44

55
require_once dirname(__FILE__) . '/../classes/headstart/library/CommUtils.php';
66
require_once dirname(__FILE__) . '/../classes/headstart/library/Toolkit.php';
7-
87
require 'helper.php';
98

109
use headstart\library;
1110

1211
$INI_DIR = dirname(__FILE__) . "/../preprocessing/conf/";
13-
1412
$ini_array = library\Toolkit::loadIni($INI_DIR);
1513

1614
$url = library\CommUtils::getParameter($_GET, "url");
1715
$filename = library\CommUtils::getParameter($_GET, "filename");
1816
$service = library\CommUtils::getParameter($_GET, "service");
19-
$pdf_urls = library\CommUtils::getParameter($_GET, "pdf_urls");
17+
$paper_id = library\CommUtils::getParameter($_GET, "paper_id");
18+
$vis_id = library\CommUtils::getParameter($_GET, "vis_id");
19+
$vis_type = library\CommUtils::getParameter($_GET, "vis_type");
20+
2021
$images_path = $ini_array["general"]["images_path"];
2122

22-
if ($service == "base" || $service == "openaire") {
23-
$pdf_link = getPDFLinkforBASE($pdf_urls);
24-
if($pdf_link != false) {
25-
getPDFAndDownload($pdf_link, $images_path, $filename);
26-
} else {
27-
library\CommUtils::echoOrCallback(json_encode(array("status" => "error")), $_GET);
28-
exit();
29-
}
23+
if (isServiceWithPDFList($service)) {
24+
handleMultiPdfService($vis_id, $paper_id, $images_path, $filename, $vis_type);
3025
} else {
31-
getPDFAndDownload($url, $images_path, $filename);
26+
handleSingleUrlService($vis_id, $paper_id, $url, $images_path, $filename, $vis_type);
3227
}
3328

3429
library\CommUtils::echoOrCallback(json_encode(array("status" => "success", "file" => $filename)), $_GET);
3530

36-
function getPDFLinkforBASE($url) {
37-
$link_list = preg_split("/;/", $url);
38-
39-
$matches_pdf = array_filter($link_list, function($item) { return substr($item, -strlen(".pdf")) === ".pdf"; });
40-
if(count($matches_pdf) != 0) {
41-
return array_values($matches_pdf)[0];
42-
}
43-
44-
$matches_doi = array_filter($link_list, function($item) { return strpos($item, "dx.doi.org"); });
45-
if(count($matches_doi) != 0) {
46-
return getRedirectURL(array_values($matches_doi)[0]);
47-
}
48-
49-
$matches_doaj = array_filter($link_list, function($item) { return strpos($item, "doaj.org"); });
50-
if(count($matches_doaj) != 0) {
51-
$url = getRedirectDOAJ(array_values($matches_doaj)[0]);
52-
if($url != false) {
53-
return getRedirectURL($url);
54-
} else {
55-
//Remove all DOAJ entries and all entries that are not URLs
56-
$link_list = array_filter($link_list, function($item) { return !strpos($item, "doaj.org"); });
57-
$link_list = array_filter($link_list, function($item) { return filter_var($item, FILTER_VALIDATE_URL); });
58-
}
59-
}
60-
61-
return getRedirectURL(array_values($link_list)[0]);
31+
function isServiceWithPDFList(string $service): bool {
32+
return in_array($service, ["base", "openaire"]);
33+
}
34+
35+
function handleMultiPdfService(
36+
string $vis_id,
37+
string $paper_id,
38+
string $images_path,
39+
string $filename,
40+
string $vis_type
41+
): void {
42+
$valid_pdf_urls = getValidURLs($vis_id, $paper_id, $vis_type);
43+
$filtered_urls_string = implode(";", $valid_pdf_urls);
44+
$pdf_link = getPDFLinkForBASE($filtered_urls_string);
45+
46+
if (!$pdf_link) {
47+
returnError("No valid PDF link could be resolved from filtered URLs.");
48+
}
49+
50+
getPDFAndDownload($pdf_link, $images_path, $filename);
51+
}
52+
53+
function handleSingleUrlService(
54+
string $vis_id,
55+
string $paper_id,
56+
string $url,
57+
string $images_path,
58+
string $filename,
59+
string $vis_type
60+
): void {
61+
$valid_pdf_urls = getValidURLs($vis_id, $paper_id, $vis_type);
62+
63+
$decoded_input_url = urldecode($url);
64+
$normalized_valid_urls = array_map('urldecode', $valid_pdf_urls);
65+
66+
if (!in_array($decoded_input_url, $normalized_valid_urls, true)) {
67+
returnError("Provided URL not found in valid paper links");
68+
}
69+
70+
getPDFAndDownload($decoded_input_url, $images_path, $filename);
71+
}
72+
73+
function getValidURLs(string $vis_id, string $paper_id, string $vis_type) {
74+
$revision_data = fetchLatestRevision($vis_id);
75+
76+
if (!$revision_data) {
77+
returnError("There are no revision data for such visualization id");
78+
}
79+
80+
$valid_pdf_urls = extractValidPdfUrls($revision_data, $paper_id, $vis_type);
81+
82+
if (empty($valid_pdf_urls)) {
83+
returnError("There are no valid PDF URLs from revision");
84+
}
85+
86+
return $valid_pdf_urls;
87+
}
88+
89+
function fetchLatestRevision(string $vis_id): ?array {
90+
$latest_url = "http://" . $_SERVER['SERVER_NAME'] . dirname($_SERVER['REQUEST_URI']) . "/getLatestRevision.php?vis_id=" . urlencode($vis_id) . "&context=true";
91+
92+
$revision_json = @file_get_contents($latest_url);
93+
if ($revision_json === false) {
94+
error_log("Failed to fetch metadata from getLatestRevision.php");
95+
return null;
96+
}
97+
98+
$revision_data = json_decode($revision_json, true);
99+
if (!is_array($revision_data)) {
100+
error_log("Invalid JSON returned from getLatestRevision.php");
101+
return null;
102+
}
103+
104+
return $revision_data;
105+
}
106+
107+
function extractValidPdfUrls(array $revision_data, string $paper_id, string $vis_type): array {
108+
$valid_urls = [];
109+
110+
$inner_data = json_decode($revision_data["data"], true);
111+
$documents_raw = $inner_data["documents"] ?? null;
112+
$documents = json_decode($documents_raw, true);
113+
114+
if (strtolower($vis_type) == 'timeline') {
115+
$inner_data = json_decode($inner_data["data"]);
116+
$documents_raw = json_encode($inner_data);
117+
$documents = json_decode($documents_raw, true);
118+
}
119+
120+
if (!is_array($documents)) {
121+
error_log("Invalid or missing documents array: " . json_encode($documents_raw));
122+
return [];
123+
}
124+
125+
$url_fields = ['link', 'oa_link', 'identifier', 'relation', 'fulltext'];
126+
127+
foreach ($documents as $entry) {
128+
if (($entry["id"] ?? null) !== $paper_id) {
129+
continue;
130+
}
131+
132+
foreach ($url_fields as $field) {
133+
if (!isset($entry[$field])) {
134+
continue;
135+
}
136+
137+
$urls = is_array($entry[$field]) ? $entry[$field] : explode(";", $entry[$field]);
138+
139+
foreach ($urls as $url) {
140+
$url = trim($url);
141+
if (filter_var($url, FILTER_VALIDATE_URL)) {
142+
$valid_urls[] = $url;
143+
}
144+
}
145+
}
146+
}
147+
148+
return array_unique($valid_urls);
149+
}
150+
151+
function getPDFLinkForBASE($url) {
152+
$link_list = preg_split("/;/", $url);
153+
$link_list = array_map('trim', $link_list);
154+
$link_list = array_filter($link_list);
155+
156+
$matches_pdf = array_filter($link_list, function($item) { return substr($item, -strlen(".pdf")) === ".pdf"; });
157+
if(count($matches_pdf) != 0) {
158+
return array_values($matches_pdf)[0];
159+
}
160+
161+
$matches_doi = array_filter($link_list, function($item) { return strpos($item, "dx.doi.org"); });
162+
if(count($matches_doi) != 0) {
163+
return getRedirectURL(array_values($matches_doi)[0]);
164+
}
165+
166+
$matches_doaj = array_filter($link_list, function($item) { return strpos($item, "doaj.org"); });
167+
if(count($matches_doaj) != 0) {
168+
$url = getRedirectDOAJ(array_values($matches_doaj)[0]);
169+
if($url != false) {
170+
return getRedirectURL($url);
171+
} else {
172+
//Remove all DOAJ entries and all entries that are not URLs
173+
$link_list = array_filter($link_list, function($item) { return !strpos($item, "doaj.org"); });
174+
$link_list = array_filter($link_list, function($item) { return filter_var($item, FILTER_VALIDATE_URL); });
175+
}
176+
}
177+
178+
foreach ($link_list as $fallback_url) {
179+
$resolved = getRedirectURL($fallback_url);
180+
if ($resolved) {
181+
return $resolved;
182+
}
183+
}
184+
185+
return getRedirectURL(array_values($link_list)[0]);
62186
}
63187

64188
//Example:
65189
//https://doaj.org/api/v1/search/articles/id%3A90764de0bd144959b1d2727c91285eb3
66190
function getRedirectDOAJ($doaj_url) {
67191
$id = substr(strrchr($doaj_url, '/' ), 1);
68192
$url = "https://doaj.org/api/v1/search/articles/id%3A" . $id;
69-
193+
70194
$response = file_get_contents($url);
71-
195+
72196
$array = json_decode($response, true);
73197
$fulltext_link = null;
74198
if($array["total"] > 0) {
@@ -77,7 +201,7 @@ function getRedirectDOAJ($doaj_url) {
77201
if($link["type"] === "fulltext") {
78202
$fulltext_link = $link["url"];
79203
}
80-
}
204+
}
81205
}
82206
return ($fulltext_link === null)?(false):($fulltext_link);
83207
}
@@ -93,25 +217,24 @@ function getContentFromURL($link) {
93217
$response = curl_exec($ch);
94218
$redir = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
95219
curl_close($ch);
96-
220+
97221
return array($response, $redir);
98222
}
99223

100224
function getRedirectURL($link) {
101225
$response = getContentFromURL($link);
102-
103226
return parsePDFLink($response[0], $response[1]);
104227
}
105228

106229
function parsePDFLink($source, $url) {
107230
if(substr($source, 0, 4) === "%PDF") {
108231
return $url;
109232
}
110-
233+
111234
$has_match = preg_match_all('/meta[\s]+content=["\']+([^"\']+)["\']+[\s]+name[\s]*=[\s]*["\']+citation_pdf_url["\']+/i', $source, $matches);
112235
if(!$has_match) {
113236
$has_match = preg_match_all('/meta[\s]+name[\s]*=[\s]*["\']+citation_pdf_url["\']+[\s]+content=["\']+([^"\']+)["\']+/i', $source, $matches);
114-
237+
115238
if(!$has_match) {
116239
$has_match = preg_match_all('/["\']?([^"\'\s>]+(?:\.pdf))["\']?/i', $source, $matches);
117240
}
@@ -125,39 +248,37 @@ function parsePDFLink($source, $url) {
125248
return $best_match;
126249
}
127250
} else {
128-
return false;
251+
return false;
129252
}
130253
}
131254

132255
function startsWith($haystack, $needle) {
133-
$length = strlen($needle);
134-
return (substr($haystack, 0, $length) === $needle);
256+
$length = strlen($needle);
257+
return (substr($haystack, 0, $length) === $needle);
135258
}
136259

137260
function getPDFAndDownload($url, $images_path, $filename) {
138-
139261
$output_path = $images_path . $filename;
140-
141262
$pdf = getContentFromURL($url)[0];
142263

143264
if ($pdf !== false) {
144265
file_put_contents($output_path, $pdf);
145266
} else {
146-
library\CommUtils::echoOrCallback(json_encode(array("status" => "error")), $_GET);
147-
exit();
267+
returnError("Unable to get PDF from the URL");
148268
}
149269

150-
$finfo = finfo_open(FILEINFO_MIME_TYPE);
151-
270+
$finfo = finfo_open(FILEINFO_MIME_TYPE);
152271
$mime_type = finfo_file($finfo, $output_path);
153-
154272
finfo_close($finfo);
155273

156274
if (strtolower($mime_type) != "application/pdf") {
157275
unlink($output_path);
158-
library\CommUtils::echoOrCallback(json_encode(array("status" => "error")), $_GET);
159-
exit();
276+
returnError("MIME type is not application/pdf");
160277
}
161278
}
162279

163-
280+
function returnError(string $reason): void {
281+
error_log("Error: " . $reason);
282+
library\CommUtils::echoOrCallback(json_encode(["status" => "error"]), $_GET);
283+
exit();
284+
}

0 commit comments

Comments
 (0)