44
55require_once dirname (__FILE__ ) . '/../classes/headstart/library/CommUtils.php ' ;
66require_once dirname (__FILE__ ) . '/../classes/headstart/library/Toolkit.php ' ;
7-
87require 'helper.php ' ;
98
109use headstart \library ;
1110
1211$ INI_DIR = dirname (__FILE__ ) . "/../preprocessing/conf/ " ;
13-
1412$ ini_array = library \Toolkit::loadIni ($ INI_DIR );
1513
1614$ url = library \CommUtils::getParameter ($ _GET , "url " );
1715$ filename = library \CommUtils::getParameter ($ _GET , "filename " );
1816$ service = library \CommUtils::getParameter ($ _GET , "service " );
19- $ pdf_urls = library \CommUtils::getParameter ($ _GET , "pdf_urls " );
17+ $ paper_id = library \CommUtils::getParameter ($ _GET , "paper_id " );
18+ $ vis_id = library \CommUtils::getParameter ($ _GET , "vis_id " );
19+ $ vis_type = library \CommUtils::getParameter ($ _GET , "vis_type " );
20+
2021$ images_path = $ ini_array ["general " ]["images_path " ];
2122
22- if ($ service == "base " || $ service == "openaire " ) {
23- $ pdf_link = getPDFLinkforBASE ($ pdf_urls );
24- if ($ pdf_link != false ) {
25- getPDFAndDownload ($ pdf_link , $ images_path , $ filename );
26- } else {
27- library \CommUtils::echoOrCallback (json_encode (array ("status " => "error " )), $ _GET );
28- exit ();
29- }
23+ if (isServiceWithPDFList ($ service )) {
24+ handleMultiPdfService ($ vis_id , $ paper_id , $ images_path , $ filename , $ vis_type );
3025} else {
31- getPDFAndDownload ( $ url , $ images_path , $ filename );
26+ handleSingleUrlService ( $ vis_id , $ paper_id , $ url , $ images_path , $ filename, $ vis_type );
3227}
3328
3429library \CommUtils::echoOrCallback (json_encode (array ("status " => "success " , "file " => $ filename )), $ _GET );
3530
36- function getPDFLinkforBASE ($ url ) {
37- $ link_list = preg_split ("/;/ " , $ url );
38-
39- $ matches_pdf = array_filter ($ link_list , function ($ item ) { return substr ($ item , -strlen (".pdf " )) === ".pdf " ; });
40- if (count ($ matches_pdf ) != 0 ) {
41- return array_values ($ matches_pdf )[0 ];
42- }
43-
44- $ matches_doi = array_filter ($ link_list , function ($ item ) { return strpos ($ item , "dx.doi.org " ); });
45- if (count ($ matches_doi ) != 0 ) {
46- return getRedirectURL (array_values ($ matches_doi )[0 ]);
47- }
48-
49- $ matches_doaj = array_filter ($ link_list , function ($ item ) { return strpos ($ item , "doaj.org " ); });
50- if (count ($ matches_doaj ) != 0 ) {
51- $ url = getRedirectDOAJ (array_values ($ matches_doaj )[0 ]);
52- if ($ url != false ) {
53- return getRedirectURL ($ url );
54- } else {
55- //Remove all DOAJ entries and all entries that are not URLs
56- $ link_list = array_filter ($ link_list , function ($ item ) { return !strpos ($ item , "doaj.org " ); });
57- $ link_list = array_filter ($ link_list , function ($ item ) { return filter_var ($ item , FILTER_VALIDATE_URL ); });
58- }
59- }
60-
61- return getRedirectURL (array_values ($ link_list )[0 ]);
31+ function isServiceWithPDFList (string $ service ): bool {
32+ return in_array ($ service , ["base " , "openaire " ]);
33+ }
34+
35+ function handleMultiPdfService (
36+ string $ vis_id ,
37+ string $ paper_id ,
38+ string $ images_path ,
39+ string $ filename ,
40+ string $ vis_type
41+ ): void {
42+ $ valid_pdf_urls = getValidURLs ($ vis_id , $ paper_id , $ vis_type );
43+ $ filtered_urls_string = implode ("; " , $ valid_pdf_urls );
44+ $ pdf_link = getPDFLinkForBASE ($ filtered_urls_string );
45+
46+ if (!$ pdf_link ) {
47+ returnError ("No valid PDF link could be resolved from filtered URLs. " );
48+ }
49+
50+ getPDFAndDownload ($ pdf_link , $ images_path , $ filename );
51+ }
52+
53+ function handleSingleUrlService (
54+ string $ vis_id ,
55+ string $ paper_id ,
56+ string $ url ,
57+ string $ images_path ,
58+ string $ filename ,
59+ string $ vis_type
60+ ): void {
61+ $ valid_pdf_urls = getValidURLs ($ vis_id , $ paper_id , $ vis_type );
62+
63+ $ decoded_input_url = urldecode ($ url );
64+ $ normalized_valid_urls = array_map ('urldecode ' , $ valid_pdf_urls );
65+
66+ if (!in_array ($ decoded_input_url , $ normalized_valid_urls , true )) {
67+ returnError ("Provided URL not found in valid paper links " );
68+ }
69+
70+ getPDFAndDownload ($ decoded_input_url , $ images_path , $ filename );
71+ }
72+
73+ function getValidURLs (string $ vis_id , string $ paper_id , string $ vis_type ) {
74+ $ revision_data = fetchLatestRevision ($ vis_id );
75+
76+ if (!$ revision_data ) {
77+ returnError ("There are no revision data for such visualization id " );
78+ }
79+
80+ $ valid_pdf_urls = extractValidPdfUrls ($ revision_data , $ paper_id , $ vis_type );
81+
82+ if (empty ($ valid_pdf_urls )) {
83+ returnError ("There are no valid PDF URLs from revision " );
84+ }
85+
86+ return $ valid_pdf_urls ;
87+ }
88+
89+ function fetchLatestRevision (string $ vis_id ): ?array {
90+ $ latest_url = "http:// " . $ _SERVER ['SERVER_NAME ' ] . dirname ($ _SERVER ['REQUEST_URI ' ]) . "/getLatestRevision.php?vis_id= " . urlencode ($ vis_id ) . "&context=true " ;
91+
92+ $ revision_json = @file_get_contents ($ latest_url );
93+ if ($ revision_json === false ) {
94+ error_log ("Failed to fetch metadata from getLatestRevision.php " );
95+ return null ;
96+ }
97+
98+ $ revision_data = json_decode ($ revision_json , true );
99+ if (!is_array ($ revision_data )) {
100+ error_log ("Invalid JSON returned from getLatestRevision.php " );
101+ return null ;
102+ }
103+
104+ return $ revision_data ;
105+ }
106+
107+ function extractValidPdfUrls (array $ revision_data , string $ paper_id , string $ vis_type ): array {
108+ $ valid_urls = [];
109+
110+ $ inner_data = json_decode ($ revision_data ["data " ], true );
111+ $ documents_raw = $ inner_data ["documents " ] ?? null ;
112+ $ documents = json_decode ($ documents_raw , true );
113+
114+ if (strtolower ($ vis_type ) == 'timeline ' ) {
115+ $ inner_data = json_decode ($ inner_data ["data " ]);
116+ $ documents_raw = json_encode ($ inner_data );
117+ $ documents = json_decode ($ documents_raw , true );
118+ }
119+
120+ if (!is_array ($ documents )) {
121+ error_log ("Invalid or missing documents array: " . json_encode ($ documents_raw ));
122+ return [];
123+ }
124+
125+ $ url_fields = ['link ' , 'oa_link ' , 'identifier ' , 'relation ' , 'fulltext ' ];
126+
127+ foreach ($ documents as $ entry ) {
128+ if (($ entry ["id " ] ?? null ) !== $ paper_id ) {
129+ continue ;
130+ }
131+
132+ foreach ($ url_fields as $ field ) {
133+ if (!isset ($ entry [$ field ])) {
134+ continue ;
135+ }
136+
137+ $ urls = is_array ($ entry [$ field ]) ? $ entry [$ field ] : explode ("; " , $ entry [$ field ]);
138+
139+ foreach ($ urls as $ url ) {
140+ $ url = trim ($ url );
141+ if (filter_var ($ url , FILTER_VALIDATE_URL )) {
142+ $ valid_urls [] = $ url ;
143+ }
144+ }
145+ }
146+ }
147+
148+ return array_unique ($ valid_urls );
149+ }
150+
151+ function getPDFLinkForBASE ($ url ) {
152+ $ link_list = preg_split ("/;/ " , $ url );
153+ $ link_list = array_map ('trim ' , $ link_list );
154+ $ link_list = array_filter ($ link_list );
155+
156+ $ matches_pdf = array_filter ($ link_list , function ($ item ) { return substr ($ item , -strlen (".pdf " )) === ".pdf " ; });
157+ if (count ($ matches_pdf ) != 0 ) {
158+ return array_values ($ matches_pdf )[0 ];
159+ }
160+
161+ $ matches_doi = array_filter ($ link_list , function ($ item ) { return strpos ($ item , "dx.doi.org " ); });
162+ if (count ($ matches_doi ) != 0 ) {
163+ return getRedirectURL (array_values ($ matches_doi )[0 ]);
164+ }
165+
166+ $ matches_doaj = array_filter ($ link_list , function ($ item ) { return strpos ($ item , "doaj.org " ); });
167+ if (count ($ matches_doaj ) != 0 ) {
168+ $ url = getRedirectDOAJ (array_values ($ matches_doaj )[0 ]);
169+ if ($ url != false ) {
170+ return getRedirectURL ($ url );
171+ } else {
172+ //Remove all DOAJ entries and all entries that are not URLs
173+ $ link_list = array_filter ($ link_list , function ($ item ) { return !strpos ($ item , "doaj.org " ); });
174+ $ link_list = array_filter ($ link_list , function ($ item ) { return filter_var ($ item , FILTER_VALIDATE_URL ); });
175+ }
176+ }
177+
178+ foreach ($ link_list as $ fallback_url ) {
179+ $ resolved = getRedirectURL ($ fallback_url );
180+ if ($ resolved ) {
181+ return $ resolved ;
182+ }
183+ }
184+
185+ return getRedirectURL (array_values ($ link_list )[0 ]);
62186}
63187
64188//Example:
65189//https://doaj.org/api/v1/search/articles/id%3A90764de0bd144959b1d2727c91285eb3
66190function getRedirectDOAJ ($ doaj_url ) {
67191 $ id = substr (strrchr ($ doaj_url , '/ ' ), 1 );
68192 $ url = "https://doaj.org/api/v1/search/articles/id%3A " . $ id ;
69-
193+
70194 $ response = file_get_contents ($ url );
71-
195+
72196 $ array = json_decode ($ response , true );
73197 $ fulltext_link = null ;
74198 if ($ array ["total " ] > 0 ) {
@@ -77,7 +201,7 @@ function getRedirectDOAJ($doaj_url) {
77201 if ($ link ["type " ] === "fulltext " ) {
78202 $ fulltext_link = $ link ["url " ];
79203 }
80- }
204+ }
81205 }
82206 return ($ fulltext_link === null )?(false ):($ fulltext_link );
83207}
@@ -93,25 +217,24 @@ function getContentFromURL($link) {
93217 $ response = curl_exec ($ ch );
94218 $ redir = curl_getinfo ($ ch , CURLINFO_EFFECTIVE_URL );
95219 curl_close ($ ch );
96-
220+
97221 return array ($ response , $ redir );
98222}
99223
100224function getRedirectURL ($ link ) {
101225 $ response = getContentFromURL ($ link );
102-
103226 return parsePDFLink ($ response [0 ], $ response [1 ]);
104227}
105228
106229function parsePDFLink ($ source , $ url ) {
107230 if (substr ($ source , 0 , 4 ) === "%PDF " ) {
108231 return $ url ;
109232 }
110-
233+
111234 $ has_match = preg_match_all ('/meta[\s]+content=[" \']+([^" \']+)[" \']+[\s]+name[\s]*=[\s]*[" \']+citation_pdf_url[" \']+/i ' , $ source , $ matches );
112235 if (!$ has_match ) {
113236 $ has_match = preg_match_all ('/meta[\s]+name[\s]*=[\s]*[" \']+citation_pdf_url[" \']+[\s]+content=[" \']+([^" \']+)[" \']+/i ' , $ source , $ matches );
114-
237+
115238 if (!$ has_match ) {
116239 $ has_match = preg_match_all ('/[" \']?([^" \'\s>]+(?:\.pdf))[" \']?/i ' , $ source , $ matches );
117240 }
@@ -125,39 +248,37 @@ function parsePDFLink($source, $url) {
125248 return $ best_match ;
126249 }
127250 } else {
128- return false ;
251+ return false ;
129252 }
130253}
131254
132255function startsWith ($ haystack , $ needle ) {
133- $ length = strlen ($ needle );
134- return (substr ($ haystack , 0 , $ length ) === $ needle );
256+ $ length = strlen ($ needle );
257+ return (substr ($ haystack , 0 , $ length ) === $ needle );
135258}
136259
137260function getPDFAndDownload ($ url , $ images_path , $ filename ) {
138-
139261 $ output_path = $ images_path . $ filename ;
140-
141262 $ pdf = getContentFromURL ($ url )[0 ];
142263
143264 if ($ pdf !== false ) {
144265 file_put_contents ($ output_path , $ pdf );
145266 } else {
146- library \CommUtils::echoOrCallback (json_encode (array ("status " => "error " )), $ _GET );
147- exit ();
267+ returnError ("Unable to get PDF from the URL " );
148268 }
149269
150- $ finfo = finfo_open (FILEINFO_MIME_TYPE );
151-
270+ $ finfo = finfo_open (FILEINFO_MIME_TYPE );
152271 $ mime_type = finfo_file ($ finfo , $ output_path );
153-
154272 finfo_close ($ finfo );
155273
156274 if (strtolower ($ mime_type ) != "application/pdf " ) {
157275 unlink ($ output_path );
158- library \CommUtils::echoOrCallback (json_encode (array ("status " => "error " )), $ _GET );
159- exit ();
276+ returnError ("MIME type is not application/pdf " );
160277 }
161278}
162279
163-
280+ function returnError (string $ reason ): void {
281+ error_log ("Error: " . $ reason );
282+ library \CommUtils::echoOrCallback (json_encode (["status " => "error " ]), $ _GET );
283+ exit ();
284+ }
0 commit comments