66class ExportService
77{
88
9-
109 /**
1110 * Convert a page to a self-contained HTML file.
1211 * Includes required CSS & image content. Images are base64 encoded into the HTML.
1312 * @param Page $page
1413 * @return mixed|string
1514 */
1615 public function pageToContainedHtml (Page $ page )
16+ {
17+ $ cssContent = file_get_contents (public_path ('/css/export-styles.css ' ));
18+ $ pageHtml = view ('pages/export ' , ['page ' => $ page , 'css ' => $ cssContent ])->render ();
19+ return $ this ->containHtml ($ pageHtml );
20+ }
21+
22+ /**
23+ * Convert a page to a pdf file.
24+ * @param Page $page
25+ * @return mixed|string
26+ */
27+ public function pageToPdf (Page $ page )
1728 {
1829 $ cssContent = file_get_contents (public_path ('/css/export-styles.css ' ));
1930 $ pageHtml = view ('pages/pdf ' , ['page ' => $ page , 'css ' => $ cssContent ])->render ();
31+ $ containedHtml = $ this ->containHtml ($ pageHtml );
32+ $ pdf = \PDF ::loadHTML ($ containedHtml );
33+ return $ pdf ->output ();
34+ }
2035
36+ /**
37+ * Bundle of the contents of a html file to be self-contained.
38+ * @param $htmlContent
39+ * @return mixed|string
40+ */
41+ protected function containHtml ($ htmlContent )
42+ {
2143 $ imageTagsOutput = [];
22- preg_match_all ("/\<img.*src\=(\'| \")(.*?)(\'| \").*?\>/i " , $ pageHtml , $ imageTagsOutput );
44+ preg_match_all ("/\<img.*src\=(\'| \")(.*?)(\'| \").*?\>/i " , $ htmlContent , $ imageTagsOutput );
2345
2446 // Replace image src with base64 encoded image strings
2547 if (isset ($ imageTagsOutput [0 ]) && count ($ imageTagsOutput [0 ]) > 0 ) {
@@ -34,12 +56,12 @@ public function pageToContainedHtml(Page $page)
3456 $ imageContent = file_get_contents ($ pathString );
3557 $ imageEncoded = 'data:image/ ' . pathinfo ($ pathString , PATHINFO_EXTENSION ) . ';base64, ' . base64_encode ($ imageContent );
3658 $ newImageString = str_replace ($ srcString , $ imageEncoded , $ oldImgString );
37- $ pageHtml = str_replace ($ oldImgString , $ newImageString , $ pageHtml );
59+ $ htmlContent = str_replace ($ oldImgString , $ newImageString , $ htmlContent );
3860 }
3961 }
4062
4163 $ linksOutput = [];
42- preg_match_all ("/\<a.*href\=(\'| \")(.*?)(\'| \").*?\>/i " , $ pageHtml , $ linksOutput );
64+ preg_match_all ("/\<a.*href\=(\'| \")(.*?)(\'| \").*?\>/i " , $ htmlContent , $ linksOutput );
4365
4466 // Replace image src with base64 encoded image strings
4567 if (isset ($ linksOutput [0 ]) && count ($ linksOutput [0 ]) > 0 ) {
@@ -49,13 +71,45 @@ public function pageToContainedHtml(Page $page)
4971 if (strpos (trim ($ srcString ), 'http ' ) !== 0 ) {
5072 $ newSrcString = url ($ srcString );
5173 $ newLinkString = str_replace ($ srcString , $ newSrcString , $ oldLinkString );
52- $ pageHtml = str_replace ($ oldLinkString , $ newLinkString , $ pageHtml );
74+ $ htmlContent = str_replace ($ oldLinkString , $ newLinkString , $ htmlContent );
5375 }
5476 }
5577 }
5678
5779 // Replace any relative links with system domain
58- return $ pageHtml ;
80+ return $ htmlContent ;
81+ }
82+
83+ /**
84+ * Converts the page contents into simple plain text.
85+ * This method filters any bad looking content to
86+ * provide a nice final output.
87+ * @param Page $page
88+ * @return mixed
89+ */
90+ public function pageToPlainText (Page $ page )
91+ {
92+ $ text = $ page ->text ;
93+ // Replace multiple spaces with single spaces
94+ $ text = preg_replace ('/\ {2,}/ ' , ' ' , $ text );
95+ // Reduce multiple horrid whitespace characters.
96+ $ text = preg_replace ('/(\x0A|\xA0|\x0A|\r|\n){2,}/su ' , "\n\n" , $ text );
97+ $ text = html_entity_decode ($ text );
98+ // Add title
99+ $ text = $ page ->name . "\n\n" . $ text ;
100+ return $ text ;
59101 }
60102
61- }
103+ }
104+
105+
106+
107+
108+
109+
110+
111+
112+
113+
114+
115+
0 commit comments