|
1 | 1 | package com.zhazhapan.vspider; |
2 | 2 |
|
3 | | -import java.net.URL; |
4 | | -import java.util.regex.Matcher; |
5 | | -import java.util.regex.Pattern; |
6 | | - |
7 | 3 | import com.zhazhapan.util.Checker; |
8 | 4 | import com.zhazhapan.util.Downloader; |
9 | 5 | import com.zhazhapan.vspider.models.CrawlConfig; |
10 | | -import com.zhazhapan.vspider.modules.constant.Values; |
11 | | - |
| 6 | +import com.zhazhapan.vspider.models.MysqlConfig; |
| 7 | +import com.zhazhapan.vspider.modules.constant.DefaultConfigValues; |
| 8 | +import com.zhazhapan.vspider.modules.constant.SpiderValueConsts; |
12 | 9 | import edu.uci.ics.crawler4j.crawler.Page; |
13 | 10 | import edu.uci.ics.crawler4j.crawler.WebCrawler; |
14 | 11 | import edu.uci.ics.crawler4j.parser.HtmlParseData; |
15 | 12 | import edu.uci.ics.crawler4j.url.WebURL; |
16 | 13 | import javafx.application.Platform; |
| 14 | +import javafx.util.Pair; |
| 15 | + |
| 16 | +import java.net.URL; |
| 17 | +import java.sql.SQLException; |
| 18 | +import java.util.regex.Matcher; |
| 19 | +import java.util.regex.Pattern; |
17 | 20 |
|
18 | 21 | /** |
19 | 22 | * @author pantao |
20 | | - * |
21 | 23 | */ |
22 | 24 | public class Crawler extends WebCrawler { |
23 | 25 |
|
24 | | - // private final Pattern FILTER_PATTERN = |
25 | | - // Pattern.compile(".*\\.(js|css)(\\?.*)?$", Pattern.CASE_INSENSITIVE); |
| 26 | + // private final Pattern FILTER_PATTERN = |
| 27 | + // Pattern.compile(".*\\.(js|css)(\\?.*)?$", Pattern.CASE_INSENSITIVE); |
26 | 28 |
|
27 | | - /** |
28 | | - * 匹配图片 |
29 | | - */ |
30 | | - public final Pattern IMAGES_PATTERN = Pattern.compile( |
31 | | - "(https?:)?//[^\\s&;\"':<>]*?\\.(bmp|gif|jpe?g|png|tiff?|pcx|tga|svg|pic)(\\?[^?\\s\"':<>]*)?", |
32 | | - Pattern.CASE_INSENSITIVE); |
| 29 | + /** |
| 30 | + * 匹配图片 |
| 31 | + */ |
| 32 | + public final Pattern IMAGES_PATTERN = Pattern.compile("(https?:)?//[^\\s&;\"':<>]*?\\." + "" + "" + "" + "" + "" |
| 33 | + + "(bmp|gif|jpe?g|png|tiff?|pcx|tga|svg|pic)(\\?[^?\\s\"':<>]*)?", Pattern.CASE_INSENSITIVE); |
33 | 34 |
|
34 | | - /** |
35 | | - * 匹配媒体文件 |
36 | | - */ |
37 | | - private final Pattern VIDEOS_PATTERN = Pattern.compile( |
38 | | - "(https?:)?//[^\\s&;\"':<>]*\\.(avi|mov|swf|asf|navi|wmv|3gp|mkv|flv|rm(vb)?|webm|mpg|mp4|qsv|mpe?g|mp3|aac|ogg|wav|flac|ape|wma|aif|au|ram|mmf|amr|flac)(\\?[^?\\s\"':<>]*)?", |
39 | | - Pattern.CASE_INSENSITIVE); |
| 35 | + /** |
| 36 | + * 匹配媒体文件 |
| 37 | + */ |
| 38 | + private final Pattern VIDEOS_PATTERN = Pattern.compile("(https?:)?//[^\\s&;\"':<>]*\\." + "" + "" + "" + "" + "" |
| 39 | + + "(avi|mov|swf|asf|navi|wmv|3gp|mkv|flv|rm(vb)" + |
| 40 | + "?|webm|mpg|mp4|qsv|mpe?g|mp3|aac|ogg|wav|flac|ape|wma|aif|au|ram|mmf|amr|flac)(\\?[^?\\s\"':<>]*)?", |
| 41 | + Pattern.CASE_INSENSITIVE); |
40 | 42 |
|
41 | | - /** |
42 | | - * 匹配文档 |
43 | | - */ |
44 | | - private final Pattern DOCS_PATTERN = Pattern.compile( |
45 | | - "(https?:)?//[^\\s&;\"':<>]*\\.(pdf|docx?|txt|log|conf|java|xml|json|css|js|html|hml|php|wps|rtf)(\\?[^?\\s\"':<>]*)?", |
46 | | - Pattern.CASE_INSENSITIVE); |
| 43 | + /** |
| 44 | + * 匹配文档 |
| 45 | + */ |
| 46 | + private final Pattern DOCS_PATTERN = Pattern.compile("(https?:)?//[^\\s&;\"':<>]*\\." + "" + "" + "" + "" + "" + |
| 47 | + "(pdf|docx?|txt|log|conf|java|xml|json|css|js|html|hml|php|wps|rtf)(\\?[^?\\s\"':<>]*)?", Pattern |
| 48 | + .CASE_INSENSITIVE); |
47 | 49 |
|
48 | | - /** |
49 | | - * 匹配其他文件 |
50 | | - */ |
51 | | - private final Pattern OTHERS_PATTERN = Pattern.compile( |
52 | | - "(https?:)?//[^\\s&;\"':<>]*\\.(zip|[0-9a-z]?z|exe|dmg|iso|jar|msi|rar|tmp|xlsx?|mdf|com|c|asm|for|lib|lst|msg|obj|pas|wki|bas|map|bak|dot|bat|sh|rpm)(\\?[^?\\s\"':<>]*)?", |
53 | | - Pattern.CASE_INSENSITIVE); |
| 50 | + /** |
| 51 | + * 匹配其他文件 |
| 52 | + */ |
| 53 | + private final Pattern OTHERS_PATTERN = Pattern.compile("(https?:)?//[^\\s&;\"':<>]*\\." + "" + "" + "" + "" + "" |
| 54 | + + "(zip|[0-9a-z]?z|exe|dmg|iso|jar|msi|rar|tmp|xlsx?|mdf|com|c|asm|for|lib|lst|msg|obj|pas|wki|bas|map" + |
| 55 | + "|bak" + "|dot|bat|sh|rpm)(\\?[^?\\s\"':<>]*)?", Pattern.CASE_INSENSITIVE); |
54 | 56 |
|
55 | | - /** |
56 | | - * 由crawler4j调用,前置(爬虫)过滤将在这里进行匹配 |
57 | | - */ |
58 | | - @Override |
59 | | - public boolean shouldVisit(Page referringPage, WebURL url) { |
60 | | - String urlStr = url.getURL(); |
61 | | - if (App.crawlFilterPattern.matcher(urlStr).find() && !App.visitUrls.contains(urlStr)) { |
62 | | - for (String domain : App.domains) { |
63 | | - if (urlStr.contains(domain)) { |
64 | | - App.visitUrls.add(urlStr); |
65 | | - return true; |
66 | | - } |
67 | | - } |
68 | | - } |
69 | | - return false; |
70 | | - } |
| 57 | + /** |
| 58 | + * 由crawler4j调用,前置(爬虫)过滤将在这里进行匹配 |
| 59 | + */ |
| 60 | + @Override |
| 61 | + public boolean shouldVisit(Page referringPage, WebURL url) { |
| 62 | + String urlStr = url.getURL(); |
| 63 | + if (SpiderApplication.crawlFilterPattern.matcher(urlStr).find() && !SpiderApplication.visitUrls.contains |
| 64 | + (urlStr)) { |
| 65 | + for (String domain : SpiderApplication.domains) { |
| 66 | + if (urlStr.contains(domain)) { |
| 67 | + SpiderApplication.visitUrls.add(urlStr); |
| 68 | + return true; |
| 69 | + } |
| 70 | + } |
| 71 | + } |
| 72 | + return false; |
| 73 | + } |
71 | 74 |
|
72 | | - /** |
73 | | - * 由crawler4j调用,链接(访问)过滤将在这里进行匹配 |
74 | | - */ |
75 | | - @Override |
76 | | - public void visit(Page page) { |
77 | | - String url = page.getWebURL().getURL(); |
78 | | - if (App.visitFilterPattern.matcher(url).find() && page.getParseData() instanceof HtmlParseData) { |
79 | | - HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); |
80 | | - Platform.runLater(() -> { |
81 | | - App.mainController.statusLabel.setText("validating url: " + url); |
82 | | - App.mainController.htmlContent.appendText(Values.VISITING_TIP + url + "\r\n"); |
83 | | - }); |
84 | | - downloadURL(url, htmlParseData.getHtml()); |
85 | | - } |
86 | | - } |
| 75 | + /** |
| 76 | + * 由crawler4j调用,链接(访问)过滤将在这里进行匹配 |
| 77 | + */ |
| 78 | + @Override |
| 79 | + public void visit(Page page) { |
| 80 | + String url = page.getWebURL().getURL(); |
| 81 | + if (SpiderApplication.visitFilterPattern.matcher(url).find() && page.getParseData() instanceof HtmlParseData) { |
| 82 | + HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); |
| 83 | + Platform.runLater(() -> { |
| 84 | + SpiderApplication.mainController.statusLabel.setText("validating url: " + url); |
| 85 | + SpiderApplication.mainController.htmlContent.appendText(SpiderValueConsts.VISITING_TIP + url + "\r\n"); |
| 86 | + }); |
| 87 | + downloadURL(url, htmlParseData.getHtml()); |
| 88 | + } |
| 89 | + } |
87 | 90 |
|
88 | | - /** |
89 | | - * 判断需要下载的资源 |
90 | | - * |
91 | | - * @param url |
92 | | - * {@link URL} |
93 | | - * @param html |
94 | | - * {@link String} |
95 | | - */ |
96 | | - public void downloadURL(String url, String html) { |
97 | | - Matcher matcher; |
98 | | - if (CrawlConfig.getCrawlImages().get()) { |
99 | | - matcher = IMAGES_PATTERN.matcher(html); |
100 | | - addURLs("image", matcher); |
101 | | - } |
102 | | - if (CrawlConfig.getCrawlVideos().get()) { |
103 | | - matcher = VIDEOS_PATTERN.matcher(html); |
104 | | - addURLs("media", matcher); |
105 | | - } |
106 | | - if (CrawlConfig.getCrawlDocs().get()) { |
107 | | - matcher = DOCS_PATTERN.matcher(html); |
108 | | - addURLs("document", matcher); |
109 | | - } |
110 | | - if (CrawlConfig.getCrawlOthers().get()) { |
111 | | - matcher = OTHERS_PATTERN.matcher(html); |
112 | | - addURLs("others", matcher); |
113 | | - } |
114 | | - if (Checker.isNotEmpty(url) && CrawlConfig.getCrawlLinks().get()) { |
115 | | - String path = App.DOWNLOAD_FOLDER + Values.SEPARATOR + "link"; |
116 | | - Downloader.download(path, (url.startsWith("//") ? "http:" : "") + url); |
117 | | - } |
118 | | - } |
| 91 | + /** |
| 92 | + * 判断需要下载的资源 |
| 93 | + * |
| 94 | + * @param url {@link URL} |
| 95 | + * @param html {@link String} |
| 96 | + */ |
| 97 | + public void downloadURL(String url, String html) { |
| 98 | + Matcher matcher; |
| 99 | + if (CrawlConfig.getCrawlImages().get()) { |
| 100 | + matcher = IMAGES_PATTERN.matcher(html); |
| 101 | + addURLs("image", matcher); |
| 102 | + } |
| 103 | + if (CrawlConfig.getCrawlVideos().get()) { |
| 104 | + matcher = VIDEOS_PATTERN.matcher(html); |
| 105 | + addURLs("media", matcher); |
| 106 | + } |
| 107 | + if (CrawlConfig.getCrawlDocs().get()) { |
| 108 | + matcher = DOCS_PATTERN.matcher(html); |
| 109 | + addURLs("document", matcher); |
| 110 | + } |
| 111 | + if (CrawlConfig.getCrawlOthers().get()) { |
| 112 | + matcher = OTHERS_PATTERN.matcher(html); |
| 113 | + addURLs("others", matcher); |
| 114 | + } |
| 115 | + if (Checker.isNotEmpty(url) && CrawlConfig.getCrawlLinks().get()) { |
| 116 | + String path = SpiderApplication.DOWNLOAD_FOLDER + SpiderValueConsts.SEPARATOR + "link"; |
| 117 | + Downloader.download(path, (url.startsWith("//") ? "http:" : "") + url); |
| 118 | + } |
| 119 | + if (MysqlConfig.isEnableCustom()) { |
| 120 | + StringBuilder preSlice = new StringBuilder("insert into " + MysqlConfig.getTableName() + "("); |
| 121 | + StringBuilder postSlice = new StringBuilder(" values("); |
| 122 | + if (Checker.isNotEmpty(MysqlConfig.getFields())) { |
| 123 | + for (Pair<String, String> pair : MysqlConfig.getFields()) { |
| 124 | + preSlice.append(pair.getValue()).append(","); |
| 125 | + postSlice.append(SpiderUtils.evaluate(pair.getKey(), html)).append(","); |
| 126 | + } |
| 127 | + String pre = preSlice.toString(); |
| 128 | + String post = postSlice.toString(); |
| 129 | + String sql = pre.substring(0, pre.length() - 1) + ")" + post.substring(0, post.length() - 1) + ")"; |
| 130 | + if (MysqlConfig.isEnableSql()) { |
| 131 | + SpiderUtils.saveFile(DefaultConfigValues.SQL_PATH, sql + "\r\n", true); |
| 132 | + } |
| 133 | + try { |
| 134 | + SpiderApplication.statement.executeUpdate(sql); |
| 135 | + } catch (SQLException e) { |
| 136 | + System.out.println(e.getMessage()); |
| 137 | + } |
| 138 | + } |
| 139 | + } |
| 140 | + } |
119 | 141 |
|
120 | | - /** |
121 | | - * 从源代码提取可下载的资源 |
122 | | - * |
123 | | - * @param path |
124 | | - * {@link String} |
125 | | - * @param matcher |
126 | | - * {@link Matcher} |
127 | | - */ |
128 | | - public void addURLs(String path, Matcher matcher) { |
129 | | - path = App.DOWNLOAD_FOLDER + Values.SEPARATOR + path; |
130 | | - while (matcher.find()) { |
131 | | - String url = matcher.group(); |
132 | | - download(path, url); |
133 | | - } |
134 | | - } |
| 142 | + /** |
| 143 | + * 从源代码提取可下载的资源 |
| 144 | + * |
| 145 | + * @param path {@link String} |
| 146 | + * @param matcher {@link Matcher} |
| 147 | + */ |
| 148 | + public void addURLs(String path, Matcher matcher) { |
| 149 | + path = SpiderApplication.DOWNLOAD_FOLDER + SpiderValueConsts.SEPARATOR + path; |
| 150 | + while (matcher.find()) { |
| 151 | + String url = matcher.group(); |
| 152 | + download(path, url); |
| 153 | + } |
| 154 | + } |
135 | 155 |
|
136 | | - /** |
137 | | - * 下载资源文件,并将链接添加到下载记录中,下载过滤将这里进行匹配 |
138 | | - * |
139 | | - * @param path |
140 | | - * {@link String} |
141 | | - * @param url |
142 | | - * {@link URL} |
143 | | - */ |
144 | | - public void download(String path, String url) { |
145 | | - String realUrl = url.split("\\?")[0]; |
146 | | - if (App.downloadFilterPattern.matcher(url).find() && !App.downloadUrls.contains(realUrl)) { |
147 | | - App.downloadUrls.add(realUrl); |
148 | | - Platform.runLater(() -> App.mainController.logOut.appendText(Values.DOWNLOADING_TIP + url + "\r\n")); |
149 | | - path += Values.SEPARATOR + url.substring(url.lastIndexOf(".") + 1); |
150 | | - if (path.contains(Values.QUESTION_MARK)) { |
151 | | - path = path.substring(0, path.indexOf(Values.QUESTION_MARK)); |
152 | | - } |
153 | | - Downloader.download(path, (url.startsWith("//") ? "http:" : "") + url); |
154 | | - try { |
155 | | - Thread.sleep(App.crawlingDelay); |
156 | | - } catch (InterruptedException e) { |
157 | | - logger.error("thread sleep error: " + e.getMessage()); |
158 | | - } |
159 | | - } |
160 | | - } |
| 156 | + /** |
| 157 | + * 下载资源文件,并将链接添加到下载记录中,下载过滤将这里进行匹配 |
| 158 | + * |
| 159 | + * @param path {@link String} |
| 160 | + * @param url {@link URL} |
| 161 | + */ |
| 162 | + public void download(String path, String url) { |
| 163 | + String realUrl = url.split("\\?")[0]; |
| 164 | + if (SpiderApplication.downloadFilterPattern.matcher(url).find() && !SpiderApplication.downloadUrls.contains |
| 165 | + (realUrl)) { |
| 166 | + SpiderApplication.downloadUrls.add(realUrl); |
| 167 | + Platform.runLater(() -> SpiderApplication.mainController.logOut.appendText(SpiderValueConsts |
| 168 | + .DOWNLOADING_TIP + url + "\r\n")); |
| 169 | + path += SpiderValueConsts.SEPARATOR + url.substring(url.lastIndexOf(".") + 1); |
| 170 | + if (path.contains(SpiderValueConsts.QUESTION_MARK)) { |
| 171 | + path = path.substring(0, path.indexOf(SpiderValueConsts.QUESTION_MARK)); |
| 172 | + } |
| 173 | + Downloader.download(path, (url.startsWith("//") ? "http:" : "") + url); |
| 174 | + try { |
| 175 | + Thread.sleep(SpiderApplication.crawlingDelay); |
| 176 | + } catch (InterruptedException e) { |
| 177 | + logger.error("thread sleep error: " + e.getMessage()); |
| 178 | + } |
| 179 | + } |
| 180 | + } |
161 | 181 | } |
0 commit comments