Skip to content
This repository was archived by the owner on Mar 6, 2024. It is now read-only.

Commit fb1d6d0

Browse files
committed
新增xpath自定义爬取
1 parent a8923c5 commit fb1d6d0

13 files changed

Lines changed: 654 additions & 280 deletions

File tree

pom.xml

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -40,30 +40,17 @@
4040
<dependency>
4141
<groupId>edu.uci.ics</groupId>
4242
<artifactId>crawler4j</artifactId>
43-
<version>4.3</version>
43+
<version>4.4.0</version>
4444
</dependency>
4545
<dependency>
4646
<groupId>com.zhazhapan</groupId>
4747
<artifactId>util</artifactId>
48-
<version>1.0.6</version>
49-
<exclusions>
50-
<exclusion>
51-
<artifactId>commons-io</artifactId>
52-
<groupId>commons-io</groupId>
53-
</exclusion>
54-
<exclusion>
55-
<artifactId>cglib</artifactId>
56-
<groupId>cglib</groupId>
57-
</exclusion>
58-
<exclusion>
59-
<artifactId>docx4j-ImportXHTML</artifactId>
60-
<groupId>org.docx4j</groupId>
61-
</exclusion>
62-
<exclusion>
63-
<artifactId>poi</artifactId>
64-
<groupId>org.apache.poi</groupId>
65-
</exclusion>
66-
</exclusions>
48+
<version>1.0.7</version>
49+
</dependency>
50+
<dependency>
51+
<groupId>mysql</groupId>
52+
<artifactId>mysql-connector-java</artifactId>
53+
<version>8.0.9-rc</version>
6754
</dependency>
6855
</dependencies>
6956
</project>
Lines changed: 155 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -1,161 +1,181 @@
11
package com.zhazhapan.vspider;
22

3-
import java.net.URL;
4-
import java.util.regex.Matcher;
5-
import java.util.regex.Pattern;
6-
73
import com.zhazhapan.util.Checker;
84
import com.zhazhapan.util.Downloader;
95
import com.zhazhapan.vspider.models.CrawlConfig;
10-
import com.zhazhapan.vspider.modules.constant.Values;
11-
6+
import com.zhazhapan.vspider.models.MysqlConfig;
7+
import com.zhazhapan.vspider.modules.constant.DefaultConfigValues;
8+
import com.zhazhapan.vspider.modules.constant.SpiderValueConsts;
129
import edu.uci.ics.crawler4j.crawler.Page;
1310
import edu.uci.ics.crawler4j.crawler.WebCrawler;
1411
import edu.uci.ics.crawler4j.parser.HtmlParseData;
1512
import edu.uci.ics.crawler4j.url.WebURL;
1613
import javafx.application.Platform;
14+
import javafx.util.Pair;
15+
16+
import java.net.URL;
17+
import java.sql.SQLException;
18+
import java.util.regex.Matcher;
19+
import java.util.regex.Pattern;
1720

1821
/**
1922
* @author pantao
20-
*
2123
*/
2224
public class Crawler extends WebCrawler {
2325

24-
// private final Pattern FILTER_PATTERN =
25-
// Pattern.compile(".*\\.(js|css)(\\?.*)?$", Pattern.CASE_INSENSITIVE);
26+
// private final Pattern FILTER_PATTERN =
27+
// Pattern.compile(".*\\.(js|css)(\\?.*)?$", Pattern.CASE_INSENSITIVE);
2628

27-
/**
28-
* 匹配图片
29-
*/
30-
public final Pattern IMAGES_PATTERN = Pattern.compile(
31-
"(https?:)?//[^\\s&;\"':<>]*?\\.(bmp|gif|jpe?g|png|tiff?|pcx|tga|svg|pic)(\\?[^?\\s\"':<>]*)?",
32-
Pattern.CASE_INSENSITIVE);
29+
/**
30+
* 匹配图片
31+
*/
32+
public final Pattern IMAGES_PATTERN = Pattern.compile("(https?:)?//[^\\s&;\"':<>]*?\\." + "" + "" + "" + "" + ""
33+
+ "(bmp|gif|jpe?g|png|tiff?|pcx|tga|svg|pic)(\\?[^?\\s\"':<>]*)?", Pattern.CASE_INSENSITIVE);
3334

34-
/**
35-
* 匹配媒体文件
36-
*/
37-
private final Pattern VIDEOS_PATTERN = Pattern.compile(
38-
"(https?:)?//[^\\s&;\"':<>]*\\.(avi|mov|swf|asf|navi|wmv|3gp|mkv|flv|rm(vb)?|webm|mpg|mp4|qsv|mpe?g|mp3|aac|ogg|wav|flac|ape|wma|aif|au|ram|mmf|amr|flac)(\\?[^?\\s\"':<>]*)?",
39-
Pattern.CASE_INSENSITIVE);
35+
/**
36+
* 匹配媒体文件
37+
*/
38+
private final Pattern VIDEOS_PATTERN = Pattern.compile("(https?:)?//[^\\s&;\"':<>]*\\." + "" + "" + "" + "" + ""
39+
+ "(avi|mov|swf|asf|navi|wmv|3gp|mkv|flv|rm(vb)" +
40+
"?|webm|mpg|mp4|qsv|mpe?g|mp3|aac|ogg|wav|flac|ape|wma|aif|au|ram|mmf|amr|flac)(\\?[^?\\s\"':<>]*)?",
41+
Pattern.CASE_INSENSITIVE);
4042

41-
/**
42-
* 匹配文档
43-
*/
44-
private final Pattern DOCS_PATTERN = Pattern.compile(
45-
"(https?:)?//[^\\s&;\"':<>]*\\.(pdf|docx?|txt|log|conf|java|xml|json|css|js|html|hml|php|wps|rtf)(\\?[^?\\s\"':<>]*)?",
46-
Pattern.CASE_INSENSITIVE);
43+
/**
44+
* 匹配文档
45+
*/
46+
private final Pattern DOCS_PATTERN = Pattern.compile("(https?:)?//[^\\s&;\"':<>]*\\." + "" + "" + "" + "" + "" +
47+
"(pdf|docx?|txt|log|conf|java|xml|json|css|js|html|hml|php|wps|rtf)(\\?[^?\\s\"':<>]*)?", Pattern
48+
.CASE_INSENSITIVE);
4749

48-
/**
49-
* 匹配其他文件
50-
*/
51-
private final Pattern OTHERS_PATTERN = Pattern.compile(
52-
"(https?:)?//[^\\s&;\"':<>]*\\.(zip|[0-9a-z]?z|exe|dmg|iso|jar|msi|rar|tmp|xlsx?|mdf|com|c|asm|for|lib|lst|msg|obj|pas|wki|bas|map|bak|dot|bat|sh|rpm)(\\?[^?\\s\"':<>]*)?",
53-
Pattern.CASE_INSENSITIVE);
50+
/**
51+
* 匹配其他文件
52+
*/
53+
private final Pattern OTHERS_PATTERN = Pattern.compile("(https?:)?//[^\\s&;\"':<>]*\\." + "" + "" + "" + "" + ""
54+
+ "(zip|[0-9a-z]?z|exe|dmg|iso|jar|msi|rar|tmp|xlsx?|mdf|com|c|asm|for|lib|lst|msg|obj|pas|wki|bas|map" +
55+
"|bak" + "|dot|bat|sh|rpm)(\\?[^?\\s\"':<>]*)?", Pattern.CASE_INSENSITIVE);
5456

55-
/**
56-
* 由crawler4j调用,前置(爬虫)过滤将在这里进行匹配
57-
*/
58-
@Override
59-
public boolean shouldVisit(Page referringPage, WebURL url) {
60-
String urlStr = url.getURL();
61-
if (App.crawlFilterPattern.matcher(urlStr).find() && !App.visitUrls.contains(urlStr)) {
62-
for (String domain : App.domains) {
63-
if (urlStr.contains(domain)) {
64-
App.visitUrls.add(urlStr);
65-
return true;
66-
}
67-
}
68-
}
69-
return false;
70-
}
57+
/**
58+
* 由crawler4j调用,前置(爬虫)过滤将在这里进行匹配
59+
*/
60+
@Override
61+
public boolean shouldVisit(Page referringPage, WebURL url) {
62+
String urlStr = url.getURL();
63+
if (SpiderApplication.crawlFilterPattern.matcher(urlStr).find() && !SpiderApplication.visitUrls.contains
64+
(urlStr)) {
65+
for (String domain : SpiderApplication.domains) {
66+
if (urlStr.contains(domain)) {
67+
SpiderApplication.visitUrls.add(urlStr);
68+
return true;
69+
}
70+
}
71+
}
72+
return false;
73+
}
7174

72-
/**
73-
* 由crawler4j调用,链接(访问)过滤将在这里进行匹配
74-
*/
75-
@Override
76-
public void visit(Page page) {
77-
String url = page.getWebURL().getURL();
78-
if (App.visitFilterPattern.matcher(url).find() && page.getParseData() instanceof HtmlParseData) {
79-
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
80-
Platform.runLater(() -> {
81-
App.mainController.statusLabel.setText("validating url: " + url);
82-
App.mainController.htmlContent.appendText(Values.VISITING_TIP + url + "\r\n");
83-
});
84-
downloadURL(url, htmlParseData.getHtml());
85-
}
86-
}
75+
/**
76+
* 由crawler4j调用,链接(访问)过滤将在这里进行匹配
77+
*/
78+
@Override
79+
public void visit(Page page) {
80+
String url = page.getWebURL().getURL();
81+
if (SpiderApplication.visitFilterPattern.matcher(url).find() && page.getParseData() instanceof HtmlParseData) {
82+
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
83+
Platform.runLater(() -> {
84+
SpiderApplication.mainController.statusLabel.setText("validating url: " + url);
85+
SpiderApplication.mainController.htmlContent.appendText(SpiderValueConsts.VISITING_TIP + url + "\r\n");
86+
});
87+
downloadURL(url, htmlParseData.getHtml());
88+
}
89+
}
8790

88-
/**
89-
* 判断需要下载的资源
90-
*
91-
* @param url
92-
* {@link URL}
93-
* @param html
94-
* {@link String}
95-
*/
96-
public void downloadURL(String url, String html) {
97-
Matcher matcher;
98-
if (CrawlConfig.getCrawlImages().get()) {
99-
matcher = IMAGES_PATTERN.matcher(html);
100-
addURLs("image", matcher);
101-
}
102-
if (CrawlConfig.getCrawlVideos().get()) {
103-
matcher = VIDEOS_PATTERN.matcher(html);
104-
addURLs("media", matcher);
105-
}
106-
if (CrawlConfig.getCrawlDocs().get()) {
107-
matcher = DOCS_PATTERN.matcher(html);
108-
addURLs("document", matcher);
109-
}
110-
if (CrawlConfig.getCrawlOthers().get()) {
111-
matcher = OTHERS_PATTERN.matcher(html);
112-
addURLs("others", matcher);
113-
}
114-
if (Checker.isNotEmpty(url) && CrawlConfig.getCrawlLinks().get()) {
115-
String path = App.DOWNLOAD_FOLDER + Values.SEPARATOR + "link";
116-
Downloader.download(path, (url.startsWith("//") ? "http:" : "") + url);
117-
}
118-
}
91+
/**
92+
* 判断需要下载的资源
93+
*
94+
* @param url {@link URL}
95+
* @param html {@link String}
96+
*/
97+
public void downloadURL(String url, String html) {
98+
Matcher matcher;
99+
if (CrawlConfig.getCrawlImages().get()) {
100+
matcher = IMAGES_PATTERN.matcher(html);
101+
addURLs("image", matcher);
102+
}
103+
if (CrawlConfig.getCrawlVideos().get()) {
104+
matcher = VIDEOS_PATTERN.matcher(html);
105+
addURLs("media", matcher);
106+
}
107+
if (CrawlConfig.getCrawlDocs().get()) {
108+
matcher = DOCS_PATTERN.matcher(html);
109+
addURLs("document", matcher);
110+
}
111+
if (CrawlConfig.getCrawlOthers().get()) {
112+
matcher = OTHERS_PATTERN.matcher(html);
113+
addURLs("others", matcher);
114+
}
115+
if (Checker.isNotEmpty(url) && CrawlConfig.getCrawlLinks().get()) {
116+
String path = SpiderApplication.DOWNLOAD_FOLDER + SpiderValueConsts.SEPARATOR + "link";
117+
Downloader.download(path, (url.startsWith("//") ? "http:" : "") + url);
118+
}
119+
if (MysqlConfig.isEnableCustom()) {
120+
StringBuilder preSlice = new StringBuilder("insert into " + MysqlConfig.getTableName() + "(");
121+
StringBuilder postSlice = new StringBuilder(" values(");
122+
if (Checker.isNotEmpty(MysqlConfig.getFields())) {
123+
for (Pair<String, String> pair : MysqlConfig.getFields()) {
124+
preSlice.append(pair.getValue()).append(",");
125+
postSlice.append(SpiderUtils.evaluate(pair.getKey(), html)).append(",");
126+
}
127+
String pre = preSlice.toString();
128+
String post = postSlice.toString();
129+
String sql = pre.substring(0, pre.length() - 1) + ")" + post.substring(0, post.length() - 1) + ")";
130+
if (MysqlConfig.isEnableSql()) {
131+
SpiderUtils.saveFile(DefaultConfigValues.SQL_PATH, sql + "\r\n", true);
132+
}
133+
try {
134+
SpiderApplication.statement.executeUpdate(sql);
135+
} catch (SQLException e) {
136+
System.out.println(e.getMessage());
137+
}
138+
}
139+
}
140+
}
119141

120-
/**
121-
* 从源代码提取可下载的资源
122-
*
123-
* @param path
124-
* {@link String}
125-
* @param matcher
126-
* {@link Matcher}
127-
*/
128-
public void addURLs(String path, Matcher matcher) {
129-
path = App.DOWNLOAD_FOLDER + Values.SEPARATOR + path;
130-
while (matcher.find()) {
131-
String url = matcher.group();
132-
download(path, url);
133-
}
134-
}
142+
/**
143+
* 从源代码提取可下载的资源
144+
*
145+
* @param path {@link String}
146+
* @param matcher {@link Matcher}
147+
*/
148+
public void addURLs(String path, Matcher matcher) {
149+
path = SpiderApplication.DOWNLOAD_FOLDER + SpiderValueConsts.SEPARATOR + path;
150+
while (matcher.find()) {
151+
String url = matcher.group();
152+
download(path, url);
153+
}
154+
}
135155

136-
/**
137-
* 下载资源文件,并将链接添加到下载记录中,下载过滤将这里进行匹配
138-
*
139-
* @param path
140-
* {@link String}
141-
* @param url
142-
* {@link URL}
143-
*/
144-
public void download(String path, String url) {
145-
String realUrl = url.split("\\?")[0];
146-
if (App.downloadFilterPattern.matcher(url).find() && !App.downloadUrls.contains(realUrl)) {
147-
App.downloadUrls.add(realUrl);
148-
Platform.runLater(() -> App.mainController.logOut.appendText(Values.DOWNLOADING_TIP + url + "\r\n"));
149-
path += Values.SEPARATOR + url.substring(url.lastIndexOf(".") + 1);
150-
if (path.contains(Values.QUESTION_MARK)) {
151-
path = path.substring(0, path.indexOf(Values.QUESTION_MARK));
152-
}
153-
Downloader.download(path, (url.startsWith("//") ? "http:" : "") + url);
154-
try {
155-
Thread.sleep(App.crawlingDelay);
156-
} catch (InterruptedException e) {
157-
logger.error("thread sleep error: " + e.getMessage());
158-
}
159-
}
160-
}
156+
/**
157+
* 下载资源文件,并将链接添加到下载记录中,下载过滤将这里进行匹配
158+
*
159+
* @param path {@link String}
160+
* @param url {@link URL}
161+
*/
162+
public void download(String path, String url) {
163+
String realUrl = url.split("\\?")[0];
164+
if (SpiderApplication.downloadFilterPattern.matcher(url).find() && !SpiderApplication.downloadUrls.contains
165+
(realUrl)) {
166+
SpiderApplication.downloadUrls.add(realUrl);
167+
Platform.runLater(() -> SpiderApplication.mainController.logOut.appendText(SpiderValueConsts
168+
.DOWNLOADING_TIP + url + "\r\n"));
169+
path += SpiderValueConsts.SEPARATOR + url.substring(url.lastIndexOf(".") + 1);
170+
if (path.contains(SpiderValueConsts.QUESTION_MARK)) {
171+
path = path.substring(0, path.indexOf(SpiderValueConsts.QUESTION_MARK));
172+
}
173+
Downloader.download(path, (url.startsWith("//") ? "http:" : "") + url);
174+
try {
175+
Thread.sleep(SpiderApplication.crawlingDelay);
176+
} catch (InterruptedException e) {
177+
logger.error("thread sleep error: " + e.getMessage());
178+
}
179+
}
180+
}
161181
}

0 commit comments

Comments
 (0)