Skip to content
This repository was archived by the owner on Mar 6, 2024. It is now read-only.

Commit af6fc6e

Browse files
committed
添加流程图
1 parent 5642526 commit af6fc6e

8 files changed

Lines changed: 491 additions & 436 deletions

File tree

README.md

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,32 @@
1-
# visual-spider
2-
一个简单、图形化的基于crawler4j的java爬虫
3-
4-
**你可以随时暂停爬虫(也可能是意外退出),程序会保存进度,爬虫会从上次结束的位置继续爬取**
51

62
QQ交流群(573484012)
73

4+
#### 图片爬取
5+
6+
目前支持的图片格式有 bmp,gif,jpeg,png,tiff,pcx,tga,svg,pic
7+
8+
#### 媒体爬取
9+
10+
目前支持的媒体格式有 avi,mov,swf,asf,navi,wmv,3gp,mkv,flv,rmvb,webm,mpg,mp4,qsv,mpeg,mp3,aac,ogg,wav,flac,ape,wma,aif,au,ram,mmf,amr,flac
11+
12+
#### 链接爬取
13+
14+
其实就是下载HTML源代码
15+
16+
#### 文档爬取
17+
18+
目前支持的文档格式有 pdf,docx,txt,log,conf,java,xml,json,css,js,html,hml,php,wps,rtf
19+
20+
#### 其他文件爬取
21+
22+
目前支持的文件格式有 zip,exe,dmg,iso,jar,msi,rar,tmp,xlsx,mdf,com,casm,for,lib,lst,msg,obj,pas,wki,bas,map,bak,dot,bat,sh,rpm
23+
24+
#### 爬虫工作流程
25+
26+
![工作流程](workflow.png)
27+
28+
#### 运行截图
29+
830
![截图](http://oq3iwfipo.bkt.clouddn.com/tutorial/vspider/visualspider.png)
931

1032
[点我下载](http://oq3iwfipo.bkt.clouddn.com/tools/zhazhapan/VisualSpider.jar)

pom.xml

Lines changed: 125 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,131 @@
11
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2-
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3-
<modelVersion>4.0.0</modelVersion>
2+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3+
<modelVersion>4.0.0</modelVersion>
44

5-
<groupId>com.zhazhapan</groupId>
6-
<artifactId>visual-spider</artifactId>
7-
<version>0.0.1-SNAPSHOT</version>
8-
<build>
9-
<plugins>
10-
<plugin>
11-
<groupId>org.apache.maven.plugins</groupId>
12-
<artifactId>maven-compiler-plugin</artifactId>
13-
<configuration>
14-
<source>1.8</source>
15-
<target>1.8</target>
16-
</configuration>
17-
</plugin>
18-
</plugins>
19-
</build>
20-
<packaging>jar</packaging>
5+
<groupId>com.zhazhapan</groupId>
6+
<artifactId>visual-spider</artifactId>
7+
<version>1.1</version>
8+
<build>
9+
<plugins>
10+
<plugin>
11+
<groupId>org.apache.maven.plugins</groupId>
12+
<artifactId>maven-compiler-plugin</artifactId>
13+
<configuration>
14+
<source>1.8</source>
15+
<target>1.8</target>
16+
</configuration>
17+
</plugin>
18+
</plugins>
19+
</build>
20+
<packaging>jar</packaging>
2121

22-
<name>visual-spider</name>
23-
<url>http://maven.apache.org</url>
22+
<name>visual-spider</name>
23+
<url>http://maven.apache.org</url>
2424

25-
<properties>
26-
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
27-
</properties>
25+
<properties>
26+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
27+
</properties>
2828

29-
<dependencies>
30-
<dependency>
31-
<groupId>org.junit.jupiter</groupId>
32-
<artifactId>junit-jupiter-migration-support</artifactId>
33-
<version>5.0.0-M4</version>
34-
</dependency>
35-
<dependency>
36-
<groupId>log4j</groupId>
37-
<artifactId>log4j</artifactId>
38-
<version>1.2.16</version>
39-
</dependency>
40-
<dependency>
41-
<groupId>edu.uci.ics</groupId>
42-
<artifactId>crawler4j</artifactId>
43-
<version>4.3</version>
44-
</dependency>
45-
<dependency>
46-
<groupId>com.zhazhapan</groupId>
47-
<artifactId>util</artifactId>
48-
<version>1.0.0</version>
49-
</dependency>
50-
</dependencies>
29+
<dependencies>
30+
<dependency>
31+
<groupId>org.junit.jupiter</groupId>
32+
<artifactId>junit-jupiter-migration-support</artifactId>
33+
<version>5.0.0-M4</version>
34+
</dependency>
35+
<dependency>
36+
<groupId>log4j</groupId>
37+
<artifactId>log4j</artifactId>
38+
<version>1.2.16</version>
39+
</dependency>
40+
<dependency>
41+
<groupId>edu.uci.ics</groupId>
42+
<artifactId>crawler4j</artifactId>
43+
<version>4.3</version>
44+
<exclusions>
45+
<exclusion>
46+
<artifactId>gson</artifactId>
47+
<groupId>com.google.code.gson</groupId>
48+
</exclusion>
49+
<exclusion>
50+
<artifactId>guava</artifactId>
51+
<groupId>com.google.guava</groupId>
52+
</exclusion>
53+
<exclusion>
54+
<artifactId>jackcess</artifactId>
55+
<groupId>com.healthmarketscience.jackcess</groupId>
56+
</exclusion>
57+
<exclusion>
58+
<artifactId>commons-codec</artifactId>
59+
<groupId>commons-codec</groupId>
60+
</exclusion>
61+
<exclusion>
62+
<artifactId>commons-io</artifactId>
63+
<groupId>commons-io</groupId>
64+
</exclusion>
65+
<exclusion>
66+
<artifactId>commons-logging</artifactId>
67+
<groupId>commons-logging</groupId>
68+
</exclusion>
69+
<exclusion>
70+
<artifactId>slf4j-api</artifactId>
71+
<groupId>org.slf4j</groupId>
72+
</exclusion>
73+
<exclusion>
74+
<artifactId>tika-core</artifactId>
75+
<groupId>org.apache.tika</groupId>
76+
</exclusion>
77+
<exclusion>
78+
<artifactId>poi-ooxml</artifactId>
79+
<groupId>org.apache.poi</groupId>
80+
</exclusion>
81+
<exclusion>
82+
<artifactId>poi</artifactId>
83+
<groupId>org.apache.poi</groupId>
84+
</exclusion>
85+
<exclusion>
86+
<artifactId>httpcore</artifactId>
87+
<groupId>org.apache.httpcomponents</groupId>
88+
</exclusion>
89+
<exclusion>
90+
<artifactId>httpclient</artifactId>
91+
<groupId>org.apache.httpcomponents</groupId>
92+
</exclusion>
93+
</exclusions>
94+
</dependency>
95+
<dependency>
96+
<groupId>com.zhazhapan</groupId>
97+
<artifactId>util</artifactId>
98+
<version>1.0.6</version>
99+
<exclusions>
100+
<exclusion>
101+
<artifactId>jackson-core</artifactId>
102+
<groupId>com.fasterxml.jackson.core</groupId>
103+
</exclusion>
104+
<exclusion>
105+
<artifactId>guava</artifactId>
106+
<groupId>com.google.guava</groupId>
107+
</exclusion>
108+
<exclusion>
109+
<artifactId>log4j</artifactId>
110+
<groupId>log4j</groupId>
111+
</exclusion>
112+
<exclusion>
113+
<artifactId>commons-lang3</artifactId>
114+
<groupId>org.apache.commons</groupId>
115+
</exclusion>
116+
<exclusion>
117+
<artifactId>slf4j-api</artifactId>
118+
<groupId>org.slf4j</groupId>
119+
</exclusion>
120+
<exclusion>
121+
<artifactId>asm</artifactId>
122+
<groupId>org.ow2.asm</groupId>
123+
</exclusion>
124+
<exclusion>
125+
<artifactId>httpclient</artifactId>
126+
<groupId>org.apache.httpcomponents</groupId>
127+
</exclusion>
128+
</exclusions>
129+
</dependency>
130+
</dependencies>
51131
</project>

src/main/java/com/zhazhapan/vspider/Crawler.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ public void visit(Page page) {
8181
if (App.visitFilterPattern.matcher(url).find() && page.getParseData() instanceof HtmlParseData) {
8282
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
8383
Platform.runLater(() -> {
84-
App.mainController.stautsLabel.setText("validating url: " + url);
84+
App.mainController.statusLabel.setText("validating url: " + url);
8585
App.mainController.htmlContent.appendText(Values.VISITING_TIP + url + "\r\n");
8686
});
8787
downloadURL(url, htmlParseData.getHtml());

0 commit comments

Comments
 (0)