Skip to content
This repository was archived by the owner on Mar 6, 2024. It is now read-only.

Commit e261e31

Browse files
committed
重构代码
1 parent af6fc6e commit e261e31

14 files changed

Lines changed: 399 additions & 500 deletions

File tree

pom.xml

Lines changed: 9 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
<groupId>com.zhazhapan</groupId>
66
<artifactId>visual-spider</artifactId>
7-
<version>1.1</version>
7+
<version>1.0</version>
88
<build>
99
<plugins>
1010
<plugin>
@@ -41,89 +41,27 @@
4141
<groupId>edu.uci.ics</groupId>
4242
<artifactId>crawler4j</artifactId>
4343
<version>4.3</version>
44-
<exclusions>
45-
<exclusion>
46-
<artifactId>gson</artifactId>
47-
<groupId>com.google.code.gson</groupId>
48-
</exclusion>
49-
<exclusion>
50-
<artifactId>guava</artifactId>
51-
<groupId>com.google.guava</groupId>
52-
</exclusion>
53-
<exclusion>
54-
<artifactId>jackcess</artifactId>
55-
<groupId>com.healthmarketscience.jackcess</groupId>
56-
</exclusion>
57-
<exclusion>
58-
<artifactId>commons-codec</artifactId>
59-
<groupId>commons-codec</groupId>
60-
</exclusion>
61-
<exclusion>
62-
<artifactId>commons-io</artifactId>
63-
<groupId>commons-io</groupId>
64-
</exclusion>
65-
<exclusion>
66-
<artifactId>commons-logging</artifactId>
67-
<groupId>commons-logging</groupId>
68-
</exclusion>
69-
<exclusion>
70-
<artifactId>slf4j-api</artifactId>
71-
<groupId>org.slf4j</groupId>
72-
</exclusion>
73-
<exclusion>
74-
<artifactId>tika-core</artifactId>
75-
<groupId>org.apache.tika</groupId>
76-
</exclusion>
77-
<exclusion>
78-
<artifactId>poi-ooxml</artifactId>
79-
<groupId>org.apache.poi</groupId>
80-
</exclusion>
81-
<exclusion>
82-
<artifactId>poi</artifactId>
83-
<groupId>org.apache.poi</groupId>
84-
</exclusion>
85-
<exclusion>
86-
<artifactId>httpcore</artifactId>
87-
<groupId>org.apache.httpcomponents</groupId>
88-
</exclusion>
89-
<exclusion>
90-
<artifactId>httpclient</artifactId>
91-
<groupId>org.apache.httpcomponents</groupId>
92-
</exclusion>
93-
</exclusions>
9444
</dependency>
9545
<dependency>
9646
<groupId>com.zhazhapan</groupId>
9747
<artifactId>util</artifactId>
9848
<version>1.0.6</version>
9949
<exclusions>
10050
<exclusion>
101-
<artifactId>jackson-core</artifactId>
102-
<groupId>com.fasterxml.jackson.core</groupId>
103-
</exclusion>
104-
<exclusion>
105-
<artifactId>guava</artifactId>
106-
<groupId>com.google.guava</groupId>
107-
</exclusion>
108-
<exclusion>
109-
<artifactId>log4j</artifactId>
110-
<groupId>log4j</groupId>
111-
</exclusion>
112-
<exclusion>
113-
<artifactId>commons-lang3</artifactId>
114-
<groupId>org.apache.commons</groupId>
51+
<artifactId>commons-io</artifactId>
52+
<groupId>commons-io</groupId>
11553
</exclusion>
11654
<exclusion>
117-
<artifactId>slf4j-api</artifactId>
118-
<groupId>org.slf4j</groupId>
55+
<artifactId>cglib</artifactId>
56+
<groupId>cglib</groupId>
11957
</exclusion>
12058
<exclusion>
121-
<artifactId>asm</artifactId>
122-
<groupId>org.ow2.asm</groupId>
59+
<artifactId>docx4j-ImportXHTML</artifactId>
60+
<groupId>org.docx4j</groupId>
12361
</exclusion>
12462
<exclusion>
125-
<artifactId>httpclient</artifactId>
126-
<groupId>org.apache.httpcomponents</groupId>
63+
<artifactId>poi</artifactId>
64+
<groupId>org.apache.poi</groupId>
12765
</exclusion>
12866
</exclusions>
12967
</dependency>
Lines changed: 88 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1,123 +1,109 @@
11
package com.zhazhapan.vspider;
22

3-
import java.util.ArrayList;
4-
import java.util.Date;
5-
import java.util.concurrent.TimeUnit;
6-
import java.util.regex.Pattern;
7-
8-
import org.apache.log4j.Logger;
9-
103
import com.zhazhapan.util.Formatter;
114
import com.zhazhapan.util.ThreadPool;
125
import com.zhazhapan.vspider.controller.MainController;
136
import com.zhazhapan.vspider.modules.constant.DefaultConfigValues;
147
import com.zhazhapan.vspider.modules.constant.Values;
15-
168
import javafx.application.Application;
179
import javafx.fxml.FXMLLoader;
1810
import javafx.scene.Scene;
1911
import javafx.scene.image.Image;
2012
import javafx.scene.layout.BorderPane;
2113
import javafx.stage.Stage;
2214
import javafx.stage.WindowEvent;
15+
import org.apache.log4j.Logger;
16+
17+
import java.util.ArrayList;
18+
import java.util.Date;
19+
import java.util.concurrent.TimeUnit;
20+
import java.util.regex.Pattern;
2321

2422
/**
2523
* @author pantao
26-
*
2724
*/
2825
public class App extends Application {
2926

30-
private static Logger logger = Logger.getLogger(App.class);
31-
32-
/**
33-
* 界面控制器
34-
*/
35-
public static MainController mainController = null;
36-
37-
/**
38-
* 爬虫控制器
39-
*/
40-
public static VsController controller = new VsController();
41-
42-
/**
43-
* 待爬取的URLs
44-
*/
45-
public static String[] domains;
46-
47-
/**
48-
* 记录访问过的URLs
49-
*/
50-
public static ArrayList<String> visitUrls = new ArrayList<String>();
51-
52-
/**
53-
* 记录下载过的URLs
54-
*/
55-
public static ArrayList<String> downloadUrls = new ArrayList<String>();
56-
57-
/**
58-
* 爬取延迟
59-
*/
60-
public static int crawlingDelay = DefaultConfigValues.POLITENESS_DELAY;
61-
62-
/**
63-
* 爬虫匹配(不匹配的链接将不会爬取,匹配的链接会进入访问状态)
64-
*/
65-
public static Pattern crawlFilterPattern = Pattern.compile(".*");
66-
67-
/**
68-
* 访问匹配(不匹配的链接将不会访问,匹配的链接会将服务器返回的源代码传送到下载模式)
69-
*/
70-
public static Pattern visitFilterPattern = Pattern.compile(".*");
71-
72-
/**
73-
* 下载匹配(从网页源代码获取可以下载的资源,资源链接不匹配的将不会下载)
74-
*/
75-
public static Pattern downloadFilterPattern = Pattern.compile(".*");
76-
77-
/**
78-
* 下载的存储目录
79-
*/
80-
public static String DOWNLOAD_FOLDER = DefaultConfigValues.CRAWL_STORAGE_FOLDER + Values.SEPARATOR + "files"
81-
+ Values.SEPARATOR + Formatter.datetimeToCustomString(new Date(), "yyyyMMdd");
82-
83-
/**
84-
* 主程序入口
85-
*
86-
* @param args
87-
* {@link String}
88-
*/
89-
public static void main(String[] args) {
90-
logger.info("start to run app");
91-
initThreadPool();
92-
// 启动JavaFX,会调用start方法
93-
launch(args);
94-
}
95-
96-
@Override
97-
public void start(Stage stage) throws Exception {
98-
try {
99-
BorderPane root = (BorderPane) FXMLLoader.load(getClass().getResource("view/MainWindow.fxml"));
100-
stage.setScene(new Scene(root));
101-
} catch (Exception e) {
102-
logger.error("load fxml error: " + e.getMessage());
103-
}
104-
stage.setTitle(Values.MAIN_TITLE);
105-
stage.getIcons().add(new Image(getClass().getResourceAsStream("view/spider.jpg")));
106-
stage.show();
107-
stage.setOnCloseRequest((WindowEvent event) -> {
108-
stage.setIconified(true);
109-
event.consume();
110-
});
111-
}
112-
113-
/**
114-
* 初始化线程池
115-
*/
116-
public static void initThreadPool() {
117-
ThreadPool.setCorePoolSize(1);
118-
ThreadPool.setMaximumPoolSize(5);
119-
ThreadPool.setKeepAliveTime(100);
120-
ThreadPool.setTimeUnit(TimeUnit.MILLISECONDS);
121-
ThreadPool.init();
122-
}
27+
/**
28+
* 界面控制器
29+
*/
30+
public static MainController mainController = null;
31+
/**
32+
* 爬虫控制器
33+
*/
34+
public static VsController controller = new VsController();
35+
/**
36+
* 待爬取的URLs
37+
*/
38+
public static String[] domains;
39+
/**
40+
* 记录访问过的URLs
41+
*/
42+
public static ArrayList<String> visitUrls = new ArrayList<>();
43+
/**
44+
* 记录下载过的URLs
45+
*/
46+
public static ArrayList<String> downloadUrls = new ArrayList<>();
47+
/**
48+
* 爬取延迟
49+
*/
50+
public static int crawlingDelay = DefaultConfigValues.POLITENESS_DELAY;
51+
/**
52+
* 爬虫匹配(不匹配的链接将不会爬取,匹配的链接会进入访问状态)
53+
*/
54+
public static Pattern crawlFilterPattern = Pattern.compile(".*");
55+
/**
56+
* 访问匹配(不匹配的链接将不会访问,匹配的链接会将服务器返回的源代码传送到下载模式)
57+
*/
58+
public static Pattern visitFilterPattern = Pattern.compile(".*");
59+
/**
60+
* 下载匹配(从网页源代码获取可以下载的资源,资源链接不匹配的将不会下载)
61+
*/
62+
public static Pattern downloadFilterPattern = Pattern.compile(".*");
63+
/**
64+
* 下载的存储目录
65+
*/
66+
public static String DOWNLOAD_FOLDER = DefaultConfigValues.CRAWL_STORAGE_FOLDER + Values.SEPARATOR + "files" +
67+
Values.SEPARATOR + Formatter.datetimeToCustomString(new Date(), "yyyyMMdd");
68+
private static Logger logger = Logger.getLogger(App.class);
69+
70+
/**
71+
* 主程序入口
72+
*
73+
* @param args {@link String}
74+
*/
75+
public static void main(String[] args) {
76+
logger.info("start to run app");
77+
initThreadPool();
78+
// 启动JavaFX,会调用start方法
79+
launch(args);
80+
}
81+
82+
/**
83+
* 初始化线程池
84+
*/
85+
public static void initThreadPool() {
86+
ThreadPool.setCorePoolSize(1);
87+
ThreadPool.setMaximumPoolSize(5);
88+
ThreadPool.setKeepAliveTime(100);
89+
ThreadPool.setTimeUnit(TimeUnit.MILLISECONDS);
90+
ThreadPool.init();
91+
}
92+
93+
@Override
94+
public void start(Stage stage) {
95+
try {
96+
BorderPane root = FXMLLoader.load(getClass().getResource("/view/MainWindow.fxml"));
97+
stage.setScene(new Scene(root));
98+
} catch (Exception e) {
99+
logger.error("load fxml error: " + e.getMessage());
100+
}
101+
stage.setTitle(Values.MAIN_TITLE);
102+
stage.getIcons().add(new Image(getClass().getResourceAsStream("/image/spider.jpg")));
103+
stage.show();
104+
stage.setOnCloseRequest((WindowEvent event) -> {
105+
stage.setIconified(true);
106+
event.consume();
107+
});
108+
}
123109
}

src/main/java/com/zhazhapan/vspider/Crawler.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
/**
2-
*
3-
*/
41
package com.zhazhapan.vspider;
52

63
import java.net.URL;
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package com.zhazhapan.vspider;
2+
3+
import com.zhazhapan.util.FileExecutor;
4+
import com.zhazhapan.util.Utils;
5+
import com.zhazhapan.util.dialog.Alerts;
6+
import com.zhazhapan.vspider.modules.constant.Values;
7+
8+
import java.io.IOException;
9+
10+
/**
11+
* @author pantao
12+
* @since 2018/4/14
13+
*/
14+
public class SpiderUtils {
15+
16+
private SpiderUtils() {}
17+
18+
public static void saveFile(String file, String content, boolean append) {
19+
try {
20+
FileExecutor.saveFile(file, content, append);
21+
} catch (IOException e) {
22+
Alerts.showError(Values.MAIN_TITLE, e.getMessage());
23+
}
24+
}
25+
26+
public static void openFile(String file) {
27+
try {
28+
Utils.openFile(file);
29+
} catch (IOException e) {
30+
Alerts.showError(Values.MAIN_TITLE, e.getMessage());
31+
}
32+
}
33+
}

0 commit comments

Comments
 (0)