0%

java之爬虫

说明

  • 之前用go写过一篇爬虫的实例,本次用java来写
  • 需要掌握http请求的编写

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Craw {
public static void main(String[] args) throws Exception {
for(int i=2;i<3;i++) {
String url= "http://www.52desktop.cn/html/DLZM/KPBZ/20191205/15898_" +Integer.toString(i) + ".html";
String pageStr = String.valueOf(GetPageStr(url));
String regImg = "http://i.52desktop.cn:81/upimg/allimg/[^\\\"]+?(\\.(jpg|png|jpeg|gif|bmp))";
Pattern pattern = Pattern.compile(regImg);
Matcher matcher = pattern.matcher(pageStr);
if (matcher.find()) {
System.out.println(matcher.groupCount());
System.out.println(matcher.group(0));
System.out.println(matcher.group(1));
if(matcher.group(0).contains("20191204")) {
downloadImg("d:\\img\\", matcher.group(0));
}
}

}
}
private static StringBuffer GetPageStr(String url) throws Exception {
URL obj = new URL(url);
HttpURLConnection con = (HttpURLConnection) obj.openConnection();
// 发送get请求
con.setRequestMethod("GET");
// 通过con.getInputStream() 获取到与服务器建立的输入流,即服务端返回的数据流
//InputStreamReader将字节流转换为字符流,而BufferedReader提供了缓冲和高效读取的功能,常常用来包装InputStreamReader以提供更好的读取性能。
BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), "UTF-8"));
String inputLine;
StringBuffer response = new StringBuffer();
//不断地调用 BufferedReader 的 readLine() 方法,每次读取一行数据,并将其加入到 StringBuffer 中。
// 当 readLine() 方法返回 null 时,表示已经读取完了所有的数据,循环结束
while ((inputLine=in.readLine()) != null) {
response.append(inputLine);
}
in.close();

// 打印服务器的响应
System.out.println("GET Response: " + response.toString());
return response;
}
private static void downloadImg(String savePath, String url) throws IOException {
URL imageUrl = new URL(url);
HttpURLConnection con = (HttpURLConnection) imageUrl.openConnection();
// 发送get请求
con.setRequestMethod("GET");
try(InputStream inputStream= con.getInputStream()) {
// 获取当前图片名称
String fileName = url.substring(url.lastIndexOf("/") +1);
// 获取保存图片路径
Path filePath = Paths.get(savePath, fileName);
// 将输入流中的内容复制到指定的目标文件
Files.copy(inputStream, filePath, StandardCopyOption.REPLACE_EXISTING);
System.out.println(fileName + " download is successful");
}

}
}

如上代码的确可以运行,但是发现目标地址基本上无法访问了

  • 爬其他目标网站的实例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;


public class Craw {
public static void main(String[] args) throws Exception {
String url= "https://sj.zol.com.cn/bizhi/detail_12078_130945.html";
// 读取html页面的内容
String pageStr = String.valueOf(GetPageStr(url));
// 用Jsoup转换为dom格式进行过滤
Document document = Jsoup.parse(pageStr);
// css的语法
Elements elements = document.select("ul#showImg li");
// 循环读取img
for(Element li: elements) {
Element img = li.selectFirst("img");
String imgUrl;
if (img.attr("src").length() > 0) {
imgUrl = img.attr("src");
} else {
imgUrl = img.attr("srcs");
}
System.out.println(imgUrl);
downloadImg("d:\\img\\", imgUrl);

}
}
private static StringBuffer GetPageStr(String url) throws Exception {
URL obj = new URL(url);
HttpURLConnection con = (HttpURLConnection) obj.openConnection();
// 发送get请求
con.setRequestMethod("GET");
// 通过con.getInputStream() 获取到与服务器建立的输入流,即服务端返回的数据流
//InputStreamReader将字节流转换为字符流,而BufferedReader提供了缓冲和高效读取的功能,常常用来包装InputStreamReader以提供更好的读取性能。
BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), "UTF-8"));
String inputLine;
StringBuffer response = new StringBuffer();
//不断地调用 BufferedReader 的 readLine() 方法,每次读取一行数据,并将其加入到 StringBuffer 中。
// 当 readLine() 方法返回 null 时,表示已经读取完了所有的数据,循环结束
while ((inputLine=in.readLine()) != null) {
response.append(inputLine);
}
in.close();

// 打印服务器的响应
System.out.println("GET Response: " + response.toString());
return response;
}
private static void downloadImg(String savePath, String url) throws IOException {
URL imageUrl = new URL(url);
HttpURLConnection con = (HttpURLConnection) imageUrl.openConnection();
// 发送get请求
con.setRequestMethod("GET");
try(InputStream inputStream= con.getInputStream()) {
// 获取当前图片名称
// String fileName = url.substring(url.lastIndexOf("/") +1);
// 生成唯一格式图片
LocalDateTime now = LocalDateTime.now();
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmssSSS");
String fileName = now.format(formatter) + ".jpg";
// 获取保存图片路径
Path filePath = Paths.get(savePath, fileName);
// 将输入流中的内容复制到指定的目标文件
Files.copy(inputStream, filePath, StandardCopyOption.REPLACE_EXISTING);
System.out.println(fileName + " download is successful");
}

}
}

本次目标网站采用jsoup的读取图片路径,然后用http下载到本地

需要在pom.xml中加入依赖文件

org.jsoup jsoup 1.14.3
  • 下面代码再次改进,采用多线程的方式
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.*;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;


public class ThreadCraw {
public static void main(String[] args) throws Exception {
String url= "https://sj.zol.com.cn/bizhi/detail_12078_130945.html";
// 读取html页面的内容
String pageStr = String.valueOf(GetPageStr(url));
// 用Jsoup转换为dom格式进行过滤
Document document = Jsoup.parse(pageStr);
// css的语法
Elements elements = document.select("ul#showImg li");
// 创建线程池
ExecutorService executor = Executors.newFixedThreadPool(elements.size());
// 循环读取img
for(Element li: elements) {
Element img = li.selectFirst("img");
String imgUrl;
if (img.attr("src").length() > 0) {
imgUrl = img.attr("src");
} else {
imgUrl = img.attr("srcs");
}
// downloadImg("d:\\img\\", imgUrl);
// 创建并启动下载线程
ImageDownloader downloader = new ImageDownloader("d:\\img\\", imgUrl);
executor.execute(downloader);
}
// 关闭线程池
executor.shutdown();
}
private static StringBuffer GetPageStr(String url) throws Exception {
URL obj = new URL(url);
HttpURLConnection con = (HttpURLConnection) obj.openConnection();
// 发送get请求
con.setRequestMethod("GET");
// 通过con.getInputStream() 获取到与服务器建立的输入流,即服务端返回的数据流
//InputStreamReader将字节流转换为字符流,而BufferedReader提供了缓冲和高效读取的功能,常常用来包装InputStreamReader以提供更好的读取性能。
BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), "UTF-8"));
String inputLine;
StringBuffer response = new StringBuffer();
//不断地调用 BufferedReader 的 readLine() 方法,每次读取一行数据,并将其加入到 StringBuffer 中。
// 当 readLine() 方法返回 null 时,表示已经读取完了所有的数据,循环结束
while ((inputLine=in.readLine()) != null) {
response.append(inputLine);
}
in.close();

// 打印服务器的响应
System.out.println("GET Response: " + response.toString());
return response;
}
private static void downloadImg(String savePath, String url) throws IOException {

URL imageUrl = new URL(url);
HttpURLConnection con = (HttpURLConnection) imageUrl.openConnection();
// 发送get请求
con.setRequestMethod("GET");
try(InputStream inputStream= con.getInputStream()) {
// 获取当前图片名称
// String fileName = url.substring(url.lastIndexOf("/") +1);
// 生成唯一格式图片
LocalDateTime now = LocalDateTime.now();
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmssSSS");
String fileName = now.format(formatter) + ".jpg";
// 获取保存图片路径
Path filePath = Paths.get(savePath, fileName);
try {
// 将输入流中的内容复制到指定的目标文件
Files.copy(inputStream, filePath, StandardCopyOption.REPLACE_EXISTING);
System.out.println(fileName + " download is successful");
} catch (FileAlreadyExistsException e) {

}

}
}

static class ImageDownloader extends Thread {

private String savePath;
private String imgUrl;

public ImageDownloader(String savePath, String imgUrl) {
this.savePath = savePath;
this.imgUrl = imgUrl;
}

@Override
public void run() {
try {
downloadImg(savePath, imgUrl);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}

关键代码

  • 创建线程池: ExecutorService executor = Executors.newFixedThreadPool(elements.size());

  • 最后执行线程

1
2
ImageDownloader downloader = new ImageDownloader("d:\\img\\", imgUrl);
executor.execute(downloader);
  • 其实更优雅的方式应该是这种
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
public static void main(String[] args) throws Exception {
String url = "https://sj.zol.com.cn/bizhi/detail_12078_130945.html";
// 读取html页面的内容
String pageStr = String.valueOf(GetPageStr(url));
// 用Jsoup转换为dom格式进行过滤
Document document = Jsoup.parse(pageStr);
// css的语法
Elements elements = document.select("ul#showImg li");

// 创建线程池
ExecutorService executor = Executors.newFixedThreadPool(elements.size());
List<Future<Void>> futures = new ArrayList<>();

// 循环读取img
for (Element li : elements) {
Element img = li.selectFirst("img");
String imgUrl;
if (img.attr("src").length() > 0) {
imgUrl = img.attr("src");
} else {
imgUrl = img.attr("srcs");
}

// 提交下载任务到线程池
Callable<Void> downloadTask = () -> {
downloadImg("d:\\img\\", imgUrl);
return null;
};
Future<Void> future = executor.submit(downloadTask);
futures.add(future);
}

// 等待所有任务完成
for (Future<Void> future : futures) {
future.get();
}

// 关闭线程池
executor.shutdown();
}