说明
- 之前用go写过一篇爬虫的实例,本次用java来写
- 需要掌握http请求的编写
代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
| import java.io.*; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.ProtocolException; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import java.util.regex.Matcher; import java.util.regex.Pattern;
public class Craw { public static void main(String[] args) throws Exception { for(int i=2;i<3;i++) { String url= "http://www.52desktop.cn/html/DLZM/KPBZ/20191205/15898_" +Integer.toString(i) + ".html"; String pageStr = String.valueOf(GetPageStr(url)); String regImg = "http://i.52desktop.cn:81/upimg/allimg/[^\\\"]+?(\\.(jpg|png|jpeg|gif|bmp))"; Pattern pattern = Pattern.compile(regImg); Matcher matcher = pattern.matcher(pageStr); if (matcher.find()) { System.out.println(matcher.groupCount()); System.out.println(matcher.group(0)); System.out.println(matcher.group(1)); if(matcher.group(0).contains("20191204")) { downloadImg("d:\\img\\", matcher.group(0)); } }
} } private static StringBuffer GetPageStr(String url) throws Exception { URL obj = new URL(url); HttpURLConnection con = (HttpURLConnection) obj.openConnection(); con.setRequestMethod("GET"); BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), "UTF-8")); String inputLine; StringBuffer response = new StringBuffer(); while ((inputLine=in.readLine()) != null) { response.append(inputLine); } in.close();
System.out.println("GET Response: " + response.toString()); return response; } private static void downloadImg(String savePath, String url) throws IOException { URL imageUrl = new URL(url); HttpURLConnection con = (HttpURLConnection) imageUrl.openConnection(); con.setRequestMethod("GET"); try(InputStream inputStream= con.getInputStream()) { String fileName = url.substring(url.lastIndexOf("/") +1); Path filePath = Paths.get(savePath, fileName); Files.copy(inputStream, filePath, StandardCopyOption.REPLACE_EXISTING); System.out.println(fileName + " download is successful"); }
} }
|
如上代码的确可以运行,但是发现目标地址基本上无法访问了
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
| import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.io.*; import java.net.HttpURLConnection; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption;
public class Craw { public static void main(String[] args) throws Exception { String url= "https://sj.zol.com.cn/bizhi/detail_12078_130945.html"; String pageStr = String.valueOf(GetPageStr(url)); Document document = Jsoup.parse(pageStr); Elements elements = document.select("ul#showImg li"); for(Element li: elements) { Element img = li.selectFirst("img"); String imgUrl; if (img.attr("src").length() > 0) { imgUrl = img.attr("src"); } else { imgUrl = img.attr("srcs"); } System.out.println(imgUrl); downloadImg("d:\\img\\", imgUrl);
} } private static StringBuffer GetPageStr(String url) throws Exception { URL obj = new URL(url); HttpURLConnection con = (HttpURLConnection) obj.openConnection(); con.setRequestMethod("GET"); BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), "UTF-8")); String inputLine; StringBuffer response = new StringBuffer(); while ((inputLine=in.readLine()) != null) { response.append(inputLine); } in.close();
System.out.println("GET Response: " + response.toString()); return response; } private static void downloadImg(String savePath, String url) throws IOException { URL imageUrl = new URL(url); HttpURLConnection con = (HttpURLConnection) imageUrl.openConnection(); con.setRequestMethod("GET"); try(InputStream inputStream= con.getInputStream()) {
LocalDateTime now = LocalDateTime.now(); DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmssSSS"); String fileName = now.format(formatter) + ".jpg"; Path filePath = Paths.get(savePath, fileName); Files.copy(inputStream, filePath, StandardCopyOption.REPLACE_EXISTING); System.out.println(fileName + " download is successful"); }
} }
|
本次目标网站采用jsoup的读取图片路径,然后用http下载到本地
需要在pom.xml中加入依赖文件
org.jsoup
jsoup
1.14.3
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
| import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements;
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.nio.file.*; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors;
public class ThreadCraw { public static void main(String[] args) throws Exception { String url= "https://sj.zol.com.cn/bizhi/detail_12078_130945.html"; String pageStr = String.valueOf(GetPageStr(url)); Document document = Jsoup.parse(pageStr); Elements elements = document.select("ul#showImg li"); ExecutorService executor = Executors.newFixedThreadPool(elements.size()); for(Element li: elements) { Element img = li.selectFirst("img"); String imgUrl; if (img.attr("src").length() > 0) { imgUrl = img.attr("src"); } else { imgUrl = img.attr("srcs"); }
ImageDownloader downloader = new ImageDownloader("d:\\img\\", imgUrl); executor.execute(downloader); } executor.shutdown(); } private static StringBuffer GetPageStr(String url) throws Exception { URL obj = new URL(url); HttpURLConnection con = (HttpURLConnection) obj.openConnection(); con.setRequestMethod("GET"); BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), "UTF-8")); String inputLine; StringBuffer response = new StringBuffer(); while ((inputLine=in.readLine()) != null) { response.append(inputLine); } in.close();
System.out.println("GET Response: " + response.toString()); return response; } private static void downloadImg(String savePath, String url) throws IOException {
URL imageUrl = new URL(url); HttpURLConnection con = (HttpURLConnection) imageUrl.openConnection(); con.setRequestMethod("GET"); try(InputStream inputStream= con.getInputStream()) {
LocalDateTime now = LocalDateTime.now(); DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmssSSS"); String fileName = now.format(formatter) + ".jpg"; Path filePath = Paths.get(savePath, fileName); try { Files.copy(inputStream, filePath, StandardCopyOption.REPLACE_EXISTING); System.out.println(fileName + " download is successful"); } catch (FileAlreadyExistsException e) {
}
} }
static class ImageDownloader extends Thread {
private String savePath; private String imgUrl;
public ImageDownloader(String savePath, String imgUrl) { this.savePath = savePath; this.imgUrl = imgUrl; }
@Override public void run() { try { downloadImg(savePath, imgUrl); } catch (IOException e) { throw new RuntimeException(e); } } } }
|
关键代码
1 2
| ImageDownloader downloader = new ImageDownloader("d:\\img\\", imgUrl); executor.execute(downloader);
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
| public static void main(String[] args) throws Exception { String url = "https://sj.zol.com.cn/bizhi/detail_12078_130945.html"; String pageStr = String.valueOf(GetPageStr(url)); Document document = Jsoup.parse(pageStr); Elements elements = document.select("ul#showImg li"); ExecutorService executor = Executors.newFixedThreadPool(elements.size()); List<Future<Void>> futures = new ArrayList<>(); for (Element li : elements) { Element img = li.selectFirst("img"); String imgUrl; if (img.attr("src").length() > 0) { imgUrl = img.attr("src"); } else { imgUrl = img.attr("srcs"); } Callable<Void> downloadTask = () -> { downloadImg("d:\\img\\", imgUrl); return null; }; Future<Void> future = executor.submit(downloadTask); futures.add(future); } for (Future<Void> future : futures) { future.get(); } executor.shutdown(); }
|