java之爬虫

说明

之前用go写过一篇爬虫的实例，本次用java来写
需要掌握http请求的编写

代码

import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Craw {
    public static void main(String[] args) throws Exception {
        for(int i=2;i<3;i++) {
           String url= "http://www.52desktop.cn/html/DLZM/KPBZ/20191205/15898_" +Integer.toString(i) + ".html";
           String pageStr = String.valueOf(GetPageStr(url));
            String regImg = "http://i.52desktop.cn:81/upimg/allimg/[^\\\"]+?(\\.(jpg|png|jpeg|gif|bmp))";
            Pattern pattern = Pattern.compile(regImg);
            Matcher matcher = pattern.matcher(pageStr);
            if (matcher.find()) {
                System.out.println(matcher.groupCount());
                System.out.println(matcher.group(0));
                System.out.println(matcher.group(1));
                if(matcher.group(0).contains("20191204")) {
                    downloadImg("d:\\img\\", matcher.group(0));
                }
            }

        }
    }
    private static StringBuffer GetPageStr(String url) throws Exception {
        URL obj = new URL(url);
        HttpURLConnection con = (HttpURLConnection) obj.openConnection();
        // 发送get请求
        con.setRequestMethod("GET");
        // 通过con.getInputStream() 获取到与服务器建立的输入流，即服务端返回的数据流
        //InputStreamReader将字节流转换为字符流，而BufferedReader提供了缓冲和高效读取的功能，常常用来包装InputStreamReader以提供更好的读取性能。
        BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(),  "UTF-8"));
        String inputLine;
        StringBuffer response = new StringBuffer();
        //不断地调用 BufferedReader 的 readLine() 方法，每次读取一行数据，并将其加入到 StringBuffer 中。
        // 当 readLine() 方法返回 null 时，表示已经读取完了所有的数据，循环结束
        while ((inputLine=in.readLine()) != null) {
            response.append(inputLine);
        }
        in.close();

        // 打印服务器的响应
        System.out.println("GET Response: " + response.toString());
        return response;
    }
    private static void downloadImg(String savePath, String url) throws IOException {
        URL imageUrl  = new URL(url);
        HttpURLConnection con = (HttpURLConnection) imageUrl.openConnection();
        // 发送get请求
        con.setRequestMethod("GET");
        try(InputStream inputStream= con.getInputStream()) {
            // 获取当前图片名称
            String fileName = url.substring(url.lastIndexOf("/") +1);
            // 获取保存图片路径
            Path filePath = Paths.get(savePath, fileName);
            // 将输入流中的内容复制到指定的目标文件
            Files.copy(inputStream, filePath, StandardCopyOption.REPLACE_EXISTING);
            System.out.println(fileName + " download is successful");
        }

    }
}

如上代码的确可以运行，但是发现目标地址基本上无法访问了

爬其他目标网站的实例

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;


public class Craw {
    public static void main(String[] args) throws Exception {
       String url= "https://sj.zol.com.cn/bizhi/detail_12078_130945.html";
       // 读取html页面的内容
       String pageStr = String.valueOf(GetPageStr(url));
       // 用Jsoup转换为dom格式进行过滤
        Document document = Jsoup.parse(pageStr);
        // css的语法
        Elements elements = document.select("ul#showImg li");
        // 循环读取img
        for(Element li: elements) {
            Element img = li.selectFirst("img");
            String imgUrl;
            if (img.attr("src").length() > 0) {
                 imgUrl = img.attr("src");
            } else {
                 imgUrl = img.attr("srcs");
            }
            System.out.println(imgUrl);
            downloadImg("d:\\img\\", imgUrl);

        }
    }
    private static StringBuffer GetPageStr(String url) throws Exception {
        URL obj = new URL(url);
        HttpURLConnection con = (HttpURLConnection) obj.openConnection();
        // 发送get请求
        con.setRequestMethod("GET");
        // 通过con.getInputStream() 获取到与服务器建立的输入流，即服务端返回的数据流
        //InputStreamReader将字节流转换为字符流，而BufferedReader提供了缓冲和高效读取的功能，常常用来包装InputStreamReader以提供更好的读取性能。
        BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(),  "UTF-8"));
        String inputLine;
        StringBuffer response = new StringBuffer();
        //不断地调用 BufferedReader 的 readLine() 方法，每次读取一行数据，并将其加入到 StringBuffer 中。
        // 当 readLine() 方法返回 null 时，表示已经读取完了所有的数据，循环结束
        while ((inputLine=in.readLine()) != null) {
            response.append(inputLine);
        }
        in.close();

        // 打印服务器的响应
        System.out.println("GET Response: " + response.toString());
        return response;
    }
    private static void downloadImg(String savePath, String url) throws IOException {
        URL imageUrl  = new URL(url);
        HttpURLConnection con = (HttpURLConnection) imageUrl.openConnection();
        // 发送get请求
        con.setRequestMethod("GET");
        try(InputStream inputStream= con.getInputStream()) {
            // 获取当前图片名称
//            String fileName = url.substring(url.lastIndexOf("/") +1);
            // 生成唯一格式图片
            LocalDateTime now = LocalDateTime.now();
            DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmssSSS");
            String fileName = now.format(formatter) + ".jpg";
            // 获取保存图片路径
            Path filePath = Paths.get(savePath, fileName);
            // 将输入流中的内容复制到指定的目标文件
            Files.copy(inputStream, filePath, StandardCopyOption.REPLACE_EXISTING);
            System.out.println(fileName + " download is successful");
        }

    }
}

本次目标网站采用jsoup的读取图片路径，然后用http下载到本地

需要在pom.xml中加入依赖文件
org.jsoup jsoup 1.14.3

下面代码再次改进，采用多线程的方式

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.*;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;


public class ThreadCraw {
    public static void main(String[] args) throws Exception {
       String url= "https://sj.zol.com.cn/bizhi/detail_12078_130945.html";
       // 读取html页面的内容
       String pageStr = String.valueOf(GetPageStr(url));
       // 用Jsoup转换为dom格式进行过滤
        Document document = Jsoup.parse(pageStr);
        // css的语法
        Elements elements = document.select("ul#showImg li");
        // 创建线程池
        ExecutorService executor = Executors.newFixedThreadPool(elements.size());
        // 循环读取img
        for(Element li: elements) {
            Element img = li.selectFirst("img");
            String imgUrl;
            if (img.attr("src").length() > 0) {
                 imgUrl = img.attr("src");
            } else {
                 imgUrl = img.attr("srcs");
            }
//            downloadImg("d:\\img\\", imgUrl);
            // 创建并启动下载线程
            ImageDownloader downloader = new ImageDownloader("d:\\img\\", imgUrl);
            executor.execute(downloader);
        }
        // 关闭线程池
        executor.shutdown();
    }
    private static StringBuffer GetPageStr(String url) throws Exception {
        URL obj = new URL(url);
        HttpURLConnection con = (HttpURLConnection) obj.openConnection();
        // 发送get请求
        con.setRequestMethod("GET");
        // 通过con.getInputStream() 获取到与服务器建立的输入流，即服务端返回的数据流
        //InputStreamReader将字节流转换为字符流，而BufferedReader提供了缓冲和高效读取的功能，常常用来包装InputStreamReader以提供更好的读取性能。
        BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(),  "UTF-8"));
        String inputLine;
        StringBuffer response = new StringBuffer();
        //不断地调用 BufferedReader 的 readLine() 方法，每次读取一行数据，并将其加入到 StringBuffer 中。
        // 当 readLine() 方法返回 null 时，表示已经读取完了所有的数据，循环结束
        while ((inputLine=in.readLine()) != null) {
            response.append(inputLine);
        }
        in.close();

        // 打印服务器的响应
        System.out.println("GET Response: " + response.toString());
        return response;
    }
    private static void downloadImg(String savePath, String url) throws IOException {

        URL imageUrl  = new URL(url);
        HttpURLConnection con = (HttpURLConnection) imageUrl.openConnection();
        // 发送get请求
        con.setRequestMethod("GET");
        try(InputStream inputStream= con.getInputStream()) {
            // 获取当前图片名称
//            String fileName = url.substring(url.lastIndexOf("/") +1);
            // 生成唯一格式图片
            LocalDateTime now = LocalDateTime.now();
            DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMddHHmmssSSS");
            String fileName = now.format(formatter) + ".jpg";
            // 获取保存图片路径
            Path filePath = Paths.get(savePath, fileName);
            try {
                // 将输入流中的内容复制到指定的目标文件
                Files.copy(inputStream, filePath, StandardCopyOption.REPLACE_EXISTING);
                System.out.println(fileName + " download is successful");
            } catch (FileAlreadyExistsException e) {

            }

        }
    }

    static class ImageDownloader  extends Thread {

        private String savePath;
        private String imgUrl;

        public ImageDownloader(String savePath, String imgUrl) {
            this.savePath = savePath;
            this.imgUrl = imgUrl;
        }

        @Override
        public void run() {
            try {
                downloadImg(savePath, imgUrl);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }
}

关键代码

创建线程池： ExecutorService executor = Executors.newFixedThreadPool(elements.size());
最后执行线程

1 2	ImageDownloader downloader = new ImageDownloader("d:\\img\\", imgUrl); executor.execute(downloader);

其实更优雅的方式应该是这种

public static void main(String[] args) throws Exception {
       String url = "https://sj.zol.com.cn/bizhi/detail_12078_130945.html";
       // 读取html页面的内容
       String pageStr = String.valueOf(GetPageStr(url));
       // 用Jsoup转换为dom格式进行过滤
       Document document = Jsoup.parse(pageStr);
       // css的语法
       Elements elements = document.select("ul#showImg li");
       
       // 创建线程池
       ExecutorService executor = Executors.newFixedThreadPool(elements.size());
       List<Future<Void>> futures = new ArrayList<>();
       
       // 循环读取img
       for (Element li : elements) {
           Element img = li.selectFirst("img");
           String imgUrl;
           if (img.attr("src").length() > 0) {
               imgUrl = img.attr("src");
           } else {
               imgUrl = img.attr("srcs");
           }
           
           // 提交下载任务到线程池
           Callable<Void> downloadTask = () -> {
               downloadImg("d:\\img\\", imgUrl);
               return null;
           };
           Future<Void> future = executor.submit(downloadTask);
           futures.add(future);
       }
       
       // 等待所有任务完成
       for (Future<Void> future : futures) {
           future.get();
       }
       
       // 关闭线程池
       executor.shutdown();
   }