Java爬虫一个ACG 网站 pic

Naraci · 发表于 2024-3-5 00:42

闲来无事顺便学习下Java的爬虫和 io流存文件便找了一个网站来练练手
手写了个爬虫但是单个爬太慢便让gpt 来加了个多线程嗯~ 很快快到被网站拒绝连接了没有好的代{过}{滤}理池就没加有的好兄弟可以加个代{过}{滤}理池试试直接上代码技术比较菜希望大佬们指点我（对了是在springboot的单元测试里面写的）

[Java] 纯文本查看 复制代码

import com.naraci.core.util.StringUtils;
import com.naraci.core.util.UrlUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

/**
 * [url=home.php?mod=space&uid=686208]@AuThor[/url] ShenZhaoYu
 * [url=home.php?mod=space&uid=686237]@date[/url] 2024/3/2
 */

public class TestAc {

    // 由于子页面都来自于主url拼接地址 所有定义一下方便循环拼接
    private String localUrl = "https://www.ciyuanjie.cn";[/align]
[align=left]
// 这个是图片的板块 可以更换的 这个网站还有动漫图什么的 另外就是只能从 一个板块的首页爬 因为我这个是遍历的 
    private String jxLocalUrl = "https://www.ciyuanjie.cn/cosplay";

    @Test
    public void Thread() throws IOException {
        TestAc testAc = new TestAc();
        testAc.pImage();
    }

    /**
     * 获取页面的所有选项目录链接
     */
    public void pImage() throws IOException {
        String url = jxLocalUrl();

        Document doc = Jsoup.parse(new URL(url), 30000);
        // 获取总页数
        Elements tag = doc.getElementsByClass("page-numbers");
        Element e = tag.get(tag.size() - 2);
        // 提取文本内容
        String pageText = e.text();
        int pageSum = Integer.parseInt(StringUtils.RequestNumber(pageText));

        // 存储 所页面的url
        List<String> pageUrlList = new ArrayList<>();
        // 循环遍历 所有页
        for (int i = 0; i <= pageSum; i++) {
            String s = url + "/page_" + i + ".html";
            pageUrlList.add(s);
        }
        LocalPageUrl(pageUrlList);
    }

    public void jxLocalUrl(String jxLocalUrl) {
        this.jxLocalUrl = jxLocalUrl;
    }

    public String jxLocalUrl() {
        return this.jxLocalUrl;
    }

    /**
     * 根据页数来遍历图集资源地址
     * [url=home.php?mod=space&uid=952169]@Param[/url] urls
     */
    public void LocalPageUrl(List<String> urls) {
        for (String url : urls) {
            Document doc;
            try {
                doc = Jsoup.parse(new URL(url), 30000);
                // 创建一个存储待抓取链接页面的列表
                List<String> links = new ArrayList<>();
                //  获取所有的链接
                Elements allLink = doc.select("#index_ajax_list");
                Elements allA = allLink.select(".kzpost-data");
                for (int i = 0; i<= allA.size()-1; ++i) {
                    Element text = allA.get(i);
                    Elements a1 = text.select("a");
                    String value = a1.attr("href");
                    links.add(localUrl+value);
                }
                // 创建一个固定大小的线程池
                ExecutorService executor = Executors.newFixedThreadPool(16);
                for (String atlas : links) {
                    executor.execute(new ImageDownloadThread(atlas));
                }
                executor.shutdown();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

class ImageDownloadThread implements Runnable {
    private String url;

    public ImageDownloadThread(String url) {
        this.url = url;
    }

    @Override
    public void run() {
        try {
            traverse(url);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void traverse(String url) throws IOException {
        Document dc = Jsoup.parse(new URL(url), 30000);
        Elements imgs = dc.getElementsByTag("img");
        Elements targetClass = imgs.select(".aligncenter");

        List<String> imagesLinks = new ArrayList<>();
        for (Element aClass : targetClass) {
            String imageUrl = aClass.attr("src");
            imagesLinks.add(imageUrl);
        }

        Elements titleGet = dc.getElementsByTag("title");
        String title = titleGet.text();
        imageDownload(imagesLinks, title);
    }

    public void imageDownload(List<String> imageLinks, String title) throws IOException {
        String filePath = "D:\\Zhaoyu\\zhaoyuCode\\YuZiApi\\YuziApi\\boot\\src\\main\\resources\\Images";
        File newFile = new File(filePath, title);
        if (!newFile.mkdirs()) {
            System.out.println("文件夹创建失败,当前文件夹名"+ title);
            filePath = "D:\\Zhaoyu\\zhaoyuCode\\YuZiApi\\YuziApi\\boot\\src\\main\\resources\\Images\\创建失败的目录存放处";
            newFile = new File(filePath);
        } else {
            System.out.println("文件夹创建成功" + newFile);
        }
        for(String url : imageLinks) {
            String redirected = UrlUtils.getRedirectedURL(url);
            URL tarGet = new URL(redirected);
            HttpURLConnection httpURLConnection = (HttpURLConnection) tarGet.openConnection();
            InputStream inputStream = httpURLConnection.getInputStream();
            try {
                String path = url.replaceAll("^https?://[^/]+/", "");
                String fileName = path.replaceAll(".*/(.*?)$", "$1");
                fileName = fileName.replaceAll("[\\\\/:*?\"<>|]", "_");
                FileOutputStream file = new FileOutputStream(newFile.getAbsolutePath() + File.separator + fileName);
                byte[] buffer = new byte[1024 * 2];
                int len;
                while ((len = inputStream.read(buffer)) != -1) {
                    file.write(buffer, 0, len);
                }
                inputStream.close();
                file.close();
                System.out.println("保存成功");
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                httpURLConnection.disconnect();
            }
        }

    }
}

Maiz1888 · 发表于 2024-3-5 09:45

Python版本：

[Python] 纯文本查看 复制代码

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import os


class TestAc:
    def __init__(self):
        # 网站的基本 URL
        self.local_url = "https://www.ciyuanjie.cn"
        # 图像部分的 URL
        self.jx_local_url = "https://www.ciyuanjie.cn/cosplay"

    def run_threads(self):
        # 运行 pImage 方法
        self.p_image()

    def p_image(self):
        url = self.jx_local_url
        response = requests.get(url)
        doc = BeautifulSoup(response.text, 'html.parser')

        # 获取总页数
        tag = doc.find_all(class_="page-numbers")
        e = tag[-2]
        page_text = e.text
        page_sum = int(''.join(filter(str.isdigit, page_text)))

        # 存储所有页面 URL
        page_url_list = [f"{url}/page_{i}.html" for i in range(page_sum + 1)]

        # 并行处理每个页面
        with ThreadPoolExecutor(max_workers=16) as executor:
            executor.map(self.local_page_url, page_url_list)

    def set_jx_local_url(self, jx_local_url):
        self.jx_local_url = jx_local_url

    def get_jx_local_url(self):
        return self.jx_local_url

    def local_page_url(self, url):
        response = requests.get(url)
        doc = BeautifulSoup(response.text, 'html.parser')

        # 获取页面上的所有链接
        links = [f"{self.local_url}{a['href']}" for a in doc.select("#index_ajax_list .kzpost-data a")]

        # 并行处理每个链路
        with ThreadPoolExecutor(max_workers=16) as executor:
            executor.map(self.image_download, links)

    def image_download(self, url):
        response = requests.get(url)
        doc = BeautifulSoup(response.text, 'html.parser')

        # 提取图像链接
        image_links = [img['src'] for img in doc.select("img.aligncenter")]

        # 从页面中提取标题
        title = doc.find("title").text

        # 下载图片
        self.download_images(image_links, title)

    def download_images(self, image_links, title):
        # 定义文件夹路径
        folder_path = os.path.join("D:\\maiz\\Images",
                                   title)

        try:
            os.makedirs(folder_path, exist_ok=True)
            for url in image_links:
                redirected = requests.head(url, allow_redirects=True).url
                file_name = os.path.join(folder_path, os.path.basename(redirected).replace("/", "_"))
                with open(file_name, 'wb') as file:
                    file.write(requests.get(redirected).content)
                print("Saved successfully:", file_name)
        except Exception as e:
            print(f"Error downloading images: {e}")


if __name__ == "__main__":
    # 创建 TestAc 实例并运行线程
    test_ac = TestAc()
    test_ac.run_threads()

傻小猪 · 发表于 2024-3-5 15:23

本帖最后由傻小猪于 2024-3-5 15:25 编辑

CoderDream 发表于 2024-3-5 09:09
这三句如果能加上注释就好了，哪位大佬帮忙解释一下，多谢！

这三句代码是用于从 URL 中提取文件名的。下面是每一句代码的作用：
1. String path = url.replaceAll("^https?://[^/]+/", "");
- 这一句代码使用正则表达式将 URL 中的协议和域名部分替换为空字符串，从而得到 URL 的路径部分。
2. String fileName = path.replaceAll(".*/(.*?)$", "$1");
- 这一句代码使用正则表达式从路径中提取文件名。它将路径中最后一个斜杠（/）后的部分提取出来作为文件名。
3. fileName = fileName.replaceAll("[\\\\/:*?\"<>|]", "_");
- 这一句代码将文件名中的特殊字符（包括反斜杠、冒号、星号、问号、双引号、尖括号和竖线）替换为下划线，以确保文件名的合法性。
这三句代码的作用是从给定的 URL 中提取出文件名，并确保文件名中不包含特殊字符，以便后续操作。

milaicai · 发表于 2024-3-5 01:05

好好好我是第一个

chenzanlin · 发表于 2024-3-5 01:23

哈哈，这个不错哦

hhzhb2005 · 发表于 2024-3-5 05:32

这个不错哦

Corgibro · 发表于 2024-3-5 07:36

好好好，这个爬得真好

ltgb · 发表于 2024-3-5 07:48

爬的COS不错

TonyLiu · 发表于 2024-3-5 09:04

真不戳，给你点赞

acbcbo · 发表于 2024-3-5 09:05

怎么搞，我怎么不行呢

acbcbo · 发表于 2024-3-5 09:06

爬的COS不错

CoderDream · 发表于 2024-3-5 09:09

String path = url.replaceAll("^https?://[^/]+/", "");
String fileName = path.replaceAll(".*/(.*?)$", "$1");
fileName = fileName.replaceAll("[\\\\/:*?\"<>|]", "_");

这三句如果能加上注释就好了，哪位大佬帮忙解释一下，多谢！

帐号		自动登录	找回密码
密码			注册[Register]

[Java 原创] Java爬虫一个ACG 网站 pic

免费评分

本帖被以下淘专辑推荐:

免费评分

[Java 原创] Java爬虫 一个ACG 网站 pic

免费评分

本帖被以下淘专辑推荐:

免费评分

[Java 原创] Java爬虫一个ACG 网站 pic