吾爱破解 - LCG - LSG |安卓破解|病毒分析|www.52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 3172|回复: 20
收起左侧

[其他转载] 使用C#语言+HtmlAgilityPack爬取妹子图片

[复制链接]
TZ糖纸 发表于 2020-8-12 10:40
本帖最后由 TZ糖纸 于 2020-8-12 10:43 编辑

这里使用了HtmlAgilityPack以及HttpHelper
HtmlAgilityPack可以直接在nuget下载

HttpHelper请自行百度

Gitee地址:https://gitee.com/TZTZTZ980929/Spider
[C#] 纯文本查看 复制代码
using SufeiUtil;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Threading;

namespace Spider
{
    public class MeiZiTu
    {
        public string rootPath = null;
        public int pageCount = 0;
        HtmlAgilityPack.HtmlDocument dc = new HtmlAgilityPack.HtmlDocument();
        public void Test1()
        {
            if (rootPath == null)
            {
                rootPath = System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase + "mzitu\\";
                checkDir(rootPath);
            }
            var allUrl = "https://www.mzitu.com/all/";
            //拿到每日更新页面的html
            var allHtml = GetHtml(allUrl);
            var allList = GetALlUrl(allHtml);
            foreach (var url in allList)
            {
                if (url.Contains("old")) continue;
                var pageHtml = GetHtml(url);
                downloadAllImage(url, pageHtml);
                this.pageCount = 0;
            }
        }

        private void downloadAllImage(string url, string pageHtml)
        {
            try
            {
                dc.LoadHtml(pageHtml);
                var title = dc.DocumentNode.SelectSingleNode("//h2[@class='main-title']").InnerText;
                var imgUrl = dc.DocumentNode.SelectSingleNode("//div[@class='main-image']/p/a/img").Attributes["src"].Value;
                var pagenav = dc.DocumentNode.SelectNodes("//div[@class='pagenavi']").Descendants("a");
                var pagenavCount = pagenav.Count();
                var pageCount = pagenav.ToArray()[pagenavCount - 2].InnerText;
                this.pageCount = Convert.ToInt32(pageCount);
                for (int i = 1; i <= this.pageCount; i++)
                {
                    var jumpurl = "";
                    if (i == 1)
                        jumpurl = url;
                    else
                        jumpurl = url + "/" + i;
                    var html = GetHtml(jumpurl);
                    downloadImage(html);
                    Thread.Sleep(2000);
                }
            }
            catch (Exception ex)
            {

              
            }
           
        }

        private void downloadImage(string html)
        {
            dc.LoadHtml(html);
            var title = dc.DocumentNode.SelectSingleNode("//h2[@class='main-title']").InnerText.Replace("?","").Replace("!", "");
            var imgUrl = dc.DocumentNode.SelectSingleNode("//div[@class='main-image']/p/a/img").Attributes["src"].Value;
            var byteImage = GetBytesFromUrl(imgUrl);
            WriteBytesToFile(title, byteImage);

        }
        public byte[] GetBytesFromUrl(string url)
        {
            byte[] b;
            HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(url);
            myReq.Referer = url;
            WebResponse myResp = myReq.GetResponse();

            Stream stream = myResp.GetResponseStream();
            //int i;
            using (BinaryReader br = new BinaryReader(stream))
            {
                //i = (int)(stream.Length);
                b = br.ReadBytes(500000);
                br.Close();
            }
            myResp.Close();
            return b;

        }
        public void WriteBytesToFile(string fileName, byte[] content)
        {
            Console.WriteLine(this.rootPath + fileName + ".jpg");
            FileStream fs = new FileStream(this.rootPath + fileName + ".jpg", FileMode.Create);
            BinaryWriter w = new BinaryWriter(fs);
            try
            {
                w.Write(content);
            }
            finally
            {
                fs.Close();
                w.Close();
            }

        }
        public static bool checkDir(string url)
        {
            try
            {
                if (!Directory.Exists(url))//如果不存在就创建file文件夹                               
                    Directory.CreateDirectory(url);//创建该文件夹              
                return true;
            }
            catch (Exception ex)
            {
                return false;
            }
        }
        private List<string> GetALlUrl(string allHtml)
        {
            List<string> urlList = new List<string>();
            dc.LoadHtml(allHtml);
            var all = dc.DocumentNode.SelectNodes("//div[@class='all']");
            var allUrl = all.Descendants("a");
            foreach (var url in allUrl)
            {
                urlList.Add(url.Attributes["href"].Value);
            }
            return urlList;
        }

        public string GetHtml(string url)
        {

            HttpHelper http = new HttpHelper();
            HttpItem item = new HttpItem()
            {
                URL = url,
                Method = "GET",
                Timeout = 100000,
                ReadWriteTimeout = 30000,
                IsToLower = false,
                Cookie = "",
                UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",//用户的浏览器类型,版本,操作系统     可选项有默认值  
                Accept = "text/html, application/xhtml+xml, */*",
                ContentType = "text/html",
                Referer = "",
                Allowautoredirect = false,
                Postdata = "",
                ResultType = ResultType.String,
            };
            HttpResult result = http.GetHtml(item);
            string html = result.Html;
            string cookie = result.Cookie;
            return html;
        }
    }
}

免费评分

参与人数 4吾爱币 +8 热心值 +2 收起 理由
liu8359 + 1 我很赞同!
苏紫方璇 + 5 + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!
wyb850 + 1 + 1 用心讨论,共获提升!
Bizhi-1024 + 1 用心讨论,共获提升!

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

HuskyHappy 发表于 2021-1-22 16:25
之前看Python有beautiful soup 一直很羡慕 前两天想起来C#也得有类似的 一搜果然有 html agility patch
xu低调 发表于 2020-12-28 20:57
之前看Python有beautiful soup 一直很羡慕 前两天想起来C#也得有类似的 一搜果然有 html agility patch
老狗丶 发表于 2020-8-12 11:08
bsjasd 发表于 2020-8-12 11:16
学习学习,支持共享
Light紫星 发表于 2020-8-12 11:18
拿了网址走人
chuwenjie161 发表于 2020-8-12 11:20
学习学习,谢谢
迷茫管家 发表于 2020-8-12 11:22
学习学习
零度的轻吻 发表于 2020-8-12 11:31
这个不错,可以好好学学
FIzz001 发表于 2020-8-12 11:44
拿了网址就跑
lamjiarong 发表于 2020-8-12 16:27
remark,谢谢楼主无私的分享!!!谢谢!
al747 发表于 2020-8-12 23:29
不错,学习了!!!!!!!
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则 警告:本版块禁止灌水或回复与主题无关内容,违者重罚!

快速回复 收藏帖子 返回列表 搜索

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-5-5 05:07

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表