爬虫学习【2】知轩藏书小说下载

Fullmoonbaka · 发表于 2021-7-21 11:18

知轩藏书小说下载爬虫
本爬虫采用 requests 请求网页信息，xpath 对网页DOM树进行解析的方式来采集网页关键信息。

爬虫功能:
1. 搜索关键字, 并且有翻页功能
2. 获取首页内容并且进行解析, 将解析出的内容进行分类
3. 获取分类列表, 并且有翻页功能
4. 查看小说详情功能
5. 下载功能 [待完善]下载功能为呼起默认浏览器下载

功能图片展示
1. 入口

入口

2. 首页

首页

3. 分类页

分类

4. 详情

详情

5. 搜索

搜索

6. 搜索结果

搜索结果

可优化点:
1. 每个请求里的headers其实都一样但是写了很多次, 可以写成全局headers (原因是我以为会有的网页不一样, 结果全都一样)
2. 首页->最新小说->更多这个页面为单独页面, 但是只采集了小说项 (除此之外还有分类项可以采集)
3. 下载的方式可以从呼起网页下载换成直接用控制台下载 (会导致一些问题, 还没有解决方案)
4. 搜索结果页和分类页代码重复率太高 (待优化)

欢迎各位感兴趣的朋友一起交流学习，欢迎大佬指出问题

[Python] 纯文本查看 复制代码

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

031

032

033

034

035

036

037

038

039

040

041

042

043

044

045

046

047

048

049

050

051

052

053

054

055

056

057

058

059

060

061

062

063

064

065

066

067

068

069

070

071

072

073

074

075

076

077

078

079

080

081

082

083

084

085

086

087

088

089

090

091

092

093

094

095

096

097

098

099

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

import requests
import os
from lxml import etree
 
dom = None  # 页面text
html = etree.HTML("", etree.HTMLParser())  # 页面结构树
 
 
# 清屏
def clear():
    print('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n')
    os.system('cls')
 
 
# 转换 int 捕获错误返回 -1
def be_int(num):
    try:
        return int(num)
    except:
        return -1
 
 
# 加载首页
def load_page():
    print('开始读取...')
    global dom, html
    url = "https://www.zxcs.info/"
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"
    }
    h = requests.get(url=url, headers=headers)
    h.encoding = 'utf-8'
    if h.status_code == 200:
        dom = h.text
        html = etree.HTML(dom, etree.HTMLParser())
    else:
        print('请求出错,错误代码:{}'.format(str(html.status_code)))
        return False
    print('读取完成!')
 
 
# 加载的菜单
def loading_menu():
    while True:
        clear()
        print("####################")
        print("## 小说下载器  v1.0 ##")
        print("####################")
        print("--------------------")
        print("[ q ] 退出")
        print("[ s ] 搜索")
        print("[ 0 ] 获取线上页面")
        print("--------------------")
        command = input("请输入: ")
        clear()
        if command == 'cls':
            print("")
        elif command == 'q':
            print("退出")
            return False
        elif command == 's':
            search_book()
        elif command == '0':
            load_page()
            return True
        else:
            input("请输入正确的指令")
 
 
# 搜索
def search_book():
    while True:
        clear()
        print("--------------------")
        print("[ b ] 返回")
        print(" 搜索请输入书名 ")
        print("--------------------")
        command = input("请输入书名: ").strip()
        if command == "":
            input("书名不能为空!")
        elif command == 'b':
            return False
        else:
            search_menu(command)
 
 
# 主菜单
def main_menu():
    global html
    title_list = html.xpath('.//div[@class="wrap"]//div[@class="title"]/strong/text()')
    while True:
        clear()
        print("--------------------")
        print("[ q ] 退出")
        for i in range(len(title_list) - 1):
            print("[ {} ] {}".format(i, title_list[i]))
        print("--------------------")
        command = input("请输入: ")
        clear()
        if command == 'cls':
            print("")
        elif command == 'q':
            print("再见~")
            return
        elif 0 <= be_int(command) < len(title_list) - 1:
            # print(title_list[be_int(command)-1])
            category_menu(be_int(command))
        else:
            input("请输入正确的指令")
 
 
# 分列表菜单 (首页列表)
def category_menu(index):
    global html
    item_box = html.xpath('.//div[@class="wrap"]/div[contains(@class,"mlist")]')[index]
    item_list = item_box.xpath('.//div[@class="box"]/ul/li/a//text()')
    item_url_list = item_box.xpath('.//div[@class="box"]/ul/li/a/@href')
    more_url_list = item_box.xpath('.//div[@class="title"]/a/@href')
    while True:
        clear()
        print("--------------------")
        print("[ b ] 返回")
        # 如果有 more
        if len(more_url_list) > 0:
            print("[ m ] 更多")
        for i in range(len(item_list)):
            print("[ {} ] {}".format(i, item_list[i]))
        print("--------------------")
        command = input("请输入: ")
        clear()
        if command == 'cls':
            print("")
        elif command == 'b':
            return
        elif command == 'm' and len(more_url_list) > 0:
            if more_url_list[0] == "map.html":
                new_menu(more_url_list[0])
            else:
                more_menu(more_url_list[0])
        elif 0 <= be_int(command) < len(item_list):
            detail_menu(item_url_list[be_int(command)])
        else:
            input("请输入正确的指令")
 
 
# 最新 页
def new_menu(url):
    print('开始读取...')
    u = "https://www.zxcs.info/map.html"
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"
    }
    h = requests.get(url=u, headers=headers)
    h.encoding = 'utf-8'
    if h.status_code == 200:
        new_dom = h.text
    else:
        print('请求出错,错误代码:{}'.format(str(html.status_code)))
        return
    # file = open('003new.html', 'r', encoding="UTF-8")
    # new_dom = file.read()
    # file.close()
 
    new_html = etree.HTML(new_dom, etree.HTMLParser())
    content_list = new_html.xpath('.//div[@class="wrap"]/div[@id="content"]/ul/li')
    print('读取完成!')
    while True:
        clear()
        print("--------------------")
        print("[ b ] 返回")
        print("-------最新列表-------")
        for i in range(len(content_list)):
            print("[ {} ] {}".format(i, content_list[i].xpath('.//a/text()')[0]))
        print("--------------------")
        command = input("请输入: ")
        clear()
        if command == 'cls':
            print("")
        elif command == 'b':
            return
        elif 0 <= be_int(command) < len(content_list):
            detail_menu(content_list[be_int(command)].xpath('.//a/@href')[0])
        else:
            input("请输入正确的指令")
 
 
# 更多 页
def more_menu(url):
    print('开始读取...')
    page = 1  # 默认页码是 1
 
    def more_page(url, page):
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"
        }
        u = url + "/page/{}".format(str(page))
        h = requests.get(url=u, headers=headers)
        h.encoding = 'utf-8'
        if h.status_code == 200:
            return h.text
        else:
            print('请求出错,错误代码:{}'.format(str(html.status_code)))
            return False
 
    more_dom = more_page(url, page)
    if more_dom:
        more_html = etree.HTML(more_dom, etree.HTMLParser())
        content_list = more_html.xpath('.//dl[@id="plist"]')
    else:
        return False
    print('读取完成!')
    while True:
        clear()
        print("--------------------")
        print("[ b ] 返回")
        for i in range(len(content_list)):
            print("[ {} ] {}".format(i, content_list[i].xpath('.//dt/a/text()')[0]))
        if len(content_list) == 0:
            print("没有更多内容了")
        print("--------------------")
        print("[ p ] p+页码进行翻页 例: p1")
        print("-------page {}-------".format(str(page)))
        command = input("请输入: ")
        clear()
        if command == 'cls':
            print("")
        elif command == 'b':
            return
        elif 0 <= be_int(command) < len(content_list):
            detail_menu(content_list[be_int(command)].xpath('.//dt/a/@href')[0])
        elif command.startswith('p') and 0 < be_int(command[1:]):
            print("正在加载第 {} 页...".format(command[1:]))
            more_dom = more_page(url, be_int(command[1:]))
            if more_dom:
                page = be_int(command[1:])
                more_html = etree.HTML(more_dom, etree.HTMLParser())
                content_list = more_html.xpath('.//dl[@id="plist"]')
            else:
                return False
        else:
            input("请输入正确的指令")
 
 
# 搜索 页
def search_menu(keyword):
    print('开始读取...')
    page = 1  # 默认页码是 1
 
    def search_page(keyword, page):
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"
        }
        params = {
            "keyword": keyword,
            "page": page
        }
        url = "https://www.zxcs.info/index.php"
        h = requests.get(url=url, headers=headers, params=params)
        h.encoding = 'utf-8'
        if h.status_code == 200:
            return h.text
        else:
            print('请求出错,错误代码:{}'.format(str(html.status_code)))
            return False
 
    search_dom = search_page(keyword, page)
    if search_dom:
        search_html = etree.HTML(search_dom, etree.HTMLParser())
        content_list = search_html.xpath('.//dl[@id="plist"]')
    else:
        return False
    print('读取完成!')
    while True:
        clear()
        print("--------------------")
        print("[ b ] 返回")
        for i in range(len(content_list)):
            print("[ {} ] {}".format(i, content_list[i].xpath('.//dt/a/text()')[0]))
        if len(content_list) == 0:
            print("没有更多内容了")
        print("--------------------")
        print("[ p ] p+页码进行翻页 例: p1")
        print("-------page {}-------".format(str(page)))
        command = input("请输入: ")
        clear()
        if command == 'cls':
            print("")
        elif command == 'b':
            return
        elif 0 <= be_int(command) < len(content_list):
            detail_menu(content_list[be_int(command)].xpath('.//dt/a/@href')[0])
        elif command.startswith('p') and 0 < be_int(command[1:]):
            print("正在加载第 {} 页...".format(command[1:]))
            more_dom = search_page(keyword, be_int(command[1:]))
            if more_dom:
                page = be_int(command[1:])
                more_html = etree.HTML(more_dom, etree.HTMLParser())
                content_list = more_html.xpath('.//dl[@id="plist"]')
            else:
                return False
        else:
            input("请输入正确的指令")
 
 
# 详情
def detail_menu(url):
    print("正在加载详情...")
    book_url = url
    if book_url.startswith('/post'):
        book_url = "https://www.zxcs.info" + url
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"
    }
    h = requests.get(url=book_url, headers=headers)
    h.encoding = 'utf-8'
    if h.status_code == 200:
        detail_dom = h.text
    else:
        print('请求出错,错误代码:{}'.format(str(html.status_code)))
        return False
    detail_html = etree.HTML(detail_dom, etree.HTMLParser())
    detail = detail_html.xpath('.//div[@id="content"]/div[2]/p[2]/text()')
    while True:
        clear()
        print("--------------------")
        print("[ b ] 返回")
        print("[ d ] 下载")
        # if [0].strip() == ""
        print("[ 标题 ] {}".format("".join(detail_html.xpath('.//h1/text()')).strip()))
        print("[ 大小 ] {}".format(detail[0][9:]))
        print("[ 详情 ]")
        for i in range(len(detail)):
            if i > 1 and detail[i].strip() != "":
                print(detail[i].strip())
        print("--------------------")
        command = input("请输入: ")
        clear()
        if command == 'cls':
            print("")
        elif command == 'b':
            return
        elif command == 'd':
            down_url = detail_html.xpath('.//div[@class="pagefujian"]/div[@class="down_2"]/a[@title="点击下载"]/@href')[0]
            download_menu(down_url)
        else:
            input("请输入正确的指令")
 
 
# 下载页面
def download_menu(url):
    print("正在加载下载页...")
    down_url = url
    if down_url.startswith('/download'):
        down_url = "https://www.zxcs.info" + url
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"
    }
    h = requests.get(url=down_url, headers=headers)
    h.encoding = 'utf-8'
    if h.status_code == 200:
        down_dom = h.text
    else:
        print('请求出错,错误代码:{}'.format(str(html.status_code)))
        return False
    down_html = etree.HTML(down_dom, etree.HTMLParser())
    down_list = down_html.xpath('.//div[@class="panel"]/div[@class="panel-body"]/span[@class="downfile"]')
    while True:
        clear()
        print("--------------------")
        print("[ b ] 返回")
        for i in range(len(down_list)):
            print("[ {} ] {}".format(i, down_list[i].xpath('.//a/text()')))
        print("--------------------")
        command = input("请输入: ")
        clear()
        if command == 'cls':
            print("")
        elif command == 'b':
            return
        elif 0 <= be_int(command) < len(down_list):
            # webbrowser.open_new_tab(down_list[be_int(command)].xpath('.//a/@href')[0])
            os.system('start {}'.format(down_list[be_int(command)].xpath('.//a/@href')[0]))
            return
        else:
            input("请输入正确的指令")
 
 
# 入口
if __name__ == '__main__':
    if loading_menu():
        main_menu()

hshcompass · 发表于 2021-9-18 22:55

好站，免费下载，别爬了。

wsliangj · 发表于 2021-7-21 11:46

这个可以有，期待中

老婆是加藤惠 · 发表于 2021-7-21 11:31

爬取盗版下盗版

西岭千秋雪 · 发表于 2021-7-21 12:12

感谢分享

不羁的阳光 · 发表于 2021-7-21 12:19

不错啊，条理清晰

yuehanoo · 发表于 2021-7-21 12:39

下来试试，谢谢、

莫影轩 · 发表于 2021-7-21 12:47

牛牛牛，谢谢大佬

qingshi · 发表于 2021-7-21 13:31

可以呀老哥。

lyx98 · 发表于 2021-7-21 13:32

哈哈，爬盗版小说练手爬虫最舒服了！

student123 · 发表于 2021-7-21 13:33

沉得住气，耐得住寂寞

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 爬虫学习【2】知轩藏书小说下载

免费评分

本帖被以下淘专辑推荐:

浏览过的版块