#[url=https://www.biquge.info/wanjiexiaoshuo/]https://www.biquge.info/wanjiexiaoshuo/[/url] 笔趣阁小说全本爬虫 import time import requests import os import random from lxml import etree import webbrowser header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77" } noName = ['#','/','',':','*','?','"','<','>','|'] #/:*?"<>| filePath = './保存小说' def strZ(_str): #将特殊字符转换为空格 ret = '' for _ in _str: if _ in noName: ret += " " else: ret += _ return ret def main(): webbrowser.open('https://www.biquwx.la/') if not os.path.exists(filePath): os.mkdir(filePath) print('1.爬取指定小说') print('2.爬取整个站点') if input('使用哪种方式爬取小说? ') == '1': appintDown() else : allDown() input("按下任意键退出") def appintDown(): #爬取指定小说 前提是网页没错 page_url = input('输入要爬取的小说网站(例如 [url=https://www.biquwx.la/10_10240/]https://www.biquwx.la/10_10240/[/url]) : ') page = requests.get(url=page_url, headers=header) if page.status_code == 200: # 响应就爬取 page.encoding = 'utf-8' page_tree = etree.HTML(page.text) page_title = page_tree.xpath('//div[@id="info"]/h1/text()')[0] _filePath = filePath + '/' + page_title if not os.path.exists(_filePath): os.mkdir(_filePath) page_dl_list = page_tree.xpath('//div[@class="box_con"]/div[@id="list"]/dl/dd') for _ in page_dl_list: _page_url = page_url + _.xpath('./a/@href')[0] _page_title = _filePath + '/' + strZ(_.xpath('./a/@title')[0]) + '.txt' _page = requests.get(_page_url, headers=header) if _page.status_code == 200: _page.encoding = 'utf-8' _tree = etree.HTML(_page.text) _page_content = _tree.xpath('//div[@id="content"]/text()') fileContent = '' for _ in _page_content: fileContent += _ + 'n' with open(_page_title, 'w', encoding='utf-8') as fp: fp.write(fileContent) print('%s成功下载到本地' % (_page_title)) time.sleep(random.uniform(0.05, 0.2)) def allDown(): #整个站点小说爬取 url = 'https://www.biquge.info/wanjiexiaoshuo/' # 目录 page = requests.get(url=url, headers=header) if page.status_code == 200: # 响应就爬取 page.encoding = 'utf-8' tree = etree.HTML(page.text) page_last = tree.xpath('//div[@class="pagelink"]/a[@class="last"]/text()')[0] for page_i in range(1, int(page_last)): # 小说页数遍历 url = 'https://www.biquge.info/wanjiexiaoshuo/' + str(page_i) page = requests.get(url=url, headers=header) if page.status_code == 200: # 响应就爬取 page.encoding = 'utf-8' tree = etree.HTML(page.text) li_list = tree.xpath('//div[@class="novelslistss"]/ul/li') for li in li_list: page_url = li.xpath('./span[@class="s2"]/a/@href')[0] # 目录链接 page_title = strZ(li.xpath('./span[@class="s2"]/a/text()')[0]) page = requests.get(url=page_url, headers=header) if page.status_code == 200: # 响应就爬取 page.encoding = 'utf-8' page_tree = etree.HTML(page.text) _filePath = filePath + '/' + page_title if not os.path.exists(_filePath): os.mkdir(_filePath) page_dl_list = page_tree.xpath('//div[@class="box_con"]/div[@id="list"]/dl/dd') for _ in page_dl_list: _page_url = page_url + _.xpath('./a/@href')[0] _page_title = _filePath + '/' + strZ(_.xpath('./a/@title')[0]) + '.txt' _page = requests.get(_page_url, headers=header) if _page.status_code == 200: _page.encoding = 'utf-8' _tree = etree.HTML(_page.text) _page_content = _tree.xpath('//div[@id="content"]/text()') fileContent = '' for _ in _page_content: fileContent += _ + 'n' with open(_page_title, 'w', encoding='utf-8') as fp: fp.write(fileContent) print('%s成功下载到本地' % (_page_title)) time.sleep(random.uniform(0.05, 0.2)) if __name__ == '__main__': main()
python教程
笔趣阁小说网Python爬虫分享
- python爬虫
-
Python好看视频地址解析下载代码
#encoding:utf-8# 好看视频下载 import socketfrom urllib.request import urlopenimport urllibimport reimport timefrom pyquery import PyQuery as pqimport requestsfrom tqdm import tqdm # 打印进度条的库import gzip print('程序开始运...
-
python美女写真图库爬虫
import requestsfrom lxml import etreeimport csvfrom time import sleepimport osfrom concurrent.futures import ThreadPoolExecutor headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit...
-
笔趣阁小说网Python爬虫分享
#[url=https://www.biquge.info/wanjiexiaoshuo/]https://www.biquge.info/wanjiexiaoshuo/[/url] 笔趣阁小说全本爬虫import timeimport requestsimport osimport randomfrom lxml import etreeimport webbrowserheader = { "User-Agent": "Mo...
-
Python爬取站长之家端口扫描接口
import requests,timeimport randomfrom bs4 import BeautifulSoupimport reimport threadingdef ports(hostm,port): url = 'http://tool.chinaz.com/port/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT ...
-
python爬虫下载抖音用户所有短视频+无水印方法
这次分享下载抖音用户所有短视频方法,python爬虫批量抓取,无水印下载,希望和大家多多交流互相学习!获取用户链接方法1、首先在抖音上随机挑选一个小姐姐,用户主页右上角点开,获取分享链接python下载抖音视频截图得到类似分享链接:在抖音,记录美好生活! https:...
-
01Python获取彩云天气实时天气API源码 3个月前
-
02基于Django的RustDesk Api&Web Server源码分享 3个月前
-
03批量修改照片文件大小Python脚本 4个月前
-
04文本段落自动分隔格式化Python脚本 5个月前
-
05利用ffmpeg提取视频第一帧保存成图片 5个月前
-
01123网盘解析下载python脚本 667热度
-
02Python自动下载歌曲宝音乐和歌词脚本 432热度
-
03Python和彩云自动签到云函数脚本分享 389热度
-
04Python无需认证QQ扫码登录脚本 362热度
-
05基于百度API文字转语音Python示例代码 316热度