欢迎光临 - 我的站长站,本站所有资源仅供学习与参考,禁止用于商业用途或从事违法行为!

python教程

笔趣阁小说网Python爬虫分享

python教程 我的站长站 2021-08-15 共168人阅读
#[url=https://www.biquge.info/wanjiexiaoshuo/]https://www.biquge.info/wanjiexiaoshuo/[/url]   笔趣阁小说全本爬虫
import time
import requests
import os
import random
from lxml import etree
import webbrowser
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77"
}
noName = ['#','/','',':','*','?','"','<','>','|']     #/:*?"<>|
filePath = './保存小说'
def strZ(_str): #将特殊字符转换为空格
    ret = ''
    for _ in _str:
        if _ in noName:
            ret += " "
        else:
            ret += _
    return ret
def main():
    webbrowser.open('https://www.biquwx.la/')
    if not os.path.exists(filePath):
        os.mkdir(filePath)
    print('1.爬取指定小说')
    print('2.爬取整个站点')
    if input('使用哪种方式爬取小说?  ') == '1':
        appintDown()
    else :
        allDown()
    input("按下任意键退出")
def appintDown(): #爬取指定小说  前提是网页没错
    page_url = input('输入要爬取的小说网站(例如 [url=https://www.biquwx.la/10_10240/]https://www.biquwx.la/10_10240/[/url]) :  ')
    page = requests.get(url=page_url, headers=header)
    if page.status_code == 200:  # 响应就爬取
        page.encoding = 'utf-8'
        page_tree = etree.HTML(page.text)
        page_title = page_tree.xpath('//div[@id="info"]/h1/text()')[0]
        _filePath = filePath + '/' + page_title
        if not os.path.exists(_filePath):
            os.mkdir(_filePath)
        page_dl_list = page_tree.xpath('//div[@class="box_con"]/div[@id="list"]/dl/dd')
        for _ in page_dl_list:
            _page_url = page_url + _.xpath('./a/@href')[0]
            _page_title = _filePath + '/' + strZ(_.xpath('./a/@title')[0]) + '.txt'
            _page = requests.get(_page_url, headers=header)
            if _page.status_code == 200:
                _page.encoding = 'utf-8'
                _tree = etree.HTML(_page.text)
                _page_content = _tree.xpath('//div[@id="content"]/text()')
                fileContent = ''
                for _ in _page_content:
                    fileContent += _ + 'n'
                with open(_page_title, 'w', encoding='utf-8') as fp:
                    fp.write(fileContent)
                    print('%s成功下载到本地' % (_page_title))
                time.sleep(random.uniform(0.05, 0.2))
def allDown(): #整个站点小说爬取
    url = 'https://www.biquge.info/wanjiexiaoshuo/'  # 目录
    page = requests.get(url=url, headers=header)
    if page.status_code == 200:  # 响应就爬取
        page.encoding = 'utf-8'
        tree = etree.HTML(page.text)
        page_last = tree.xpath('//div[@class="pagelink"]/a[@class="last"]/text()')[0]
        for page_i in range(1, int(page_last)):  # 小说页数遍历
            url = 'https://www.biquge.info/wanjiexiaoshuo/' + str(page_i)
            page = requests.get(url=url, headers=header)
            if page.status_code == 200:  # 响应就爬取
                page.encoding = 'utf-8'
                tree = etree.HTML(page.text)
                li_list = tree.xpath('//div[@class="novelslistss"]/ul/li')
                for li in li_list:
                    page_url = li.xpath('./span[@class="s2"]/a/@href')[0]  # 目录链接
                    page_title = strZ(li.xpath('./span[@class="s2"]/a/text()')[0])
                    page = requests.get(url=page_url, headers=header)
                    if page.status_code == 200:  # 响应就爬取
                        page.encoding = 'utf-8'
                        page_tree = etree.HTML(page.text)
                        _filePath = filePath + '/' + page_title
                        if not os.path.exists(_filePath):
                            os.mkdir(_filePath)
                        page_dl_list = page_tree.xpath('//div[@class="box_con"]/div[@id="list"]/dl/dd')
                        for _ in page_dl_list:
                            _page_url = page_url + _.xpath('./a/@href')[0]
                            _page_title = _filePath + '/' + strZ(_.xpath('./a/@title')[0]) + '.txt'
                            _page = requests.get(_page_url, headers=header)
                            if _page.status_code == 200:
                                _page.encoding = 'utf-8'
                                _tree = etree.HTML(_page.text)
                                _page_content = _tree.xpath('//div[@id="content"]/text()')
                                fileContent = ''
                                for _ in _page_content:
                                    fileContent += _ + 'n'
                                with open(_page_title, 'w', encoding='utf-8') as fp:
                                    fp.write(fileContent)
                                    print('%s成功下载到本地' % (_page_title))
                                time.sleep(random.uniform(0.05, 0.2))
if __name__ == '__main__':
    main()


标签 python爬虫
相关推荐
  • python爬虫
  • Python好看视频地址解析下载代码

    #encoding:utf-8# 好看视频下载 import socketfrom urllib.request import urlopenimport urllibimport reimport timefrom pyquery import PyQuery as pqimport requestsfrom tqdm import tqdm # 打印进度条的库import gzip print(&#39;程序开始运...

    python教程 138 3年前
  • python美女写真图库爬虫

    import requestsfrom lxml import etreeimport csvfrom time import sleepimport osfrom concurrent.futures import ThreadPoolExecutor headers = { &#39;user-agent&#39;: &#39;Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit...

    python教程 56 3年前
  • 笔趣阁小说网Python爬虫分享

    #[url=https://www.biquge.info/wanjiexiaoshuo/]https://www.biquge.info/wanjiexiaoshuo/[/url] 笔趣阁小说全本爬虫import timeimport requestsimport osimport randomfrom lxml import etreeimport webbrowserheader = { "User-Agent": "Mo...

    python教程 168 3年前
  • Python爬取站长之家端口扫描接口

    import requests,timeimport randomfrom bs4 import BeautifulSoupimport reimport threadingdef ports(hostm,port): url = &#39;http://tool.chinaz.com/port/&#39; headers = { &#39;User-Agent&#39;:&#39;Mozilla/5.0 (Windows NT ...

    python教程 90 3年前
  • python爬虫下载抖音用户所有短视频+无水印方法

    这次分享下载抖音用户所有短视频方法,python爬虫批量抓取,无水印下载,希望和大家多多交流互相学习!获取用户链接方法1、首先在抖音上随机挑选一个小姐姐,用户主页右上角点开,获取分享链接python下载抖音视频截图得到类似分享链接:在抖音,记录美好生活! https:...

    python教程 313 3年前