话不多说直接上代码,都有注释,如果有不懂的可以提出来或者有更好方案也可以提出来,大家一起学习。
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import requests import urllib.parse from lxml import etree import re import os # 自定义错误 class Error(Exception): def __init__(self, message): self.message = message class Wallpaper: """ 0x0 所有分辨率 1920x1080 """ def __init__(self, url, path, page, min_resolution): self.url = url self.path = path self.params = { "page": page } self.min_resolution = min_resolution # post请求参数 self.data = { # 分页 "view": "paged", # 分辨率 "min_resolution": min_resolution, # 分辨率等于还是至少是 可以大于等于 "resolution_equals": "=", # 排序方法 # newest 最新上传 # rating 最多赞数 "sort": "newest", } # 创建文件夹 @staticmethod def create_dir(path): if not os.path.exists(path): os.makedirs(path) else: return "文件夹已存在" # 返回图片类型的文件夹名称, 根据链接的不同类型 def get_folder_name(self): # 标签链接名称获取 https://wall.alphacoders.com/tag/ahri-(league-of-legends)-wallpapers?lang=Chinese pattern = r"tag/([\w-]+)-(.*)\?lang=Chinese" match = re.search(pattern, self.url) if match: content1 = match.group(1) # 获取第一个捕获组的内容 content2 = match.group(2) # 获取第二个捕获组的内容 image_dir_name = content1 + content2 return image_dir_name # 分类链接名称获取 https://wall.alphacoders.com/by_sub_category.php?id=169908&name=%E8%8B%B1%E9%9B%84%E8%81%94%E7%9B%9F+%E5%A3%81%E7%BA%B8&lang=Chinese elif not match: try: params = {} url_params = self.url.split("?")[1] temp = url_params.split("&") for param in temp: key = param.split("=")[0] value = param.split("=")[1] params[key] = value name = params.get("name", None) # 获取分类名称 if name is not None: image_dir_name = urllib.parse.unquote(name.split("+")[0]) return image_dir_name # 获取不到名称名称,那链接中的就是这种类型 https://wall.alphacoders.com/by_resolution.php?w=3840&h=2160&lang=Chinese else: w = params.get("w") h = params.get("h") image_dir_name = w + "x" + h return image_dir_name # 获取索引名称 https://wall.alphacoders.com/search.php?search=landscape&lang=Chinese except AttributeError: pattern = r"search=([^&]+)&lang=Chinese" match = re.search(pattern, self.url) if match: image_dir_name = match.group(1) return image_dir_name # 获取每一页的所有图片页面的图片链接 def get_image_urls(self): url = "https://wall.alphacoders.com" response = requests.post(self.url, params=self.params, data=self.data, allow_redirects=False) # 超出页数会重定向到最大的页面,进行判断,防止重复爬取 if response.status_code == 200: html = etree.HTML(response.text) image_page_params = html.xpath( '//*[@id="page_container"]//div//div[@class="thumb-container"]//div[@class="boxgrid"]//@href') # 判断当前页面有没有图片 if len(image_page_params) == 0: raise Error("获取不到当前页码的图片,请检查页码有否有效!") else: result = [] for image_page_param in image_page_params: image_page_url = url + image_page_param response_image = requests.get(image_page_url).text html = etree.HTML(response_image) image_urls = html.xpath("/html/body/div[2]/div[2]/div[2]/div[1]/img//@src") # 这里可以用推导式 for i in image_urls: result.append(i) return len(result), result raise Error("获取不到当前页码的图片,请检查页码有否有效!") def download_image(self): mun = 0 error = 0 self.create_dir(self.path) images_dir_name = self.get_folder_name() images_mun, images_urls = self.get_image_urls() for image_url in images_urls: image_name_temp = re.search(r'https://[^/]+/[^/]+/(\d+)(\.png|\.jpg)', image_url) # 匹配图片页面的图片是否是这两种格式,不是则跳过 if image_name_temp is not None: image_name = image_name_temp.group(1) # 判断图片是否重复下载 if not os.path.exists(os.path.join(self.path, images_dir_name, image_name + ".png")): self.create_dir(os.path.join(self.path, images_dir_name)) download = requests.get(image_url).content with open(os.path.join(self.path, images_dir_name, image_name + ".png"), "wb") as f: f.write(download) print("图片 {} 下载完成,图片地址: {}".format(image_name, image_url)) mun = mun + 1 if mun == images_mun: print("当前页面图片下载完成, 一共 {} 张图片".format(mun)) else: print("已有图片: {}, 图片地址: {}".format(image_name, image_url)) continue else: error = error + 1 print("下载失败 {} 张图片".format(error)) if __name__ == '__main__': url = input("请输入壁纸url! \n") # 存放图片的文件夹 path = "images" ''' 分辨率 0x0 所有分辨率 ''' print(""" 常用分辨率 1920x1080 2560x1440 2560x1600 3840x2160 5120x2880 7680x4320 """) resolution = input("请输入需要下载的分辨率! \n") # 200 可以修为自己想怕的最大页数,当链接所有页数大于或小于都能正常爬取链接的最大页数的图片,这里可以取一个折中的值 for page_num in range(1, 200): print("正在下载第 {} 页的图片".format(page_num)) page_one = Wallpaper(url, path, str(page_num), resolution) print(page_one.data)