python爬取某网站原图作为壁纸

2022-07-23,,,,

不得不说 python真是一个神奇的东西,学三天就能爬网站 真香

完整代码

# -*- coding: utf-8 -*-
"""
created on wed may 26 17:53:13 2021

@author: 19088
"""
import urllib.request
import os
import pickle
import re
import random
import sys


#获取http代理
class gethttpagents:
    #初始化函数
    def __init__(self):
        self.attarray=self.__loadagentlist()
        self.myagent=""
    
    #注意 返回对象未进行解码
    def openurl(self,url,istry=1):
        response=""
        ip=""
        if(0 != len(self.myagent.strip())):
            ip=self.myagent
        i=1
        if not istry:
            i=99
        while i<100:
            try:
                #print(self.attarray)
                if(0 == len(self.attarray) and 0==len(ip.strip())):
                    req=urllib.request.request(url)
                    #设置访问头
                    req.add_header("user-agent", "mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/90.0.4430.212 safari/537.36")
                    response=urllib.request.urlopen(req)
                else:
                    if(0 != len(self.attarray)):
                        ip=random.choice(self.attarray)
                    if(0 != len(self.myagent.strip())):
                        ip=self.myagent
                    print("以{}访问 {}".format(ip,url))
                    #设置代理
                    proxy={"http":ip}
                    #print(proxy)
                    #定义一个代理字段
                    proxy_support=urllib.request.proxyhandler(proxy)

                    #建立一个opener
                    opener=urllib.request.build_opener(proxy_support)
                    opener.addheaders=[("user-agent","mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/90.0.4430.212 safari/537.36")]
                    #urllib.request.install_opener(opener)   
                    #获得网页对象
                    response=opener.open(url)
            except:
                if not istry:
                    print("{} 无法使用".format(ip))
                else:
                    print("第{}次尝试连接!".format(i))
            else:
                break;
            finally:
                i+=1
        if 11==i and istry:
            raise valueerror
        if not response:
            return 
        html=response.read()
        #print(html)
        return html

    #检查代理池 去除掉不可用代理ip
    def checkmyippool(self):
        agentsresult=[]
        agentlist=self.attarray
        for iter in agentlist:
            ip=iter
            self.setmyip(ip)
            b=self.__getmyip()
            if not b:
                #代理不能用
                #agentlist.pop(-iter)
                pass
            else:
                agentsresult.append(ip)
                #print(b)
        #记录爬取过的可以使用的代理ip
        self.__writeagentlist(agentsresult)
        self.__setagents(agentsresult)
        self.setmyip("")
    
    #解析读取网页中所有的代理地址
    def getagents(self,html):
        #print(html)
        #匹配 ip地址 正则表达式
        pattern = re.compile(r'(<td>)\s*((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\s*</td>')
        iplist=[]
        ip=pattern.finditer(html)
        for ipiter in ip:
            iptext=ipiter.group()
            ipgroup=re.search(r"((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)", iptext)
            iplist.append(ipgroup.group())

        #匹配 端口地址 正则表达式
        portlist=[]
        pattern = re.compile(r'(<td>)\s*\d+\s*</td>')
        port = pattern.finditer(html) 
        for portiter in port:
            porttext=portiter.group()
            portgroup=re.search(r"\d+", porttext)
            portlist.append(portgroup.group())

        if(len(iplist) is not len(portlist)):
            print("注意: ip和端口参数不匹配!")
            return
        ipdict=dict(zip(iplist,portlist))

        agentlist=[]
        for key in ipdict:
            agentlist.append(key+":"+ipdict.get(key))  
        agentsresult=[]
        for iter in agentlist:
            ip=iter
            self.setmyip(ip)
            b=self.__getmyip()
            if not b:
                #代理不能用
                pass
                #agentlist.pop(-iter)
            else :
                agentsresult.append(ip)
                self.__setagents(agentsresult)
                print("{} 可以使用".format(ip))
        agentsresult.extend(self.attarray)  
        #记录爬取过的可以使用的代理ip
        if(0==len(agentsresult)):
            return
        self.__writeagentlist(agentsresult)
        self.__setagents(agentsresult)
        self.setmyip("")
        return agentlist

    
    def __setagents(self,iparray):
        self.attarray=iparray
    def setmyip(self,ip):
        self.myagent=ip
    #存储爬取过的ip代理
    def __writeagentlist(self, agentlist): 
        if os.path.exists("agent.pkl"):
            os.remove("agent.pkl")          #每次重新生成 要不多次 dump需要多次 load
        with open("agent.pkl.","wb") as f:
            pickle.dump(agentlist, f)
        print("存储{}条代理".format(len(agentlist)))
    
    #加载之前存储过的ip代理
    def __loadagentlist(self):
        agentlist=[]
        if not os.path.exists("agent.pkl"):
            return agentlist
        with open("agent.pkl","rb") as f:
            agentlist=pickle.load(f)
            print("加载{}条代理".format(len(agentlist)))
            return agentlist

    #获取当前使用的ip地址 类的内部方法 仅供内部调用
    def __getmyip(self,ip=""):
        url="https://www.baidu.com/"
        html=""
        try:
            html=self.openurl(url,0).decode("utf-8")
        except:
            return 
        #匹配ip地址
        #pattern = re.compile(r'((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)')
        #groupip=pattern.search(html)
        #if groupip:
            #return groupip.group()
        else:
            return html
    
    #通过不同的网站去爬取代理
    def crawlingagents(self,index):
        try:
            url ="http://ip.yqie.com/ipproxy.htm"
            print(url)
            html=self.openurl(url) 
            html=html.decode("utf-8") 
            self.setmyip("")                                                #不指定ip 随机挑选一个作为代理
            self.getagents(html)
        except exception as e:
            print("{} 爬取失败".format(url))
        
        #一共搜集多少页
        page=index
        
        indexcur=1
        while indexcur<=page:
            try:
                url=r"https://www.89ip.cn/index_{}.html".format(indexcur)
                print(url)
                self.setmyip("") 
                html=self.openurl(url)                                               #不指定ip 随机挑选一个作为代理
                html=html.decode("utf-8")
                self.getagents(html)
            except exception as e:
                print("{} 爬取失败".format(url))
            finally:
                indexcur+=1
        
        indexcur=1
        while indexcur<=page:
            try:
                url=r"http://www.66ip.cn/{}.html".format(indexcur)
                print(url)
                self.setmyip("") 
                html=a.openurl(url)                                               #不指定ip 随机挑选一个作为代理
                html=html.decode("gb2312")
                self.getagents(html)
            except exception as e:
                print("{} 爬取失败".format(url))
            finally:
                indexcur+=1                                  

        indexcur=1
        while indexcur<=page:
            try:
                url=r"http://www.ip3366.net/?stype=1&page={}".format(indexcur)
                print(url)
                self.setmyip("") 
                html=a.openurl(url)                                               #不指定ip 随机挑选一个作为代理
                html=html.decode("gb2312")
                self.getagents(html)
            except exception as e:
                print("{} 爬取失败".format(url))
            finally:
                indexcur+=1  

        indexcur=1
        while indexcur<=page:
            try:
                url=r"http://www.kxdaili.com/dailiip/1/{}.html".format(indexcur)
                print(url)
                self.setmyip("") 
                html=a.openurl(url)                                               #不指定ip 随机挑选一个作为代理
                html=html.decode("utf-8")
                self.getagents(html)
            except exception as e:
                print("{} 爬取失败".format(url))
            finally:
                indexcur+=1


#下载图片封装类
class downloadpictures:
    #构造函数
    def __init__(self):
        self.sortkey={}                                 #定义一个搜索关键字的字典
        self.urlload=gethttpagents()
        self.bzmenudict={}                              #分类信息 风景 美女 什么的分类
        self.sortscreendict={}                          #按照屏幕尺寸分类
        self.littlesigndict={}                          #分类信息下面的小分类
        pass
    
    
    def getpictures(self,url):
        #第一步 打开网页 读取page信息 
        pagerhtml=self.urlload.openurl(url)
        #第二步 获取 pagefolder 链接和各种分类信息 返回的是一堆folder链接的url
        folderpictursurl=self.readpages(pagerhtml).values()
        if not folderpictursurl:
            print("获取图片集失败!")
            return
        for floderiterurl in folderpictursurl:
            folderurl=str("https://www.ivsky.com/")+floderiterurl
            folderhtml=self.urlload.openurl(folderurl)
            #第三步 读取图片集 获取单个图片的链接地址 返回的是图片集里面的一堆文件url
            pictursurldict=self.readfolders(folderhtml)
            for iterpicturekey in pictursurldict:
                filename=iterpicturekey+".jpg"
                pictureurl=str("https://www.ivsky.com/")+pictursurldict.get(iterpicturekey)
                
                #读取图片页相关信息
                picturehtml=self.urlload.openurl(pictureurl)
                picturdownurl=self.readpictures(picturehtml)
                picturedownhtml=self.urlload.openurl(picturdownurl)
                if not picturedownhtml:
                    continue
                #保存图片
                with open(filename,"wb+") as f:
                    f.write(picturedownhtml)
        
    
    #提取匹配内容中的所有链接地址
    def gethrefmap(self,html,ispicture=0,isfolder=0):
        hrefdict={}
        pattern=re.compile(r'<a\s*.*?\s*</a>',re.i)
        if ispicture:
            pattern=re.compile(r'<p>\s*?<a\s*.*?</p>',re.i)
        hrefiter=pattern.finditer(html)
        index=0
        for iter in hrefiter:
            hreftext=iter.group()
            #匹配分类名字
            pattern=re.compile(r'"\s*?>\s*?.*?</a>',re.i)
            name=""
            namegroup=pattern.search(hreftext)
            if namegroup:
                name=namegroup.group()
                if(5==len(namegroup.group().replace(" ", ""))):
                    pattern=re.compile(r'title=".*?"',re.i)
                    namegroup=pattern.search(hreftext)
                    if namegroup:
                        name=namegroup.group()[7:-1]
                name=name[2:-4].replace(" ", '')
            #匹配href
            pattern=re.compile(r'href=".*?" rel="external nofollow" ',re.i)
            url=""
            urlgroup=pattern.search(hreftext)
            if urlgroup:
                url=urlgroup.group()[6:-1].replace(" ", '')
            if isfolder:
                index+=1
                name+="_"+str(index)
            hrefdict[name]=url
        return hrefdict
     #读取首页信息 包含各种分类的链接地址 以及图片集的地址集合   
    def readpages(self,html):
        html=html.decode("utf-8")
        #检索壁纸分类
        #匹配 壁纸分类信息
        pattern=re.compile(r'<ul\s*class="bzmenu".*?</ul>',re.i)
        sortclassgroup=pattern.search(html)
        if sortclassgroup:
            sortmessage=sortclassgroup.group()
            self.bzmenudict=self.gethrefmap(sortmessage)
            #print(self.bzmenudict)
        else:
            print("匹配壁纸分类出错!")
            return
        
         #匹配 按照屏幕大小分类
        pattern=re.compile(r'<ul\s*class="sall_dd".*?</ul>',re.i)
        sortclassgroup=pattern.search(html)
        if sortclassgroup:
            sortmessage=sortclassgroup.group()
            self.sortscreendict=self.gethrefmap(sortmessage)
            #print(self.sortscreendict)
        else:
            print("匹配屏幕尺寸分类失败!")
            return       
       
         #匹配 获取小分类
        pattern=re.compile(r'<div\s*class="sline".*?</div>',re.i)
        sortclassgroup=pattern.search(html)
        if sortclassgroup:
            sortmessage=sortclassgroup.group()
            #print(sortmessage)
            self.littlesigndict=self.gethrefmap(sortmessage)
            #print(self.littlesigndict)
        else:
            print("匹配小分类失败")
            return               
        
        picturedict={}
        #匹配 图片集地址
        pattern=re.compile(r'<ul\s*class="ali".*?</ul>',re.i)
        sortclassgroup=pattern.search(html)
        if sortclassgroup:
            sortmessage=sortclassgroup.group()
            picturedict=self.gethrefmap(sortmessage,1)
            #print(picturedict)
        else:
            print("匹配图片集地址失败!")
            return         
        #print(html)
        return picturedict
    
    #解析每个图片集合对应的图片集内容 解析出单个图片的链接地址
    def readfolders(self,html):
        if not html:
            return
        html=html.decode("utf-8")
        
        #获取图片集里面每个图片的具体地址和名称
         #匹配 获取小分类
        pattern=re.compile(r'<ul\s*class="pli".*?</ul>',re.i)
        sortclassgroup=pattern.search(html)
        pictureurldict={}
        if sortclassgroup:
            sortmessage=sortclassgroup.group()
            #print(sortmessage)
            pictureurldict=self.gethrefmap(sortmessage,1,1)
            #print(pictureurldict)
        else:
            print("匹配小分类失败")
            return                            
        return pictureurldict
    
    #解析每个图片集合对应的图片集内容 解析出单个图片的链接地址
    def readpictures(self,html):
        if not html:
            return        
        html=html.decode("utf-8")
        #获取图片集里面每个图片的具体地址和名称
         #匹配 获取小分类
        pattern=re.compile(r'<div\s*class="pic".*?</div>',re.i)
        sortclassgroup=pattern.search(html)
        pictureurl=""
        if sortclassgroup:
            sortmessage=sortclassgroup.group()
            #匹配href
            pattern=re.compile(u"src='.*?'",re.i)
            url=""
            urlgroup=pattern.search(sortmessage)
            if urlgroup:
                url=urlgroup.group()[5:-1].replace(" ", '')            
            url=url.replace('img-pre', 'img-picdown')
            url=url.replace('pre', 'pic')
            url=str("https:")+url
            #print(sortmessage)
            pictureurldict=url
            #print(url)
        else:
            print("匹配小分类失败")
            return                            
        return pictureurldict        
        

class urluser:
    
    def __init__(self):
        self.agent=gethttpagents()
        self.downpicture=downloadpictures()   
    
    #下载图片调用函数
    def downpictures(self):

        #url="https://www.ivsky.com/bizhi"
        #b.getpictures(url)
        #确定保存路径      
        dirpath=input("请输入保存路径:")
        if not os.path.exists(dirpath):
            os.mkdir(dirpath)
        if not os.path.isdir(dirpath):
            print("savepath is wrong!")
            sys.exit()
        os.chdir(dirpath)                                       #切换工作目录 
        #url=r"https://www.ivsky.com/bizhi/nvxing_1920x1080/index_{}.html"
        page=input("爬取前多少页的图片?\n")
        indexre = re.search(r"\d+", page)
        if(not indexre):
            print("输入页数有误!")
        indexre=int(indexre.group())
        indexcur=1
        while indexcur<=indexre:
            try:
                #注意 爬取什么类型的图片可以根据不同的网址进行设计 下载类里面已经读取了所有分类对应的地址 有兴趣可以自己完善
                url=r"https://www.ivsky.com/bizhi/nvxing_1920x1080/index_{}.html".format(indexcur)
                print(url)
                self.downpicture.getpictures(url)
            except:
                print("打开出错!")
                pass
            finally:
                indexcur+=1

    #爬取代理
    def downagents(self):
        page=input("爬取前多少页的代理?\n")
        indexre = re.search(r"\d+", page)
        if(not indexre):
            print("输入页数有误!")
            return
        indexre=int(indexre.group())    
        self.agent.crawlingagents(indexre)
    
    # 检查当前代理池是否可以
    def checkpool(self):
        self.agent.checkmyippool() 
        
if __name__ == "__main__":
    print("*"*20)
    print("1.爬取代理\n")
    print("2.检查代理\n")
    print("3.爬取图片")   
    print("*"*20)
    mode=input("请输入数字选择处理模式:\n")
    indexre = re.search(r"\d+", mode)
    if(not indexre):
        print("输入页数有误!")
        sys.exit()
    indexre=int(indexre.group())
    #实例化一个对象
    uesrobj=urluser()
    
    if 1 == indexre:
        uesrobj.downagents()
    elif 2 == indexre:
        uesrobj.checkpool()
    elif 3 == indexre:
        uesrobj.downpictures()
    else:
        print("模式选择错误!")
        sys.exit()
    print("爬取完毕!")


效果图

《python爬取某网站原图作为壁纸.doc》

下载本文的Word格式文档,以方便收藏与打印。