不得不说 python真是一个神奇的东西,学三天就能爬网站 真香
完整代码
# -*- coding: utf-8 -*- """ created on wed may 26 17:53:13 2021 @author: 19088 """ import urllib.request import os import pickle import re import random import sys #获取http代理 class gethttpagents: #初始化函数 def __init__(self): self.attarray=self.__loadagentlist() self.myagent="" #注意 返回对象未进行解码 def openurl(self,url,istry=1): response="" ip="" if(0 != len(self.myagent.strip())): ip=self.myagent i=1 if not istry: i=99 while i<100: try: #print(self.attarray) if(0 == len(self.attarray) and 0==len(ip.strip())): req=urllib.request.request(url) #设置访问头 req.add_header("user-agent", "mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/90.0.4430.212 safari/537.36") response=urllib.request.urlopen(req) else: if(0 != len(self.attarray)): ip=random.choice(self.attarray) if(0 != len(self.myagent.strip())): ip=self.myagent print("以{}访问 {}".format(ip,url)) #设置代理 proxy={"http":ip} #print(proxy) #定义一个代理字段 proxy_support=urllib.request.proxyhandler(proxy) #建立一个opener opener=urllib.request.build_opener(proxy_support) opener.addheaders=[("user-agent","mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/90.0.4430.212 safari/537.36")] #urllib.request.install_opener(opener) #获得网页对象 response=opener.open(url) except: if not istry: print("{} 无法使用".format(ip)) else: print("第{}次尝试连接!".format(i)) else: break; finally: i+=1 if 11==i and istry: raise valueerror if not response: return html=response.read() #print(html) return html #检查代理池 去除掉不可用代理ip def checkmyippool(self): agentsresult=[] agentlist=self.attarray for iter in agentlist: ip=iter self.setmyip(ip) b=self.__getmyip() if not b: #代理不能用 #agentlist.pop(-iter) pass else: agentsresult.append(ip) #print(b) #记录爬取过的可以使用的代理ip self.__writeagentlist(agentsresult) self.__setagents(agentsresult) self.setmyip("") #解析读取网页中所有的代理地址 def getagents(self,html): #print(html) #匹配 ip地址 正则表达式 pattern = re.compile(r'(<td>)\s*((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\s*</td>') iplist=[] ip=pattern.finditer(html) for ipiter in ip: iptext=ipiter.group() ipgroup=re.search(r"((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)", iptext) iplist.append(ipgroup.group()) #匹配 端口地址 正则表达式 portlist=[] pattern = re.compile(r'(<td>)\s*\d+\s*</td>') port = pattern.finditer(html) for portiter in port: porttext=portiter.group() portgroup=re.search(r"\d+", porttext) portlist.append(portgroup.group()) if(len(iplist) is not len(portlist)): print("注意: ip和端口参数不匹配!") return ipdict=dict(zip(iplist,portlist)) agentlist=[] for key in ipdict: agentlist.append(key+":"+ipdict.get(key)) agentsresult=[] for iter in agentlist: ip=iter self.setmyip(ip) b=self.__getmyip() if not b: #代理不能用 pass #agentlist.pop(-iter) else : agentsresult.append(ip) self.__setagents(agentsresult) print("{} 可以使用".format(ip)) agentsresult.extend(self.attarray) #记录爬取过的可以使用的代理ip if(0==len(agentsresult)): return self.__writeagentlist(agentsresult) self.__setagents(agentsresult) self.setmyip("") return agentlist def __setagents(self,iparray): self.attarray=iparray def setmyip(self,ip): self.myagent=ip #存储爬取过的ip代理 def __writeagentlist(self, agentlist): if os.path.exists("agent.pkl"): os.remove("agent.pkl") #每次重新生成 要不多次 dump需要多次 load with open("agent.pkl.","wb") as f: pickle.dump(agentlist, f) print("存储{}条代理".format(len(agentlist))) #加载之前存储过的ip代理 def __loadagentlist(self): agentlist=[] if not os.path.exists("agent.pkl"): return agentlist with open("agent.pkl","rb") as f: agentlist=pickle.load(f) print("加载{}条代理".format(len(agentlist))) return agentlist #获取当前使用的ip地址 类的内部方法 仅供内部调用 def __getmyip(self,ip=""): url="https://www.baidu.com/" html="" try: html=self.openurl(url,0).decode("utf-8") except: return #匹配ip地址 #pattern = re.compile(r'((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)') #groupip=pattern.search(html) #if groupip: #return groupip.group() else: return html #通过不同的网站去爬取代理 def crawlingagents(self,index): try: url ="http://ip.yqie.com/ipproxy.htm" print(url) html=self.openurl(url) html=html.decode("utf-8") self.setmyip("") #不指定ip 随机挑选一个作为代理 self.getagents(html) except exception as e: print("{} 爬取失败".format(url)) #一共搜集多少页 page=index indexcur=1 while indexcur<=page: try: url=r"https://www.89ip.cn/index_{}.html".format(indexcur) print(url) self.setmyip("") html=self.openurl(url) #不指定ip 随机挑选一个作为代理 html=html.decode("utf-8") self.getagents(html) except exception as e: print("{} 爬取失败".format(url)) finally: indexcur+=1 indexcur=1 while indexcur<=page: try: url=r"http://www.66ip.cn/{}.html".format(indexcur) print(url) self.setmyip("") html=a.openurl(url) #不指定ip 随机挑选一个作为代理 html=html.decode("gb2312") self.getagents(html) except exception as e: print("{} 爬取失败".format(url)) finally: indexcur+=1 indexcur=1 while indexcur<=page: try: url=r"http://www.ip3366.net/?stype=1&page={}".format(indexcur) print(url) self.setmyip("") html=a.openurl(url) #不指定ip 随机挑选一个作为代理 html=html.decode("gb2312") self.getagents(html) except exception as e: print("{} 爬取失败".format(url)) finally: indexcur+=1 indexcur=1 while indexcur<=page: try: url=r"http://www.kxdaili.com/dailiip/1/{}.html".format(indexcur) print(url) self.setmyip("") html=a.openurl(url) #不指定ip 随机挑选一个作为代理 html=html.decode("utf-8") self.getagents(html) except exception as e: print("{} 爬取失败".format(url)) finally: indexcur+=1 #下载图片封装类 class downloadpictures: #构造函数 def __init__(self): self.sortkey={} #定义一个搜索关键字的字典 self.urlload=gethttpagents() self.bzmenudict={} #分类信息 风景 美女 什么的分类 self.sortscreendict={} #按照屏幕尺寸分类 self.littlesigndict={} #分类信息下面的小分类 pass def getpictures(self,url): #第一步 打开网页 读取page信息 pagerhtml=self.urlload.openurl(url) #第二步 获取 pagefolder 链接和各种分类信息 返回的是一堆folder链接的url folderpictursurl=self.readpages(pagerhtml).values() if not folderpictursurl: print("获取图片集失败!") return for floderiterurl in folderpictursurl: folderurl=str("https://www.ivsky.com/")+floderiterurl folderhtml=self.urlload.openurl(folderurl) #第三步 读取图片集 获取单个图片的链接地址 返回的是图片集里面的一堆文件url pictursurldict=self.readfolders(folderhtml) for iterpicturekey in pictursurldict: filename=iterpicturekey+".jpg" pictureurl=str("https://www.ivsky.com/")+pictursurldict.get(iterpicturekey) #读取图片页相关信息 picturehtml=self.urlload.openurl(pictureurl) picturdownurl=self.readpictures(picturehtml) picturedownhtml=self.urlload.openurl(picturdownurl) if not picturedownhtml: continue #保存图片 with open(filename,"wb+") as f: f.write(picturedownhtml) #提取匹配内容中的所有链接地址 def gethrefmap(self,html,ispicture=0,isfolder=0): hrefdict={} pattern=re.compile(r'<a\s*.*?\s*</a>',re.i) if ispicture: pattern=re.compile(r'<p>\s*?<a\s*.*?</p>',re.i) hrefiter=pattern.finditer(html) index=0 for iter in hrefiter: hreftext=iter.group() #匹配分类名字 pattern=re.compile(r'"\s*?>\s*?.*?</a>',re.i) name="" namegroup=pattern.search(hreftext) if namegroup: name=namegroup.group() if(5==len(namegroup.group().replace(" ", ""))): pattern=re.compile(r'title=".*?"',re.i) namegroup=pattern.search(hreftext) if namegroup: name=namegroup.group()[7:-1] name=name[2:-4].replace(" ", '') #匹配href pattern=re.compile(r'href=".*?" rel="external nofollow" ',re.i) url="" urlgroup=pattern.search(hreftext) if urlgroup: url=urlgroup.group()[6:-1].replace(" ", '') if isfolder: index+=1 name+="_"+str(index) hrefdict[name]=url return hrefdict #读取首页信息 包含各种分类的链接地址 以及图片集的地址集合 def readpages(self,html): html=html.decode("utf-8") #检索壁纸分类 #匹配 壁纸分类信息 pattern=re.compile(r'<ul\s*class="bzmenu".*?</ul>',re.i) sortclassgroup=pattern.search(html) if sortclassgroup: sortmessage=sortclassgroup.group() self.bzmenudict=self.gethrefmap(sortmessage) #print(self.bzmenudict) else: print("匹配壁纸分类出错!") return #匹配 按照屏幕大小分类 pattern=re.compile(r'<ul\s*class="sall_dd".*?</ul>',re.i) sortclassgroup=pattern.search(html) if sortclassgroup: sortmessage=sortclassgroup.group() self.sortscreendict=self.gethrefmap(sortmessage) #print(self.sortscreendict) else: print("匹配屏幕尺寸分类失败!") return #匹配 获取小分类 pattern=re.compile(r'<div\s*class="sline".*?</div>',re.i) sortclassgroup=pattern.search(html) if sortclassgroup: sortmessage=sortclassgroup.group() #print(sortmessage) self.littlesigndict=self.gethrefmap(sortmessage) #print(self.littlesigndict) else: print("匹配小分类失败") return picturedict={} #匹配 图片集地址 pattern=re.compile(r'<ul\s*class="ali".*?</ul>',re.i) sortclassgroup=pattern.search(html) if sortclassgroup: sortmessage=sortclassgroup.group() picturedict=self.gethrefmap(sortmessage,1) #print(picturedict) else: print("匹配图片集地址失败!") return #print(html) return picturedict #解析每个图片集合对应的图片集内容 解析出单个图片的链接地址 def readfolders(self,html): if not html: return html=html.decode("utf-8") #获取图片集里面每个图片的具体地址和名称 #匹配 获取小分类 pattern=re.compile(r'<ul\s*class="pli".*?</ul>',re.i) sortclassgroup=pattern.search(html) pictureurldict={} if sortclassgroup: sortmessage=sortclassgroup.group() #print(sortmessage) pictureurldict=self.gethrefmap(sortmessage,1,1) #print(pictureurldict) else: print("匹配小分类失败") return return pictureurldict #解析每个图片集合对应的图片集内容 解析出单个图片的链接地址 def readpictures(self,html): if not html: return html=html.decode("utf-8") #获取图片集里面每个图片的具体地址和名称 #匹配 获取小分类 pattern=re.compile(r'<div\s*class="pic".*?</div>',re.i) sortclassgroup=pattern.search(html) pictureurl="" if sortclassgroup: sortmessage=sortclassgroup.group() #匹配href pattern=re.compile(u"src='.*?'",re.i) url="" urlgroup=pattern.search(sortmessage) if urlgroup: url=urlgroup.group()[5:-1].replace(" ", '') url=url.replace('img-pre', 'img-picdown') url=url.replace('pre', 'pic') url=str("https:")+url #print(sortmessage) pictureurldict=url #print(url) else: print("匹配小分类失败") return return pictureurldict class urluser: def __init__(self): self.agent=gethttpagents() self.downpicture=downloadpictures() #下载图片调用函数 def downpictures(self): #url="https://www.ivsky.com/bizhi" #b.getpictures(url) #确定保存路径 dirpath=input("请输入保存路径:") if not os.path.exists(dirpath): os.mkdir(dirpath) if not os.path.isdir(dirpath): print("savepath is wrong!") sys.exit() os.chdir(dirpath) #切换工作目录 #url=r"https://www.ivsky.com/bizhi/nvxing_1920x1080/index_{}.html" page=input("爬取前多少页的图片?\n") indexre = re.search(r"\d+", page) if(not indexre): print("输入页数有误!") indexre=int(indexre.group()) indexcur=1 while indexcur<=indexre: try: #注意 爬取什么类型的图片可以根据不同的网址进行设计 下载类里面已经读取了所有分类对应的地址 有兴趣可以自己完善 url=r"https://www.ivsky.com/bizhi/nvxing_1920x1080/index_{}.html".format(indexcur) print(url) self.downpicture.getpictures(url) except: print("打开出错!") pass finally: indexcur+=1 #爬取代理 def downagents(self): page=input("爬取前多少页的代理?\n") indexre = re.search(r"\d+", page) if(not indexre): print("输入页数有误!") return indexre=int(indexre.group()) self.agent.crawlingagents(indexre) # 检查当前代理池是否可以 def checkpool(self): self.agent.checkmyippool() if __name__ == "__main__": print("*"*20) print("1.爬取代理\n") print("2.检查代理\n") print("3.爬取图片") print("*"*20) mode=input("请输入数字选择处理模式:\n") indexre = re.search(r"\d+", mode) if(not indexre): print("输入页数有误!") sys.exit() indexre=int(indexre.group()) #实例化一个对象 uesrobj=urluser() if 1 == indexre: uesrobj.downagents() elif 2 == indexre: uesrobj.checkpool() elif 3 == indexre: uesrobj.downpictures() else: print("模式选择错误!") sys.exit() print("爬取完毕!")
效果图