requests库+re正则表达式爬取并解析古诗文网

2022-08-10,,,,

# requests + re
# requests: 数据爬取
# re:数据解析

import requests
import re

def parse(url):
    # 定义请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
    }
    response = requests.get(url,headers)
    text = response.text
    # 核心: 利用re来解析数据
    # 限定符后面的?表示非贪婪模式
    # re.DOTALL 可以让 . 运算符匹配到\n,.运算符默认是匹配不到\n的
    titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
    dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    contents_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL)
    contents = []
    for content in contents_tags:
        content = re.sub(r'<.*?>', '', content)
        contents.append(content.strip())
    poems = []
    for value in zip(titles, dynasties, authors, contents):
        title, dynasty, author, content = value
        poem = [
            {
                'title': title,
                'dynasties': dynasty,
                'authors': author,
                'contents': content
            }
        ]
        # 将字典作为元素添加到列表中
        poems.append(poem)
    for poem in poems:
        print(poem)
        print('~'*100)

def main():
    # 爬取指定的页数
    for page in range(1, 51):
        url = 'https://www.gushiwen.org/default_{}.aspx'.format(page)
        parse(url)

if __name__ == '__main__':
    main()

 

本文地址:https://blog.csdn.net/qq_39504519/article/details/107084243

《requests库+re正则表达式爬取并解析古诗文网.doc》

下载本文的Word格式文档,以方便收藏与打印。