今日头条爬取新闻视频用户

2022-07-26,,,,

import requests
import time
import random
import pymongo

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
base_url = 'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset={}&format=json&keyword={}&'
video_url = '&from=video&pd=video'
users_url = '&from=media&pd=user'

# 请设置要爬取的关键字
keyword = "新闻"

def get_user():
    offset = 0
    tmp=[]
    while (True):
        url = base_url.format(offset, keyword) + users_url
        time.sleep(random.randint(10, 50) * 0.1)
        resopnse = requests.get(url).json()
        if (resopnse['data']) == None:
            break
        length = len(resopnse['data'])
        try:
            for l in range(0, length - 1):
                dict_ = {}
                dict_['name']=resopnse['data'][l]['name']
                dict_['description'] = resopnse['data'][l]['description']
                tmp.append(dict_)
            offset += 20
            if (len(tmp) > 200):
                set_comments(tmp)
                tmp=[]
        except Exception as e:
            print(e)
    if (len(tmp) > 0):
        set_comments(tmp)


def get_new():
    offset = 0
    num = 0
    while (True):
        url = base_url.format(offset, keyword)
        time.sleep(random.randint(10, 50) * 0.1)
        resopnse = requests.get(url, headers=headers).json()
        if (resopnse['data']) == None:
            print('data 为空,到达底部')
            break
        length = len(resopnse['data'])
        for l in range(0, length - 1):
            tmp = []
            try:
                # json中有假数据 用这种方法去除
                if (resopnse['data'][l]['title'] and resopnse['data'][l]['source']):
                    title = resopnse['data'][l]['title']
                    source = resopnse['data'][l]['source']  # 来源
                    tmp.append(title)
                    tmp.append(source)
                    print(tmp)
                    num += 1
                else:
                    continue
            except Exception as e:
                pass
        offset += 20
    print("爬取了{}条新闻".format(num))


def get_vedio():
    offset = 0
    num = 0
    while (True):
        url = base_url.format(offset, keyword + video_url)
        time.sleep(random.randint(10, 50) * 0.1)
        resopnse = requests.get(url, headers=headers).json()

        if (resopnse['data']) == None:
            print('data 为空,到达底部')
            break
        length = len(resopnse['data'])
        for l in range(0, length):
            tmp = []
            try:
                # json中有假数据 用这种方法去除   还有一些假数据一模一样...
                if (resopnse['data'][l]['title'] and resopnse['data'][l]['source']):
                    video_title = resopnse['data'][l]['title']
                    video_source = resopnse['data'][l]['source']  # 来源
                    tmp.append(video_title)
                    tmp.append(video_source)
                    print(tmp)
                    num += 1
                else:
                    continue
            except Exception as e:
                pass
        offset += 20
    print("爬取了{}条视频".format(num))

def set_comments(tmp):
        connection = pymongo.MongoClient('81.69.')
        db = connection.weibo
        try:
            db.comments.insert_many(
                tmp
            )
        except  Exception as e1:
            print(e1)
if __name__ == '__main__':
    get_vedio()

本文地址:https://blog.csdn.net/qq_43751489/article/details/110943688

《今日头条爬取新闻视频用户.doc》

下载本文的Word格式文档,以方便收藏与打印。