【Python之路】异步IO

线程：CPU基本执行单元，可以与同属一个进程的其他线程共享资源，线程是属于进程的。

进程：资源单元，进程一般由程序、数据集、进程控制块三部分组成。一个进程默认有一个主线程，

GIL：用于在进程中对所有线程加锁，同一时刻只有一个线程被cpu调度。GIL对IO请求影响小。

在编写爬虫时，性能的消耗主要在IO请求中

当单线程模式下请求URL时必然会引起等待，从而使得请求整体变慢。

import requests

def fetch_async(url):

    response = requests.get(url)

    print(response.text)

    return response

if __name__ == '__main__':

    url_list = ['http://www.github.com', 'http://www.bing.com']

    for url in url_list:

        fetch_async(url)

同步执行

多线程方式运行：

import requests

from concurrent.futures import ThreadPoolExecutor

def fetch_async(url):

    response = requests.get(url)

    return response

if __name__ == '__main__':

    pool = ThreadPoolExecutor(5)

    url_list = ['http://www.github.com', 'http://www.bing.com']

    for url in url_list:

        pool.submit(fetch_async,url)

    pool.shutdown()

多线程

import requests

from concurrent.futures import ThreadPoolExecutor

def fetch_async(url):

    response = requests.get(url)

    return response

def callback(future):

    print(future.result())

if __name__ == '__main__':

    pool = ThreadPoolExecutor(5)

    url_list = ['http://www.github.com', 'http://www.bing.com']

    for url in url_list:

        v = pool.submit(fetch_async,url)

        v.add_done_callback(callback)

    pool.shutdown()

多线程+回调函数

多进程方式运行:

import requests

from concurrent.futures import ProcessPoolExecutor

def fetch_async(url):

    response = requests.get(url)

    return response

if __name__ == '__main__':

    pool = ProcessPoolExecutor(5)

    url_list = ['http://www.github.com', 'http://www.bing.com']

    for url in url_list:

        pool.submit(fetch_async,url)

    pool.shutdown(wait=True)

多进程

import requests

from concurrent.futures import ProcessPoolExecutor

def fetch_async(url):

    response = requests.get(url)

    return response

def callback(future):

    print(future.result())

if __name__ == '__main__':

    pool = ProcessPoolExecutor(5)

    url_list = ['http://www.github.com', 'http://www.bing.com']

    for url in url_list:

        v = pool.submit(fetch_async,url)

        v.add_done_callback(callback)

    pool.shutdown(wait=True)

多进程+回调函数

通过上述代码均可以完成对请求性能的提高，对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费，所以异步IO会是首选：

import asyncio

@asyncio.coroutine

def func1():

    print('before...func1......')

    yield from asyncio.sleep(5)

    print('end...func1......')

tasks = [func1(), func1()]

loop = asyncio.get_event_loop()

loop.run_until_complete(asyncio.gather(*tasks))

loop.close()

asyncio-示例1

异步发送TCP请求:

import asyncio

@asyncio.coroutine

def fetch_async(host, url='/'):

    print(host, url)

    reader, writer = yield from asyncio.open_connection(host, 80)

    request_header_content = """GET %s HTTP/1.0\r\nHost: %s\r\n\r\n""" % (url, host,)

    request_header_content = bytes(request_header_content, encoding='utf-8')

    writer.write(request_header_content)

    yield from writer.drain()

    text = yield from reader.read()

    print(host, url, text)

    writer.close()

tasks = [

    fetch_async('www.cnblogs.com', '/alex/'),

    fetch_async('dig.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091')

]

loop = asyncio.get_event_loop()

results = loop.run_until_complete(asyncio.gather(*tasks))

loop.close()

asyncio-示例2

import asyncio

from aiohttp import ClientSession

async def func1(url):

    print(url)

    async with ClientSession() as session:

        async with session.get(url) as response:

            response = await response.read()

            print('Response OK')

tasks = [

    asyncio.ensure_future(func1('https://www.cnblogs.com/')),

    asyncio.ensure_future(func1('https://www.baidu.com/')),

    asyncio.ensure_future(func1('https://www.python.org/')),

]

loop = asyncio.get_event_loop()

loop.run_until_complete(asyncio.gather(*tasks))

asyncio + aiohttp

import asyncio

import requests

@asyncio.coroutine

def fetch_async(func, *args):

    loop = asyncio.get_event_loop()

    future = loop.run_in_executor(None, func, *args)

    response = yield from future

    print(response.url, response.content)

tasks = [

    fetch_async(requests.get, 'http://www.cnblogs.com/alex/'),

    fetch_async(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091')

]

loop = asyncio.get_event_loop()

results = loop.run_until_complete(asyncio.gather(*tasks))

loop.close()

asyncio + requests

import gevent

import requests

from gevent import monkey

monkey.patch_all()

def fetch_async(method, url, req_kwargs):

    print(method, url, req_kwargs)

    response = requests.request(method=method, url=url, **req_kwargs)

    print(response.url, response.content)

# ##### 发送请求 #####

gevent.joinall([

    gevent.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),

    gevent.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),

    gevent.spawn(fetch_async, method='get', url='https://github.com/', req_kwargs={}),

])

# ##### 发送请求（协程池控制最大协程数量） #####

# from gevent.pool import Pool

# pool = Pool(None)

# gevent.joinall([

#     pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),

#     pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),

#     pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}),

# ])

gevent + requests

import grequests

request_list = [

    grequests.get('http://httpbin.org/delay/1', timeout=0.001),

    grequests.get('http://fakedomain/'),

    grequests.get('http://httpbin.org/status/500')

]

# ##### 执行并获取响应列表 #####

# response_list = grequests.map(request_list)

# print(response_list)

# ##### 执行并获取响应列表（处理异常） #####

# def exception_handler(request, exception):

# print(request,exception)

#     print("Request failed")

# response_list = grequests.map(request_list, exception_handler=exception_handler)

# print(response_list)

grequests

from twisted.web.client import getPage

from twisted.internet import reactor

REV_COUNTER = 0

REQ_COUNTER = 0

def callback(contents):

    print(contents,)

    global REV_COUNTER

    REV_COUNTER += 1

    if REV_COUNTER == REQ_COUNTER:

        reactor.stop()

url_list = ['http://www.bing.com', 'http://www.baidu.com', ]

REQ_COUNTER = len(url_list)

for url in url_list:

    print(url)

    deferred = getPage(bytes(url, encoding='utf8'))

    deferred.addCallback(callback)

reactor.run()

twisted-示例1

from twisted.internet import reactor

from twisted.web.client import getPage

import urllib.parse

def one_done(arg):

    print(arg)

    reactor.stop()

post_data = urllib.parse.urlencode({'check_data': 'adf'})

post_data = bytes(post_data, encoding='utf8')

headers = {b'Content-Type': b'application/x-www-form-urlencoded'}

response = getPage(bytes('http://dig.chouti.com/login', encoding='utf8'),

                   method=bytes('POST', encoding='utf8'),

                   postdata=post_data,

                   cookies={},

                   headers=headers)

response.addBoth(one_done)

reactor.run()

twisted-示例2

from tornado.httpclient import AsyncHTTPClient

from tornado.httpclient import HTTPRequest

from tornado import ioloop

def handle_response(response):

    """

    处理返回值内容（需要维护计数器，来停止IO循环），调用 ioloop.IOLoop.current().stop()

    :param response:

    :return:

    """

    if response.error:

        print("Error:", response.error)

    else:

        print(response.body)

def func():

    url_list = [

        'http://www.baidu.com',

        'http://www.bing.com',

    ]

    for url in url_list:

        print(url)

        http_client = AsyncHTTPClient()

        http_client.fetch(HTTPRequest(url), handle_response)

ioloop.IOLoop.current().add_callback(func)

ioloop.IOLoop.current().start()

tornado

以上均是Python内置以及第三方模块提供异步IO请求模块，使用简便大大提高效率，而对于异步IO请求的本质则是【非阻塞Socket】+【IO多路复用】：

import select

import socket

import time

class AsyncTimeoutException(TimeoutError):

    """

    请求超时异常类

    """

    def __init__(self, msg):

        self.msg = msg

        super(AsyncTimeoutException, self).__init__(msg)

class HttpContext(object):

    """封装请求和相应的基本数据"""

    def __init__(self, sock, host, port, method, url, data, callback, timeout=5):

        """

        sock: 请求的客户端socket对象

        host: 请求的主机名

        port: 请求的端口

        port: 请求的端口

        method: 请求方式

        url: 请求的URL

        data: 请求时请求体中的数据

        callback: 请求完成后的回调函数

        timeout: 请求的超时时间

        """

        self.sock = sock

        self.callback = callback

        self.host = host

        self.port = port

        self.method = method

        self.url = url

        self.data = data

        self.timeout = timeout

        self.__start_time = time.time()

        self.__buffer = []

    def is_timeout(self):

        """当前请求是否已经超时"""

        current_time = time.time()

        if (self.__start_time + self.timeout) < current_time:

            return True

    def fileno(self):

        """请求sockect对象的文件描述符，用于select监听"""

        return self.sock.fileno()

    def write(self, data):

        """在buffer中写入响应内容"""

        self.__buffer.append(data)

    def finish(self, exc=None):

        """在buffer中写入响应内容完成，执行请求的回调函数"""

        if not exc:

            response = b''.join(self.__buffer)

            self.callback(self, response, exc)

        else:

            self.callback(self, None, exc)

    def send_request_data(self):

        content = """%s %s HTTP/1.0\r\nHost: %s\r\n\r\n%s""" % (

            self.method.upper(), self.url, self.host, self.data,)

        return content.encode(encoding='utf8')

class AsyncRequest(object):

    def __init__(self):

        self.fds = []

        self.connections = []

    def add_request(self, host, port, method, url, data, callback, timeout):

        """创建一个要请求"""

        client = socket.socket()

        client.setblocking(False)

        try:

            client.connect((host, port))

        except BlockingIOError as e:

            pass

            # print('已经向远程发送连接的请求')

        req = HttpContext(client, host, port, method, url, data, callback, timeout)

        self.connections.append(req)

        self.fds.append(req)

    def check_conn_timeout(self):

        """检查所有的请求，是否有已经连接超时，如果有则终止"""

        timeout_list = []

        for context in self.connections:

            if context.is_timeout():

                timeout_list.append(context)

        for context in timeout_list:

            context.finish(AsyncTimeoutException('请求超时'))

            self.fds.remove(context)

            self.connections.remove(context)

    def running(self):

        """事件循环，用于检测请求的socket是否已经就绪，从而执行相关操作"""

        while True:

            r, w, e = select.select(self.fds, self.connections, self.fds, 0.05)

            if not self.fds:

                return

            for context in r:

                sock = context.sock

                while True:

                    try:

                        data = sock.recv(8096)

                        if not data:

                            self.fds.remove(context)

                            context.finish()

                            break

                        else:

                            context.write(data)

                    except BlockingIOError as e:

                        break

                    except TimeoutError as e:

                        self.fds.remove(context)

                        self.connections.remove(context)

                        context.finish(e)

                        break

            for context in w:

                # 已经连接成功远程服务器，开始向远程发送请求数据

                if context in self.fds:

                    data = context.send_request_data()

                    context.sock.sendall(data)

                    self.connections.remove(context)

            self.check_conn_timeout()

if __name__ == '__main__':

    def callback_func(context, response, ex):

        """

        :param context: HttpContext对象，内部封装了请求相关信息

        :param response: 请求响应内容

        :param ex: 是否出现异常（如果有异常则值为异常对象；否则值为None）

        :return:

        """

        print(context, response, ex)

    obj = AsyncRequest()

    url_list = [

        {'host': 'www.google.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,

         'callback': callback_func},

        {'host': 'www.baidu.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,

         'callback': callback_func},

        {'host': 'www.bing.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,

         'callback': callback_func},

    ]

    for item in url_list:

        print(item)

        obj.add_request(**item)

    obj.running()