motor 异步存储实战

原创已于 2022-03-25 23:02:58 修改 · 513 阅读

1 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#mongodb #python

于 2022-03-25 22:59:14 首次发布

学习笔记专栏收录该内容

16 篇文章

订阅专栏

本文介绍了在使用Motor库进行MongoDB操作时，如何通过upsert参数实现数据更新。当upsert设置为true，若查询条件未匹配到记录，Motor将自动插入新纪录。

"""
    1.爬取所有页面书的信息
    2.取出id,构造url,爬取详情页信息
    3.motor异步存储
"""
import aiohttp
import logging
import asyncio
from motor.motor_asyncio import AsyncIOMotorClient
concurrency = 10
session = None
logging.basicConfig(level=logging.INFO,format='%(levelname)s-%(message)s')
#数据库操作
motor_connect_string = 'mongodb://localhost:27017'
momgodb_name = 'books'
mongo_collection_name = 'books'

client = AsyncIOMotorClient(motor_connect_string )
db = client[momgodb_name]
collection =db[mongo_collection_name]


page_num = 10
async def index_url(page):
    INDEX_URL =f'https://spa5.scrape.center/api/book/?limit=18&offset={18*(page-1)}'
    return await scrape_api(INDEX_URL)


async def scrape_api(url):
    async with asyncio.Semaphore(concurrency) :
        try:
            logging.info('scraping %s',url)
            async with session.get(url) as resp:
                return await resp.json()
        
        except (aiohttp.ClientError,asyncio.TimeoutError):
            logging.error('error occurred while scraping %s',url,exc_info=True)

async def scrape_detail(id):
    """爬取详情页"""
    detail_url = f'https://spa5.scrape.center/api/book/{id}/'
    data =  await scrape_api(detail_url)
    await save_data(data)

async def save_data(data):
    logging.info('saving data %s',data)
    if data:
        return await collection.update_one(
            {'id':data.get('id')},{'$set':data},upsert = True
        )
    

async def main():
    global session
    timeout = aiohttp.ClientTimeout(total=600) 
    session = aiohttp.ClientSession(timeout=timeout)
    tasks = [asyncio.ensure_future(index_url(i)) for i in range(1,page_num+1)]
    results = await asyncio.gather(*tasks)
    ids = []
    for index_data in results:
        if not index_data:continue
        for item in index_data.get('results'):
            ids.append(item.get('id'))
    tasks2 = [asyncio.ensure_future(scrape_detail(i)) for i in ids]
    await asyncio.gather(*tasks2)
    await session.close()


if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(main())

使用mongo更新数据时，当upsert参数为true时且根据查询条件无结果时，mongo就要插入新纪录。