"""
1.爬取所有页面书的信息
2.取出id,构造url,爬取详情页信息
3.motor异步存储
"""
import aiohttp
import logging
import asyncio
from motor.motor_asyncio import AsyncIOMotorClient
concurrency = 10
session = None
logging.basicConfig(level=logging.INFO,format='%(levelname)s-%(message)s')
#数据库操作
motor_connect_string = 'mongodb://localhost:27017'
momgodb_name = 'books'
mongo_collection_name = 'books'
client = AsyncIOMotorClient(motor_connect_string )
db = client[momgodb_name]
collection =db[mongo_collection_name]
page_num = 10
async def index_url(page):
INDEX_URL =f'https://spa5.scrape.center/api/book/?limit=18&offset={18*(page-1)}'
return await scrape_api(INDEX_URL)
async def scrape_api(url):
async with asyncio.Semaphore(concurrency) :
try:
logging.info('scraping %s',url)
async with session.get(url) as resp:
return await resp.json()
except (aiohttp.ClientError,asyncio.TimeoutError):
logging.error('error occurred while scraping %s',url,exc_info=True)
async def scrape_detail(id):
"""爬取详情页"""
detail_url = f'https://spa5.scrape.center/api/book/{id}/'
data = await scrape_api(detail_url)
await save_data(data)
async def save_data(data):
logging.info('saving data %s',data)
if data:
return await collection.update_one(
{'id':data.get('id')},{'$set':data},upsert = True
)
async def main():
global session
timeout = aiohttp.ClientTimeout(total=600)
session = aiohttp.ClientSession(timeout=timeout)
tasks = [asyncio.ensure_future(index_url(i)) for i in range(1,page_num+1)]
results = await asyncio.gather(*tasks)
ids = []
for index_data in results:
if not index_data:continue
for item in index_data.get('results'):
ids.append(item.get('id'))
tasks2 = [asyncio.ensure_future(scrape_detail(i)) for i in ids]
await asyncio.gather(*tasks2)
await session.close()
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())
使用mongo更新数据时,当upsert参数为true时且根据查询条件无结果时,mongo就要插入新纪录。
本文介绍了在使用Motor库进行MongoDB操作时,如何通过upsert参数实现数据更新。当upsert设置为true,若查询条件未匹配到记录,Motor将自动插入新纪录。

962

被折叠的 条评论
为什么被折叠?



