python(字节流、buffer、多进程)

原创已于 2023-07-01 10:51:21 修改 · 2.3k 阅读

10 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#python

于 2020-09-07 15:29:55 首次发布

python编码专栏收录该内容

6 篇文章

订阅专栏

二进制、字节流

PIL.Image.Image 转二进制流

import io
import base64

def pil_image_byte(image):
	# image: PIL.Image.Image
	# 这里的操作是先将PIL.Image.Image的图片转为二进制流，然后再进行base64编码
    img_byte=io.BytesIO()
    image.save(img_byte,format='PNG')
    image_data = img_byte.getvalue()
    image_base64 = base64.b64encode(image_data).decode("utf-8")
    # base64解码成图片
    img = cv2.imdecode(np.frombuffer(base64.b64decode(image_base64), np.uint8), cv2.IMREAD_COLOR)

def byte2image(byte_data):
	# 二进制流转PIL image
	image = Image.open(io.BytesIO(byte_data))
    return image

二进制流处理pdf

from urllib import parse, request
import io
from pdfplumber.pdf import PDF

img_url = 'http://apis.v30.edge.customs.dev.amiintellect.com/api/customs/core/storage/preview?id=/uploads/amiintellect-customs/dec/2020/3Q/9/7823107e-7d90-4200-814a-a29d63ed9cbf/gbdw/BG2020092500013/invoice_file/48852核注清单 - 副本.pdf'
image_path = parse.quote(img_url, encoding='utf8', safe='/:?=&')
resp = request.urlopen(image_path)

buffer = bytearray(resp.read())
stream = io.BytesIO(buffer)
pdf = PDF(stream)
for i, page in enumerate(pdf.pages):
        pdf_size = (page.height, page.width)
        page_words = page.extract_words()
        page_tables = page.find_tables()
        print([i['text']for i in page_words])

base64编码与解码

import cv2
import numpy as np
import base64

def np_img2base64(np_img):
    image = cv2.imencode('.png', np_img)[1]
    image = np.squeeze(image, 1)
    image_code = base64.b64encode(image)
    return image_code
    
def base642np_image(base64_str):
    missing_padding = 4 - len(base64_str) % 4
    if missing_padding:
        base64_str += b'=' * missing_padding
    raw_str = base64.b64decode(base64_str)
    np_img = np.fromstring(raw_str, dtype=np.uint8)
    img = cv2.imdecode(np_img, cv2.COLOR_RGB2BGR)
    return img

buffer 与 stream

PDF buffer转stream（二进制转stream）

import io
buffer = open('G://1.pdf', 'rb').read()
stream = io.BytesIO(buffer)

图片 buffer转strteam

import numpy as np
import cv2
buffer = open('G://1.png', 'rb').read()
img = cv2.imdecode(np.frombuffer(buffer, np.uint8), cv2.IMREAD_COLOR)

excel buffer处理

import pandas as pd

buffer = resp.body.buffer
if suffix == 'txt':
   excel_data = pd.read_table(buffer, header=None)
else:
   excel_data = pd.read_excel(buffer, header=None, keep_default_na=True)

wordbuffer处理

import io
import docx
buffer = resp.body.buffer
wordFile = io.BytesIO(buffer)
file = docx.Document(wordFile)
print([para.text for para in file.paragraphs])

html buffer处理

from bs4 import BeautifulSoup
buffer = resp.body.buffer
soup = BeautifulSoup(buffer, 'html.parser')

numpy转bytes

# 需要将numpy类型的图片转换为bytes，然后进行base64
# 抑制参数img是一个np.ndarray
import io
import base64
img_byte = io.BytesIO()  #初始化一个空字节流
#把我们得图片以‘PNG’保存到空字节流
img.save(img_byte, format='PNG')
# 无视指针，获取全部内容，类型由io流变成bytes。
image_data = img_byte.getvalue()    
image_base64 = base64.b64encode(image_data).decode("utf-8")

多进程

多进程需要在main函数中运行

import time
import os
from glob import glob
from multiprocessing import Pool



def extract_frame(video_path):
    print(video_path)
    time.sleep(5)
    return True


if __name__ == '__main__':
    pool = Pool(3)
    for _ in range(3):
        dir_ = 'E:/tumi/material/video'
        file_path = 'E:/tumi/material/video/*/*'
        file_list = glob(file_path)
        file_list = sorted(file_list, key=lambda x: os.path.basename(x))
        pool.map(extract_frame, file_list)