二进制、字节流
- PIL.Image.Image 转二进制流
import io
import base64
def pil_image_byte(image):
img_byte=io.BytesIO()
image.save(img_byte,format='PNG')
image_data = img_byte.getvalue()
image_base64 = base64.b64encode(image_data).decode("utf-8")
img = cv2.imdecode(np.frombuffer(base64.b64decode(image_base64), np.uint8), cv2.IMREAD_COLOR)
def byte2image(byte_data):
image = Image.open(io.BytesIO(byte_data))
return image
- 二进制流处理pdf
from urllib import parse, request
import io
from pdfplumber.pdf import PDF
img_url = 'http://apis.v30.edge.customs.dev.amiintellect.com/api/customs/core/storage/preview?id=/uploads/amiintellect-customs/dec/2020/3Q/9/7823107e-7d90-4200-814a-a29d63ed9cbf/gbdw/BG2020092500013/invoice_file/48852核注清单 - 副本.pdf'
image_path = parse.quote(img_url, encoding='utf8', safe='/:?=&')
resp = request.urlopen(image_path)
buffer = bytearray(resp.read())
stream = io.BytesIO(buffer)
pdf = PDF(stream)
for i, page in enumerate(pdf.pages):
pdf_size = (page.height, page.width)
page_words = page.extract_words()
page_tables = page.find_tables()
print([i['text']for i in page_words])
- base64编码与解码
import cv2
import numpy as np
import base64
def np_img2base64(np_img):
image = cv2.imencode('.png', np_img)[1]
image = np.squeeze(image, 1)
image_code = base64.b64encode(image)
return image_code
def base642np_image(base64_str):
missing_padding = 4 - len(base64_str) % 4
if missing_padding:
base64_str += b'=' * missing_padding
raw_str = base64.b64decode(base64_str)
np_img = np.fromstring(raw_str, dtype=np.uint8)
img = cv2.imdecode(np_img, cv2.COLOR_RGB2BGR)
return img
buffer 与 stream
- PDF buffer转stream(二进制转stream)
import io
buffer = open('G://1.pdf', 'rb').read()
stream = io.BytesIO(buffer)
- 图片 buffer转strteam
import numpy as np
import cv2
buffer = open('G://1.png', 'rb').read()
img = cv2.imdecode(np.frombuffer(buffer, np.uint8), cv2.IMREAD_COLOR)
- excel buffer处理
import pandas as pd
buffer = resp.body.buffer
if suffix == 'txt':
excel_data = pd.read_table(buffer, header=None)
else:
excel_data = pd.read_excel(buffer, header=None, keep_default_na=True)
- wordbuffer处理
import io
import docx
buffer = resp.body.buffer
wordFile = io.BytesIO(buffer)
file = docx.Document(wordFile)
print([para.text for para in file.paragraphs])
- html buffer处理
from bs4 import BeautifulSoup
buffer = resp.body.buffer
soup = BeautifulSoup(buffer, 'html.parser')
- numpy转bytes
import io
import base64
img_byte = io.BytesIO()
img.save(img_byte, format='PNG')
image_data = img_byte.getvalue()
image_base64 = base64.b64encode(image_data).decode("utf-8")
多进程
- 多进程需要在main函数中运行
import time
import os
from glob import glob
from multiprocessing import Pool
def extract_frame(video_path):
print(video_path)
time.sleep(5)
return True
if __name__ == '__main__':
pool = Pool(3)
for _ in range(3):
dir_ = 'E:/tumi/material/video'
file_path = 'E:/tumi/material/video/*/*'
file_list = glob(file_path)
file_list = sorted(file_list, key=lambda x: os.path.basename(x))
pool.map(extract_frame, file_list)