python调用硅基流动的视觉语言模型

susu1083018911

567人浏览 · 2025-06-03 11:02:17

susu1083018911 · 2025-06-03 11:02:17 发布

参考： https://docs.siliconflow.cn/cn/userguide/capabilities/vision

import base64
import json
from openai import OpenAI
from PIL import Image
import io

# 初始化OpenAI客户端
client = OpenAI(
    api_key="sk-**********",  # 替换为实际API密钥
    base_url="https://api.siliconflow.cn/v1"
)

def convert_image_to_webp_base64(input_image_path: str) -> str:
    """将本地图片转换为WebP格式的Base64字符串"""
    try:
        with Image.open(input_image_path) as img:
            # 转换为WebP格式（优化大小）
            byte_arr = io.BytesIO()
            img.save(byte_arr, format='WEBP', quality=85)  # 调整质量平衡大小和清晰度
            byte_arr = byte_arr.getvalue()
            return base64.b64encode(byte_arr).decode('utf-8')
    except Exception as e:
        print(f"图片转换错误: {e}")
        return None

# 1. 转换本地图片
input_image_path = "7125e2e3.jpeg"  # 替换为实际图片路径
base64_image = convert_image_to_webp_base64(input_image_path)

if not base64_image:
    print("图片转换失败，请检查路径和格式")
    exit()

# 2. 创建流式请求
response = client.chat.completions.create(
    model="Qwen/Qwen2.5-VL-72B-Instruct",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/webp;base64,{base64_image}",  # 指定WebP格式
                        "detail": "high"  # 平衡速度与精度
                    }
                },
                {
                    "type": "text",
                    "text": "使用ocr识别图片内容并输出"  # 替换为你的提示词
                }
            ]
        }
    ],
    stream=True,
    max_tokens=1000  # 控制响应长度
)

# 3. 流式处理响应
print("模型响应：")
full_response = ""
for chunk in response:
    if chunk.choices[0].delta.content:
        text_chunk = chunk.choices[0].delta.content
        print(text_chunk, end='', flush=True)
        full_response += text_chunk

print("\n\n完整响应已接收")