mlc-llm 简介#
import set_env
MLC LLM 是机器学习编译器和高性能部署引擎,专为大型语言模型设计。该项目的使命是让每个人都能在自己的平台上原生地开发、优化和部署 AI 模型。
下载模型:
# git clone https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC
git clone https://hf-mirror.com/mlc-ai/Hermes-3-Llama-3.1-8B-q4f32_1-MLC {temp_dir}/mlc-ai/Hermes-3-Llama-3.1-8B-q4f32_1-MLC
下面是 hello world 的示例:
from mlc_llm import MLCEngine
# Create engine
# model = "HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC" # 原始模型地址
model = f"{temp_dir}/mlc-ai/Hermes-3-Llama-3.1-8B-q4f32_1-MLC"
engine = MLCEngine(model)
# Run chat completion in OpenAI API.
for response in engine.chat.completions.create(
messages=[{"role": "user", "content": "What is the meaning of life?"}],
model=model,
stream=True,
):
for choice in response.choices:
print(choice.delta.content, end="", flush=True)
print("\n")
engine.terminate()
也支持异步操作:
import asyncio
from typing import Dict
from mlc_llm.serve import AsyncMLCEngine
# model = "HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC"
model = f"{temp_dir}/mlc-ai/Hermes-3-Llama-3.1-8B-q4f32_1-MLC"
prompts = [
"Write a three-day travel plan to Pittsburgh.",
"What is the meaning of life?",
]
async def test_completion():
# Create engine
async_engine = AsyncMLCEngine(model=model)
num_requests = len(prompts)
output_texts: Dict[str, str] = {}
async def generate_task(prompt: str):
async for response in await async_engine.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model=model,
stream=True,
):
if response.id not in output_texts:
output_texts[response.id] = ""
output_texts[response.id] += response.choices[0].delta.content
tasks = [asyncio.create_task(generate_task(prompts[i])) for i in range(num_requests)]
await asyncio.gather(*tasks)
# Print output.
for request_id, output in output_texts.items():
print(f"Output of request {request_id}:\n{output}\n")
async_engine.terminate()
# asyncio.run(test_completion())
await test_completion()