mlc-llm 简介

mlc-llm 简介#

import set_env

MLC LLM 是机器学习编译器和高性能部署引擎,专为大型语言模型设计。该项目的使命是让每个人都能在自己的平台上原生地开发、优化和部署 AI 模型。

下载模型:

# git clone https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC
git clone https://hf-mirror.com/mlc-ai/Hermes-3-Llama-3.1-8B-q4f32_1-MLC {temp_dir}/mlc-ai/Hermes-3-Llama-3.1-8B-q4f32_1-MLC

下面是 hello world 的示例:

from mlc_llm import MLCEngine

# Create engine
# model = "HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC" # 原始模型地址
model = f"{temp_dir}/mlc-ai/Hermes-3-Llama-3.1-8B-q4f32_1-MLC"
engine = MLCEngine(model)

# Run chat completion in OpenAI API.
for response in engine.chat.completions.create(
    messages=[{"role": "user", "content": "What is the meaning of life?"}],
    model=model,
    stream=True,
):
    for choice in response.choices:
        print(choice.delta.content, end="", flush=True)
print("\n")

engine.terminate()

也支持异步操作:

import asyncio
from typing import Dict

from mlc_llm.serve import AsyncMLCEngine

# model = "HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC"
model = f"{temp_dir}/mlc-ai/Hermes-3-Llama-3.1-8B-q4f32_1-MLC"
prompts = [
    "Write a three-day travel plan to Pittsburgh.",
    "What is the meaning of life?",
]


async def test_completion():
    # Create engine
    async_engine = AsyncMLCEngine(model=model)

    num_requests = len(prompts)
    output_texts: Dict[str, str] = {}

    async def generate_task(prompt: str):
        async for response in await async_engine.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model,
            stream=True,
        ):
            if response.id not in output_texts:
                output_texts[response.id] = ""
            output_texts[response.id] += response.choices[0].delta.content

    tasks = [asyncio.create_task(generate_task(prompts[i])) for i in range(num_requests)]
    await asyncio.gather(*tasks)

    # Print output.
    for request_id, output in output_texts.items():
        print(f"Output of request {request_id}:\n{output}\n")

    async_engine.terminate()


# asyncio.run(test_completion())
await test_completion()