python/samples/concepts/local_models/onnx_text_completion.py

# Copyright (c) Microsoft. All rights reserved.


import asyncio

from semantic_kernel.connectors.ai.onnx import OnnxGenAITextCompletion
from semantic_kernel.functions.kernel_arguments import KernelArguments
from semantic_kernel.kernel import Kernel

# This concept sample shows how to use the Onnx connector with
# a local model running in Onnx

kernel = Kernel()

service_id = "phi3"
#############################################
# Make sure to download an ONNX model
# (https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx)
# If onnxruntime-genai is used:
# use the model stored in /cpu folder
# If onnxruntime-genai-cuda is installed for gpu use:
# use the model stored in /cuda folder
# Then set ONNX_GEN_AI_TEXT_MODEL_FOLDER environment variable to the path to the model folder
#############################################
streaming = True

kernel.add_service(OnnxGenAITextCompletion(ai_model_id=service_id))

settings = kernel.get_prompt_execution_settings_from_service_id(service_id)

# Phi3 Model is using chat templates to generate responses
# With the Chat Template the model understands
# the context and roles of the conversation better
# https://huggingface.co/microsoft/Phi-3-mini-4k-instruct#chat-format
chat_function = kernel.add_function(
    plugin_name="ChatBot",
    function_name="Chat",
    prompt="<|user|>{{$user_input}}<|end|><|assistant|>",
    template_format="semantic-kernel",
    prompt_execution_settings=settings,
)


async def chat() -> bool:
    try:
        user_input = input("User:> ")
    except KeyboardInterrupt:
        print("\n\nExiting chat...")
        return False
    except EOFError:
        print("\n\nExiting chat...")
        return False

    if user_input == "exit":
        print("\n\nExiting chat...")
        return False

    if streaming:
        print("Mosscap:> ", end="")
        async for chunk in kernel.invoke_stream(chat_function, KernelArguments(user_input=user_input)):
            print(chunk[0].text, end="")
        print("\n")
    else:
        answer = await kernel.invoke(chat_function, KernelArguments(user_input=user_input))
        print(f"Mosscap:> {answer}")
    return True


async def main() -> None:
    chatting = True
    while chatting:
        chatting = await chat()


if __name__ == "__main__":
    asyncio.run(main())