Skip to content

Python: Allow Kernel Functions from Prompt for image and audio content #11403

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions python/samples/concepts/audio/audio_from_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio

from samples.concepts.audio.audio_player import AudioPlayer
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai import PromptExecutionSettings
from semantic_kernel.connectors.ai.open_ai import OpenAITextToAudio
from semantic_kernel.functions import KernelArguments

"""
This simple sample demonstrates how to use the AzureTextToAudio services
with a prompt and prompt rendering.

Resources required for this sample: An Azure Text to Speech deployment (e.g. tts).

Additional dependencies required for this sample:
- pyaudio: run `pip install pyaudio` or `uv pip install pyaudio` if you are using uv.
"""


async def main():
kernel = Kernel()
kernel.add_service(OpenAITextToAudio(service_id="tts"))

result = await kernel.invoke_prompt(
prompt="speak the following phrase: {{$phrase}}",
arguments=KernelArguments(
phrase="a painting of a flower vase",
settings=PromptExecutionSettings(service_id="tts", voice="coral"),
),
)
if result:
AudioPlayer(audio_content=result.value[0]).play()


if __name__ == "__main__":
asyncio.run(main())
49 changes: 49 additions & 0 deletions python/samples/concepts/images/image_gen_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio
from urllib.request import urlopen

try:
from PIL import Image

pil_available = True
except ImportError:
pil_available = False

from semantic_kernel import Kernel
from semantic_kernel.connectors.ai import PromptExecutionSettings
from semantic_kernel.connectors.ai.open_ai import OpenAITextToImage
from semantic_kernel.functions import KernelArguments

"""
This sample demonstrates how to use the OpenAI text-to-image service to generate an image from a prompt.
It uses the OpenAITextToImage class to create an image based on the provided prompt and settings.
The generated image is then displayed using the PIL library if available.
"""


async def main():
kernel = Kernel()
kernel.add_service(OpenAITextToImage(service_id="dalle3"))

result = await kernel.invoke_prompt(
prompt="Generate a image of {{$topic}} in the style of a {{$style}}",
arguments=KernelArguments(
topic="a flower vase",
style="painting",
settings=PromptExecutionSettings(
service_id="dalle3",
width=1024,
height=1024,
quality="hd",
style="vivid",
),
),
)
if result and pil_available:
img = Image.open(urlopen(str(result.value[0].uri))) # nosec
img.show()


if __name__ == "__main__":
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
# Copyright (c) Microsoft. All rights reserved.

import logging
from typing import Literal
from typing import Annotated, Literal

from pydantic import Field, model_validator
from pydantic import Field

from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.exceptions.service_exceptions import ServiceInvalidExecutionSettingsError

logger = logging.getLogger(__name__)

Expand All @@ -18,13 +17,6 @@ class OpenAITextToAudioExecutionSettings(PromptExecutionSettings):
input: str | None = Field(
None, description="Do not set this manually. It is set by the service based on the text content."
)
voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"] = "alloy"
voice: Literal["alloy", "ash", "ballad", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer"] = "alloy"
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | None = None
speed: float | None = None

@model_validator(mode="after")
def validate_speed(self) -> "OpenAITextToAudioExecutionSettings":
"""Validate the speed parameter."""
if self.speed is not None and (self.speed < 0.25 or self.speed > 4.0):
raise ServiceInvalidExecutionSettingsError("Speed must be between 0.25 and 4.0.")
return self
speed: Annotated[float | None, Field(ge=0.25, le=4.0)] = None
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,26 @@ class OpenAITextToImageExecutionSettings(PromptExecutionSettings):
quality: str | None = None
style: str | None = None

@model_validator(mode="before")
@classmethod
def get_size(cls, data: dict[str, Any]) -> dict[str, Any]:
"""Check that the requested image size is valid."""
if isinstance(data, dict):
if "size" not in data and "width" in data and "height" in data:
data["size"] = ImageSize(width=data["width"], height=data["height"])
elif "extension_data" in data:
extension_data = data["extension_data"]
if (
isinstance(extension_data, dict)
and "size" not in extension_data
and "width" in extension_data
and "height" in extension_data
):
data["extension_data"]["size"] = ImageSize(
width=extension_data["width"], height=extension_data["height"]
)
return data

@model_validator(mode="after")
def check_size(self) -> "OpenAITextToImageExecutionSettings":
"""Check that the requested image size is valid."""
Expand All @@ -51,16 +71,6 @@ def check_size(self) -> "OpenAITextToImageExecutionSettings":

return self

@model_validator(mode="after")
def check_prompt(self) -> "OpenAITextToImageExecutionSettings":
"""Check that the prompt is not empty."""
prompt = self.prompt or self.extension_data.get("prompt")

if not prompt:
raise ServiceInvalidExecutionSettingsError("The prompt is required.")

return self

def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
"""Prepare the settings dictionary for the OpenAI API."""
settings_dict = super().prepare_settings_dict(**kwargs)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft. All rights reserved.

from typing import Any
from warnings import warn

from openai.types.images_response import ImagesResponse

Expand All @@ -11,30 +12,55 @@
from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.connectors.ai.text_to_image_client_base import TextToImageClientBase
from semantic_kernel.exceptions.service_exceptions import ServiceResponseException
from semantic_kernel.exceptions.service_exceptions import ServiceInvalidRequestError, ServiceResponseException


class OpenAITextToImageBase(OpenAIHandler, TextToImageClientBase):
"""OpenAI text to image client."""

async def generate_image(self, description: str, width: int, height: int, **kwargs: Any) -> bytes | str:
async def generate_image(
self,
description: str,
width: int | None = None,
height: int | None = None,
settings: PromptExecutionSettings | None = None,
**kwargs: Any,
) -> bytes | str:
"""Generate image from text.

Args:
description: Description of the image.
width: Width of the image, check the openai documentation for the supported sizes.
height: Height of the image, check the openai documentation for the supported sizes.
width: Deprecated, use settings instead.
height: Deprecated, use settings instead.
settings: Execution settings for the prompt.
kwargs: Additional arguments, check the openai images.generate documentation for the supported arguments.

Returns:
bytes | str: Image bytes or image URL.
"""
settings = OpenAITextToImageExecutionSettings(
prompt=description,
size=ImageSize(width=width, height=height),
ai_model_id=self.ai_model_id,
**kwargs,
)
if not settings:
settings = OpenAITextToImageExecutionSettings(**kwargs)
if not isinstance(settings, OpenAITextToImageExecutionSettings):
settings = OpenAITextToImageExecutionSettings.from_prompt_execution_settings(settings)
if width:
warn("The 'width' argument is deprecated. Use 'settings.size' instead.", DeprecationWarning)
if settings.size and not settings.size.width:
settings.size.width = width
if height:
warn("The 'height' argument is deprecated. Use 'settings.size' instead.", DeprecationWarning)
if settings.size and not settings.size.height:
settings.size.height = height
if not settings.size and width and height:
settings.size = ImageSize(width=width, height=height)

if not settings.prompt:
settings.prompt = description

if not settings.prompt:
raise ServiceInvalidRequestError("Prompt is required.")

if not settings.ai_model_id:
settings.ai_model_id = self.ai_model_id

response = await self._send_request(settings)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def __init__(self, service_id: str | None = None, **kwargs: Any):
@property
def keys(self):
"""Get the keys of the prompt execution settings."""
return self.model_fields.keys()
return self.__class__.model_fields.keys()

def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
"""Prepare the settings as a dictionary for sending to the AI service.
Expand All @@ -86,7 +86,7 @@ def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
by_alias=True,
)

def update_from_prompt_execution_settings(self, config: _T) -> None:
def update_from_prompt_execution_settings(self, config: "PromptExecutionSettings") -> None:
"""Update the prompt execution settings from a completion config."""
if config.service_id is not None:
self.service_id = config.service_id
Expand All @@ -95,7 +95,7 @@ def update_from_prompt_execution_settings(self, config: _T) -> None:
self.unpack_extension_data()

@classmethod
def from_prompt_execution_settings(cls: type[_T], config: _T) -> _T:
def from_prompt_execution_settings(cls: type[_T], config: "PromptExecutionSettings") -> _T:
"""Create a prompt execution settings from a completion config."""
config.pack_extension_data()
return cls(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,54 @@
from abc import ABC, abstractmethod
from typing import Any

from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.contents.image_content import ImageContent
from semantic_kernel.services.ai_service_client_base import AIServiceClientBase


class TextToImageClientBase(AIServiceClientBase, ABC):
"""Base class for text to image client."""

@abstractmethod
async def generate_image(self, description: str, width: int, height: int, **kwargs: Any) -> bytes | str:
async def generate_image(
self,
description: str,
width: int | None = None,
height: int | None = None,
settings: PromptExecutionSettings | None = None,
**kwargs: Any,
) -> bytes | str:
"""Generate image from text.

Args:
description: Description of the image.
width: Width of the image.
height: Height of the image.
width: Deprecated, use settings instead.
height: Deprecated, use settings instead.
settings: Execution settings for the prompt.
kwargs: Additional arguments.

Returns:
bytes | str: Image bytes or image URL.
"""
raise NotImplementedError

async def get_image_content(
self,
description: str,
settings: PromptExecutionSettings,
**kwargs: Any,
) -> ImageContent:
"""Generate an image from prompt and return an ImageContent.

Args:
description: Description of the image.
settings: Execution settings for the prompt.
kwargs: Additional arguments.

Returns:
ImageContent: Image content.
"""
image = await self.generate_image(description=description, settings=settings, **kwargs)
if isinstance(image, str):
return ImageContent(uri=image)
return ImageContent(data=image)
38 changes: 36 additions & 2 deletions python/semantic_kernel/functions/kernel_function_from_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,13 @@
from semantic_kernel.connectors.ai.chat_completion_client_base import ChatCompletionClientBase
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.connectors.ai.text_completion_client_base import TextCompletionClientBase
from semantic_kernel.connectors.ai.text_to_audio_client_base import TextToAudioClientBase
from semantic_kernel.connectors.ai.text_to_image_client_base import TextToImageClientBase
from semantic_kernel.const import DEFAULT_SERVICE_NAME
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents.chat_history import ChatHistory
from semantic_kernel.contents.chat_message_content import ChatMessageContent
from semantic_kernel.contents.image_content import ImageContent
from semantic_kernel.contents.text_content import TextContent
from semantic_kernel.exceptions import FunctionExecutionException, FunctionInitializationError
from semantic_kernel.exceptions.function_exceptions import PromptRenderingException
Expand Down Expand Up @@ -204,6 +208,34 @@ async def _invoke_internal(self, context: FunctionInvocationContext) -> None:
)
return

if isinstance(prompt_render_result.ai_service, TextToImageClientBase):
try:
images = await prompt_render_result.ai_service.get_image_content(
description=unescape(prompt_render_result.rendered_prompt),
settings=prompt_render_result.execution_settings,
)
except Exception as exc:
raise FunctionExecutionException(f"Error occurred while invoking function {self.name}: {exc}") from exc

context.result = self._create_function_result(
completions=[images], arguments=context.arguments, prompt=prompt_render_result.rendered_prompt
)
return

if isinstance(prompt_render_result.ai_service, TextToAudioClientBase):
try:
audio = await prompt_render_result.ai_service.get_audio_content(
text=unescape(prompt_render_result.rendered_prompt),
settings=prompt_render_result.execution_settings,
)
except Exception as exc:
raise FunctionExecutionException(f"Error occurred while invoking function {self.name}: {exc}") from exc

context.result = self._create_function_result(
completions=[audio], arguments=context.arguments, prompt=prompt_render_result.rendered_prompt
)
return

raise ValueError(f"Service `{type(prompt_render_result.ai_service).__name__}` is not a valid AI service")

async def _invoke_internal_stream(self, context: FunctionInvocationContext) -> None:
Expand Down Expand Up @@ -253,7 +285,9 @@ async def _render_prompt(
if prompt_render_context.rendered_prompt is None:
raise PromptRenderingException("Prompt rendering failed, no rendered prompt was returned.")
selected_service: tuple["AIServiceClientBase", PromptExecutionSettings] = context.kernel.select_ai_service(
function=self, arguments=context.arguments
function=self,
arguments=context.arguments,
type=(TextCompletionClientBase, ChatCompletionClientBase) if prompt_render_context.is_streaming else None,
)
return PromptRenderingResult(
rendered_prompt=prompt_render_context.rendered_prompt,
Expand All @@ -268,7 +302,7 @@ async def _inner_render_prompt(self, context: PromptRenderContext) -> None:

def _create_function_result(
self,
completions: list[ChatMessageContent] | list[TextContent],
completions: list[ChatMessageContent] | list[TextContent] | list[ImageContent] | list[AudioContent],
arguments: KernelArguments,
chat_history: ChatHistory | None = None,
prompt: str | None = None,
Expand Down
Loading
Loading