Skip to content

Commit 385b65e

Browse files
authored
Merge pull request #964 from hchen2020/master
IAudioSynthesis
2 parents 23cbe41 + 2d710d4 commit 385b65e

File tree

35 files changed

+307
-244
lines changed

35 files changed

+307
-244
lines changed

src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioCompletion.cs

Lines changed: 0 additions & 15 deletions
This file was deleted.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
namespace BotSharp.Abstraction.MLTasks;
2+
3+
/// <summary>
4+
/// Text to speech synthesis
5+
/// </summary>
6+
public interface IAudioSynthesis
7+
{
8+
string Provider { get; }
9+
10+
string Model { get; }
11+
12+
void SetModelName(string model);
13+
14+
Task<BinaryData> GenerateAudioAsync(string text, string? voice = "alloy", string? format = "mp3", string? instructions = null);
15+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
using System.IO;
2+
3+
namespace BotSharp.Abstraction.MLTasks;
4+
5+
/// <summary>
6+
/// Audio transcription service
7+
/// </summary>
8+
public interface IAudioTranscription
9+
{
10+
string Provider { get; }
11+
12+
string Model { get; }
13+
14+
Task<string> TranscriptTextAsync(Stream audio, string audioFileName, string? text = null);
15+
16+
void SetModelName(string model);
17+
}

src/Infrastructure/BotSharp.Abstraction/MLTasks/Settings/LlmModelSetting.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@ namespace BotSharp.Abstraction.MLTasks.Settings;
33
public class LlmModelSetting
44
{
55
/// <summary>
6-
/// Model Id, like "gpt-3.5" and "gpt-4".
6+
/// Model Id, like "gpt-4", "gpt-4o", "o1".
77
/// </summary>
8-
public string? Id { get; set; }
8+
public string Id { get; set; } = null!;
99

1010
/// <summary>
1111
/// Deployment model name
1212
/// </summary>
13-
public string Name { get; set; }
13+
public string Name { get; set; } = null!;
1414

1515
/// <summary>
1616
/// Model version
@@ -28,8 +28,8 @@ public class LlmModelSetting
2828
/// </summary>
2929
public string? Group { get; set; }
3030

31-
public string ApiKey { get; set; }
32-
public string Endpoint { get; set; }
31+
public string ApiKey { get; set; } = null!;
32+
public string? Endpoint { get; set; }
3333
public LlmModelType Type { get; set; } = LlmModelType.Chat;
3434

3535
/// <summary>

src/Infrastructure/BotSharp.Abstraction/Realtime/Models/ModelTurnDetection.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,6 @@ public class ModelTurnDetection
1111

1212
public class AudioTranscription
1313
{
14-
public string Model { get; set; } = "whisper-1";
15-
public string Language { get; set; } = "en";
14+
public string Model { get; set; } = "gpt-4o-mini-transcribe";
15+
public string? Language { get; set; }
1616
}

src/Infrastructure/BotSharp.Core.Realtime/Services/RealtimeHub.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ private async Task ConnectToModel(WebSocket userWebSocket)
7474
if (!model.Contains("-realtime-"))
7575
{
7676
var llmProviderService = _services.GetRequiredService<ILlmProviderService>();
77-
model = llmProviderService.GetProviderModel("openai", "gpt-4", realTime: true).Name;
77+
model = llmProviderService.GetProviderModel("openai", "gpt-4o", realTime: true).Name;
7878
}
7979

8080
_completer.SetModelName(model);

src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Audio.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ public partial class FileInstructService
66
{
77
public async Task<string> SpeechToText(string? provider, string? model, InstructFileModel audio, string? text = null)
88
{
9-
var completion = CompletionProvider.GetAudioCompletion(_services, provider: provider ?? "openai", model: model ?? "whisper-1");
9+
var completion = CompletionProvider.GetAudioTranscriber(_services, provider: provider, model: model);
1010
var audioBytes = await DownloadFile(audio);
1111
using var stream = new MemoryStream();
1212
stream.Write(audioBytes, 0, audioBytes.Length);
1313
stream.Position = 0;
1414

1515
var fileName = $"{audio.FileName ?? "audio"}.{audio.FileExtension ?? "wav"}";
16-
var content = await completion.GenerateTextFromAudioAsync(stream, fileName, text);
16+
var content = await completion.TranscriptTextAsync(stream, fileName, text);
1717
stream.Close();
1818
return content;
1919
}

src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Pdf.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public async Task<string> ReadPdf(string? provider, string? model, string? model
2727

2828
var innerAgentId = agentId ?? Guid.Empty.ToString();
2929
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider ?? "openai",
30-
model: model, modelId: modelId ?? "gpt-4", multiModal: true);
30+
model: model, modelId: modelId ?? "gpt-4o", multiModal: true);
3131
var message = await completion.GetChatCompletions(new Agent()
3232
{
3333
Id = innerAgentId,

src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.SelectFile.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ private async Task<IEnumerable<MessageFileModel>> SelectFiles(IEnumerable<Messag
9393
}
9494

9595
var providerName = options.Provider ?? "openai";
96-
var modelId = options?.ModelId ?? "gpt-4";
96+
var modelId = options?.ModelId ?? "gpt-4o";
9797
var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == providerName);
9898
var model = llmProviderService.GetProviderModel(provider: provider, id: modelId);
9999
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name);

src/Infrastructure/BotSharp.Core/Infrastructures/CompletionProvider.cs

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ public static object GetCompletion(IServiceProvider services,
3030
}
3131
else if (settings.Type == LlmModelType.Audio)
3232
{
33-
return GetAudioCompletion(services, provider: provider, model: model);
33+
return GetAudioTranscriber(services, provider: provider, model: model);
3434
}
3535
else
3636
{
@@ -126,20 +126,39 @@ public static ITextEmbedding GetTextEmbedding(IServiceProvider services,
126126
return completer;
127127
}
128128

129-
public static IAudioCompletion GetAudioCompletion(
129+
public static IAudioTranscription GetAudioTranscriber(
130130
IServiceProvider services,
131-
string provider,
132-
string model)
131+
string? provider = null,
132+
string? model = null)
133133
{
134-
var completions = services.GetServices<IAudioCompletion>();
135-
var completer = completions.FirstOrDefault(x => x.Provider == provider);
134+
var completions = services.GetServices<IAudioTranscription>();
135+
var completer = completions.FirstOrDefault(x => x.Provider == (provider ?? "openai"));
136136
if (completer == null)
137137
{
138138
var logger = services.GetRequiredService<ILogger<CompletionProvider>>();
139-
logger.LogError($"Can't resolve audio-completion provider by {provider}");
139+
logger.LogError($"Can't resolve audio-transcriber provider by {provider}");
140+
return default!;
140141
}
141142

142-
completer.SetModelName(model);
143+
completer.SetModelName(model ?? "gpt-4o-mini-transcribe");
144+
return completer;
145+
}
146+
147+
public static IAudioSynthesis GetAudioSynthesizer(
148+
IServiceProvider services,
149+
string? provider = null,
150+
string? model = null)
151+
{
152+
var completions = services.GetServices<IAudioSynthesis>();
153+
var completer = completions.FirstOrDefault(x => x.Provider == (provider ?? "openai"));
154+
if (completer == null)
155+
{
156+
var logger = services.GetRequiredService<ILogger<CompletionProvider>>();
157+
logger.LogError($"Can't resolve audio-synthesizer provider by {provider}");
158+
return default!;
159+
}
160+
161+
completer.SetModelName(model ?? "gpt-4o-mini-tts");
143162
return completer;
144163
}
145164

src/Infrastructure/BotSharp.OpenAPI/Controllers/InstructModeController.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -499,8 +499,8 @@ public async Task<SpeechToTextViewModel> SpeechToText(IFormFile file, [FromForm]
499499
file.CopyTo(stream);
500500
stream.Position = 0;
501501

502-
var completion = CompletionProvider.GetAudioCompletion(_services, provider: provider ?? "openai", model: model ?? "whisper-1");
503-
var content = await completion.GenerateTextFromAudioAsync(stream, file.FileName, text);
502+
var completion = CompletionProvider.GetAudioTranscriber(_services, provider: provider, model: model);
503+
var content = await completion.TranscriptTextAsync(stream, file.FileName, text);
504504
viewModel.Content = content;
505505
stream.Close();
506506
return viewModel;
@@ -520,8 +520,8 @@ public async Task<IActionResult> TextToSpeech([FromBody] TextToSpeechRequest inp
520520
var state = _services.GetRequiredService<IConversationStateService>();
521521
input.States.ForEach(x => state.SetState(x.Key, x.Value, activeRounds: x.ActiveRounds, source: StateSource.External));
522522

523-
var completion = CompletionProvider.GetAudioCompletion(_services, provider: input.Provider ?? "openai", model: input.Model ?? "tts-1");
524-
var binaryData = await completion.GenerateAudioFromTextAsync(input.Text);
523+
var completion = CompletionProvider.GetAudioSynthesizer(_services, provider: input.Provider, model: input.Model);
524+
var binaryData = await completion.GenerateAudioAsync(input.Text);
525525
var stream = binaryData.ToStream();
526526
stream.Position = 0;
527527

src/Infrastructure/BotSharp.OpenAPI/Controllers/RealtimeController.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ public RealtimeController(IServiceProvider services)
2222
[HttpGet("/agent/{agentId}/realtime/session")]
2323
public async Task<RealtimeSession> CreateSession(string agentId)
2424
{
25-
var completion = CompletionProvider.GetRealTimeCompletion(_services, provider: "openai", modelId: "gpt-4");
25+
var completion = CompletionProvider.GetRealTimeCompletion(_services, provider: "openai", modelId: "gpt-4o");
2626

2727
var agentService = _services.GetRequiredService<IAgentService>();
2828
var agent = await agentService.LoadAgent(agentId);

src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ public void RegisterDI(IServiceCollection services, IConfiguration config)
1616
return settingService.Bind<AudioHandlerSettings>("AudioHandler");
1717
});
1818

19-
services.AddScoped<IAudioCompletion, NativeWhisperProvider>();
19+
services.AddScoped<IAudioTranscription, NativeWhisperProvider>();
2020
services.AddScoped<IAgentUtilityHook, AudioHandlerUtilityHook>();
2121
}
2222
}

src/Plugins/BotSharp.Plugin.AudioHandler/Functions/HandleAudioRequestFn.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ private async Task<string> GetResponeFromDialogs(List<RoleDialogModel> dialogs)
9191
using var stream = new MemoryStream(bytes);
9292
stream.Position = 0;
9393

94-
var result = await audioCompletion.GenerateTextFromAudioAsync(stream, fileName);
94+
var result = await audioCompletion.TranscriptTextAsync(stream, fileName);
9595
transcripts.Add(result);
9696
stream.Close();
9797
}
@@ -104,9 +104,9 @@ private async Task<string> GetResponeFromDialogs(List<RoleDialogModel> dialogs)
104104
return string.Join("\r\n\r\n", transcripts);
105105
}
106106

107-
private IAudioCompletion PrepareModel()
107+
private IAudioTranscription PrepareModel()
108108
{
109-
return CompletionProvider.GetAudioCompletion(_serviceProvider, provider: "openai", model: "whisper-1");
109+
return CompletionProvider.GetAudioTranscriber(_serviceProvider);
110110
}
111111

112112
private bool ParseAudioFileType(string fileName)

src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ namespace BotSharp.Plugin.AudioHandler.Provider;
66
/// <summary>
77
/// Native Whisper provider for speech to text conversion
88
/// </summary>
9-
public class NativeWhisperProvider : IAudioCompletion
9+
public class NativeWhisperProvider : IAudioTranscription
1010
{
1111
private static WhisperProcessor _whisperProcessor;
1212

@@ -29,7 +29,7 @@ public NativeWhisperProvider(
2929
_logger = logger;
3030
}
3131

32-
public async Task<string> GenerateTextFromAudioAsync(Stream audio, string audioFileName, string? text = null)
32+
public async Task<string> TranscriptTextAsync(Stream audio, string audioFileName, string? text = null)
3333
{
3434
var textResult = new List<SegmentData>();
3535

src/Plugins/BotSharp.Plugin.AzureOpenAI/AzureOpenAiPlugin.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,6 @@ public void RegisterDI(IServiceCollection services, IConfiguration config)
3131
services.AddScoped<IChatCompletion, ChatCompletionProvider>();
3232
services.AddScoped<ITextEmbedding, TextEmbeddingProvider>();
3333
services.AddScoped<IImageCompletion, ImageCompletionProvider>();
34-
services.AddScoped<IAudioCompletion, AudioCompletionProvider>();
34+
services.AddScoped<IAudioTranscription, AudioCompletionProvider>();
3535
}
3636
}

src/Plugins/BotSharp.Plugin.AzureOpenAI/Providers/Audio/AudioCompletionProvider.SpeechToText.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ namespace BotSharp.Plugin.AzureOpenAI.Providers.Audio;
44

55
public partial class AudioCompletionProvider
66
{
7-
public async Task<string> GenerateTextFromAudioAsync(Stream audio, string audioFileName, string? text = null)
7+
public async Task<string> TranscriptTextAsync(Stream audio, string audioFileName, string? text = null)
88
{
99
var audioClient = ProviderHelper.GetClient(Provider, _model, _services)
1010
.GetAudioClient(_model);

src/Plugins/BotSharp.Plugin.AzureOpenAI/Providers/Audio/AudioCompletionProvider.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
namespace BotSharp.Plugin.AzureOpenAI.Providers.Audio;
22

3-
public partial class AudioCompletionProvider : IAudioCompletion
3+
public partial class AudioCompletionProvider : IAudioTranscription
44
{
55
private readonly IServiceProvider _services;
66

src/Plugins/BotSharp.Plugin.EmailHandler/Functions/HandleEmailReaderFn.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ public async Task<bool> Execute(RoleDialogModel message)
6767

6868
var llmProviderService = _services.GetRequiredService<ILlmProviderService>();
6969
var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == "openai");
70-
var model = llmProviderService.GetProviderModel(provider: provider ?? "openai", id: "gpt-4");
70+
var model = llmProviderService.GetProviderModel(provider: provider ?? "openai", id: "gpt-4o");
7171
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name);
7272
var convService = _services.GetRequiredService<IConversationService>();
7373
var conversationId = convService.ConversationId;

src/Plugins/BotSharp.Plugin.FileHandler/Functions/ReadImageFn.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ private async Task<string> GetChatCompletion(Agent agent, List<RoleDialogModel>
100100
{
101101
var llmProviderService = _services.GetRequiredService<ILlmProviderService>();
102102
var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == "openai");
103-
var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4", multiModal: true);
103+
var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4o", multiModal: true);
104104
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name);
105105
var response = await completion.GetChatCompletions(agent, dialogs);
106106
return response.Content;

src/Plugins/BotSharp.Plugin.FileHandler/Functions/ReadPdfFn.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ private async Task<string> GetChatCompletion(Agent agent, List<RoleDialogModel>
7878
{
7979
var llmProviderService = _services.GetRequiredService<ILlmProviderService>();
8080
var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == "openai");
81-
var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4", multiModal: true);
81+
var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4o", multiModal: true);
8282
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name);
8383
var response = await completion.GetChatCompletions(agent, dialogs);
8484
return response.Content;

src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,26 +61,30 @@ public class RealtimeSessionTurnDetection
6161
/// <summary>
6262
/// Milliseconds
6363
/// </summary>
64-
[JsonPropertyName("prefix_padding_ms")]
64+
/*[JsonPropertyName("prefix_padding_ms")]
6565
public int PrefixPadding { get; set; } = 300;
6666
6767
[JsonPropertyName("silence_duration_ms")]
6868
public int SilenceDuration { get; set; } = 500;
6969
7070
[JsonPropertyName("threshold")]
71-
public float Threshold { get; set; } = 0.5f;
71+
public float Threshold { get; set; } = 0.5f;*/
7272

7373
[JsonPropertyName("type")]
74-
public string Type { get; set; } = "server_vad";
74+
public string Type { get; set; } = "semantic_vad";
75+
76+
[JsonPropertyName("eagerness")]
77+
public string eagerness { get;set; } = "auto";
7578
}
7679

7780
public class InputAudioTranscription
7881
{
7982
[JsonPropertyName("model")]
80-
public string Model { get; set; } = "whisper-1";
83+
public string Model { get; set; } = "gpt-4o-transcribe";
8184

8285
[JsonPropertyName("language")]
83-
public string Language { get; set; } = "en";
86+
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
87+
public string? Language { get; set; }
8488

8589
[JsonPropertyName("prompt")]
8690
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]

src/Plugins/BotSharp.Plugin.OpenAI/OpenAiPlugin.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ public void RegisterDI(IServiceCollection services, IConfiguration config)
3333
services.AddScoped<IChatCompletion, ChatCompletionProvider>();
3434
services.AddScoped<ITextEmbedding, TextEmbeddingProvider>();
3535
services.AddScoped<IImageCompletion, ImageCompletionProvider>();
36-
services.AddScoped<IAudioCompletion, AudioCompletionProvider>();
36+
services.AddScoped<IAudioTranscription, AudioTranscriptionProvider>();
37+
services.AddScoped<IAudioSynthesis, AudioSynthesisProvider>();
3738
services.AddScoped<IRealTimeCompletion, RealTimeCompletionProvider>();
3839

3940
services.AddRefitClient<IOpenAiRealtimeApi>()

src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioCompletionProvider.cs

Lines changed: 0 additions & 21 deletions
This file was deleted.

0 commit comments

Comments
 (0)