Speech Generation
Generate speech audio from text using AI voices.
Overview
Speech Generation allows agents to:
Convert text to speech
Use AI voices (OpenAI TTS, ElevenLabs)
Generate character dialogue
Create voice-overs
Produce audio feedback
Basic Setup
Enable Speech Generation
agent.AddLocalTool(LocalToolType.SpeechGeneration);Configure Voice
agent.Settings.Voice = new VoiceSettings
{
Provider = VoiceProvider.OpenAI, // or ElevenLabs
Voice = "alloy", // OpenAI voices: alloy, echo, fable, onyx, nova, shimmer
Speed = 1.0f // 0.25 - 4.0
};Generate Speech
Simple Text-to-Speech
await agent.SpeakAsync("Hello! This is a test of speech generation.");Generate Without Playing
AudioClip clip = await agent.GenerateSpeechAsync("Text to convert");
// Use clip later
audioSource.clip = clip;
audioSource.Play();Voice Configuration
OpenAI Voices
// Available OpenAI voices
agent.Settings.Voice.Voice = "alloy"; // Neutral, balanced
agent.Settings.Voice.Voice = "echo"; // Male, clear
agent.Settings.Voice.Voice = "fable"; // British accent
agent.Settings.Voice.Voice = "onyx"; // Deep, authoritative
agent.Settings.Voice.Voice = "nova"; // Female, energetic
agent.Settings.Voice.Voice = "shimmer"; // Female, softElevenLabs Voices
agent.Settings.Voice = new VoiceSettings
{
Provider = VoiceProvider.ElevenLabs,
Voice = "21m00Tcm4TlvDq8ikWAM", // Voice ID
Model = "eleven_monolingual_v1",
Stability = 0.5f,
SimilarityBoost = 0.75f
};Speech Speed
// Normal speed
agent.Settings.Voice.Speed = 1.0f;
// Slow (for emphasis)
agent.Settings.Voice.Speed = 0.75f;
// Fast (for excitement)
agent.Settings.Voice.Speed = 1.25f;Character Dialogue
NPC Speech System
public class NPCDialogueSystem : MonoBehaviour
{
[SerializeField] private AgentBehaviour agent;
[SerializeField] private AudioSource audioSource;
private Dictionary<string, VoiceProfile> characterVoices = new()
{
{ "Warrior", new VoiceProfile { Voice = "onyx", Speed = 1.0f } },
{ "Mage", new VoiceProfile { Voice = "fable", Speed = 0.9f } },
{ "Rogue", new VoiceProfile { Voice = "echo", Speed = 1.1f } },
{ "Princess", new VoiceProfile { Voice = "shimmer", Speed = 0.95f } }
};
public async void SpeakAs(string characterName, string dialogue)
{
if (!characterVoices.TryGetValue(characterName, out var voice))
{
Debug.LogWarning($"No voice profile for {characterName}");
return;
}
// Set voice
agent.Settings.Voice.Voice = voice.Voice;
agent.Settings.Voice.Speed = voice.Speed;
// Generate and play
AudioClip clip = await agent.GenerateSpeechAsync(dialogue);
audioSource.clip = clip;
audioSource.Play();
Debug.Log($"{characterName}: {dialogue}");
}
}
[System.Serializable]
public class VoiceProfile
{
public string Voice;
public float Speed;
}
// Usage
npcSystem.SpeakAs("Warrior", "Stand and fight!");
npcSystem.SpeakAs("Mage", "Let me cast a spell.");Dialogue Queue
public class DialogueQueue : MonoBehaviour
{
[SerializeField] private AgentBehaviour agent;
[SerializeField] private AudioSource audioSource;
private Queue<DialogueLine> queue = new();
private bool isPlaying;
public void AddDialogue(string characterName, string text, string voice)
{
queue.Enqueue(new DialogueLine
{
Character = characterName,
Text = text,
Voice = voice
});
if (!isPlaying)
{
PlayNextLine();
}
}
async void PlayNextLine()
{
if (queue.Count == 0)
{
isPlaying = false;
return;
}
isPlaying = true;
var line = queue.Dequeue();
// Configure voice
agent.Settings.Voice.Voice = line.Voice;
// Generate
AudioClip clip = await agent.GenerateSpeechAsync(line.Text);
// Play
audioSource.clip = clip;
audioSource.Play();
Debug.Log($"{line.Character}: {line.Text}");
// Wait for completion
await UniTask.WaitUntil(() => !audioSource.isPlaying);
// Next line
PlayNextLine();
}
struct DialogueLine
{
public string Character;
public string Text;
public string Voice;
}
}Audio Management
Save Audio Files
public async void SaveSpeech(string text, string fileName)
{
AudioClip clip = await agent.GenerateSpeechAsync(text);
// Convert to WAV
byte[] wavData = ConvertToWav(clip);
// Save
string path = Path.Combine(Application.persistentDataPath, $"{fileName}.wav");
File.WriteAllBytes(path, wavData);
Debug.Log($"Saved: {path}");
}
byte[] ConvertToWav(AudioClip clip)
{
// WAV conversion implementation
// (Use Unity's AudioClip data or a WAV library)
return new byte[0]; // Placeholder
}Cache Generated Audio
public class SpeechCache : MonoBehaviour
{
private Dictionary<string, AudioClip> cache = new();
public async UniTask<AudioClip> GetOrGenerate(AgentBehaviour agent, string text)
{
string key = $"{agent.Settings.Voice.Voice}_{text}";
if (cache.TryGetValue(key, out var cached))
{
Debug.Log("Using cached audio");
return cached;
}
AudioClip clip = await agent.GenerateSpeechAsync(text);
cache[key] = clip;
return clip;
}
public void ClearCache()
{
foreach (var clip in cache.Values)
{
Destroy(clip);
}
cache.Clear();
}
}UI Integration
Speech Button
public class SpeechButton : MonoBehaviour
{
[SerializeField] private AgentBehaviour agent;
[SerializeField] private Button speakButton;
[SerializeField] private TMP_Text textToSpeak;
void Start()
{
speakButton.onClick.AddListener(OnSpeakClicked);
}
async void OnSpeakClicked()
{
speakButton.interactable = false;
await agent.SpeakAsync(textToSpeak.text);
speakButton.interactable = true;
}
}Voice Selection UI
public class VoiceSelector : MonoBehaviour
{
[SerializeField] private AgentBehaviour agent;
[SerializeField] private TMP_Dropdown voiceDropdown;
void Start()
{
// Populate dropdown
voiceDropdown.options.Clear();
voiceDropdown.options.Add(new TMP_Dropdown.OptionData("Alloy"));
voiceDropdown.options.Add(new TMP_Dropdown.OptionData("Echo"));
voiceDropdown.options.Add(new TMP_Dropdown.OptionData("Fable"));
voiceDropdown.options.Add(new TMP_Dropdown.OptionData("Onyx"));
voiceDropdown.options.Add(new TMP_Dropdown.OptionData("Nova"));
voiceDropdown.options.Add(new TMP_Dropdown.OptionData("Shimmer"));
voiceDropdown.onValueChanged.AddListener(OnVoiceChanged);
}
void OnVoiceChanged(int index)
{
string voice = voiceDropdown.options[index].text.ToLower();
agent.Settings.Voice.Voice = voice;
Debug.Log($"Voice changed to: {voice}");
}
}Advanced Usage
Multi-Language Support
public async void SpeakInLanguage(string text, string language)
{
// OpenAI TTS supports multiple languages automatically
// Just provide text in the target language
await agent.SpeakAsync(text);
}
// Usage
SpeakInLanguage("Bonjour!", "french");
SpeakInLanguage("こんにちは", "japanese");
SpeakInLanguage("Hola!", "spanish");Emotional Speech
public async void SpeakWithEmotion(string text, Emotion emotion)
{
// Modify text with emotion cues
string emotionalText = emotion switch
{
Emotion.Excited => text + "!",
Emotion.Sad => text + "...",
Emotion.Angry => text.ToUpper() + "!",
Emotion.Calm => text + ".",
_ => text
};
// Adjust speed based on emotion
agent.Settings.Voice.Speed = emotion switch
{
Emotion.Excited => 1.2f,
Emotion.Sad => 0.8f,
Emotion.Angry => 1.1f,
Emotion.Calm => 0.9f,
_ => 1.0f
};
await agent.SpeakAsync(emotionalText);
}
public enum Emotion
{
Neutral,
Excited,
Sad,
Angry,
Calm
}Batch Generation
public async UniTask<List<AudioClip>> GenerateBatch(string[] lines)
{
List<AudioClip> clips = new();
foreach (var line in lines)
{
var clip = await agent.GenerateSpeechAsync(line);
clips.Add(clip);
await UniTask.Delay(100); // Rate limiting
}
return clips;
}
// Usage
string[] dialogue = {
"Welcome to the game!",
"Let's begin your adventure.",
"Good luck, hero!"
};
var clips = await GenerateBatch(dialogue);Error Handling
Handle Generation Errors
agent.onSpeechError.AddListener(error =>
{
Debug.LogError($"Speech generation failed: {error}");
if (error.Contains("rate_limit"))
{
ShowMessage("Too many requests. Please wait.");
}
else if (error.Contains("quota"))
{
ShowMessage("Speech quota exceeded.");
}
else
{
ShowMessage("Failed to generate speech.");
}
});Complete Example
using UnityEngine;
using Glitch9.AIDevKit.Agents;
using Cysharp.Threading.Tasks;
public class SpeechGenerator : MonoBehaviour
{
[SerializeField] private AgentBehaviour agent;
[SerializeField] private AudioSource audioSource;
[Header("Voice Settings")]
[SerializeField] private string defaultVoice = "alloy";
[SerializeField] private float defaultSpeed = 1.0f;
private Dictionary<string, AudioClip> cache = new();
async void Start()
{
await SetupSpeechGeneration();
}
async UniTask SetupSpeechGeneration()
{
// Configure voice
agent.Settings.Voice = new VoiceSettings
{
Provider = VoiceProvider.OpenAI,
Voice = defaultVoice,
Speed = defaultSpeed
};
// Add tool
agent.AddLocalTool(LocalToolType.SpeechGeneration);
// Listen for events
agent.onSpeechGenerated.AddListener(OnSpeechGenerated);
agent.onSpeechError.AddListener(OnSpeechError);
Debug.Log("✓ Speech generation ready");
}
public async void Speak(string text, string voice = null, float? speed = null)
{
// Configure voice
if (voice != null)
agent.Settings.Voice.Voice = voice;
if (speed.HasValue)
agent.Settings.Voice.Speed = speed.Value;
Debug.Log($"🔊 Speaking: {text}");
try
{
// Check cache
string cacheKey = $"{agent.Settings.Voice.Voice}_{text}";
if (cache.TryGetValue(cacheKey, out var cached))
{
PlayAudio(cached);
return;
}
// Generate
AudioClip clip = await agent.GenerateSpeechAsync(text);
// Cache
cache[cacheKey] = clip;
// Play
PlayAudio(clip);
}
catch (Exception ex)
{
Debug.LogError($"Speech failed: {ex.Message}");
}
}
void PlayAudio(AudioClip clip)
{
audioSource.clip = clip;
audioSource.Play();
Debug.Log($"▶️ Playing audio: {clip.length:F2}s");
}
void OnSpeechGenerated(AudioClip clip)
{
Debug.Log($"✓ Speech generated: {clip.length:F2}s");
}
void OnSpeechError(string error)
{
Debug.LogError($"Speech error: {error}");
}
public void ClearCache()
{
foreach (var clip in cache.Values)
{
Destroy(clip);
}
cache.Clear();
Debug.Log("✓ Cache cleared");
}
void OnDestroy()
{
ClearCache();
}
}Next Steps
Last updated