Text-to-Speech

Generate natural speech from text using OpenAI TTS.

Overview

Text-to-Speech provides:

  • Natural voice synthesis

  • Multiple voice options

  • Speed control

  • Audio streaming

  • Character voice presets

Basic Setup

Generate Speech

AudioClip clip = await agent.GenerateSpeechAsync("Hello, world!");
agent.AudioController.OutputPlayer.Play(clip);

Voice Selection

Available Voices

OpenAI TTS offers 6 voices:

  • alloy - Neutral, balanced

  • echo - Male, clear

  • fable - British, warm

  • onyx - Deep, authoritative

  • nova - Female, friendly

  • shimmer - Soft, gentle

Set Voice

public class VoiceSelector : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    
    void Start()
    {
        // Set voice
        agent.ParametersController.SetVoice("nova");
        
        Debug.Log("Voice set to: nova");
    }
    
    public async UniTask SpeakWithVoice(string text, string voice)
    {
        agent.ParametersController.SetVoice(voice);
        
        AudioClip clip = await agent.GenerateSpeechAsync(text);
        agent.AudioController.OutputPlayer.Play(clip);
    }
}

// Examples
await SpeakWithVoice("Welcome!", "alloy");
await SpeakWithVoice("Warning!", "onyx");
await SpeakWithVoice("Thank you!", "nova");

Voice Comparison UI

public class VoiceComparator : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    [SerializeField] private Dropdown voiceDropdown;
    [SerializeField] private Button testButton;
    
    private string[] voices = { "alloy", "echo", "fable", "onyx", "nova", "shimmer" };
    
    void Start()
    {
        voiceDropdown.ClearOptions();
        voiceDropdown.AddOptions(voices.ToList());
        
        testButton.onClick.AddListener(TestVoice);
    }
    
    async void TestVoice()
    {
        string voice = voices[voiceDropdown.value];
        string testText = $"This is the {voice} voice.";
        
        agent.ParametersController.SetVoice(voice);
        
        AudioClip clip = await agent.GenerateSpeechAsync(testText);
        agent.AudioController.OutputPlayer.Play(clip);
        
        Debug.Log($"Testing voice: {voice}");
    }
}

Speed Control

Adjust Speech Speed

public class SpeechSpeedController : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    [SerializeField] private Slider speedSlider;
    [SerializeField] private TMP_Text speedText;
    
    void Start()
    {
        speedSlider.minValue = 0.25f;
        speedSlider.maxValue = 4.0f;
        speedSlider.value = 1.0f;
        
        speedSlider.onValueChanged.AddListener(SetSpeed);
    }
    
    void SetSpeed(float speed)
    {
        agent.ParametersController.SetSpeechSpeed(speed);
        speedText.text = $"{speed:F2}x";
        
        Debug.Log($"Speech speed: {speed}x");
    }
}

Character Voices

Voice Presets

public class CharacterVoicePresets : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    
    public enum CharacterType
    {
        Hero,
        Villain,
        Companion,
        Merchant,
        Elder
    }
    
    Dictionary<CharacterType, string> voicePresets = new()
    {
        { CharacterType.Hero, "echo" },
        { CharacterType.Villain, "onyx" },
        { CharacterType.Companion, "nova" },
        { CharacterType.Merchant, "fable" },
        { CharacterType.Elder, "alloy" }
    };
    
    public async UniTask SpeakAsCharacter(CharacterType character, string dialogue)
    {
        string voice = voicePresets[character];
        agent.ParametersController.SetVoice(voice);
        
        Debug.Log($"{character} ({voice}): {dialogue}");
        
        AudioClip clip = await agent.GenerateSpeechAsync(dialogue);
        agent.AudioController.OutputPlayer.Play(clip);
    }
}

// Example usage
var presets = GetComponent<CharacterVoicePresets>();
await presets.SpeakAsCharacter(CharacterVoicePresets.CharacterType.Hero, 
    "I will save the kingdom!");
await presets.SpeakAsCharacter(CharacterVoicePresets.CharacterType.Villain, 
    "You cannot stop me!");

Dynamic Voice Assignment

public class DynamicVoiceAssignment : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    
    private Dictionary<string, string> characterVoices = new();
    private string[] availableVoices = { "alloy", "echo", "fable", "onyx", "nova", "shimmer" };
    
    public async UniTask SpeakAsNPC(string npcName, string dialogue)
    {
        // Assign voice if not assigned
        if (!characterVoices.ContainsKey(npcName))
        {
            string voice = AssignVoice(npcName);
            characterVoices[npcName] = voice;
        }
        
        string assignedVoice = characterVoices[npcName];
        agent.ParametersController.SetVoice(assignedVoice);
        
        Debug.Log($"{npcName} ({assignedVoice}): {dialogue}");
        
        AudioClip clip = await agent.GenerateSpeechAsync(dialogue);
        agent.AudioController.OutputPlayer.Play(clip);
    }
    
    string AssignVoice(string npcName)
    {
        // Hash name to consistently assign same voice
        int hash = npcName.GetHashCode();
        int index = Mathf.Abs(hash) % availableVoices.Length;
        return availableVoices[index];
    }
}

Dialogue Queue

Sequential Dialogue

public class DialogueQueue : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    
    private Queue<(string text, string voice)> dialogueQueue = new();
    private bool isPlaying;
    
    public void QueueDialogue(string text, string voice)
    {
        dialogueQueue.Enqueue((text, voice));
        
        if (!isPlaying)
        {
            PlayNextDialogue().Forget();
        }
    }
    
    async UniTaskVoid PlayNextDialogue()
    {
        if (dialogueQueue.Count == 0)
        {
            isPlaying = false;
            return;
        }
        
        isPlaying = true;
        
        var (text, voice) = dialogueQueue.Dequeue();
        
        agent.ParametersController.SetVoice(voice);
        
        AudioClip clip = await agent.GenerateSpeechAsync(text);
        
        var player = agent.AudioController.OutputPlayer;
        player.Play(clip);
        
        // Wait for completion
        await UniTask.WaitUntil(() => !player.IsPlaying);
        
        // Play next
        await PlayNextDialogue();
    }
    
    public void ClearQueue()
    {
        dialogueQueue.Clear();
        agent.AudioController.OutputPlayer.Stop();
        isPlaying = false;
    }
}

// Example usage
dialogueQueue.QueueDialogue("Hello traveler!", "nova");
dialogueQueue.QueueDialogue("What brings you here?", "nova");
dialogueQueue.QueueDialogue("I need your help!", "echo");

Subtitle Synchronization

Show Subtitles with Speech

public class SubtitleSync : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    [SerializeField] private TMP_Text subtitleText;
    [SerializeField] private float wordsPerSecond = 3f;
    
    public async UniTask SpeakWithSubtitles(string text, string voice)
    {
        agent.ParametersController.SetVoice(voice);
        
        // Generate speech
        AudioClip clip = await agent.GenerateSpeechAsync(text);
        
        // Play audio and show subtitles
        var player = agent.AudioController.OutputPlayer;
        player.Play(clip);
        
        await ShowSubtitlesAnimated(text, clip.length);
    }
    
    async UniTask ShowSubtitlesAnimated(string text, float duration)
    {
        string[] words = text.Split(' ');
        float timePerWord = duration / words.Length;
        
        StringBuilder current = new();
        
        foreach (string word in words)
        {
            current.Append(word).Append(" ");
            subtitleText.text = current.ToString();
            
            await UniTask.Delay(TimeSpan.FromSeconds(timePerWord));
        }
        
        // Clear after a delay
        await UniTask.Delay(TimeSpan.FromSeconds(2));
        subtitleText.text = "";
    }
}

Caching

Cache Generated Speech

public class SpeechCache : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    [SerializeField] private int maxCacheSize = 50;
    
    private Dictionary<string, AudioClip> cache = new();
    private Queue<string> cacheKeys = new();
    
    public async UniTask<AudioClip> GetOrGenerateSpeech(string text, string voice)
    {
        string key = $"{voice}_{text}";
        
        // Check cache
        if (cache.ContainsKey(key))
        {
            Debug.Log($"✓ Cache hit: {text.Substring(0, Math.Min(30, text.Length))}...");
            return cache[key];
        }
        
        // Generate
        Debug.Log($"Generating speech: {text.Substring(0, Math.Min(30, text.Length))}...");
        
        agent.ParametersController.SetVoice(voice);
        AudioClip clip = await agent.GenerateSpeechAsync(text);
        
        // Add to cache
        AddToCache(key, clip);
        
        return clip;
    }
    
    void AddToCache(string key, AudioClip clip)
    {
        // Remove oldest if cache full
        if (cache.Count >= maxCacheSize)
        {
            string oldestKey = cacheKeys.Dequeue();
            cache.Remove(oldestKey);
        }
        
        cache[key] = clip;
        cacheKeys.Enqueue(key);
    }
    
    public void ClearCache()
    {
        cache.Clear();
        cacheKeys.Clear();
        
        Debug.Log("Speech cache cleared");
    }
}

Emotion and Emphasis

SSML-like Formatting

public class EmotionalSpeech : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    
    public async UniTask SpeakWithEmotion(string text, string emotion)
    {
        // Add emotional context
        string emotionalText = AddEmotionalContext(text, emotion);
        
        AudioClip clip = await agent.GenerateSpeechAsync(emotionalText);
        agent.AudioController.OutputPlayer.Play(clip);
    }
    
    string AddEmotionalContext(string text, string emotion)
    {
        switch (emotion.ToLower())
        {
            case "excited":
                return text + "!";
            case "sad":
                return text.ToLower() + "...";
            case "angry":
                return text.ToUpper() + "!";
            case "whisper":
                return $"*{text}*";
            default:
                return text;
        }
    }
}

Batch Generation

Pre-generate Multiple Lines

public class BatchSpeechGenerator : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    
    public async UniTask<Dictionary<string, AudioClip>> GenerateBatch(
        Dictionary<string, string> dialogues,
        string voice)
    {
        agent.ParametersController.SetVoice(voice);
        
        Dictionary<string, AudioClip> results = new();
        
        int count = 0;
        foreach (var kvp in dialogues)
        {
            count++;
            Debug.Log($"Generating {count}/{dialogues.Count}: {kvp.Key}");
            
            AudioClip clip = await agent.GenerateSpeechAsync(kvp.Value);
            results[kvp.Key] = clip;
            
            // Rate limiting
            await UniTask.Delay(TimeSpan.FromSeconds(1));
        }
        
        Debug.Log($"✓ Batch generation complete: {count} clips");
        return results;
    }
}

// Example usage
var dialogues = new Dictionary<string, string>
{
    { "greeting", "Welcome to the shop!" },
    { "thanks", "Thank you for your purchase!" },
    { "goodbye", "Come back soon!" }
};

var clips = await batchGenerator.GenerateBatch(dialogues, "fable");

Save and Load

Export Audio Files

public class SpeechExporter : MonoBehaviour
{
    [SerializeField] private string exportPath = "Speech";
    
    public void SaveSpeech(AudioClip clip, string filename)
    {
        byte[] wavData = ConvertToWav(clip);
        
        string fullPath = Path.Combine(
            Application.persistentDataPath,
            exportPath,
            $"{filename}.wav"
        );
        
        Directory.CreateDirectory(Path.GetDirectoryName(fullPath));
        File.WriteAllBytes(fullPath, wavData);
        
        Debug.Log($"💾 Saved: {fullPath}");
    }
    
    byte[] ConvertToWav(AudioClip clip)
    {
        float[] samples = new float[clip.samples * clip.channels];
        clip.GetData(samples, 0);
        
        byte[] wav = new byte[44 + samples.Length * 2];
        
        // WAV header
        System.Text.Encoding.UTF8.GetBytes("RIFF").CopyTo(wav, 0);
        BitConverter.GetBytes(wav.Length - 8).CopyTo(wav, 4);
        System.Text.Encoding.UTF8.GetBytes("WAVE").CopyTo(wav, 8);
        
        // fmt chunk
        System.Text.Encoding.UTF8.GetBytes("fmt ").CopyTo(wav, 12);
        BitConverter.GetBytes(16).CopyTo(wav, 16);
        BitConverter.GetBytes((short)1).CopyTo(wav, 20);
        BitConverter.GetBytes((short)clip.channels).CopyTo(wav, 22);
        BitConverter.GetBytes(clip.frequency).CopyTo(wav, 24);
        BitConverter.GetBytes(clip.frequency * clip.channels * 2).CopyTo(wav, 28);
        BitConverter.GetBytes((short)(clip.channels * 2)).CopyTo(wav, 32);
        BitConverter.GetBytes((short)16).CopyTo(wav, 34);
        
        // data chunk
        System.Text.Encoding.UTF8.GetBytes("data").CopyTo(wav, 36);
        BitConverter.GetBytes(samples.Length * 2).CopyTo(wav, 40);
        
        // Audio data
        int offset = 44;
        for (int i = 0; i < samples.Length; i++)
        {
            short sample = (short)(samples[i] * short.MaxValue);
            BitConverter.GetBytes(sample).CopyTo(wav, offset);
            offset += 2;
        }
        
        return wav;
    }
}

Error Handling

Handle TTS Errors

try
{
    AudioClip clip = await agent.GenerateSpeechAsync(text);
    agent.AudioController.OutputPlayer.Play(clip);
}
catch (Exception ex)
{
    Debug.LogError($"TTS error: {ex.Message}");
    
    if (ex.Message.Contains("invalid_text"))
    {
        ShowMessage("Invalid text input.");
    }
    else if (ex.Message.Contains("too_long"))
    {
        ShowMessage("Text too long (max 4096 characters).");
    }
    else if (ex.Message.Contains("rate_limit"))
    {
        ShowMessage("Rate limit exceeded. Please wait.");
    }
    else
    {
        ShowMessage("Speech generation failed.");
    }
}

Complete Example

using UnityEngine;
using Glitch9.AIDevKit.Agents;
using Cysharp.Threading.Tasks;

public class TTSManager : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    [SerializeField] private TMP_InputField textInput;
    [SerializeField] private Dropdown voiceDropdown;
    [SerializeField] private Button speakButton;
    [SerializeField] private TMP_Text subtitleText;
    
    private string[] voices = { "alloy", "echo", "fable", "onyx", "nova", "shimmer" };
    private SpeechCache cache;
    
    async void Start()
    {
        cache = gameObject.AddComponent<SpeechCache>();
        
        SetupUI();
        
        Debug.Log("✓ TTS Manager ready");
    }
    
    void SetupUI()
    {
        voiceDropdown.ClearOptions();
        voiceDropdown.AddOptions(voices.ToList());
        voiceDropdown.value = 4; // nova
        
        speakButton.onClick.AddListener(Speak);
    }
    
    async void Speak()
    {
        string text = textInput.text;
        
        if (string.IsNullOrEmpty(text))
        {
            Debug.LogWarning("No text to speak");
            return;
        }
        
        string voice = voices[voiceDropdown.value];
        
        speakButton.interactable = false;
        subtitleText.text = "Generating...";
        
        try
        {
            // Get or generate speech
            AudioClip clip = await cache.GetOrGenerateSpeech(text, voice);
            
            // Play with subtitles
            await SpeakWithSubtitles(text, clip);
            
            Debug.Log($"✓ Spoke: {text}");
        }
        catch (Exception ex)
        {
            Debug.LogError($"TTS error: {ex.Message}");
            subtitleText.text = $"<color=red>Error: {ex.Message}</color>";
            
            await UniTask.Delay(TimeSpan.FromSeconds(2));
        }
        finally
        {
            speakButton.interactable = true;
            subtitleText.text = "";
        }
    }
    
    async UniTask SpeakWithSubtitles(string text, AudioClip clip)
    {
        var player = agent.AudioController.OutputPlayer;
        player.Play(clip);
        
        // Show subtitles
        string[] words = text.Split(' ');
        float timePerWord = clip.length / words.Length;
        
        StringBuilder current = new();
        
        foreach (string word in words)
        {
            current.Append(word).Append(" ");
            subtitleText.text = current.ToString();
            
            await UniTask.Delay(TimeSpan.FromSeconds(timePerWord));
        }
        
        // Wait for completion
        await UniTask.WaitUntil(() => !player.IsPlaying);
        
        // Clear after delay
        await UniTask.Delay(TimeSpan.FromSeconds(1));
        subtitleText.text = "";
    }
}

Next Steps

Last updated