Audio Input

Send audio messages to your agent using voice input.

Overview

Audio input enables:

  • Voice conversations - Speak to your agent

  • Audio transcription - Convert speech to text

  • Multi-modal interaction - Combine voice and text

Basic Audio Input

Send Audio Clip

// Load or record audio clip
AudioClip audioClip = GetAudioClip();

// Send to agent
Response response = await agent.SendAsync(audioClip);

Debug.Log($"Transcription: {response.Text}");

Using Input Audio Recorder

[SerializeField] private AgentBehaviour agent;
[SerializeField] private InputAudioRecorder recorder;

void Start()
{
    agent.InputAudioRecorder = recorder;
}

public async void RecordAndSend()
{
    // Agent handles recording automatically
    await agent.SendAudioAsync();
}

Recording Audio

Manual Recording

[SerializeField] private InputAudioRecorder recorder;

public async void ManualRecord()
{
    // Start recording
    recorder.StartRecording();
    
    // Wait for user to finish (e.g., button release)
    await UniTask.WaitUntil(() => Input.GetKeyUp(KeyCode.Space));
    
    // Stop and get clip
    AudioClip clip = recorder.StopRecording();
    
    // Send to agent
    await agent.SendAsync(clip);
}

Push-to-Talk

public class PushToTalk : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    [SerializeField] private InputAudioRecorder recorder;
    
    private bool isRecording = false;
    
    void Update()
    {
        // Hold Space to record
        if (Input.GetKeyDown(KeyCode.Space) && !isRecording)
        {
            StartRecording();
        }
        else if (Input.GetKeyUp(KeyCode.Space) && isRecording)
        {
            StopRecording();
        }
    }
    
    async void StartRecording()
    {
        isRecording = true;
        recorder.StartRecording();
        Debug.Log("🎤 Recording...");
    }
    
    async void StopRecording()
    {
        isRecording = false;
        AudioClip clip = recorder.StopRecording();
        
        Debug.Log("⏹ Stopped recording");
        
        // Send to agent
        await agent.SendAsync(clip);
    }
}

Voice Activity Detection

public class VoiceActivation : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    [SerializeField] private InputAudioRecorder recorder;
    
    [SerializeField] private float threshold = 0.02f;
    [SerializeField] private float silenceDuration = 2f;
    
    private bool isRecording = false;
    private float silenceTimer = 0f;
    
    void Update()
    {
        float volume = GetMicrophoneVolume();
        
        if (volume > threshold)
        {
            if (!isRecording)
            {
                StartRecording();
            }
            silenceTimer = 0f;
        }
        else if (isRecording)
        {
            silenceTimer += Time.deltaTime;
            
            if (silenceTimer >= silenceDuration)
            {
                StopRecording();
            }
        }
    }
    
    float GetMicrophoneVolume()
    {
        // Get volume from recorder
        return recorder.GetCurrentVolume();
    }
    
    async void StartRecording()
    {
        isRecording = true;
        recorder.StartRecording();
        Debug.Log("🎤 Voice detected, recording...");
    }
    
    async void StopRecording()
    {
        isRecording = false;
        silenceTimer = 0f;
        
        AudioClip clip = recorder.StopRecording();
        Debug.Log("⏹ Silence detected, sending...");
        
        await agent.SendAsync(clip);
    }
}

Audio Events

Recording Lifecycle

void Start()
{
    agent.onInputAudioStarted.AddListener(OnRecordingStarted);
    agent.onInputAudioCompleted.AddListener(OnRecordingCompleted);
    agent.onInputAudioTranscribed.AddListener(OnTranscribed);
}

void OnRecordingStarted()
{
    Debug.Log("🎤 Recording started");
    ShowRecordingIndicator();
}

void OnRecordingCompleted(AudioClip clip)
{
    Debug.Log($"⏹ Recording completed: {clip.length}s");
    HideRecordingIndicator();
    ShowTranscribingIndicator();
}

void OnTranscribed(string text)
{
    Debug.Log($"📝 Transcribed: {text}");
    HideTranscribingIndicator();
    DisplayTranscription(text);
}

UnityEvents (AgentBehaviour)

// In Inspector, bind to UI elements
public class AudioUI : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    [SerializeField] private GameObject recordingIndicator;
    [SerializeField] private TMP_Text transcriptionText;
    
    void Start()
    {
        // Bind in Inspector or code
        agent.onInputAudioStarted.AddListener(() =>
        {
            recordingIndicator.SetActive(true);
        });
        
        agent.onInputAudioCompleted.AddListener((clip) =>
        {
            recordingIndicator.SetActive(false);
        });
        
        agent.onInputAudioTranscribed.AddListener((text) =>
        {
            transcriptionText.text = text;
        });
    }
}

Transcription Settings

Configure Language

// Set in AgentSettings or at runtime
agent.InputAudioLanguage = SystemLanguage.English;

// Or Japanese
agent.InputAudioLanguage = SystemLanguage.Japanese;

Transcription Parameters

// In AgentSettings.InputAudioParameters
settings.InputAudioParameters = new TranscriptionParameters
{
    Model = "whisper-1",
    SpokenLanguage = SystemLanguage.English,
    Prompt = "Technical conversation about Unity and game development",
    Temperature = 0.2f  // Lower = more accurate
};

Custom Prompt for Context

// Improve accuracy with context
agent.Settings.InputAudioParameters.Prompt = @"
This is a conversation about Unity game development.
Common terms: GameObject, MonoBehaviour, coroutine, prefab, shader.
";

Audio Quality

Check Microphone

void Start()
{
    if (Microphone.devices.Length == 0)
    {
        Debug.LogError("No microphone detected!");
        return;
    }
    
    Debug.Log($"Using microphone: {Microphone.devices[0]}");
}

Validate Audio Clip

bool IsAudioClipValid(AudioClip clip)
{
    if (clip == null)
    {
        Debug.LogError("Audio clip is null");
        return false;
    }
    
    if (clip.length < 0.1f)
    {
        Debug.LogWarning("Audio clip too short");
        return false;
    }
    
    if (clip.length > 60f)
    {
        Debug.LogWarning("Audio clip too long");
        return false;
    }
    
    return true;
}

Noise Reduction

public AudioClip ReduceNoise(AudioClip original)
{
    // Get samples
    float[] samples = new float[original.samples * original.channels];
    original.GetData(samples, 0);
    
    // Apply simple noise gate
    float threshold = 0.01f;
    for (int i = 0; i < samples.Length; i++)
    {
        if (Mathf.Abs(samples[i]) < threshold)
        {
            samples[i] = 0f;
        }
    }
    
    // Create new clip
    AudioClip filtered = AudioClip.Create(
        "Filtered",
        original.samples,
        original.channels,
        original.frequency,
        false
    );
    filtered.SetData(samples, 0);
    
    return filtered;
}

Combining Audio and Text

Send Both

// Send audio for transcription, but also include text context
public async UniTask SendAudioWithContext(AudioClip audio, string context)
{
    // First send context
    await agent.SendAsync(context);
    
    // Then send audio
    await agent.SendAsync(audio);
    
    // Agent has context from previous message
}

Pre-transcribe and Edit

public async UniTask SendAudioEditable(AudioClip audio)
{
    // Transcribe first
    string transcription = await agent.TranscribeAsync(audio);
    
    // Let user edit transcription
    string edited = await ShowEditDialog(transcription);
    
    // Send edited text
    await agent.SendAsync(edited);
}

Error Handling

Recording Errors

try
{
    AudioClip clip = await RecordAudioAsync();
    await agent.SendAsync(clip);
}
catch (MicrophoneException ex)
{
    Debug.LogError($"Microphone error: {ex.Message}");
    ShowError("Microphone access denied");
}
catch (Exception ex)
{
    Debug.LogError($"Recording failed: {ex.Message}");
}

Transcription Errors

agent.onError.AddListener(OnError);

void OnError(string error)
{
    if (error.Contains("transcription"))
    {
        Debug.LogError("Transcription failed");
        ShowError("Could not understand audio. Please try again.");
    }
}

Performance Tips

1. Limit Recording Duration

private const float maxRecordingDuration = 30f;
private float recordingStartTime;

void StartRecording()
{
    recorder.StartRecording();
    recordingStartTime = Time.time;
}

void Update()
{
    if (isRecording)
    {
        if (Time.time - recordingStartTime >= maxRecordingDuration)
        {
            StopRecording();
            Debug.Log("Max recording duration reached");
        }
    }
}

2. Compress Audio

public AudioClip CompressAudio(AudioClip original)
{
    // Downsample to 16kHz (Whisper's native rate)
    int targetFrequency = 16000;
    
    if (original.frequency <= targetFrequency)
    {
        return original;
    }
    
    // Resample audio
    // (Implementation depends on audio library)
    return ResampleAudio(original, targetFrequency);
}

3. Cache Microphone Access

private static string cachedMicrophone;

string GetMicrophone()
{
    if (cachedMicrophone == null)
    {
        if (Microphone.devices.Length > 0)
        {
            cachedMicrophone = Microphone.devices[0];
        }
    }
    
    return cachedMicrophone;
}

Complete Example

using UnityEngine;
using Glitch9.AIDevKit.Agents;
using Cysharp.Threading.Tasks;
using TMPro;

public class VoiceInput : MonoBehaviour
{
    [SerializeField] private AgentBehaviour agent;
    [SerializeField] private InputAudioRecorder recorder;
    
    [Header("UI")]
    [SerializeField] private GameObject recordButton;
    [SerializeField] private GameObject stopButton;
    [SerializeField] private GameObject recordingIndicator;
    [SerializeField] private TMP_Text transcriptionText;
    [SerializeField] private TMP_Text statusText;
    
    [Header("Settings")]
    [SerializeField] private float maxRecordingDuration = 30f;
    
    private bool isRecording = false;
    private float recordingStartTime;
    
    void Start()
    {
        // Setup agent
        agent.InputAudioRecorder = recorder;
        agent.InputAudioLanguage = SystemLanguage.English;
        
        // Setup events
        agent.onInputAudioStarted.AddListener(OnRecordingStarted);
        agent.onInputAudioCompleted.AddListener(OnRecordingCompleted);
        agent.onInputAudioTranscribed.AddListener(OnTranscribed);
        agent.onError.AddListener(OnError);
        
        // Check microphone
        if (Microphone.devices.Length == 0)
        {
            statusText.text = "No microphone detected";
            recordButton.SetActive(false);
        }
    }
    
    void Update()
    {
        // Check max duration
        if (isRecording)
        {
            float duration = Time.time - recordingStartTime;
            statusText.text = $"Recording: {duration:F1}s / {maxRecordingDuration}s";
            
            if (duration >= maxRecordingDuration)
            {
                StopRecording();
            }
        }
    }
    
    public void StartRecording()
    {
        isRecording = true;
        recordingStartTime = Time.time;
        
        recorder.StartRecording();
        
        recordButton.SetActive(false);
        stopButton.SetActive(true);
        recordingIndicator.SetActive(true);
        transcriptionText.text = "";
    }
    
    public async void StopRecording()
    {
        if (!isRecording) return;
        
        isRecording = false;
        
        AudioClip clip = recorder.StopRecording();
        
        recordButton.SetActive(true);
        stopButton.SetActive(false);
        recordingIndicator.SetActive(false);
        statusText.text = "Processing...";
        
        try
        {
            await agent.SendAsync(clip);
        }
        catch (Exception ex)
        {
            Debug.LogError($"Error: {ex.Message}");
            statusText.text = "Error: " + ex.Message;
        }
    }
    
    void OnRecordingStarted()
    {
        Debug.Log("🎤 Recording started");
    }
    
    void OnRecordingCompleted(AudioClip clip)
    {
        Debug.Log($"⏹ Recording completed: {clip.length}s");
        statusText.text = "Transcribing...";
    }
    
    void OnTranscribed(string text)
    {
        Debug.Log($"📝 Transcribed: {text}");
        transcriptionText.text = $"You: {text}";
        statusText.text = "Ready";
    }
    
    void OnError(string error)
    {
        Debug.LogError($"Error: {error}");
        statusText.text = $"Error: {error}";
        
        isRecording = false;
        recordButton.SetActive(true);
        stopButton.SetActive(false);
        recordingIndicator.SetActive(false);
    }
}

Next Steps

Last updated