Speech to Text
Transcribe audio to text using .GENTranscript() or .SpeechToText().
Basic Usage
AudioClip recording = Microphone.Start(null, false, 10, 44100);
await UniTask.Delay(5000);
Microphone.End(null);
string transcript = await recording
.GENTranscript()
.ExecuteAsync();
Debug.Log($"You said: {transcript}");Input Types
AudioClip Input
AudioClip audio = Resources.Load<AudioClip>("Recording");
string text = await audio
.GENTranscript()
.ExecuteAsync();File Input
var audioFile = new File<AudioClip>(audioClip, "recording.wav");
string text = await audioFile
.GENTranscript()
.ExecuteAsync();Alias Method
// SpeechToText() is an alias for GENTranscript()
string text = await audioClip
.SpeechToText()
.ExecuteAsync();Configuration
Model Selection
// OpenAI Whisper
string text = await audioClip
.GENTranscript()
.SetModel(OpenAIModel.Whisper1)
.ExecuteAsync();
// Google Chirp
string text = await audioClip
.GENTranscript()
.SetModel(GoogleModel.ChirpV2)
.ExecuteAsync();Language Hint
string text = await audioClip
.GENTranscript()
.SetLanguage(SystemLanguage.English)
.ExecuteAsync();Supported languages:
English, Spanish, French, German, Italian, Portuguese
Chinese, Japanese, Korean
Arabic, Russian, Turkish
And many more...
Context Prompt
Provide context to improve accuracy:
string text = await audioClip
.GENTranscript()
.SetPrompt("Technical terms: Unity, C#, GameObject, MonoBehaviour")
.ExecuteAsync();Temperature
Control randomness of transcription (0.0-1.0):
string text = await audioClip
.GENTranscript()
.SetTemperature(0.0f) // More deterministic
.ExecuteAsync();Response Format
// Get detailed response with timestamps
var response = await audioClip
.GENTranscript()
.SetResponseFormat(TranscriptFormat.VerboseJson)
.ExecuteAsync();Available formats:
TranscriptFormat.Text- Plain text onlyTranscriptFormat.Json- JSON with textTranscriptFormat.VerboseJson- JSON with timestamps and metadataTranscriptFormat.Srt- SubRip subtitle formatTranscriptFormat.Vtt- WebVTT subtitle format
Unity Integration Examples
Example 1: Voice Command System
public class VoiceCommandSystem : MonoBehaviour
{
private bool isListening = false;
public async UniTask StartListening()
{
if (isListening) return;
isListening = true;
// Start recording
AudioClip recording = Microphone.Start(null, false, 10, 44100);
// Wait for user to finish speaking
await UniTask.Delay(5000);
// Stop recording
Microphone.End(null);
isListening = false;
// Transcribe
string command = await recording
.GENTranscript()
.SetLanguage(Application.systemLanguage)
.ExecuteAsync();
// Process command
ProcessCommand(command);
}
void ProcessCommand(string command)
{
if (command.Contains("jump"))
player.Jump();
else if (command.Contains("shoot"))
player.Shoot();
}
}Example 2: Real-time Subtitles
public class RealtimeSubtitles : MonoBehaviour
{
[SerializeField] private TMPro.TextMeshProUGUI subtitleText;
private Queue<AudioClip> audioQueue = new();
void Update()
{
if (Input.GetKeyDown(KeyCode.Space))
StartRecording().Forget();
}
async UniTaskVoid StartRecording()
{
AudioClip recording = Microphone.Start(null, false, 5, 44100);
await UniTask.Delay(3000);
Microphone.End(null);
string text = await recording
.GENTranscript()
.ExecuteAsync();
subtitleText.text = text;
}
}Example 3: Dictation System
public class DictationSystem : MonoBehaviour
{
[SerializeField] private TMPro.TMP_InputField inputField;
private List<string> transcripts = new();
public async UniTask RecordDictation()
{
Debug.Log("Recording... Press Space to stop");
AudioClip recording = Microphone.Start(null, false, 60, 44100);
await UniTask.WaitUntil(() => Input.GetKeyDown(KeyCode.Space));
Microphone.End(null);
string text = await recording
.GENTranscript()
.SetLanguage(Application.systemLanguage)
.ExecuteAsync();
transcripts.Add(text);
inputField.text = string.Join(" ", transcripts);
}
}Example 4: Audio File Transcriber
public class AudioFileTranscriber : MonoBehaviour
{
public async UniTask<string> TranscribeAudioFile(string filePath)
{
// Load audio file
AudioClip audio = await LoadAudioFile(filePath);
// Transcribe
string transcript = await audio
.GENTranscript()
.SetModel(OpenAIModel.Whisper1)
.SetResponseFormat(TranscriptFormat.VerboseJson)
.ExecuteAsync();
// Save to text file
string outputPath = Path.ChangeExtension(filePath, ".txt");
File.WriteAllText(outputPath, transcript);
return transcript;
}
async UniTask<AudioClip> LoadAudioFile(string path)
{
// Implementation depends on audio format
// Use UnityWebRequest for runtime loading
return null;
}
}Example 5: Meeting Transcriber
public class MeetingTranscriber : MonoBehaviour
{
private List<string> transcripts = new();
private bool isRecording = false;
public async UniTask StartMeeting()
{
isRecording = true;
while (isRecording)
{
// Record 30-second segments
AudioClip segment = Microphone.Start(null, false, 30, 44100);
await UniTask.Delay(30000);
Microphone.End(null);
// Transcribe segment
string text = await segment
.GENTranscript()
.SetPrompt("Meeting transcript with technical terms")
.ExecuteAsync();
transcripts.Add($"[{DateTime.Now:HH:mm:ss}] {text}");
}
}
public void StopMeeting()
{
isRecording = false;
// Save full transcript
string fullTranscript = string.Join("\n\n", transcripts);
File.WriteAllText("meeting_transcript.txt", fullTranscript);
}
}Example 6: Multi-Language Support
public class MultiLanguageTranscriber : MonoBehaviour
{
public async UniTask<string> TranscribeWithLanguageDetection(AudioClip audio)
{
// Try transcribing without language hint (auto-detect)
string transcript = await audio
.GENTranscript()
.ExecuteAsync();
return transcript;
}
public async UniTask<string> TranscribeSpecificLanguage(
AudioClip audio,
SystemLanguage language)
{
return await audio
.GENTranscript()
.SetLanguage(language)
.ExecuteAsync();
}
}Provider Support
OpenAI Whisper
string text = await audioClip
.GENTranscript()
.SetModel(OpenAIModel.Whisper1)
.SetLanguage(SystemLanguage.English)
.SetPrompt("Context for better accuracy")
.SetTemperature(0.0f)
.SetResponseFormat(TranscriptFormat.VerboseJson)
.ExecuteAsync();Features:
✅ 99+ languages
✅ High accuracy
✅ Speaker diarization (in verbose mode)
✅ Timestamp support
Google Chirp
string text = await audioClip
.GENTranscript()
.SetModel(GoogleModel.ChirpV2)
.SetLanguage(SystemLanguage.English)
.ExecuteAsync();Features:
✅ Multiple languages
✅ Real-time streaming
✅ Punctuation
✅ Word-level timestamps
Audio Requirements
Format Requirements
Supported formats:
WAV
MP3
M4A
FLAC
OGG
Recommended settings:
Sample rate: 16kHz or higher
Channels: Mono or Stereo
Bit depth: 16-bit or higher
Size Limits
OpenAI:
Max file size: 25 MB
Max duration: ~2 hours (at standard quality)
Google:
Max file size: 10 MB (for sync)
Max duration: 1 minute (for sync)
Best Practices for Audio
// ✅ Good - Use appropriate sample rate
AudioClip recording = Microphone.Start(null, false, 10, 16000);
// ✅ Good - Remove silence at start/end
AudioClip trimmed = TrimSilence(recording);
// ✅ Good - Normalize volume
AudioClip normalized = NormalizeVolume(recording);
// ❌ Bad - Very low sample rate
AudioClip lowQuality = Microphone.Start(null, false, 10, 8000);Best Practices
✅ Good Practices
// ✅ Provide context for technical terms
await audio.GENTranscript()
.SetPrompt("Unity game development: GameObject, Transform, Rigidbody")
.ExecuteAsync();
// ✅ Use appropriate language hint
await audio.GENTranscript()
.SetLanguage(Application.systemLanguage)
.ExecuteAsync();
// ✅ Handle long audio by chunking
async UniTask<string> TranscribeLongAudio(AudioClip longAudio)
{
var chunks = SplitAudio(longAudio, maxSeconds: 30);
var tasks = chunks.Select(chunk => chunk.GENTranscript().ExecuteAsync());
string[] results = await UniTask.WhenAll(tasks);
return string.Join(" ", results);
}
// ✅ Cache common phrases
Dictionary<AudioClip, string> cache = new();❌ Bad Practices
// ❌ Don't transcribe empty/silent audio
if (audioClip.length == 0) return; // Check first!
// ❌ Don't use very low quality audio
// Use at least 16kHz sample rate
// ❌ Don't forget error handling
try {
string text = await audio.GENTranscript().ExecuteAsync();
} catch (Exception ex) {
Debug.LogError($"Transcription failed: {ex.Message}");
}
// ❌ Don't block main thread
string text = audio.GENTranscript().ExecuteAsync().Result; // NO!Error Handling
try
{
string transcript = await audioClip
.GENTranscript()
.ExecuteAsync();
if (string.IsNullOrEmpty(transcript))
Debug.LogWarning("Empty transcript - audio may be silent");
return transcript;
}
catch (AIApiException ex)
{
Debug.LogError($"API Error: {ex.Message}");
// Handle: invalid audio format, file too large, etc.
}
catch (Exception ex)
{
Debug.LogError($"Unexpected error: {ex.Message}");
}Performance Tips
// ✅ Good - parallel processing
var tasks = audioClips.Select(clip =>
clip.GENTranscript().ExecuteAsync()
);
await UniTask.WhenAll(tasks);
// ✅ Good - use lower quality for non-critical transcription
await audio.GENTranscript()
.SetModel(OpenAIModel.Whisper1) // Faster
.ExecuteAsync();
// ❌ Bad - sequential processing
foreach (var clip in audioClips)
{
await clip.GENTranscript().ExecuteAsync(); // Slow!
}Next Steps
Speech Translation - Translate speech to English
Text to Speech - Generate speech from text
Voice Change - Modify voice characteristics
Last updated