feat: add xAI voice bot with Grok integration

- Add pkg/xai WebSocket client for xAI Voice Agent API
- Add cmd/voicebot TeamSpeak bot with voice bridge
- Proper 20ms audio timing with buffered queue
- Clear audio queue on user interruption (VAD)
This commit is contained in:
Jose Luis Montañes Ojados
2026-01-15 22:38:39 +01:00
parent 02318b1490
commit aa8c0dbcbc
7 changed files with 677 additions and 0 deletions

136
pkg/xai/types.go Normal file
View File

@@ -0,0 +1,136 @@
package xai
// Message types for xAI Voice Agent WebSocket API
// ClientMessage is the base for messages sent to the server
type ClientMessage struct {
Type string `json:"type"`
}
// SessionUpdate configures the voice session
type SessionUpdate struct {
Type string `json:"type"` // "session.update"
Session Session `json:"session"`
}
type Session struct {
Voice string `json:"voice,omitempty"`
Instructions string `json:"instructions,omitempty"`
TurnDetection *TurnDetection `json:"turn_detection,omitempty"`
Audio *AudioConfig `json:"audio,omitempty"`
}
type TurnDetection struct {
Type string `json:"type"` // "server_vad" or null
}
type AudioConfig struct {
Input *AudioFormatConfig `json:"input,omitempty"`
Output *AudioFormatConfig `json:"output,omitempty"`
}
type AudioFormatConfig struct {
Format AudioFormat `json:"format"`
}
type AudioFormat struct {
Type string `json:"type"` // "audio/pcm", "audio/pcmu", "audio/pcma"
Rate int `json:"rate"` // 8000, 16000, 24000, 48000, etc.
}
// InputAudioBufferAppend sends audio data to the server
type InputAudioBufferAppend struct {
Type string `json:"type"` // "input_audio_buffer.append"
Audio string `json:"audio"` // Base64 encoded PCM
}
// ResponseCreate requests a response from the model
type ResponseCreate struct {
Type string `json:"type"` // "response.create"
Response ResponseSettings `json:"response"`
}
type ResponseSettings struct {
Modalities []string `json:"modalities"` // ["text", "audio"]
}
// ConversationItemCreate creates a new conversation item
type ConversationItemCreate struct {
Type string `json:"type"` // "conversation.item.create"
Item ConversationItem `json:"item"`
}
type ConversationItem struct {
Type string `json:"type"` // "message"
Role string `json:"role"` // "user", "assistant"
Content []ItemContent `json:"content"`
}
type ItemContent struct {
Type string `json:"type"` // "input_text", "input_audio"
Text string `json:"text,omitempty"`
}
// =============================================================================
// Server Messages
// =============================================================================
// ServerMessage is the base for messages received from the server
type ServerMessage struct {
Type string `json:"type"`
EventID string `json:"event_id,omitempty"`
}
// SessionUpdated confirms session configuration
type SessionUpdated struct {
Type string `json:"type"` // "session.updated"
EventID string `json:"event_id"`
Session Session `json:"session"`
}
// ResponseOutputAudioDelta contains audio data from the model
type ResponseOutputAudioDelta struct {
Type string `json:"type"` // "response.output_audio.delta"
EventID string `json:"event_id"`
Delta string `json:"delta"` // Base64 encoded PCM
}
// ResponseDone indicates the response is complete
type ResponseDone struct {
Type string `json:"type"` // "response.done"
EventID string `json:"event_id"`
}
// InputAudioBufferSpeechStarted indicates VAD detected speech start
type InputAudioBufferSpeechStarted struct {
Type string `json:"type"` // "input_audio_buffer.speech_started"
EventID string `json:"event_id"`
}
// InputAudioBufferSpeechStopped indicates VAD detected speech stop
type InputAudioBufferSpeechStopped struct {
Type string `json:"type"` // "input_audio_buffer.speech_stopped"
EventID string `json:"event_id"`
}
// ErrorMessage represents an error from the server
type ErrorMessage struct {
Type string `json:"type"` // "error"
EventID string `json:"event_id"`
Error ErrorInfo `json:"error"`
}
type ErrorInfo struct {
Type string `json:"type"`
Code string `json:"code"`
Message string `json:"message"`
}
// Available voices
const (
VoiceAra = "Ara"
VoiceRex = "Rex"
VoiceSal = "Sal"
VoiceEve = "Eve"
VoiceLeo = "Leo"
)