feat: add xAI voice bot with Grok integration

- Add pkg/xai WebSocket client for xAI Voice Agent API - Add cmd/voicebot TeamSpeak bot with voice bridge - Proper 20ms audio timing with buffered queue - Clear audio queue on user interruption (VAD)
2026-01-15 22:38:39 +01:00
parent 02318b1490
commit aa8c0dbcbc
7 changed files with 677 additions and 0 deletions
--- a/pkg/xai/client.go
+++ b/pkg/xai/client.go
@@ -0,0 +1,277 @@
+package xai
+
+import (
+	"encoding/base64"
+	"encoding/binary"
+	"encoding/json"
+	"fmt"
+	"log"
+	"sync"
+
+	"github.com/gorilla/websocket"
+)
+
+const (
+	// WebSocket endpoint for xAI realtime API
+	RealtimeURL = "wss://api.x.ai/v1/realtime"
+)
+
+// AudioHandler is called when audio is received from xAI
+type AudioHandler func(pcm []int16)
+
+// TranscriptHandler is called when transcript text is received
+type TranscriptHandler func(text string)
+
+// SpeechHandler is called when speech is detected (for interruptions)
+type SpeechHandler func()
+
+// Client manages a WebSocket connection to xAI Voice Agent API
+type Client struct {
+	apiKey string
+	conn   *websocket.Conn
+	mu     sync.Mutex
+
+	// Callbacks
+	onAudio         AudioHandler
+	onTranscript    TranscriptHandler
+	onSpeechStarted SpeechHandler
+
+	// State
+	connected bool
+	done      chan struct{}
+}
+
+// New creates a new xAI client
+func New(apiKey string) *Client {
+	return &Client{
+		apiKey: apiKey,
+		done:   make(chan struct{}),
+	}
+}
+
+// OnAudio sets the callback for received audio
+func (c *Client) OnAudio(handler AudioHandler) {
+	c.onAudio = handler
+}
+
+// OnTranscript sets the callback for received transcripts
+func (c *Client) OnTranscript(handler TranscriptHandler) {
+	c.onTranscript = handler
+}
+
+// OnSpeechStarted sets the callback for when user starts speaking (for interruptions)
+func (c *Client) OnSpeechStarted(handler SpeechHandler) {
+	c.onSpeechStarted = handler
+}
+
+// Connect establishes WebSocket connection to xAI
+func (c *Client) Connect() error {
+	header := make(map[string][]string)
+	header["Authorization"] = []string{"Bearer " + c.apiKey}
+
+	dialer := websocket.Dialer{}
+	conn, _, err := dialer.Dial(RealtimeURL, header)
+	if err != nil {
+		return fmt.Errorf("failed to connect to xAI: %w", err)
+	}
+
+	c.conn = conn
+	c.connected = true
+
+	// Start message receiver
+	go c.receiveLoop()
+
+	log.Println("[xAI] Connected to Voice Agent API")
+	return nil
+}
+
+// ConfigureSession sets up the voice session
+func (c *Client) ConfigureSession(voice, instructions string) error {
+	msg := SessionUpdate{
+		Type: "session.update",
+		Session: Session{
+			Voice:        voice,
+			Instructions: instructions,
+			TurnDetection: &TurnDetection{
+				Type: "server_vad",
+			},
+			Audio: &AudioConfig{
+				Input: &AudioFormatConfig{
+					Format: AudioFormat{Type: "audio/pcm", Rate: 48000},
+				},
+				Output: &AudioFormatConfig{
+					Format: AudioFormat{Type: "audio/pcm", Rate: 48000},
+				},
+			},
+		},
+	}
+
+	return c.sendJSON(msg)
+}
+
+// SendAudio sends PCM audio data to xAI
+// pcm should be int16 samples at 48kHz mono
+func (c *Client) SendAudio(pcm []int16) error {
+	// Convert int16 slice to bytes (little endian)
+	buf := make([]byte, len(pcm)*2)
+	for i, sample := range pcm {
+		binary.LittleEndian.PutUint16(buf[i*2:], uint16(sample))
+	}
+
+	// Encode to base64
+	encoded := base64.StdEncoding.EncodeToString(buf)
+
+	msg := InputAudioBufferAppend{
+		Type:  "input_audio_buffer.append",
+		Audio: encoded,
+	}
+
+	return c.sendJSON(msg)
+}
+
+// Close closes the WebSocket connection
+func (c *Client) Close() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.conn != nil {
+		close(c.done)
+		c.conn.Close()
+		c.connected = false
+		log.Println("[xAI] Connection closed")
+	}
+}
+
+// IsConnected returns connection status
+func (c *Client) IsConnected() bool {
+	return c.connected
+}
+
+// sendJSON sends a JSON message over WebSocket
+func (c *Client) sendJSON(v any) error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.conn == nil {
+		return fmt.Errorf("not connected")
+	}
+
+	data, err := json.Marshal(v)
+	if err != nil {
+		return err
+	}
+
+	return c.conn.WriteMessage(websocket.TextMessage, data)
+}
+
+// receiveLoop handles incoming messages from xAI
+func (c *Client) receiveLoop() {
+	defer func() {
+		c.connected = false
+	}()
+
+	for {
+		select {
+		case <-c.done:
+			return
+		default:
+		}
+
+		_, message, err := c.conn.ReadMessage()
+		if err != nil {
+			if websocket.IsCloseError(err, websocket.CloseNormalClosure) {
+				log.Println("[xAI] Connection closed normally")
+			} else {
+				log.Printf("[xAI] Read error: %v", err)
+			}
+			return
+		}
+
+		c.handleMessage(message)
+	}
+}
+
+// handleMessage processes an incoming WebSocket message
+func (c *Client) handleMessage(data []byte) {
+	// Parse base message to get type
+	var base ServerMessage
+	if err := json.Unmarshal(data, &base); err != nil {
+		log.Printf("[xAI] Failed to parse message: %v", err)
+		return
+	}
+
+	switch base.Type {
+	case "session.updated":
+		log.Println("[xAI] Session configured successfully")
+
+	case "session.created":
+		log.Println("[xAI] Session created")
+
+	case "conversation.created":
+		log.Println("[xAI] Conversation created")
+
+	case "response.output_audio.delta":
+		var msg ResponseOutputAudioDelta
+		if err := json.Unmarshal(data, &msg); err != nil {
+			log.Printf("[xAI] Failed to parse audio delta: %v", err)
+			return
+		}
+		c.handleAudioDelta(msg.Delta)
+
+	case "response.output_audio.done":
+		// Audio stream complete for this response
+		log.Println("[xAI] Audio response complete")
+
+	case "response.output_audio_transcript.delta":
+		// Could extract transcript text here
+		var raw map[string]any
+		json.Unmarshal(data, &raw)
+		if delta, ok := raw["delta"].(string); ok && c.onTranscript != nil {
+			c.onTranscript(delta)
+		}
+
+	case "response.done":
+		log.Println("[xAI] Response complete")
+
+	case "input_audio_buffer.speech_started":
+		log.Println("[xAI] Speech started (VAD)")
+		if c.onSpeechStarted != nil {
+			c.onSpeechStarted()
+		}
+
+	case "input_audio_buffer.speech_stopped":
+		log.Println("[xAI] Speech stopped (VAD)")
+
+	case "error":
+		var msg ErrorMessage
+		if err := json.Unmarshal(data, &msg); err == nil {
+			log.Printf("[xAI] Error: %s - %s", msg.Error.Code, msg.Error.Message)
+		}
+
+	default:
+		// Log unhandled message types for debugging
+		log.Printf("[xAI] Received: %s", base.Type)
+	}
+}
+
+// handleAudioDelta processes received audio data
+func (c *Client) handleAudioDelta(base64Audio string) {
+	if c.onAudio == nil {
+		return
+	}
+
+	// Decode base64
+	audioBytes, err := base64.StdEncoding.DecodeString(base64Audio)
+	if err != nil {
+		log.Printf("[xAI] Failed to decode audio: %v", err)
+		return
+	}
+
+	// Convert bytes to int16 (little endian)
+	pcm := make([]int16, len(audioBytes)/2)
+	for i := 0; i < len(pcm); i++ {
+		pcm[i] = int16(binary.LittleEndian.Uint16(audioBytes[i*2:]))
+	}
+
+	c.onAudio(pcm)
+}
--- a/pkg/xai/types.go
+++ b/pkg/xai/types.go
@@ -0,0 +1,136 @@
+package xai
+
+// Message types for xAI Voice Agent WebSocket API
+
+// ClientMessage is the base for messages sent to the server
+type ClientMessage struct {
+	Type string `json:"type"`
+}
+
+// SessionUpdate configures the voice session
+type SessionUpdate struct {
+	Type    string  `json:"type"` // "session.update"
+	Session Session `json:"session"`
+}
+
+type Session struct {
+	Voice         string         `json:"voice,omitempty"`
+	Instructions  string         `json:"instructions,omitempty"`
+	TurnDetection *TurnDetection `json:"turn_detection,omitempty"`
+	Audio         *AudioConfig   `json:"audio,omitempty"`
+}
+
+type TurnDetection struct {
+	Type string `json:"type"` // "server_vad" or null
+}
+
+type AudioConfig struct {
+	Input  *AudioFormatConfig `json:"input,omitempty"`
+	Output *AudioFormatConfig `json:"output,omitempty"`
+}
+
+type AudioFormatConfig struct {
+	Format AudioFormat `json:"format"`
+}
+
+type AudioFormat struct {
+	Type string `json:"type"` // "audio/pcm", "audio/pcmu", "audio/pcma"
+	Rate int    `json:"rate"` // 8000, 16000, 24000, 48000, etc.
+}
+
+// InputAudioBufferAppend sends audio data to the server
+type InputAudioBufferAppend struct {
+	Type  string `json:"type"`  // "input_audio_buffer.append"
+	Audio string `json:"audio"` // Base64 encoded PCM
+}
+
+// ResponseCreate requests a response from the model
+type ResponseCreate struct {
+	Type     string           `json:"type"` // "response.create"
+	Response ResponseSettings `json:"response"`
+}
+
+type ResponseSettings struct {
+	Modalities []string `json:"modalities"` // ["text", "audio"]
+}
+
+// ConversationItemCreate creates a new conversation item
+type ConversationItemCreate struct {
+	Type string           `json:"type"` // "conversation.item.create"
+	Item ConversationItem `json:"item"`
+}
+
+type ConversationItem struct {
+	Type    string        `json:"type"` // "message"
+	Role    string        `json:"role"` // "user", "assistant"
+	Content []ItemContent `json:"content"`
+}
+
+type ItemContent struct {
+	Type string `json:"type"` // "input_text", "input_audio"
+	Text string `json:"text,omitempty"`
+}
+
+// =============================================================================
+// Server Messages
+// =============================================================================
+
+// ServerMessage is the base for messages received from the server
+type ServerMessage struct {
+	Type    string `json:"type"`
+	EventID string `json:"event_id,omitempty"`
+}
+
+// SessionUpdated confirms session configuration
+type SessionUpdated struct {
+	Type    string  `json:"type"` // "session.updated"
+	EventID string  `json:"event_id"`
+	Session Session `json:"session"`
+}
+
+// ResponseOutputAudioDelta contains audio data from the model
+type ResponseOutputAudioDelta struct {
+	Type    string `json:"type"` // "response.output_audio.delta"
+	EventID string `json:"event_id"`
+	Delta   string `json:"delta"` // Base64 encoded PCM
+}
+
+// ResponseDone indicates the response is complete
+type ResponseDone struct {
+	Type    string `json:"type"` // "response.done"
+	EventID string `json:"event_id"`
+}
+
+// InputAudioBufferSpeechStarted indicates VAD detected speech start
+type InputAudioBufferSpeechStarted struct {
+	Type    string `json:"type"` // "input_audio_buffer.speech_started"
+	EventID string `json:"event_id"`
+}
+
+// InputAudioBufferSpeechStopped indicates VAD detected speech stop
+type InputAudioBufferSpeechStopped struct {
+	Type    string `json:"type"` // "input_audio_buffer.speech_stopped"
+	EventID string `json:"event_id"`
+}
+
+// ErrorMessage represents an error from the server
+type ErrorMessage struct {
+	Type    string    `json:"type"` // "error"
+	EventID string    `json:"event_id"`
+	Error   ErrorInfo `json:"error"`
+}
+
+type ErrorInfo struct {
+	Type    string `json:"type"`
+	Code    string `json:"code"`
+	Message string `json:"message"`
+}
+
+// Available voices
+const (
+	VoiceAra = "Ara"
+	VoiceRex = "Rex"
+	VoiceSal = "Sal"
+	VoiceEve = "Eve"
+	VoiceLeo = "Leo"
+)