137 lines
3.8 KiB
Go
137 lines
3.8 KiB
Go
|
|
package xai
|
||
|
|
|
||
|
|
// Message types for xAI Voice Agent WebSocket API
|
||
|
|
|
||
|
|
// ClientMessage is the base for messages sent to the server
|
||
|
|
type ClientMessage struct {
|
||
|
|
Type string `json:"type"`
|
||
|
|
}
|
||
|
|
|
||
|
|
// SessionUpdate configures the voice session
|
||
|
|
type SessionUpdate struct {
|
||
|
|
Type string `json:"type"` // "session.update"
|
||
|
|
Session Session `json:"session"`
|
||
|
|
}
|
||
|
|
|
||
|
|
type Session struct {
|
||
|
|
Voice string `json:"voice,omitempty"`
|
||
|
|
Instructions string `json:"instructions,omitempty"`
|
||
|
|
TurnDetection *TurnDetection `json:"turn_detection,omitempty"`
|
||
|
|
Audio *AudioConfig `json:"audio,omitempty"`
|
||
|
|
}
|
||
|
|
|
||
|
|
type TurnDetection struct {
|
||
|
|
Type string `json:"type"` // "server_vad" or null
|
||
|
|
}
|
||
|
|
|
||
|
|
type AudioConfig struct {
|
||
|
|
Input *AudioFormatConfig `json:"input,omitempty"`
|
||
|
|
Output *AudioFormatConfig `json:"output,omitempty"`
|
||
|
|
}
|
||
|
|
|
||
|
|
type AudioFormatConfig struct {
|
||
|
|
Format AudioFormat `json:"format"`
|
||
|
|
}
|
||
|
|
|
||
|
|
type AudioFormat struct {
|
||
|
|
Type string `json:"type"` // "audio/pcm", "audio/pcmu", "audio/pcma"
|
||
|
|
Rate int `json:"rate"` // 8000, 16000, 24000, 48000, etc.
|
||
|
|
}
|
||
|
|
|
||
|
|
// InputAudioBufferAppend sends audio data to the server
|
||
|
|
type InputAudioBufferAppend struct {
|
||
|
|
Type string `json:"type"` // "input_audio_buffer.append"
|
||
|
|
Audio string `json:"audio"` // Base64 encoded PCM
|
||
|
|
}
|
||
|
|
|
||
|
|
// ResponseCreate requests a response from the model
|
||
|
|
type ResponseCreate struct {
|
||
|
|
Type string `json:"type"` // "response.create"
|
||
|
|
Response ResponseSettings `json:"response"`
|
||
|
|
}
|
||
|
|
|
||
|
|
type ResponseSettings struct {
|
||
|
|
Modalities []string `json:"modalities"` // ["text", "audio"]
|
||
|
|
}
|
||
|
|
|
||
|
|
// ConversationItemCreate creates a new conversation item
|
||
|
|
type ConversationItemCreate struct {
|
||
|
|
Type string `json:"type"` // "conversation.item.create"
|
||
|
|
Item ConversationItem `json:"item"`
|
||
|
|
}
|
||
|
|
|
||
|
|
type ConversationItem struct {
|
||
|
|
Type string `json:"type"` // "message"
|
||
|
|
Role string `json:"role"` // "user", "assistant"
|
||
|
|
Content []ItemContent `json:"content"`
|
||
|
|
}
|
||
|
|
|
||
|
|
type ItemContent struct {
|
||
|
|
Type string `json:"type"` // "input_text", "input_audio"
|
||
|
|
Text string `json:"text,omitempty"`
|
||
|
|
}
|
||
|
|
|
||
|
|
// =============================================================================
|
||
|
|
// Server Messages
|
||
|
|
// =============================================================================
|
||
|
|
|
||
|
|
// ServerMessage is the base for messages received from the server
|
||
|
|
type ServerMessage struct {
|
||
|
|
Type string `json:"type"`
|
||
|
|
EventID string `json:"event_id,omitempty"`
|
||
|
|
}
|
||
|
|
|
||
|
|
// SessionUpdated confirms session configuration
|
||
|
|
type SessionUpdated struct {
|
||
|
|
Type string `json:"type"` // "session.updated"
|
||
|
|
EventID string `json:"event_id"`
|
||
|
|
Session Session `json:"session"`
|
||
|
|
}
|
||
|
|
|
||
|
|
// ResponseOutputAudioDelta contains audio data from the model
|
||
|
|
type ResponseOutputAudioDelta struct {
|
||
|
|
Type string `json:"type"` // "response.output_audio.delta"
|
||
|
|
EventID string `json:"event_id"`
|
||
|
|
Delta string `json:"delta"` // Base64 encoded PCM
|
||
|
|
}
|
||
|
|
|
||
|
|
// ResponseDone indicates the response is complete
|
||
|
|
type ResponseDone struct {
|
||
|
|
Type string `json:"type"` // "response.done"
|
||
|
|
EventID string `json:"event_id"`
|
||
|
|
}
|
||
|
|
|
||
|
|
// InputAudioBufferSpeechStarted indicates VAD detected speech start
|
||
|
|
type InputAudioBufferSpeechStarted struct {
|
||
|
|
Type string `json:"type"` // "input_audio_buffer.speech_started"
|
||
|
|
EventID string `json:"event_id"`
|
||
|
|
}
|
||
|
|
|
||
|
|
// InputAudioBufferSpeechStopped indicates VAD detected speech stop
|
||
|
|
type InputAudioBufferSpeechStopped struct {
|
||
|
|
Type string `json:"type"` // "input_audio_buffer.speech_stopped"
|
||
|
|
EventID string `json:"event_id"`
|
||
|
|
}
|
||
|
|
|
||
|
|
// ErrorMessage represents an error from the server
|
||
|
|
type ErrorMessage struct {
|
||
|
|
Type string `json:"type"` // "error"
|
||
|
|
EventID string `json:"event_id"`
|
||
|
|
Error ErrorInfo `json:"error"`
|
||
|
|
}
|
||
|
|
|
||
|
|
type ErrorInfo struct {
|
||
|
|
Type string `json:"type"`
|
||
|
|
Code string `json:"code"`
|
||
|
|
Message string `json:"message"`
|
||
|
|
}
|
||
|
|
|
||
|
|
// Available voices
|
||
|
|
const (
|
||
|
|
VoiceAra = "Ara"
|
||
|
|
VoiceRex = "Rex"
|
||
|
|
VoiceSal = "Sal"
|
||
|
|
VoiceEve = "Eve"
|
||
|
|
VoiceLeo = "Leo"
|
||
|
|
)
|