API และโปรโตคอล

โมดูล AudioCommon นิยามโปรโตคอลที่ไม่ผูกกับโมเดลและประเภทข้อมูลที่ใช้ร่วม โมเดลใดก็ตามที่ทำตามโปรโตคอลเหล่านี้สามารถใช้แทนกันได้ผ่าน interface ดังกล่าว

ภาพรวมโปรโตคอล

┌─────────────────────────────────────────────────────────┐
│                    AudioCommon                          │
│                                                         │
│  AudioChunk          SpeechGenerationModel (TTS)        │
│  AlignedWord         SpeechRecognitionModel (STT)       │
│  SpeechSegment       ForcedAlignmentModel               │
│                      SpeechToSpeechModel                │
│                      VoiceActivityDetectionModel (VAD)   │
│                      SpeakerEmbeddingModel              │
│                      SpeakerDiarizationModel            │
│                      SpeakerExtractionCapable           │
└─────────────────────────────────────────────────────────┘

SpeechRecognitionModel

โปรโตคอลสำหรับโมเดลเสียงพูดเป็นข้อความ

public protocol SpeechRecognitionModel: AnyObject {
    var inputSampleRate: Int { get }
    func transcribe(audio: [Float], sampleRate: Int, language: String?) -> String
    func transcribeWithLanguage(audio: [Float], sampleRate: Int, language: String?) -> TranscriptionResult
}

ประเภทที่ทำตาม: Qwen3ASRModel, WhisperASRModel, ParakeetASRModel, ParakeetStreamingASRModel, OmnilingualASRModel (CoreML), OmnilingualASRMLXModel (MLX)

SpeechGenerationModel

โปรโตคอลสำหรับโมเดลข้อความเป็นเสียงพูด

public protocol SpeechGenerationModel: AnyObject {
    var sampleRate: Int { get }
    func generate(text: String, language: String?) async throws -> [Float]
    func generateStream(text: String, language: String?) -> AsyncThrowingStream<AudioChunk, Error>  // has default impl
}

generateStream() มีการ implement เริ่มต้นที่ห่อ generate() ให้เป็น chunk เดียว โมเดลที่มี streaming จริง (เช่น Qwen3-TTS) จะ override มัน

ประเภทที่ทำตาม: Qwen3TTSModel, CosyVoiceTTSModel, VoxCPM2TTSModel, KokoroTTSModel, IndexTTS2TTSModel

IndexTTS2TTSModel เพิ่ม overload ของ generate ที่รับเสียงอ้างอิงสำหรับการโคลน zero-shot และใช้ IndexTTS2SynthesisOptions เพื่อควบคุม speaking rate และ pause

ForcedAlignmentModel

โปรโตคอลสำหรับการจัดเรียง timestamp ระดับคำ

public protocol ForcedAlignmentModel: AnyObject {
    func align(audio: [Float], text: String, sampleRate: Int, language: String?) -> [AlignedWord]
}

SpeechToSpeechModel

โปรโตคอลสำหรับโมเดลบทสนทนาเสียงพูดสู่เสียงพูด

public protocol SpeechToSpeechModel: AnyObject {
    var sampleRate: Int { get }
    func respond(userAudio: [Float]) -> [Float]
    func respondStream(userAudio: [Float]) -> AsyncThrowingStream<AudioChunk, Error>
}

ประเภทที่ทำตาม: PersonaPlexModel

VoiceActivityDetectionModel

โปรโตคอลสำหรับการตรวจจับกิจกรรมเสียงพูด

public protocol VoiceActivityDetectionModel: AnyObject {
    var inputSampleRate: Int { get }
    func detectSpeech(audio: [Float], sampleRate: Int) -> [SpeechSegment]
}

SpeakerEmbeddingModel

โปรโตคอลสำหรับการสกัด embedding ของผู้พูด

public protocol SpeakerEmbeddingModel: AnyObject {
    var inputSampleRate: Int { get }
    var embeddingDimension: Int { get }
    func embed(audio: [Float], sampleRate: Int) -> [Float]
}

ประเภทที่ทำตาม: WeSpeakerModel

SpeakerDiarizationModel

โปรโตคอลสำหรับโมเดลแยกผู้พูดที่กำกับ label ผู้พูดให้กับช่วงเสียง

public protocol SpeakerDiarizationModel: AnyObject {
    var inputSampleRate: Int { get }
    func diarize(audio: [Float], sampleRate: Int) -> [DiarizedSegment]
}

ประเภทที่ทำตาม: DiarizationPipeline (Pyannote), SortformerDiarizer

SpeakerExtractionCapable

โปรโตคอลแยกผู้พูดแบบขยายสำหรับเอนจินที่รองรับการสกัดช่วงของผู้พูดเป้าหมายโดยใช้ embedding อ้างอิง ไม่ใช่ทุกเอนจินที่รองรับ (Sortformer เป็น end-to-end และไม่สร้าง embedding ของผู้พูด)

public protocol SpeakerExtractionCapable: SpeakerDiarizationModel {
    func extractSpeaker(audio: [Float], sampleRate: Int, targetEmbedding: [Float]) -> [SpeechSegment]
}

ประเภทที่ทำตาม: DiarizationPipeline (เฉพาะ Pyannote)

ประเภทที่ใช้ร่วมกัน

AudioChunk

public struct AudioChunk {
    public let samples: [Float]   // PCM samples
    public let sampleRate: Int    // Sample rate (e.g. 24000)
}

SpeechSegment

public struct SpeechSegment {
    public let startTime: Float   // Start time in seconds
    public let endTime: Float     // End time in seconds
}

AlignedWord

public struct AlignedWord {
    public let text: String       // The word
    public let startTime: Float   // Start time in seconds
    public let endTime: Float     // End time in seconds
}

DiarizedSegment

public struct DiarizedSegment {
    public let startTime: Float   // Start time in seconds
    public let endTime: Float     // End time in seconds
    public let speakerId: Int     // Speaker identifier (0-based)
}

DialogueSegment

ส่วนหนึ่งของข้อความบทสนทนาหลายผู้พูดที่ถูก parse แล้ว มีแท็กผู้พูดและอารมณ์เป็นตัวเลือก ใช้กับ DialogueParser และ DialogueSynthesizer สำหรับการสังเคราะห์บทสนทนาของ CosyVoice3

public struct DialogueSegment: Sendable, Equatable {
    public let speaker: String?   // Speaker identifier ("S1", "S2"), nil for untagged
    public let emotion: String?   // Emotion tag ("happy", "whispers"), nil if none
    public let text: String       // Cleaned text to synthesize
}

DialogueParser

Parse ข้อความบทสนทนาหลายผู้พูดที่มีแท็กผู้พูดแบบ inline ([S1]) และแท็กอารมณ์ ((happy))

public enum DialogueParser {
    static func parse(_ text: String) -> [DialogueSegment]
    static func emotionToInstruction(_ emotion: String) -> String
}

อารมณ์ในตัว: happy/excited, sad, angry, whispers/whispering, laughs/laughing, calm, surprised, serious แท็กที่ไม่รู้จักจะถูกส่งผ่านเป็นคำสั่งแบบอิสระ

DialogueSynthesizer

ประสานการสังเคราะห์บทสนทนาหลายส่วนพร้อมการโคลนเสียงต่อผู้พูด, ช่องว่างเงียบ และ crossfade

public enum DialogueSynthesizer {
    static func synthesize(
        segments: [DialogueSegment],
        speakerEmbeddings: [String: [Float]],
        model: CosyVoiceTTSModel,
        language: String,
        config: DialogueSynthesisConfig,
        verbose: Bool
    ) -> [Float]
}

DialogueSynthesisConfig

public struct DialogueSynthesisConfig: Sendable {
    public var turnGapSeconds: Float      // Default: 0.2
    public var crossfadeSeconds: Float    // Default: 0.0
    public var defaultInstruction: String // Default: "You are a helpful assistant."
    public var maxTokensPerSegment: Int   // Default: 500
}

PipelineLLM

โปรโตคอลสำหรับการรวมโมเดลภาษาเข้ากับไปป์ไลน์เสียง เชื่อม LLM เข้ากับสายการทำงาน ASR → LLM → TTS ของ VoicePipeline

public protocol PipelineLLM: AnyObject {
    func chat(messages: [(role: MessageRole, content: String)],
              onToken: @escaping (String, Bool) -> Void)
    func cancel()
}

Adapter ในตัว: Qwen3PipelineLLM เชื่อม Qwen35MLXChat เข้ากับโปรโตคอลนี้พร้อมการทำความสะอาด token, การยกเลิก และการสะสมวลีที่รอ

AudioIO

ตัวจัดการ I/O เสียงที่ใช้ซ้ำได้ ขจัด boilerplate ของ AVAudioEngine จัดการการจับเสียงจากไมค์, การ resample, การเล่นกลับ และการวัดระดับเสียง

let audio = AudioIO()
try audio.startMicrophone(targetSampleRate: 16000) { samples in
    pipeline.pushAudio(samples)
}
audio.player.scheduleChunk(ttsOutput)
audio.stopMicrophone()

AudioIO รวม StreamingAudioPlayer สำหรับเอาต์พุต TTS และ AudioRingBuffer สำหรับโอนเสียงระหว่างเธรดการจับและเธรด inference อย่างปลอดภัย

SentencePieceModel

ตัวอ่าน protobuf ที่ใช้ร่วมสำหรับไฟล์ .model ของ SentencePiece อยู่ใน AudioCommon ทุกโมดูลที่ต้องการ decode SentencePiece pieces (PersonaPlex, OmnilingualASR, การ port ASR / TTS ในอนาคต) จะสร้าง decoder ของตัวเองบนตัวอ่านเดียวนี้แทนการ implement รูปแบบ wire ของ protobuf ใหม่

public struct SentencePieceModel: Sendable {
    public struct Piece: Sendable, Equatable {
        public let text: String
        public let score: Float
        public let type: Int32
        public var pieceType: PieceType? { get }
        public var isControlOrUnknown: Bool { get }
    }
    public enum PieceType: Int32 {
        case normal = 1, unknown = 2, control = 3,
             userDefined = 4, unused = 5, byte = 6
    }
    public let pieces: [Piece]
    public var count: Int { get }
    public subscript(_ id: Int) -> Piece? { get }
    public init(contentsOf url: URL) throws
    public init(modelPath: String) throws
    public init(data: Data) throws
}

ใช้โดย: OmnilingualASR.OmnilingualVocabulary, PersonaPlex.SentencePieceDecoder ครอบคลุมด้วย 7 unit test ใน Tests/AudioCommonTests/SentencePieceModelTests

MLXCommon.SDPA

ตัวช่วย scaled dot-product attention ที่ใช้ร่วมในทุกโมดูล attention ของ MLX (Qwen3-ASR / Qwen3-TTS / Qwen3-Chat / CosyVoice / PersonaPlex / OmnilingualASR) แต่ละโมดูลเก็บ projection ของตัวเอง — SDPA จัดการเฉพาะ boilerplate ของ reshape → attention → merge

public enum SDPA {
    // Flat [B, T, H*D] input: project/reshape happens inside
    public static func multiHead(
        q: MLXArray, k: MLXArray, v: MLXArray,
        numHeads: Int, headDim: Int, scale: Float,
        mask: MLXArray? = nil
    ) -> MLXArray

    // GQA / MQA variant with separate query and KV head counts
    public static func multiHead(
        q: MLXArray, k: MLXArray, v: MLXArray,
        numQueryHeads: Int, numKVHeads: Int, headDim: Int, scale: Float,
        mask: MLXArray? = nil
    ) -> MLXArray

    // Already-shaped [B, H, T, D] (RoPE / KV cache paths)
    public static func attendAndMerge(
        qHeads: MLXArray, kHeads: MLXArray, vHeads: MLXArray,
        scale: Float,
        mask: MLXArray? = nil
    ) -> MLXArray

    // Same, with ScaledDotProductAttentionMaskMode enum (newer API)
    public static func attendAndMerge(
        qHeads: MLXArray, kHeads: MLXArray, vHeads: MLXArray,
        scale: Float,
        mask: MLXFast.ScaledDotProductAttentionMaskMode
    ) -> MLXArray

    // Low-level head merge: [B, H, T, D] → [B, T, H*D]
    public static func mergeHeads(_ attn: MLXArray) -> MLXArray
}

ทุกการเรียก reshape ใช้ -1 สำหรับมิติ batch ดังนั้นตัวช่วยเหล่านี้จึงประกอบกับกราฟ MLX.compile(shapeless:) ที่ batch เปลี่ยนได้ที่ runtime (เช่น Qwen3-TTS Talker ที่ decode แบบ autoregressive)

เซิร์ฟเวอร์ HTTP API

ไฟล์ปฏิบัติการ speech-server เปิดทุกโมเดลใน speech-swift เป็น endpoint HTTP REST บวกกับ endpoint WebSocket ที่ implement OpenAI Realtime API โมเดลถูกโหลดแบบ lazy เมื่อมีคำขอครั้งแรก; ใส่ --preload เพื่อ warm ทั้งหมดตอนเริ่มต้น

swift build -c release
.build/release/speech-server --port 8080

# โหลดทุกโมเดลล่วงหน้าตอนเริ่มต้น
.build/release/speech-server --port 8080 --preload

REST Endpoints

Endpoint	Method	คำขอ	การตอบกลับ
`/transcribe`	POST	body `audio/wav`	JSON `{ text }` (Qwen3-ASR)
`/speak`	POST	JSON `{ text, engine?, language?, voice? }`	body `audio/wav` (Qwen3-TTS, CosyVoice, Kokoro)
`/respond`	POST	body `audio/wav`	body `audio/wav` (PersonaPlex)
`/enhance`	POST	body `audio/wav`	body `audio/wav` (DeepFilterNet3)
`/vad`	POST	body `audio/wav`	รายการช่วงเป็น JSON
`/diarize`	POST	body `audio/wav`	รายการ JSON `DiarizedSegment`
`/embed-speaker`	POST	body `audio/wav`	JSON `[Float]` (256 มิติ)

# ถอดเสียงไฟล์เป็นข้อความ
curl -X POST http://localhost:8080/transcribe \
  --data-binary @recording.wav \
  -H "Content-Type: audio/wav"

# สังเคราะห์เสียงพูด
curl -X POST http://localhost:8080/speak \
  -H "Content-Type: application/json" \
  -d '{"text": "Hello world", "engine": "cosyvoice"}' \
  -o output.wav

# วงรอบเสียงพูดสู่เสียงพูดแบบเต็ม
curl -X POST http://localhost:8080/respond \
  --data-binary @question.wav \
  -o response.wav

OpenAI Realtime API (`/v1/realtime`)

Endpoint WebSocket ที่ ws://host:port/v1/realtime implement โปรโตคอล OpenAI Realtime ทุกข้อความเป็น JSON โดยมี type เป็นตัวบ่งชี้; payload เสียงเป็น PCM16 เข้ารหัส base64 ที่ 24 kHz mono

ระหว่างการโหลดโมเดลแบบ cold start หรือการสร้างผลลัพธ์ที่ใช้เวลานาน เซิร์ฟเวอร์จะส่งอีเวนต์ JSON ขนาดเบา realtime.keepalive และเฟรมควบคุม websocket pong ทุกประมาณ 15 วินาทีจนกว่าเอาต์พุตของโมเดลจะพร้อม ไคลเอนต์สามารถละเว้นอีเวนต์เหล่านี้หรือใช้เป็นตัวบ่งชี้กิจกรรมได้

เหตุการณ์ Client → Server

เหตุการณ์	วัตถุประสงค์
`session.update`	กำหนดค่าเอนจิน, ภาษา, เสียง และรูปแบบเสียง
`input_audio_buffer.append`	ต่อท้าย chunk PCM16 base64 ลงในบัฟเฟอร์อินพุต
`input_audio_buffer.commit`	commit เสียงที่บัฟเฟอร์ไว้สำหรับการถอดเป็นข้อความ
`input_audio_buffer.clear`	ทิ้งบัฟเฟอร์อินพุตปัจจุบัน
`response.create`	ขอการสังเคราะห์ TTS สำหรับข้อความ/คำสั่งที่ให้

เหตุการณ์ Server → Client

เหตุการณ์	ความหมาย
`session.created`	handshake เสร็จสมบูรณ์ การตั้งค่าเริ่มต้นถูกปล่อย
`session.updated`	ยืนยัน `session.update` ล่าสุดแล้ว
`input_audio_buffer.committed`	เสียงถูกยอมรับและจัดคิวเพื่อถอดเป็นข้อความ
`conversation.item.input_audio_transcription.completed`	ผลลัพธ์ ASR พร้อมข้อความสุดท้าย
`response.audio.delta`	chunk PCM16 base64 ของเสียงที่สังเคราะห์
`response.audio.done`	ไม่มี chunk เสียงอีกสำหรับ response นี้
`response.done`	response เสร็จสิ้น (metadata + สถิติความหน่วง)
`error`	กรอบ error พร้อม `type` และ `message`

const ws = new WebSocket('ws://localhost:8080/v1/realtime');

// ASR: push audio, request transcription
ws.send(JSON.stringify({ type: 'input_audio_buffer.append', audio: base64PCM16 }));
ws.send(JSON.stringify({ type: 'input_audio_buffer.commit' }));
// → conversation.item.input_audio_transcription.completed

// TTS: request synthesis and stream audio deltas
ws.send(JSON.stringify({
  type: 'response.create',
  response: { modalities: ['audio', 'text'], instructions: 'Hello world' }
}));
// → response.audio.delta (repeated), response.audio.done, response.done

เซิร์ฟเวอร์อยู่ใน SPM product ชื่อ AudioServer ตัวอย่าง client บนเบราว์เซอร์มาด้วยที่ Examples/websocket-client.html — เปิดควบคู่กับเซิร์ฟเวอร์ที่กำลังรันเพื่อใช้งานรอบ ASR + TTS แบบเต็ม

การดาวน์โหลดโมเดล

โมเดลทั้งหมดดาวน์โหลดจาก HuggingFace ในการใช้งานครั้งแรกและเก็บแคชใน ~/Library/Caches/qwen3-speech/ โมดูล AudioCommon มี HuggingFaceDownloader ที่ใช้ร่วมกันสำหรับจัดการการดาวน์โหลด การแคช และการตรวจสอบความสมบูรณ์