跳转到主要内容

模型名称映射

对于所有 CoreML(ANE)模型,需要填写对应的插件 ID;GGUF 格式模型运行于 CPU/GPU,无需填写插件 ID 或模型名称(plugin 参数可省略)。
模型名称插件 IDHuggingFace 仓库名
NexaAI/EmbedNeural-ANEaneNexaAI/EmbedNeural-ANE
NexaAI/parakeet-tdt-0.6b-v3-aneaneNexaAI/parakeet-tdt-0.6b-v3-ane

ASR 用法

端侧语音识别,将音频转写为文本。

基础用法

import NexaSdk

// 加载模型
let asr = try Asr()
try await asr.load(from: "<path/to/model/dir>")

// 转写音频文件
let result = try await asr.transcribe(options: .init(audioPath: "<your-audio-path>"))
print(result.asrResult)

ASR 流式模式

import NexaSdk

// 加载模型
let asr = try Asr()
try await asr.load(from: "<path/to/model/dir>")

do {
    // 开始录制并流式输出
    let stream = try asr.startRecordingStream()
    for try await content in stream {
        print(content)
    }
    // 停止流
    asr.stopRecordingStream()
} catch {
    asr.stopRecordingStream()
}

API 说明

核心方法

func load(from repoFolder: URL) async throws
  • 从 HuggingFace 本地仓库目录加载 ASR 模型
  • 参数:repoFolder 模型文件所在目录
  • 返回:无
  • 抛出:加载失败时抛错
  • 说明:异步方法,需要 await
func startRecordingStream(config: ASRStreamConfig = .init(), block tapBlock: AVAudioNodeTapBlock? = nil)
  • 启动录音并开启 ASR 流式识别
  • 参数:config 流配置;tapBlock 可选音频采样回调
  • 返回:无
func stopRecordingStream()
  • 停止录音与流式识别
  • 返回:无
func startRecording(block tapBlock: AVAudioNodeTapBlock? = nil)
  • 仅开启录音
  • 参数:可选采样回调
  • 抛出:音频会话或引擎启动失败时
func stopRecording()
  • 停止当前录音
func startStream(config: ASRStreamConfig = .init()) throws -> AsyncThrowingStream<String, Error>
  • 启动 ASR 流式模式
  • 返回:异步流,持续产出转写文本
func stopStream(graceful: Bool = true)
  • 停止流式识别
  • 参数:graceful=true 处理缓冲后停止;false 立即停止
func streamPushSamples(samples: [Float]) throws
  • 将原始音频采样推入流式管线
  • 参数:samples PCM 采样数组

AsrResult

/// ASR transcription result
public struct AsrResult: Codable {
    /// Transcribed text
    public let transcript: String
    /// Confidence scores for each unit
    public let confidences: [Float]
    /// Timestamp pairs: [start, end] for each unit
    public let timestamps: [Float]
}

AsrResponse

public struct AsrResponse: Codable {
    public let asrResult: AsrResult
    public let profileData: ProfileData?
}

AsrOptions

public struct AsrOptions: Codable {
    public let modelPath: String
    public let language: Language
}

public enum Language: String, Codable {
    case en
    case ch
}

ASR 流式配置

public struct ASRStreamConfig {
    // 时间戳模式
    public enum TimestampMode: String {
        case segment
        case word
        case none
    }

    // 语言
    public var language: Language = .en
    // 每个分片时长(默认 4.0 秒)
    public var chunkDuration: Float
    // 分片重叠时长(默认 3.0 秒)
    public var overlapDuration: Float
    // 采样率(默认 16000)
    public var sampleRate: Int32
    // 队列最大分片数(默认 10)
    public var maxQueueSize: Int32
    // 输入缓冲大小(默认 512)
    public var bufferSize: Int32
    // 时间戳模式:"none" | "segment" | "word"
    public var timestamps: TimestampMode
    // Beam size(默认 5)
    public var beamSize: Int32
}

Embedding 用法

生成向量嵌入,支持语义检索/RAG。

基础用法

import NexaSdk
import Foundation

// 加载 ANE 模型的 Embedder
let repoDir: URL = URL(string: "path/to/model/dir")!
let embedder = try Embedder(from: repoDir, plugin: .ane)

// 针对多段文本生成向量
let texts = ["<your-text1>", "<your-text2>"]
let result = try embedder.embed(texts: texts, config: .init(batchSize: Int32(texts.count)))
for (i, vec) in result.embeddings.enumerated() {
    let head = vec.prefix(10)
    print("[\(i)]", Array(head))
}

API 说明

核心方法

convenience init(from repoFolder: URL, plugin: Plugin = .cpu_gpu)
  • 从本地仓库目录初始化
  • 参数:repoFolder 模型目录;plugin 后端插件(默认 cpu_gpu
func embed(inputIds: [[Int32]], config: EmbeddingConfig) throws -> EmbedResult
  • 对预分词的 token ID 生成向量;仅支持 cpu_gpu
func embed(texts: [String], config: EmbeddingConfig) throws -> EmbedResult
  • 对文本生成向量,支持批量与归一化配置
func embed(imagePaths: [String], config: EmbeddingConfig) throws -> EmbedResult
  • 对图片生成向量
func dim() throws -> Int32
  • 返回嵌入维度

EmbeddingConfig

public struct EmbeddingConfig {
    /// batch 大小
    public var batchSize: Int32

    /// 归一化方式:"l2", "mean"
    /// `nil` 表示不归一化
    public var normalizeMethod: NormalizeMethod?
}

EmbedResult

public struct EmbedResult {
    public let embeddings: [[Float]]
    public let profileData: ProfileData
}

LLM 用法

用于文本生成与对话。

流式对话(CPU/GPU,GGUF)

import NexaSdk

let llm = try LLM()
// 从 GGUF 路径加载
try await llm.load(.init(modelPath: "<path/to/model/file>"))

let system = "You are a helpful AI assistant"
let userMsgs = [
    "Tell me a long story, about 100 words",
    "How are you"
]
var messages = [ChatMessage]()
messages.append(.init(role: .system, content: system))
for userMsg in userMsgs {
    messages.append(.init(role: .user, content: userMsg))
    // 生成
    let stream = try await llm.generateAsyncStream(messages: messages)
    var response = ""
    for try await token in stream {
        print(token, terminator: "")
        response += token
    }
    messages.append(.init(role: .assistant, content: response))
    print("\n\n")
}


多模态用法

VLM 视觉语言模型,用于图像理解与多模态对话。

流式对话(CPU/GPU,GGUF)

import NexaSdk

let vlm = try VLM()
try await vlm.load(.init(modelPath: "<path/to/model/file>", mmprojPath: "<path/to/mmproj/file>"))

let images = ["<path/to/your/image>"]
var config = GenerationConfig.default
config.imagePaths = images
let message = ChatMessage(role: .user, content: "What do you see in this image", images: images)
let stream = try await vlm.generateAsyncStream(messages: [message], options: .init(config: config))

for try await token in stream {
    print(token, terminator: "")
}
print()

API 说明

核心方法

func load(_ options: ModelOptions) async throws
  • 按配置加载模型
func load(from repoFolder: URL, modelFileName: String = "", mmprojFileName: String = "") throws
  • 从本地 HuggingFace 仓库目录加载;文件名留空则用默认
func applyChatTemplate(messages: [ChatMessage], options: ChatTemplateOptions) async throws -> String
  • 应用聊天模板并格式化消息
func generateAsyncStream(messages: [ChatMessage], options: GenerationOptions) async throws -> AsyncThrowingStream<String, any Error>
  • 基于对话历史流式生成文本
func generate(prompt: String, config: GenerationConfig) async throws -> GenerateResult
  • 针对单轮 prompt 生成文本
func reset()
  • 重置内部状态
func stopStream()
  • 停止流式生成
func saveKVCache(to path: String) / func loadKVCache(from path: String)
  • 保存/加载 KV cache,仅 LLM 支持

GenerationConfig

public struct GenerationConfig {
    public var maxTokens: Int32 
    public var stop: [String]  
    public var nPast: Int32           
    public var samplerConfig: SamplerConfig 
    public var imagePaths: [String] 
    public var audioPaths: [String]            
}

SamplerConfig

public struct SamplerConfig {
    public var temperature: Float
    public var topP: Float
    public var topK: Int32
    public var minP: Float
    public var repetitionPenalty: Float
    public var presencePenalty: Float
    public var frequencyPenalty: Float
    public var seed: Int32
    public var grammarPath: String?
 }

ProfileData

public struct ProfileData: CustomStringConvertible, Codable {
    public let ttft: Int64
    public let promptTime: Int64
    public let decodeTime: Int64
    public let promptTokens: Int64
    public let generatedTokens: Int64
    public let audioDuration: Int64
    public let prefillSpeed: Double
    public let decodingSpeed: Double
    public let realTimeFactor: Double
    public let stopReason: String
}


重排(Rerank)用法

提升检索相关性,对文档列表进行重排序。

基础用法

import NexaSdk
import Foundation

// 加载 CPU/GPU 重排模型
let repoDir = URL(string: "<path/to/model/dir>")!
let reranker = try Reranker(from: repoDir)
let query = "What is machine learning?"
let documents = [
    "Machine learning is a subset of artificial intelligence that enables computers to learn and make decisions ",
    "without being explicitly programmed.",
    "Machine learning algorithms build mathematical models based on sample data to make predictions or decisions.",
    "Deep learning is a subset of machine learning that uses neural networks with multiple layers.",
    "Python is a popular programming language for machine learning and data science.",
    "The weather today is sunny and warm."
]
let result = try await reranker.rerank(query, documents: documents)
print(result.scores)

API 说明

核心方法

init(modelPath: String, tokenizerPath: String? = nil, deviceId: String? = nil, plugin: Plugin = .cpu_gpu) throws
  • 从本地文件初始化重排模型,可指定插件与设备 ID
func rerank(_ query: String, documents: [String], config: RerankConfig = .init()) async throws -> RerankerResult
  • 基于查询对文档列表打分排序
convenience init(from repoFolder: URL, plugin: Plugin = .cpu_gpu) throws
  • 从本地 HuggingFace 仓库加载重排模型

RerankerResult

public struct RerankerResult {
    public let scores: [Float]
    public let profileData: ProfileData
}

RerankConfig

public struct RerankConfig {
    /// batch 大小
    public var batchSize: Int32

    /// 归一化:"softmax" | "min-max" | "l2";`nil` 表示不归一化
    public var normalizeMethod: NormalizeMethod?

    public enum NormalizeMethod: String, CaseIterable {
        case softmax
        case minMax = "min-max"
        case l2
    }
}

如何选择 CPU/GPU 与 ANE

Nexa iOS / macOS SDK 提供两种加速模式:CPU/GPU 与 ANE。每个模型仅适配特定硬件,请查阅对应 HuggingFace 模型卡。

CPU/GPU 模式

// CPU/GPU embedder - 使用 GGUF 格式模型
let embedder = try Embedder(from: "<path/to/model/dir>", plugin: .cpu_gpu)

ANE 模式

// ANE embedder - 使用 CoreML 格式模型
let embedder = try Embedder(from: "<path/to/model/dir>", plugin: .ane)

Embedder 模块同时支持 CPU/GPU 与 ANE;LLM、VLM、Reranker 目前仅支持 CPU/GPU;ASR 仅支持 ANE,不支持 CPU/GPU。

需要帮助?

加入社区获取支持、分享项目并与其他开发者交流。