Detailed usage examples for the LLM, VLM, Reranker, and Embedder classes in the Nexa iOS SDK.
import NexaAI
func runLLM() async throws {
//load model
let modelPath = try resoucePath(of: "Qwen3-0.6B-Q8_0")
let llm = LLM(modelPath: modelPath)
try llm.loadModel()
print("Generation Stream")
let prompt = "Once upon a time"
let config = GenerationConfig.default
let streamText = try await llm.generationStream(
prompt: prompt,
config: config,
onToken: { token in
print(token, terminator: "")
return true
}
)
print("full text: \(streamText)")
// Using AsyncStream for streaming
for try await value in await llm.generationAsyncStream(prompt: prompt) {
result += value
}
}
Task {
do {
try await runLLM()
} catch {
print(error)
}
}
import NexaAI
func runVLM() async throws {
// load model
let modelPath = try resoucePath(of: "Qwen3-4B-Q4_K_M.F32")
let mmprojPath = try resoucePath(of: "mmproj-model-f16")
let vlm = VLM(modelPath: modelPath, mmprojPath: mmprojPath)
try vlm.loadModel()
// generate (image case)
var generationConfig = GenerationConfig()
let imagePath = try resoucePath(of: "test1", ext: "jpg")
generationConfig.imagePaths = [imagePath]
var promt = "Describe this image"
let generatedText = try await vlm.generate(prompt: promt, config: generationConfig)
print("result: \(generatedText)")
// generateStream (audio case)
var config = GenerationConfig()
let audioPath = try resoucePath(of: "test1", ext: "mp3")
promt = "Translate this audio to English"
config.audioPaths = [audioPath]
let streamText = try await vlm.generationStream(
prompt: promt,
config: config,
onToken: { token in
print(token, terminator: "")
// return false to stop
return true
}
)
print("full text: \(streamText)")
}
Task {
do {
try await runVLM()
} catch {
print(error)
}
}
let path = Bundle.main.path(forResource: "path/to/model", ofType: "gguf")
do {
let llm = LLM(modelPath: modelPath)
try llm.loadModel()
// do something
} catch {
print(error)
}
var config = GenerationConfig.default
config.maxTokens = 32
let out = try await llm.generate(prompt: prompt, config: config)
print(out)
// using callback
let streamText = try await llm.generationStream(
prompt: prompt,
config: .init(),
onToken: { token in
print(token)
return true
}
)
print("full text: \(streamText)")
// Using AsyncStream for streaming
for try await value in await vlm.generationAsyncStream(prompt: prompt) {
result += value
}
let llm = try LLM(modelPath: path)
let embedResult = try await llm.embed(texts: [prompt])
if embedResult.count > 32 {
print(embedResult.prefix(32), "... (\(embedResult.count - 32) more)")
} else {
print(embedResult)
}
try llm.addLoRA("/path/to/lora.bin")
try llm.setloRA(1)
try llm.removeLoRA(by: 1)
try llm.saveKVCache(to: "path/to/kvcache")
try llm.loadKVCache(from: "path/to/kvcache")
try llm.setSampler(config: .init(temperature: 0.8, topP: 0.9, topK: 40, repetitionPenalty: 1.0, presencePenalty: 0.0, seed: 42))
// reset default
try llm.resetSampler()
let messages: [ChatMessage] = [
ChatMessage(role: .system, content: "You are a helpful assistant."),
ChatMessage(role: .user, content: "What is the capital of France?"),
]
let applyChatResult = try await llm.applyChatTemplate(messages: messages)
print(applyChatResult)
let genConfig = GenerationConfig()
let chatResult = try await llm.generate(prompt: applyChatResult, config: genConfig)
print(chatResult)
let path = Bundle.main.path(forResource: "path/to/model", ofType: "gguf")
let mmprojPath = Bundle.main.path(forResource: "path/to/mmproj", ofType: "gguf")
do {
let vlm = VLM(modelPath: modelPath, mmprojPath: mmprojPath)
try vlm.loadModel()
} catch {
print(error)
}
var generationConfig = GenerationConfig()
let generatedText = try await vlm.generate(prompt: text, config: generationConfig)
// using callback
let streamText = try await vlm.generationStream(
prompt: text,
config: generationConfig,
onToken: { token in
if let token.isEmpty {
print(token, terminator: "")
} else {
print("Stream ended")
}
return true
}
)
print("\nFull text: \(streamText)")
// Using AsyncStream for streaming
for try await value in await vlm.generationAsyncStream(prompt: prompt) {
result += value
}
var config = GenerationConfig()
config.maxTokens = 32
config.audioPaths = ["path/to/audio"]
config.imagePaths = ["path/to/image"]
let prompt = "Translate this audio to English"
let result = try await vlm.generationStream(prompt: prompt, config: config) { token in
print(token)
return true
}
print(result)
let embedResult = try vlm.embed(texts: ["some prompt"])
if embedResult.count > 32 {
print(embedResult.prefix(32), "... (\(embedResult.count - 32) more)")
} else {
print(embedResult)
}
let text = "Describe this image"
let tokens = try vlm.encode(text: text)
print("Encoded tokens: \(tokens)")
print("test decode")
let decodedText = try vlm.decode(tokens: tokens)
print("Decoded text: \(decodedText)")
try vlm.setSampler(config: .default)
try vlm.resetSampler()
print("test chat")
let messages: [ChatMessage] = [
ChatMessage(role: .system, content: "You are a helpful assistant."),
ChatMessage(role: .user, content: "Hello, who are you?"),
]
let applyChatResult = try await vlm.applyChatTemplate(messages: messages)
print(applyChatResult)
let applyGeneratedText = try await vlm.generate(prompt: applyChatResult, config: generationConfig)
print("apply generated text: \(applyGeneratedText)")
let reranker = try Reranker(modelPath: "path/to/model", tokenizerPath: "path/to/tokenizer.model" )
let query = "Today busy"
let documents = [
"What is the weather today?",
"What is the weather tomorrow?",
"I'm a bit busy today."
]
let config = RerankConfig(batchSize: 32, normalize: false, normalizeMethod: .none)
let scores = try await reranker.rerank(query, documents: documents, config: config)
let embedder = try Embedder(modelPath: "path/to/model", tokenizerPath: "path/to/tokenizer.model" )
let text = "Hello World!"
let cfg = EmbeddingConfig(batchSize: 32, normalize: true, normalizeMethod: .l2)
let embeddings = try embedder.embed(texts: [text], config: cfg)
public struct ModelConfig {
// text context, 0 = from model
public var nCtx: Int32
// number of threads to use for generation
public var nThreads: Int32
// number of threads to use for batch processing
public var nThreadsBatch: Int32
// logical maximum batch size that can be submitted to llama_decode
public var nBatch: Int32
// physical maximum batch size
public var nUbatch: Int32
// max number of sequences (i.e. distinct states for recurrent models)
public var nSeqMax: Int32
// path to chat template file, optional
public var chatTemplatePath: String?
// content of chat template file, optional
public var chatTemplateContent: String?
}
public struct SamplerConfig {
// Sampling temperature (0.0-2.0)
public var temperature: Float
// Nucleus sampling parameter (0.0-1.0)
public var topP: Float
// Top-k sampling parameter
public var topK: Int32
// Minimum probability for nucleus sampling
public var minP: Float
// Penalty for repeated tokens
public var repetitionPenalty: Float
// Penalty for token presence
public var presencePenalty: Float
// Penalty for token frequency
public var frequencyPenalty: Float
// Random seed (-1 for random)
public var seed: Int32
// Optional grammar file path
public var grammarPath: String?
}
public struct GenerationConfig {
// Maximum tokens to generate
public var maxTokens: Int32
// Array of stop sequences
public var stop: [String]
// Number of past tokens to consider
public var nPast: Int32
// Advanced sampling config
public var samplerConfig: SamplerConfig
// Array of image paths for VLM, emptyt for none
public var imagePaths: [String]
// Array of audio paths for VLM, emptyt for none
public var audioPaths: [String]
}
public struct RerankConfig {
// Processing batch size
public var batchSize: Int32
// Whether to normalize scores
public var normalize: Bool
// Normalization, type of enum
public var normalizeMethod: RerankConfig.NormalizeMethod
}
// Normalization method
public enum NormalizeMethod: String, CaseIterable {
case softmax
case minMax
case none
}
public struct EmbeddingConfig {
// Processing batch size
public var batchSize: Int32
// Whether to normalize embeddings
public var normalize: Bool
// Normalization
public var normalizeMethod: NormalizeMethod.NormalizeMethod
}
// Normalization method
public enum NormalizeMethod: String {
case l2
case mean
case none
}