Use Example

LLM

Swift
import NexaAI

func runLLM() async throws {
    //load model
    let modelPath = try resoucePath(of: "Qwen3-0.6B-Q8_0")
    let llm = LLM(modelPath: modelPath)
    try llm.loadModel()

    print("Generation Stream")
    let prompt = "Once upon a time"
    let config = GenerationConfig.default
    let streamText = try await llm.generationStream(
        prompt: prompt,
        config: config,
        onToken: { token in
            print(token, terminator: "")
            return true
        }
    )
    print("full text: \(streamText)")
    
    // Using AsyncStream for streaming
    for try await value in await llm.generationAsyncStream(prompt: prompt) {
        result += value
    }
}

Task {
    do {
        try await runLLM()
    } catch {
        print(error)
    }
}

VLM

Swift
import NexaAI

func runVLM() async throws {
    // load model
    let modelPath = try resoucePath(of: "Qwen3-4B-Q4_K_M.F32")
    let mmprojPath = try resoucePath(of: "mmproj-model-f16")
    let vlm = VLM(modelPath: modelPath, mmprojPath: mmprojPath)
    try vlm.loadModel()

    // generate (image case)
    var generationConfig = GenerationConfig()
    let imagePath = try resoucePath(of: "test1", ext: "jpg")
    generationConfig.imagePaths = [imagePath]
    var promt = "Describe this image"
    let generatedText = try await vlm.generate(prompt: promt, config: generationConfig)
    print("result: \(generatedText)")

    
    // generateStream (audio case)
    var config = GenerationConfig()
    let audioPath = try resoucePath(of: "test1", ext: "mp3")
    promt = "Translate this audio to English"
    config.audioPaths = [audioPath]
    let streamText = try await vlm.generationStream(
        prompt: promt,
        config: config,
        onToken: { token in
            print(token, terminator: "")
            // return false to stop
            return true
        }
    )
    print("full text: \(streamText)")
}

Task {
    do {
        try await runVLM()
    } catch {
        print(error)
    }
}

LLM

Initialize LLM with model path

Swift
let path = Bundle.main.path(forResource: "path/to/model", ofType: "gguf")
do {
    let llm = LLM(modelPath: modelPath)
    try llm.loadModel()
    // do something
} catch {
    print(error)
}
For model file loading, you can load directly from the document or download it to the sandbox first, and then load the model.

API Use cases

Generate text from prompt

Swift
var config = GenerationConfig.default
config.maxTokens = 32
let out = try await llm.generate(prompt: prompt, config: config)
print(out)

Generate text by streaming

Swift
// using callback
let streamText = try await llm.generationStream(
        prompt: prompt,
        config: .init(),
        onToken: { token in
            print(token)
            return true
        }
    )
print("full text: \(streamText)")

// Using AsyncStream for streaming
for try await value in await vlm.generationAsyncStream(prompt: prompt) {
    result += value
}

Generate embeddings for input texts

Swift
let llm = try LLM(modelPath: path)
let embedResult = try await llm.embed(texts: [prompt])
if embedResult.count > 32 {
    print(embedResult.prefix(32), "... (\(embedResult.count - 32) more)")
} else {
    print(embedResult)
}

LoRA Model Management

Swift
try llm.addLoRA("/path/to/lora.bin")
try llm.setloRA(1)
try llm.removeLoRA(by: 1)

KV Cache

Swift
try llm.saveKVCache(to: "path/to/kvcache")
try llm.loadKVCache(from: "path/to/kvcache")

Sampling Configuration

Swift
try llm.setSampler(config: .init(temperature: 0.8, topP: 0.9, topK: 40, repetitionPenalty: 1.0, presencePenalty: 0.0, seed: 42))

// reset default
try llm.resetSampler()

Chat Template

Swift
let messages: [ChatMessage] = [
    ChatMessage(role: .system, content: "You are a helpful assistant."),
    ChatMessage(role: .user, content: "What is the capital of France?"),
]
let applyChatResult = try await llm.applyChatTemplate(messages: messages)
print(applyChatResult)

let genConfig = GenerationConfig()
let chatResult = try await llm.generate(prompt: applyChatResult, config: genConfig)
print(chatResult)

VLM

Initialize VLM from model files

Swift
let path = Bundle.main.path(forResource: "path/to/model", ofType: "gguf")
let mmprojPath = Bundle.main.path(forResource: "path/to/mmproj", ofType: "gguf")
do {
   let vlm = VLM(modelPath: modelPath, mmprojPath: mmprojPath)
   try vlm.loadModel()
} catch {
    print(error)
}

API Use cases

Generate text from prompt

Swift
var generationConfig = GenerationConfig()
let generatedText = try await vlm.generate(prompt: text, config: generationConfig)

Generate text by streaming

Swift
// using callback
let streamText = try await vlm.generationStream(
    prompt: text,
    config: generationConfig,
    onToken: { token in
        if let token.isEmpty {
            print(token, terminator: "")
        } else {
            print("Stream ended")
        }
        return true
    }
)
print("\nFull text: \(streamText)")

// Using AsyncStream for streaming
for try await value in await vlm.generationAsyncStream(prompt: prompt) {
    result += value
}

Audio/Image

Swift
var config = GenerationConfig()
config.maxTokens = 32
config.audioPaths = ["path/to/audio"]
config.imagePaths = ["path/to/image"]
let prompt = "Translate this audio to English"
let result = try await vlm.generationStream(prompt: prompt, config: config) { token in
    print(token)
    return true
}
print(result)

Generate embeddings for input texts

Swift
let embedResult = try vlm.embed(texts: ["some prompt"])
if embedResult.count > 32 {
    print(embedResult.prefix(32), "... (\(embedResult.count - 32) more)")
} else {
    print(embedResult)
}

Encode/Decode

Swift
let text = "Describe this image"
let tokens = try vlm.encode(text: text)
print("Encoded tokens: \(tokens)")

print("test decode")
let decodedText = try vlm.decode(tokens: tokens)
print("Decoded text: \(decodedText)")

Sampling Configuration

Swift
try vlm.setSampler(config: .default)
try vlm.resetSampler()

Chat Template

Swift
print("test chat")
let messages: [ChatMessage] = [
    ChatMessage(role: .system, content: "You are a helpful assistant."),
    ChatMessage(role: .user, content: "Hello, who are you?"),
]
let applyChatResult = try await vlm.applyChatTemplate(messages: messages)
print(applyChatResult)

let applyGeneratedText = try await vlm.generate(prompt: applyChatResult, config: generationConfig)
print("apply generated text: \(applyGeneratedText)")

Reranker

Initialize

Swift
let reranker = try Reranker(modelPath: "path/to/model", tokenizerPath: "path/to/tokenizer.model" )

Rerank documents against a query

Swift
let query = "Today busy"
let documents = [
    "What is the weather today?",
    "What is the weather tomorrow?",
    "I'm a bit busy today."
]
let config = RerankConfig(batchSize: 32, normalize: false, normalizeMethod: .none)
let scores = try await reranker.rerank(query, documents: documents, config: config)

Embedder

Initialize

Swift
let embedder = try Embedder(modelPath: "path/to/model", tokenizerPath: "path/to/tokenizer.model" )

Generate embeddings for input texts

Swift
let text = "Hello World!"
let cfg = EmbeddingConfig(batchSize: 32, normalize: true, normalizeMethod: .l2)
let embeddings = try embedder.embed(texts: [text], config: cfg)

Configuration

ModelConfig

Swift
public struct ModelConfig {
    // text context, 0 = from model
    public var nCtx: Int32
    // number of threads to use for generation
    public var nThreads: Int32
    // number of threads to use for batch processing
    public var nThreadsBatch: Int32
    // logical maximum batch size that can be submitted to llama_decode
    public var nBatch: Int32
    // physical maximum batch size
    public var nUbatch: Int32
    // max number of sequences (i.e. distinct states for recurrent models)
    public var nSeqMax: Int32
    // path to chat template file, optional
    public var chatTemplatePath: String?
    // content of chat template file, optional
    public var chatTemplateContent: String?
}

SamplerConfig

Swift
public struct SamplerConfig {
    // Sampling temperature (0.0-2.0)
    public var temperature: Float
    // Nucleus sampling parameter (0.0-1.0)
    public var topP: Float
    // Top-k sampling parameter
    public var topK: Int32
    // Minimum probability for nucleus sampling
    public var minP: Float
    // Penalty for repeated tokens
    public var repetitionPenalty: Float
    // Penalty for token presence
    public var presencePenalty: Float
    // Penalty for token frequency
    public var frequencyPenalty: Float
    // Random seed (-1 for random)
    public var seed: Int32
    // Optional grammar file path
    public var grammarPath: String?
 }

GenerationConfig

Swift
public struct GenerationConfig {
    // Maximum tokens to generate
    public var maxTokens: Int32 
    // Array of stop sequences                
    public var stop: [String]  
    // Number of past tokens to consider                
    public var nPast: Int32           
    // Advanced sampling config          
    public var samplerConfig: SamplerConfig 
    // Array of image paths for VLM, emptyt for none   
    public var imagePaths: [String] 
    // Array of audio paths for VLM, emptyt for none          
    public var audioPaths: [String]            
}

RerankConfig

Swift
public struct RerankConfig {
    // Processing batch size
    public var batchSize: Int32
    // Whether to normalize scores
    public var normalize: Bool
    // Normalization, type of enum
    public var normalizeMethod: RerankConfig.NormalizeMethod
}

// Normalization method
public enum NormalizeMethod: String, CaseIterable {
    case softmax
    case minMax
    case none
}

EmbeddingConfig

Swift
public struct EmbeddingConfig {
    // Processing batch size
    public var batchSize: Int32
    // Whether to normalize embeddings
    public var normalize: Bool
    // Normalization
    public var normalizeMethod: NormalizeMethod.NormalizeMethod
}

// Normalization method
public enum NormalizeMethod: String {
    case l2
    case mean
    case none
}