Skip to main content

Use Example

LLM

Python
import os
from nexaai.llm import LLM, GenerationConfig
from nexaai.common import ModelConfig, ChatMessage

# Initialize model
model_path = "~/.cache/nexa.ai/nexa_sdk/models/Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf"
m_cfg = ModelConfig()
llm = LLM.from_(model_path, plugin_id="cpu_gpu", device_id="cpu", m_cfg=m_cfg)

# Create conversation
conversation = [ChatMessage(role="system", content="You are a helpful assistant.")]
conversation.append(ChatMessage(role="user", content="Hello, how are you?"))

# Apply chat template and generate
prompt = llm.apply_chat_template(conversation)
for token in llm.generate_stream(prompt, g_cfg=GenerationConfig(max_tokens=100)):
    print(token, end="", flush=True)

VLM

Python
import os
from nexaai.vlm import VLM, GenerationConfig
from nexaai.common import ModelConfig, MultiModalMessage, MultiModalMessageContent

# Initialize model
model_path = "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/gemma-3n-E4B-it-4bit-MLX/model-00001-of-00002.safetensors"
m_cfg = ModelConfig()
vlm = VLM.from_(name_or_path=model_path, m_cfg=m_cfg, plugin_id="cpu_gpu", device_id="")

# Create multimodal conversation
conversation = [MultiModalMessage(role="system", 
                                content=[MultiModalMessageContent(type="text", text="You are a helpful assistant.")])]

# Add user message with image
contents = [
    MultiModalMessageContent(type="text", text="Describe this image"),
    MultiModalMessageContent(type="image", text="path/to/image.jpg")
]
conversation.append(MultiModalMessage(role="user", content=contents))

# Apply chat template and generate
prompt = vlm.apply_chat_template(conversation)
for token in vlm.generate_stream(prompt, g_cfg=GenerationConfig(max_tokens=100, image_paths=["path/to/image.jpg"])):
    print(token, end="", flush=True)

LLM

Initialize LLM with model path

Python
import os
from nexaai.llm import LLM
from nexaai.common import ModelConfig

model_path = os.path.expanduser("~/.cache/nexa.ai/nexa_sdk/models/Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf")
m_cfg = ModelConfig()
llm = LLM.from_(model_path, plugin_id="cpu_gpu", device_id="cpu", m_cfg=m_cfg)

API Use cases

Generate text from prompt

Python
from nexaai.llm import GenerationConfig

config = GenerationConfig(max_tokens=100)
prompt = "Once upon a time"
for token in llm.generate_stream(prompt, g_cfg=config):
    print(token, end="", flush=True)

Generate text by streaming

Python
import io

strbuff = io.StringIO()
for token in llm.generate_stream(prompt, g_cfg=GenerationConfig(max_tokens=100)):
    print(token, end="", flush=True)
    strbuff.write(token)

full_text = strbuff.getvalue()
print(f"\nFull text: {full_text}")

Chat Template

Python
from nexaai.common import ChatMessage

conversation = [
    ChatMessage(role="system", content="You are a helpful assistant."),
    ChatMessage(role="user", content="What is the capital of France?"),
]
prompt = llm.apply_chat_template(conversation)
for token in llm.generate_stream(prompt, g_cfg=GenerationConfig(max_tokens=100)):
    print(token, end="", flush=True)

KV Cache

Python
# Save KV cache
llm.save_kv_cache("path/to/kvcache")

# Load KV cache
llm.load_kv_cache("path/to/kvcache")

# Reset conversation
llm.reset()

Profiling Data

Python
# Get profiling information
profiling_data = llm.get_profiling_data()
if profiling_data is not None:
    print(profiling_data)

VLM

Initialize VLM from model files

Python
import os
from nexaai.vlm import VLM
from nexaai.common import ModelConfig

model_path = os.path.expanduser("~/.cache/nexa.ai/nexa_sdk/models/NexaAI/gemma-3n-E4B-it-4bit-MLX/model-00001-of-00002.safetensors")
m_cfg = ModelConfig()
vlm = VLM.from_(name_or_path=model_path, m_cfg=m_cfg, plugin_id="cpu_gpu", device_id="")

API Use cases

Generate text from prompt

Python
from nexaai.vlm import GenerationConfig

config = GenerationConfig(max_tokens=100)
prompt = "Describe this image"
for token in vlm.generate_stream(prompt, g_cfg=config):
    print(token, end="", flush=True)

Generate text by streaming

Python
import io

strbuff = io.StringIO()
for token in vlm.generate_stream(prompt, g_cfg=GenerationConfig(max_tokens=100)):
    print(token, end="", flush=True)
    strbuff.write(token)

full_text = strbuff.getvalue()
print(f"\nFull text: {full_text}")

Audio/Image Processing

Python
from nexaai.vlm import GenerationConfig
from nexaai.common import MultiModalMessage, MultiModalMessageContent

# Image processing
contents = [
    MultiModalMessageContent(type="text", text="Describe this image"),
    MultiModalMessageContent(type="image", text="path/to/image.jpg")
]
conversation = [MultiModalMessage(role="user", content=contents)]
prompt = vlm.apply_chat_template(conversation)

config = GenerationConfig(max_tokens=100, image_paths=["path/to/image.jpg"])
for token in vlm.generate_stream(prompt, g_cfg=config):
    print(token, end="", flush=True)

# Audio processing
contents = [
    MultiModalMessageContent(type="text", text="Translate this audio to English"),
    MultiModalMessageContent(type="audio", text="path/to/audio.mp3")
]
conversation = [MultiModalMessage(role="user", content=contents)]
prompt = vlm.apply_chat_template(conversation)

config = GenerationConfig(max_tokens=100, audio_paths=["path/to/audio.mp3"])
for token in vlm.generate_stream(prompt, g_cfg=config):
    print(token, end="", flush=True)

Profiling Data

Python
# Get profiling information
profiling_data = vlm.get_profiling_data()
if profiling_data is not None:
    print(profiling_data)

Embedder

Initialize

Python
import os
from nexaai.embedder import Embedder

model_path = os.path.expanduser("~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-fp16-mlx/model.safetensors")
embedder = Embedder.from_(name_or_path=model_path, plugin_id="cpu_gpu")

Generate embeddings for input texts

Python
from nexaai.embedder import EmbeddingConfig
import numpy as np

texts = [
    "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
    "Nexa AI allows you to run state-of-the-art AI models locally on CPU, GPU, or NPU.",
    "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
    "The capital of France is Paris."
]

config = EmbeddingConfig(batch_size=4)
embeddings = embedder.generate(texts=texts, config=config)

# Get embedding dimension
dim = embedder.get_embedding_dim()
print(f"Embedding dimension: {dim}")

# Process each embedding
for i, (text, embedding) in enumerate(zip(texts, embeddings)):
    print(f"Text {i+1}: {text}")
    print(f"Embedding shape: {len(embedding)} dimensions")
    print(f"First 10 elements: {embedding[:10]}")

Similarity Analysis

Python
# Generate query embedding
query = "what is on device AI"
query_embedding = embedder.generate(texts=[query], config=EmbeddingConfig(batch_size=1))[0]

# Calculate similarities
for i, (text, embedding) in enumerate(zip(texts, embeddings)):
    query_vec = np.array(query_embedding)
    text_vec = np.array(embedding)
    inner_product = np.dot(query_vec, text_vec)
    
    print(f"Text {i+1}: {text}")
    print(f"Inner product with query: {inner_product:.6f}")

Reranker

Initialize

Python
import os
from nexaai.rerank import Reranker

model_path = os.path.expanduser("~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-rerank-mlx/jina-reranker-v2-base-multilingual-f16.safetensors")
reranker = Reranker.from_(name_or_path=model_path, plugin_id="cpu_gpu")

Rerank documents against a query

Python
from nexaai.rerank import RerankConfig

query = "Where is on-device AI?"
documents = [
    "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
    "edge computing",
    "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
    "The capital of France is Paris."
]

config = RerankConfig(batch_size=4)
scores = reranker.rerank(query=query, documents=documents, config=config)

print(f"Query: {query}")
print(f"Documents: {len(documents)} documents")
print("-" * 50)
for i, score in enumerate(scores):
    print(f"[{score:.4f}] : {documents[i]}")

CV (Computer Vision)

OCR (Optical Character Recognition)

Python
import os
from nexaai.cv import CVCapabilities, CVModel, CVModelConfig

# Initialize OCR model
det_model_path = os.path.expanduser("~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/ch_ptocr_v4_det_infer.safetensors")
rec_model_path = os.path.expanduser("~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/ch_ptocr_v4_rec_infer.safetensors")
image_path = os.path.expanduser("~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/test_input.jpg")

config = CVModelConfig(capabilities=CVCapabilities.OCR,
                       det_model_path=det_model_path, 
                       rec_model_path=rec_model_path)

cv = CVModel.from_(name_or_path=det_model_path, config=config, plugin_id="cpu_gpu")
results = cv.infer(image_path)

print(f"Number of results: {results.result_count}")
for result in results.results:
    print(f"[{result.confidence:.2f}] {result.text}")

ASR (Automatic Speech Recognition)

Initialize

Python
import os
from nexaai.asr import ASR, ASRConfig

model_path = "NexaAI/parakeet-npu"
audio_path = "path/to/audio.wav"

asr = ASR.from_(name_or_path=model_path, plugin_id="npu", device_id="npu")

Transcribe audio file

Python
config = ASRConfig(timestamps="segment", beam_size=5, stream=False)
result = asr.transcribe(audio_path=audio_path, language="en", config=config)
print(result.transcript)

ASR Configuration Options

Python
# Available timestamp options
config = ASRConfig(timestamps="none")      # No timestamps
config = ASRConfig(timestamps="segment")   # Segment-level timestamps
config = ASRConfig(timestamps="word")      # Word-level timestamps (if supported)

# Beam size for decoding
config = ASRConfig(beam_size=5)

# Language specification
result = asr.transcribe(audio_path=audio_path, language="en", config=config)  # English
result = asr.transcribe(audio_path=audio_path, language="zh", config=config)  # Chinese
result = asr.transcribe(audio_path=audio_path, language="", config=config)    # Auto-detect

Configuration

ModelConfig

Python
from nexaai.common import ModelConfig

config = ModelConfig()
# Default configuration is usually sufficient for most use cases

GenerationConfig

Python
from nexaai.llm import GenerationConfig

config = GenerationConfig(
    max_tokens=100,           # Maximum tokens to generate
    # Additional parameters available based on model type
)

EmbeddingConfig

Python
from nexaai.embedder import EmbeddingConfig

config = EmbeddingConfig(
    batch_size=4,             # Batch size for processing
    # Additional parameters available
)

RerankConfig

Python
from nexaai.rerank import RerankConfig

config = RerankConfig(
    batch_size=4,             # Batch size for processing
    # Additional parameters available
)

ASRConfig

Python
from nexaai.asr import ASRConfig

config = ASRConfig(
    timestamps="segment",     # Timestamp granularity: none|segment|word
    beam_size=5,             # Beam size for decoding
    stream=False             # Whether to use streaming mode
)

Plugin ID Options

The plugin_id parameter supports different backends:
  • cpu_gpu: Default, supports both CPU and GPU
  • mlx: Apple Silicon optimized (for supported models)
  • llama_cpp: For GGUF format models
  • onnx: ONNX runtime backend
  • npu: For NPU acceleration (Windows ARM64, Snapdragon X Elite)

Example Usage with Different Backends

Python
# Using MLX backend (macOS Apple Silicon)
llm = LLM.from_(model_path, plugin_id="mlx", device_id="")

# Using NPU backend (Windows ARM64, Snapdragon X Elite)
llm = LLM.from_(model_path, plugin_id="npu", device_id="npu")

# Using ONNX backend
llm = LLM.from_(model_path, plugin_id="onnx", device_id="cpu")

# Using llama_cpp for GGUF models
llm = LLM.from_(model_path, plugin_id="llama_cpp", device_id="cpu")

Example Scripts

The NexaAI SDK repository includes example scripts that demonstrate various use cases. These scripts are located in the /bindings/python/ directory and can be used as reference implementations:
  • llm.py - LLM usage examples with command line interface
  • vlm.py - VLM usage examples with multimodal support
  • embedder.py - Text embedding generation examples
  • rerank.py - Document reranking examples
  • cv_ocr.py - OCR (Optical Character Recognition) examples
  • asr.py - Automatic Speech Recognition examples
These scripts support various command line arguments and can be run directly from the SDK repository for testing and experimentation.
I