NexaAI macOS Setup Guide
This guide demonstrates how to use the NexaAI SDK for various AI inference tasks on macOS, including:- LLM (Large Language Model): Text generation and conversation
- VLM (Vision Language Model): Multimodal understanding and generation
- Embedder: Text vectorization and similarity computation
- Reranker: Document reranking
- ASR (Automatic Speech Recognition): Speech-to-text transcription
- CV (Computer Vision): OCR/text recognition
Prerequisites
1. Install the correct Python version
NexaAI requires Python 3.10 on macOS Verify the installation:Copy
Ask AI
python -c "import sys, platform; print(f'Python version: {sys.version}')"
Python version: 3.10.18 (main, Jun 3 2025, 18:23:41) [Clang 17.0.0 (clang-1700.0.13.5)]Expected output must contain version
3.10.x
Here are suggested ways to install Python 3.10:
Option 1: Using Homebrew
If you don’t have Homebrew, first install it from https://brew.sh/. Then, in your Terminal:
Copy
Ask AI
brew install python@3.10
Copy
Ask AI
conda create -n nexaai python=3.10
conda activate nexaai
python3.10:
Copy
Ask AI
python3.10 --version
2. Create and activate a virtual environment
Copy
Ask AI
python -m venv nexaai-env
source nexaai-env/bin/activate
3. Install the NexaAI SDK
Copy
Ask AI
pip install 'nexaai[mlx]'
4. Verify Your Environment
Run the following code to ensure you have the right environment:Copy
Ask AI
import sys
import platform
current_ver = sys.version_info
arch = platform.machine()
if current_ver.major != 3 or current_ver.minor != 10:
print(f"❌ Error: Python {current_ver.major}.{current_ver.minor} detected")
print("✅ Required: Python 3.10")
print("Please install Python 3.10 and restart the kernel.")
sys.exit(1)
else:
print("✅ Python 3.10 running natively on Apple Silicon - Ready to proceed!")
Authentication Setup
Before running any examples, you need to set up your NexaAI authentication token from https://sdk.nexa.ai/.Set Token in Environment
Replace"YOUR_NEXA_TOKEN_HERE" with your actual NexaAI token:
Copy
Ask AI
export NEXA_TOKEN="YOUR_NEXA_TOKEN_HERE"
Verify the Token is Set
Copy
Ask AI
echo $NEXA_TOKEN
Your NEXA_TOKEN must start with
key/. If it doesn’t, please check your token from the dashboard.1. LLM (Large Language Model) Inference
Using MLX-accelerated large language models for text generation and conversation.Copy
Ask AI
import io
import os
from nexaai.common import GenerationConfig, ModelConfig, ChatMessage
from nexaai.llm import LLM
def llm_example():
"""LLM Inference example"""
print("=== LLM Inference Example ===")
# Model configuration
model_name = "NexaAI/Qwen3-1.7B-4bit-MLX"
max_tokens = 100
system_message = "You are a helpful assistant."
print(f"Loading model: {model_name}")
# Create model instance
m_cfg = ModelConfig()
llm = LLM.from_(name_or_path=model_name, m_cfg=m_cfg)
# Create conversation history
conversation = [ChatMessage(role="system", content=system_message)]
# Example conversations
test_prompts = [
"What is artificial intelligence?",
"Explain the benefits of on-device AI processing.",
"How does MLX acceleration work?"
]
for i, prompt in enumerate(test_prompts, 1):
print(f"\n--- Conversation {i} ---")
print(f"User: {prompt}")
# Add user message
conversation.append(ChatMessage(role="user", content=prompt))
# Apply chat template
formatted_prompt = llm.apply_chat_template(conversation)
# Generate response
print("Assistant: ", end="", flush=True)
response_buffer = io.StringIO()
for token in llm.generate_stream(formatted_prompt, g_cfg=GenerationConfig(max_tokens=max_tokens)):
print(token, end="", flush=True)
response_buffer.write(token)
# Get profiling data
profiling_data = llm.get_profiling_data()
if profiling_data:
print(f"\nProfiling data: {profiling_data}")
# Add assistant response to conversation history
conversation.append(ChatMessage(role="assistant", content=response_buffer.getvalue()))
print("\n" + "=" * 50)
llm_example()
2. VLM (Vision Language Model) Inference
Using MLX-accelerated vision language models for multimodal understanding and generation.Copy
Ask AI
import os
import io
from nexaai.vlm import VLM
from nexaai.common import GenerationConfig, ModelConfig, MultiModalMessage, MultiModalMessageContent
def vlm_example():
"""VLM Inference example"""
print("=== VLM Inference Example ===")
# Model configuration
model_name = "NexaAI/gemma-3n-E2B-it-4bit-MLX"
plugin_id = "metal"
max_tokens = 100
system_message = "You are a helpful assistant that can understand images and text."
image_path = '/your/image/path' # Replace with actual image path if available
print(f"Loading model: {model_name}")
print(f"Using plugin: {plugin_id}")
# Check for image existence
if not (image_path and os.path.exists(image_path)):
print(f"WARNING: The specified image_path ('{image_path}') does not exist or was not provided. Multimodal prompts will not include image input.")
# Create model instance
m_cfg = ModelConfig()
vlm = VLM.from_(name_or_path=model_name, m_cfg=m_cfg, plugin_id=plugin_id)
# Create conversation history
conversation = [MultiModalMessage(role="system",
content=[MultiModalMessageContent(type="text", text=system_message)])]
# Example multimodal conversations
test_cases = [
{
"text": "What do you see in this image?",
"image_path": image_path
}
]
for i, case in enumerate(test_cases, 1):
print(f"\n--- Multimodal Conversation {i} ---")
print(f"User: {case['text']}")
# Build message content
contents = [MultiModalMessageContent(type="text", text=case['text'])]
# Add image content if available
if case['image_path'] and os.path.exists(case['image_path']):
contents.append(MultiModalMessageContent(type="image", path=case['image_path']))
print(f"Including image: {case['image_path']}")
# Add user message
conversation.append(MultiModalMessage(role="user", content=contents))
# Apply chat template
formatted_prompt = vlm.apply_chat_template(conversation)
# Generate response
print("Assistant: ", end="", flush=True)
response_buffer = io.StringIO()
# Prepare image and audio paths
image_paths = [case['image_path']] if case['image_path'] and os.path.exists(case['image_path']) else None
audio_paths = None
for token in vlm.generate_stream(formatted_prompt,
g_cfg=GenerationConfig(max_tokens=max_tokens,
image_paths=image_paths,
audio_paths=audio_paths)):
print(token, end="", flush=True)
response_buffer.write(token)
# Get profiling data
profiling_data = vlm.get_profiling_data()
if profiling_data:
print(f"\nProfiling data: {profiling_data}")
# Add assistant response to conversation history
conversation.append(MultiModalMessage(role="assistant",
content=[MultiModalMessageContent(type="text", text=response_buffer.getvalue())]))
print("\n" + "=" * 50)
vlm_example()
3. Embedder Inference
Using MLX-accelerated embedding models for text vectorization and similarity computation.Copy
Ask AI
import numpy as np
from nexaai.embedder import Embedder, EmbeddingConfig
def embedder_example():
"""Embedder Inference example"""
print("=== Embedder Inference Example ===")
# Model configuration
model_name = "mlx-community/embeddinggemma-300m-bf16"
plugin_id = "metal"
batch_size = 2
print(f"Loading model: {model_name}")
print(f"Using plugin: {plugin_id}")
print(f"Batch size: {batch_size}")
# Create embedder instance
embedder = Embedder.from_(name_or_path=model_name, plugin_id=plugin_id)
print('Embedder loaded successfully!')
# Get embedding dimension
dim = embedder.get_embedding_dim()
print(f"Embedding dimension: {dim}")
# Example texts
texts = [
"On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
"Nexa AI allows you to run state-of-the-art AI models locally on CPU, GPU, or NPU.",
"A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
"The capital of France is Paris.",
"MLX acceleration provides significant performance improvements for AI workloads on Apple Silicon."
]
query = "what is on device AI"
print(f"\n=== Generating Embeddings ===")
print(f"Processing {len(texts)} texts...")
# Generate embeddings
embeddings = embedder.generate(
texts=texts,
config=EmbeddingConfig(batch_size=batch_size)
)
print(f"Successfully generated {len(embeddings)} embeddings")
# Display embedding information
print(f"\n=== Embedding Details ===")
for i, (text, embedding) in enumerate(zip(texts, embeddings)):
print(f"\nText {i + 1}:")
print(f" Content: {text}")
print(f" Embedding dimension: {len(embedding)}")
print(f" First 10 elements: {embedding[:10]}")
print("-" * 70)
# Query processing
print(f"\n=== Query Processing ===")
print(f"Query: '{query}'")
query_embedding = embedder.generate(
texts=[query],
config=EmbeddingConfig(batch_size=1)
)[0]
print(f"Query embedding dimension: {len(query_embedding)}")
# Similarity analysis
print(f"\n=== Similarity Analysis (Inner Product) ===")
similarities = []
for i, (text, embedding) in enumerate(zip(texts, embeddings)):
query_vec = np.array(query_embedding)
text_vec = np.array(embedding)
inner_product = np.dot(query_vec, text_vec)
similarities.append((i, text, inner_product))
print(f"\nText {i + 1}:")
print(f" Content: {text}")
print(f" Inner product with query: {inner_product:.6f}")
print("-" * 70)
# Sort and display most similar texts
similarities.sort(key=lambda x: x[2], reverse=True)
print(f"\n=== Similarity Ranking Results ===")
for rank, (idx, text, score) in enumerate(similarities, 1):
print(f"Rank {rank}: [{score:.6f}] {text}")
return embeddings, query_embedding, similarities
embeddings, query_emb, similarities = embedder_example()
4. ASR (Automatic Speech Recognition) Inference
Using MLX-accelerated speech recognition models for speech-to-text transcription.Copy
Ask AI
import os
import time
from nexaai.asr import ASR, ASRConfig
def asr_example():
"""ASR Inference example"""
print("=== ASR Inference Example ===")
# Model configuration
model_name = "NexaAI/parakeet-tdt-0.6b-v2-MLX"
plugin_id = "metal"
audio_file = r"path/to/audio" # Replace with actual audio file path
print(f"Loading model: {model_name}")
print(f"Using plugin: {plugin_id}")
# Check if audio file exists
if not os.path.exists(audio_file):
print(f"Error: Audio file not found: {audio_file}")
print("Please provide a valid audio file path to test ASR functionality.")
return None
# Create ASR instance
asr = ASR.from_(name_or_path=model_name, plugin_id=plugin_id)
print('ASR model loaded successfully!')
# Basic ASR configuration
config = ASRConfig(
timestamps="segment", # Get segment-level timestamps
beam_size=5,
stream=False
)
print(f"\n=== Starting Transcription ===")
start_time = time.time()
# Perform transcription
result = asr.transcribe(audio_path=audio_file, language="en", config=config)
end_time = time.time()
transcription_time = end_time - start_time
# Display results
print(f"\n=== Transcription Results ===")
print(f"Transcription: {result.transcript}")
print(f"Processing time: {transcription_time:.2f} seconds")
# Display segment information if available
if hasattr(result, 'segments') and result.segments:
print(f"\nSegments ({len(result.segments)}):")
for i, segment in enumerate(result.segments[:3]): # Show first 3 segments
start_time = segment.get('start', 'N/A')
end_time = segment.get('end', 'N/A')
text = segment.get('text', '').strip()
print(f" {i +1}. [{start_time:.2f}s - {end_time:.2f}s] {text}")
if len(result.segments) > 3:
print(f" ... and {len(result.segments) - 3} more segments")
# Get profiling data
profiling_data = asr.get_profiling_data()
if profiling_data:
print(f"\nProfiling data: {profiling_data}")
return result
result = asr_example()
5. Reranker Inference
Using MLX-accelerated reranking models for document reranking.Copy
Ask AI
from nexaai.rerank import Reranker, RerankConfig
def reranker_example():
"""Reranker Inference example"""
print("=== Reranker Inference Example ===")
# Model configuration
model_name = "NexaAI/jina-v2-rerank-mlx"
plugin_id = "metal"
batch_size = 4
print(f"Loading model: {model_name}")
print(f"Using plugin: {plugin_id}")
print(f"Batch size: {batch_size}")
# Create reranker instance
reranker = Reranker.from_(name_or_path=model_name, plugin_id=plugin_id)
print('Reranker loaded successfully!')
# Example queries and documents
queries = [
"Where is on-device AI?",
"What is MLX acceleration?",
"How does machine learning work?",
"Tell me about computer vision"
]
documents = [
"On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
"MLX acceleration provides significant performance improvements for AI workloads on Apple Silicon.",
"Edge computing brings computation and data storage closer to the sources of data.",
"A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
"The capital of France is Paris, a beautiful city known for its art and culture.",
"Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.",
"Computer vision is a field of artificial intelligence that trains computers to interpret and understand visual information.",
"Deep learning uses neural networks with multiple layers to model and understand complex patterns in data."
]
print(f"\n=== Document Reranking Test ===")
print(f"Number of documents: {len(documents)}")
# Rerank for each query
for i, query in enumerate(queries, 1):
print(f"\n--- Query {i} ---")
print(f"Query: '{query}'")
print("-" * 50)
# Perform reranking
scores = reranker.rerank(
query=query,
documents=documents,
config=RerankConfig(batch_size=batch_size)
)
# Create (document, score) pairs and sort
doc_scores = list(zip(documents, scores))
doc_scores.sort(key=lambda x: x[1], reverse=True)
# Display ranking results
print("Reranking results:")
for rank, (doc, score) in enumerate(doc_scores, 1):
print(f" {rank:2d}. [{score:.4f}] {doc}")
# Display most relevant documents
print(f"\nMost relevant documents (top 3):")
for rank, (doc, score) in enumerate(doc_scores[:3], 1):
print(f" {rank}. {doc}")
print("=" * 80)
return reranker
reranker = reranker_example()
6. Computer Vision (CV) Inference
Using MLX-accelerated computer vision tasks (e.g., OCR/text recognition) on images.Copy
Ask AI
import os
from nexaai.cv import CVCapabilities, CVModel, CVModelConfig
def cv_ocr_example():
"""CV OCR Inference example"""
print("=== CV OCR Inference Example ===")
# Model configuration
model_name = "NexaAI/paddleocr-mlx"
image_path = r"path/to/image" # Replace with actual image file path
# Check if image file exists
if not os.path.exists(image_path):
print(f"Error: Image file not found: {image_path}")
print("Please provide a valid image file path to test CV functionality.")
return None
print(f"Loading model: {model_name}")
config = CVModelConfig(capabilities=CVCapabilities.OCR)
cv = CVModel.from_(name_or_path=model_name, config=config, plugin_id='metal')
print("OCR model loaded successfully!")
print(f"\n=== Processing Image ===")
results = cv.infer(image_path)
print(f"\n=== OCR Results ===")
print(f"Number of text regions detected: {results.result_count}")
for i, result in enumerate(results.results, 1):
print(f"{i}. [{result.confidence:.2f}] {result.text}")
cv_ocr_example()
Next Steps
- Explore the API Reference for comprehensive documentation
- Check out the Windows x64 Guide for cross-platform development
- Visit the Windows ARM64 Guide for NPU optimization
Was this page helpful?