NexaAI Windows x64 Setup Guide
This guide demonstrates how to use the NexaAI SDK for various AI inference tasks on Windows x64, including:- LLM (Large Language Model): Text generation and conversation
- VLM (Vision Language Model): Multimodal understanding and generation
- Embedder: Text vectorization and similarity computation
- Reranker: Document reranking
- ASR (Automatic Speech Recognition): Speech-to-text transcription
- CV (Computer Vision): OCR/text recognition
Prerequisites
1. Install the correct Python version
NexaAI requires Python 3.10 on Windows x64 Verify the installation:Copy
Ask AI
python -c "import sys, platform; print(f'Python version: {sys.version}')"
Python version: 3.10.11 (tags/v3.10.11:7d4cc5a, Apr 5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]Expected output must contain version
3.10.x and architecture AMD64.
Here are suggested ways to install Python 3.10:
Using Anaconda
Copy
Ask AI
conda create -n nexaai python=3.10
conda activate nexaai
python3.10:
Copy
Ask AI
python3.10 --version
2. Create and activate a virtual environment
Copy
Ask AI
python -m venv nexaai-env
nexaai-env\Scripts\activate
3. Install the NexaAI SDK
Copy
Ask AI
pip install nexaai
4. Verify Your Environment
Run the following code to ensure you have the right environment:Copy
Ask AI
import sys
import platform
current_ver = sys.version_info
arch = platform.machine()
if current_ver.major != 3 or current_ver.minor != 10:
print(f"❌ Error: Python {current_ver.major}.{current_ver.minor} detected")
print("✅ Required: Python 3.10")
print("Please install Python 3.10 and restart the kernel.")
sys.exit(1)
else:
print("✅ Python 3.10 ready to proceed!")
Authentication Setup
Before running any examples, you need to set up your NexaAI authentication token from https://sdk.nexa.ai/.Set Token in Environment
Replace"YOUR_NEXA_TOKEN_HERE" with your actual NexaAI token:
Copy
Ask AI
set NEXA_TOKEN=YOUR_NEXA_TOKEN_HERE
Verify the Token is Set
Copy
Ask AI
echo %NEXA_TOKEN%
Your NEXA_TOKEN must start with
key/. If it doesn’t, please check your token from the dashboard.1. LLM (Large Language Model) Inference
Using CPU/GPU-accelerated large language models for text generation and conversation.Copy
Ask AI
import io
import os
from nexaai.common import GenerationConfig, ModelConfig, ChatMessage
from nexaai.llm import LLM
def llm_example():
"""LLM Inference example"""
print("=== LLM Inference Example ===")
# Model configuration
model_name = "Qwen/Qwen3-1.7B-GGUF/Qwen3-1.7B-Q8_0.gguf"
plugin_id = "cpu_gpu"
max_tokens = 100
system_message = "You are a helpful assistant."
print(f"Loading model: {model_name}")
print(f"Using plugin: {plugin_id}")
# Create model instance
m_cfg = ModelConfig()
llm = LLM.from_(name_or_path=model_name, m_cfg=m_cfg, plugin_id=plugin_id)
# Create conversation history
conversation = [ChatMessage(role="system", content=system_message)]
# Example conversations
test_prompts = [
"What is artificial intelligence?",
"Explain the benefits of on-device AI processing.",
"How does CPU/GPU acceleration work?"
]
for i, prompt in enumerate(test_prompts, 1):
print(f"\n--- Conversation {i} ---")
print(f"User: {prompt}")
# Add user message
conversation.append(ChatMessage(role="user", content=prompt))
# Apply chat template
formatted_prompt = llm.apply_chat_template(conversation)
# Generate response
print("Assistant: ", end="", flush=True)
response_buffer = io.StringIO()
for token in llm.generate_stream(formatted_prompt, g_cfg=GenerationConfig(max_tokens=max_tokens)):
print(token, end="", flush=True)
response_buffer.write(token)
# Get profiling data
profiling_data = llm.get_profiling_data()
if profiling_data:
print(f"\nProfiling data: {profiling_data}")
# Add assistant response to conversation history
conversation.append(ChatMessage(role="assistant", content=response_buffer.getvalue()))
print("\n" + "=" * 50)
llm_example()
2. VLM (Vision Language Model) Inference
Using CPU/GPU-accelerated vision language models for multimodal understanding and generation.Copy
Ask AI
import os
import io
from nexaai.vlm import VLM
from nexaai.common import GenerationConfig, ModelConfig, MultiModalMessage, MultiModalMessageContent
def vlm_example():
"""VLM Inference example"""
print("=== VLM Inference Example ===")
# Model configuration
model_name = "ggml-org/gemma-3-4b-it-GGUF/gemma-3-4b-it-Q4_K_M.gguf"
plugin_id = "cpu_gpu"
max_tokens = 100
system_message = "You are a helpful assistant that can understand images and text."
image_path = r'path\to\image.jpg' # Replace with actual image path if available
print(f"Loading model: {model_name}")
print(f"Using plugin: {plugin_id}")
# Check for image existence
if not (image_path and os.path.exists(image_path)):
print(f"WARNING: The specified image_path ('{image_path}') does not exist or was not provided. Multimodal prompts will not include image input.")
# Create model instance
m_cfg = ModelConfig()
vlm = VLM.from_(name_or_path=model_name, m_cfg=m_cfg, plugin_id=plugin_id)
# Create conversation history
conversation = [MultiModalMessage(role="system",
content=[MultiModalMessageContent(type="text", text=system_message)])]
# Example multimodal conversations
test_cases = [
{
"text": "What do you see in this image?",
"image_path": image_path
}
]
for i, case in enumerate(test_cases, 1):
print(f"\n--- Multimodal Conversation {i} ---")
print(f"User: {case['text']}")
# Build message content
contents = [MultiModalMessageContent(type="text", text=case['text'])]
# Add image content if available
if case['image_path'] and os.path.exists(case['image_path']):
contents.append(MultiModalMessageContent(type="image", path=case['image_path']))
print(f"Including image: {case['image_path']}")
# Add user message
conversation.append(MultiModalMessage(role="user", content=contents))
# Apply chat template
formatted_prompt = vlm.apply_chat_template(conversation)
# Generate response
print("Assistant: ", end="", flush=True)
response_buffer = io.StringIO()
# Prepare image and audio paths
image_paths = [case['image_path']] if case['image_path'] and os.path.exists(case['image_path']) else None
audio_paths = None
for token in vlm.generate_stream(formatted_prompt,
g_cfg=GenerationConfig(max_tokens=max_tokens,
image_paths=image_paths,
audio_paths=audio_paths)):
print(token, end="", flush=True)
response_buffer.write(token)
# Get profiling data
profiling_data = vlm.get_profiling_data()
if profiling_data:
print(f"\nProfiling data: {profiling_data}")
# Add assistant response to conversation history
conversation.append(MultiModalMessage(role="assistant",
content=[MultiModalMessageContent(type="text", text=response_buffer.getvalue())]))
print("\n" + "=" * 50)
vlm_example()
3. Embedder Inference
Using CPU/GPU-accelerated embedding models for text vectorization and similarity computation.Copy
Ask AI
import numpy as np
from nexaai.embedder import Embedder, EmbeddingConfig
def embedder_example():
"""Embedder Inference example"""
print("=== Embedder Inference Example ===")
# Model configuration
model_name = "NexaAI/jina-v2-fp16-mlx"
plugin_id = "cpu_gpu"
batch_size = 2
print(f"Loading model: {model_name}")
print(f"Using plugin: {plugin_id}")
print(f"Batch size: {batch_size}")
# Create embedder instance
embedder = Embedder.from_(name_or_path=model_name, plugin_id=plugin_id)
print('Embedder loaded successfully!')
# Get embedding dimension
dim = embedder.get_embedding_dim()
print(f"Embedding dimension: {dim}")
# Example texts
texts = [
"On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
"Nexa AI allows you to run state-of-the-art AI models locally on CPU, GPU, or NPU.",
"A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
"The capital of France is Paris.",
"CPU/GPU acceleration provides significant performance improvements for AI workloads."
]
query = "what is on device AI"
print(f"\n=== Generating Embeddings ===")
print(f"Processing {len(texts)} texts...")
# Generate embeddings
embeddings = embedder.generate(
texts=texts,
config=EmbeddingConfig(batch_size=batch_size)
)
print(f"Successfully generated {len(embeddings)} embeddings")
# Display embedding information
print(f"\n=== Embedding Details ===")
for i, (text, embedding) in enumerate(zip(texts, embeddings)):
print(f"\nText {i + 1}:")
print(f" Content: {text}")
print(f" Embedding dimension: {len(embedding)}")
print(f" First 10 elements: {embedding[:10]}")
print("-" * 70)
# Query processing
print(f"\n=== Query Processing ===")
print(f"Query: '{query}'")
query_embedding = embedder.generate(
texts=[query],
config=EmbeddingConfig(batch_size=1)
)[0]
print(f"Query embedding dimension: {len(query_embedding)}")
# Similarity analysis
print(f"\n=== Similarity Analysis (Inner Product) ===")
similarities = []
for i, (text, embedding) in enumerate(zip(texts, embeddings)):
query_vec = np.array(query_embedding)
text_vec = np.array(embedding)
inner_product = np.dot(query_vec, text_vec)
similarities.append((i, text, inner_product))
print(f"\nText {i + 1}:")
print(f" Content: {text}")
print(f" Inner product with query: {inner_product:.6f}")
print("-" * 70)
# Sort and display most similar texts
similarities.sort(key=lambda x: x[2], reverse=True)
print(f"\n=== Similarity Ranking Results ===")
for rank, (idx, text, score) in enumerate(similarities, 1):
print(f"Rank {rank}: [{score:.6f}] {text}")
return embeddings, query_embedding, similarities
embeddings, query_emb, similarities = embedder_example()
4. ASR (Automatic Speech Recognition) Inference
Using CPU/GPU-accelerated speech recognition models for speech-to-text transcription.Copy
Ask AI
import os
import time
from nexaai.asr import ASR, ASRConfig
def asr_example():
"""ASR Inference example"""
print("=== ASR Inference Example ===")
# Model configuration
model_name = "NexaAI/parakeet-tdt-0.6b-v2"
plugin_id = "cpu_gpu"
audio_file = r"path\to\audio.wav" # Replace with actual audio file path
print(f"Loading model: {model_name}")
print(f"Using plugin: {plugin_id}")
# Check if audio file exists
if not os.path.exists(audio_file):
print(f"Error: Audio file not found: {audio_file}")
print("Please provide a valid audio file path to test ASR functionality.")
return None
# Create ASR instance
asr = ASR.from_(name_or_path=model_name, plugin_id=plugin_id)
print('ASR model loaded successfully!')
# Basic ASR configuration
config = ASRConfig(
timestamps="segment", # Get segment-level timestamps
beam_size=5,
stream=False
)
print(f"\n=== Starting Transcription ===")
start_time = time.time()
# Perform transcription
result = asr.transcribe(audio_path=audio_file, language="en", config=config)
end_time = time.time()
transcription_time = end_time - start_time
# Display results
print(f"\n=== Transcription Results ===")
print(f"Transcription: {result.transcript}")
print(f"Processing time: {transcription_time:.2f} seconds")
# Display segment information if available
if hasattr(result, 'segments') and result.segments:
print(f"\nSegments ({len(result.segments)}):")
for i, segment in enumerate(result.segments[:3]): # Show first 3 segments
start_time = segment.get('start', 'N/A')
end_time = segment.get('end', 'N/A')
text = segment.get('text', '').strip()
print(f" {i +1}. [{start_time:.2f}s - {end_time:.2f}s] {text}")
if len(result.segments) > 3:
print(f" ... and {len(result.segments) - 3} more segments")
# Get profiling data
profiling_data = asr.get_profiling_data()
if profiling_data:
print(f"\nProfiling data: {profiling_data}")
return result
result = asr_example()
5. Reranker Inference
Using CPU/GPU-accelerated reranking models for document reranking.Copy
Ask AI
from nexaai.rerank import Reranker, RerankConfig
def reranker_example():
"""Reranker Inference example"""
print("=== Reranker Inference Example ===")
# Model configuration
model_name = "NexaAI/jina-v2-rerank-mlx"
plugin_id = "cpu_gpu"
batch_size = 4
print(f"Loading model: {model_name}")
print(f"Using plugin: {plugin_id}")
print(f"Batch size: {batch_size}")
# Create reranker instance
reranker = Reranker.from_(name_or_path=model_name, plugin_id=plugin_id)
print('Reranker loaded successfully!')
# Example queries and documents
queries = [
"Where is on-device AI?",
"What is CPU/GPU acceleration?",
"How does machine learning work?",
"Tell me about computer vision"
]
documents = [
"On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
"CPU/GPU acceleration provides significant performance improvements for AI workloads on traditional hardware.",
"Edge computing brings computation and data storage closer to the sources of data.",
"A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
"The capital of France is Paris, a beautiful city known for its art and culture.",
"Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.",
"Computer vision is a field of artificial intelligence that trains computers to interpret and understand visual information.",
"Deep learning uses neural networks with multiple layers to model and understand complex patterns in data."
]
print(f"\n=== Document Reranking Test ===")
print(f"Number of documents: {len(documents)}")
# Rerank for each query
for i, query in enumerate(queries, 1):
print(f"\n--- Query {i} ---")
print(f"Query: '{query}'")
print("-" * 50)
# Perform reranking
scores = reranker.rerank(
query=query,
documents=documents,
config=RerankConfig(batch_size=batch_size)
)
# Create (document, score) pairs and sort
doc_scores = list(zip(documents, scores))
doc_scores.sort(key=lambda x: x[1], reverse=True)
# Display ranking results
print("Reranking results:")
for rank, (doc, score) in enumerate(doc_scores, 1):
print(f" {rank:2d}. [{score:.4f}] {doc}")
# Display most relevant documents
print(f"\nMost relevant documents (top 3):")
for rank, (doc, score) in enumerate(doc_scores[:3], 1):
print(f" {rank}. {doc}")
print("=" * 80)
return reranker
reranker = reranker_example()
6. Computer Vision (CV) Inference
Using CPU/GPU-accelerated computer vision tasks (e.g., OCR/text recognition) on images.Copy
Ask AI
import os
from nexaai.cv import CVCapabilities, CVModel, CVModelConfig
def cv_ocr_example():
"""CV OCR Inference example"""
print("=== CV OCR Inference Example ===")
# Model configuration
model_name = "NexaAI/paddleocr-mlx"
image_path = r"path\to\image.jpg" # Replace with actual image file path
# Check if image file exists
if not os.path.exists(image_path):
print(f"Error: Image file not found: {image_path}")
print("Please provide a valid image file path to test CV functionality.")
return None
print(f"Loading model: {model_name}")
config = CVModelConfig(capabilities=CVCapabilities.OCR)
cv = CVModel.from_(name_or_path=model_name, config=config, plugin_id='cpu_gpu')
print("OCR model loaded successfully!")
print(f"\n=== Processing Image ===")
results = cv.infer(image_path)
print(f"\n=== OCR Results ===")
print(f"Number of text regions detected: {results.result_count}")
for i, result in enumerate(results.results, 1):
print(f"{i}. [{result.confidence:.2f}] {result.text}")
cv_ocr_example()
Next Steps
- Explore the API Reference for comprehensive documentation
- Check out the macOS Guide for Apple Silicon optimization
- Visit the Windows ARM64 Guide for NPU acceleration
Was this page helpful?