Skip to main content

NexaAI Windows x64 Setup Guide

This guide demonstrates how to use the NexaAI SDK for various AI inference tasks on Windows x64, including:
  • LLM (Large Language Model): Text generation and conversation
  • VLM (Vision Language Model): Multimodal understanding and generation
  • Embedder: Text vectorization and similarity computation
  • Reranker: Document reranking
  • ASR (Automatic Speech Recognition): Speech-to-text transcription
  • CV (Computer Vision): OCR/text recognition
  • TTS (Text-to-Speech): Text-to-speech synthesis
  • ImageGen: Image generation from text prompts
  • Diarize: Speaker diarization

Prerequisites

1. Install the correct Python version

NexaAI requires Python 3.10 on Windows x64 Verify the installation:
python -c "import sys, platform; print(f'Python version: {sys.version}')"
Your output should look like:
Python version: 3.10.11 (tags/v3.10.11:7d4cc5a, Apr 5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]
Expected output must contain version 3.10.x and architecture AMD64. Here are suggested ways to install Python 3.10: Using Anaconda
conda create -n nexaai python=3.10
conda activate nexaai
After installation, you may need to access Python 3.10 using python3.10:
python3.10 --version

2. Create and activate a virtual environment

python -m venv nexaai-env
nexaai-env\Scripts\activate

3. Install the NexaAI SDK

pip install nexaai

4. Verify Your Environment

Run the following code to ensure you have the right environment:
import sys
import platform

current_ver = sys.version_info
arch = platform.machine()

if current_ver.major != 3 or current_ver.minor != 10:
    print(f"❌ Error: Python {current_ver.major}.{current_ver.minor} detected")
    print("✅ Required: Python 3.10")
    print("Please install Python 3.10 and restart the kernel.")
    sys.exit(1)
else:
    print("✅ Python 3.10 ready to proceed!")

Authentication Setup

Before running any examples, you need to set up your NexaAI authentication token from https://sdk.nexa.ai/.

Set Token in Environment

Replace "YOUR_NEXA_TOKEN_HERE" with your actual NexaAI token:
set NEXA_TOKEN=YOUR_NEXA_TOKEN_HERE

Verify the Token is Set

echo %NEXA_TOKEN%
Your NEXA_TOKEN must start with key/. If it doesn’t, please check your token from the dashboard.

1. LLM (Large Language Model) Inference

Using CPU/GPU-accelerated large language models for text generation and conversation.
import io
import logging
from nexaai import LLM, GenerationConfig, ModelConfig, LlmChatMessage, setup_logging

setup_logging(level=logging.DEBUG)


def llm_example():
    """LLM Inference example"""
    print("=== LLM Inference Example ===")

    # Model configuration
    model_name = "NexaAI/Qwen3-0.6B-GGUF"
    max_tokens = 128
    system_message = "You are a helpful assistant."

    print(f"Loading model: {model_name}")

    # Create model instance
    config = ModelConfig()
    llm = LLM.from_(model=model_name, config=config)

    # Create conversation history
    conversation = [LlmChatMessage(role="system", content=system_message)]

    # Example conversations
    test_prompts = [
        "What is artificial intelligence?",
        "Explain the benefits of on-device AI processing.",
        "How does CPU/GPU acceleration work?"
    ]

    for i, prompt in enumerate(test_prompts, 1):
        print(f"\n--- Conversation {i} ---")
        print(f"User: {prompt}")

        # Add user message
        conversation.append(LlmChatMessage(role="user", content=prompt))

        # Apply chat template
        formatted_prompt = llm.apply_chat_template(conversation)

        # Generate response
        print("Assistant: ", end="", flush=True)
        response_buffer = io.StringIO()

        gen = llm.generate_stream(formatted_prompt, GenerationConfig(max_tokens=max_tokens))
        result = None
        try:
            while True:
                token = next(gen)
                print(token, end="", flush=True)
                response_buffer.write(token)
        except StopIteration as e:
            result = e.value

        # Get profiling data
        if result and hasattr(result, 'profile_data') and result.profile_data:
            print(f"\n{result.profile_data}")

        # Add assistant response to conversation history
        conversation.append(LlmChatMessage(role="assistant", content=response_buffer.getvalue()))
        print("\n" + "=" * 50)


llm_example()

2. VLM (Vision Language Model) Inference

Using CPU/GPU-accelerated vision language models for multimodal understanding and generation.
import os
import io
import logging
from nexaai import VLM, GenerationConfig, ModelConfig, VlmChatMessage, VlmContent, setup_logging

setup_logging(level=logging.DEBUG)


def vlm_example():
    """VLM Inference example"""
    print("=== VLM Inference Example ===")

    # Model configuration
    model_name = "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/gemma-3n-E4B-it-4bit-MLX/model-00001-of-00002.safetensors"
    max_tokens = 128
    system_message = "You are a helpful assistant that can understand images and text."
    image_path = r'path\to\image.jpg'  # Replace with actual image path if available

    print(f"Loading model: {model_name}")

    # Check for image existence
    if not (image_path and os.path.exists(image_path)):
        print(f"WARNING: The specified image_path ('{image_path}') does not exist or was not provided. Multimodal prompts will not include image input.")

    # Create model instance
    config = ModelConfig()
    vlm = VLM.from_(model=model_name, config=config)

    # Create conversation history
    conversation = [VlmChatMessage(role="system",
                                   contents=[VlmContent(type="text", text=system_message)])]

    # Example multimodal conversations
    test_cases = [
        {
            "text": "What do you see in this image?",
            "image_path": image_path
        }
    ]

    for i, case in enumerate(test_cases, 1):
        print(f"\n--- Multimodal Conversation {i} ---")
        print(f"User: {case['text']}")

        # Build message content
        contents = [VlmContent(type="text", text=case['text'])]

        # Add image content if available
        if case['image_path'] and os.path.exists(case['image_path']):
            contents.append(VlmContent(type="image", text=case['image_path']))
            print(f"Including image: {case['image_path']}")

        # Add user message
        conversation.append(VlmChatMessage(role="user", contents=contents))

        # Apply chat template
        formatted_prompt = vlm.apply_chat_template(conversation)

        # Generate response
        print("Assistant: ", end="", flush=True)
        response_buffer = io.StringIO()

        # Prepare image and audio paths
        image_paths = [case['image_path']] if case['image_path'] and os.path.exists(case['image_path']) else None
        audio_paths = None

        gen = vlm.generate_stream(formatted_prompt,
                                  GenerationConfig(max_tokens=max_tokens, image_paths=image_paths, audio_paths=audio_paths))
        result = None
        try:
            while True:
                token = next(gen)
                print(token, end="", flush=True)
                response_buffer.write(token)
        except StopIteration as e:
            result = e.value

        # Get profiling data
        if result and hasattr(result, 'profile_data') and result.profile_data:
            print(f"\n{result.profile_data}")

        # Add assistant response to conversation history
        conversation.append(VlmChatMessage(role="assistant",
                                           contents=[VlmContent(type="text", text=response_buffer.getvalue())]))
        print("\n" + "=" * 50)


vlm_example()

3. Embedder Inference

Using CPU/GPU-accelerated embedding models for text vectorization and similarity computation.
import logging
from nexaai import Embedder, setup_logging

setup_logging(level=logging.DEBUG)


def embedder_example():
    """Embedder Inference example"""
    print("=== Embedder Inference Example ===")

    # Model configuration
    model_name = "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-fp16-mlx/model.safetensors"
    batch_size = None  # Use default or len(texts)

    print(f"Loading model: {model_name}")

    # Create embedder instance
    embedder = Embedder.from_(model=model_name)
    print('Embedder loaded successfully!')

    # Get embedding dimension
    dim = embedder.embedding_dim()
    print(f"Embedding dimension: {dim}")

    # Example texts
    texts = [
        "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
        "Nexa AI allows you to run state-of-the-art AI models locally on CPU, GPU, or NPU — from instant use cases to production deployments.",
        "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
        "The capital of France is Paris.",
    ]

    query = "what is on device AI"

    print(f"\n=== Generating Embeddings ===")
    print(f"Processing {len(texts)} texts...")

    # Generate embeddings
    result = embedder.embed(texts=texts, batch_size=batch_size or len(texts))
    embeddings = result.embeddings

    print(f"Successfully generated {len(embeddings)} embeddings")

    # Display embedding information
    print(f"\n=== Embedding Details ===")
    for i, (text, embedding) in enumerate(zip(texts, embeddings)):
        print(f"\nText {i + 1}:")
        print(f"  Content: {text}")
        print(f"  Embedding shape: {len(embedding)} dimensions")
        print(f"  First 10 elements: {embedding[:10]}")
        print("-" * 70)

    # Query processing
    print(f"\n=== Query Processing ===")
    print(f"Query: '{query}'")

    query_result = embedder.embed(texts=[query], batch_size=1)
    query_embedding = query_result.embeddings[0]

    print(f"Query embedding shape: {len(query_embedding)} dimensions")

    # Similarity analysis
    print(f"\n=== Similarity Analysis (Inner Product) ===")
    similarities = []

    for i, (text, embedding) in enumerate(zip(texts, embeddings)):
        inner_product = sum(a * b for a, b in zip(query_embedding, embedding))
        similarities.append((i, text, inner_product))

        print(f"\nText {i + 1}:")
        print(f"  Content: {text}")
        print(f"  Inner product with query: {inner_product:.6f}")
        print("-" * 70)

    # Sort and display most similar texts
    similarities.sort(key=lambda x: x[2], reverse=True)

    print(f"\n=== Similarity Ranking Results ===")
    for rank, (idx, text, score) in enumerate(similarities, 1):
        print(f"Rank {rank}: [{score:.6f}] {text}")

    return embeddings, query_embedding, similarities


embeddings, query_emb, similarities = embedder_example()

4. ASR (Automatic Speech Recognition) Inference

Using CPU/GPU-accelerated speech recognition models for speech-to-text transcription.
import os
import logging
from nexaai import ASR, setup_logging

setup_logging(level=logging.DEBUG)


def asr_example():
    """ASR Inference example"""
    print("=== ASR Inference Example ===")

    # Model configuration
    model_name = "NexaAI/parakeet-npu"
    audio_file = r"path\to\audio.wav"  # Replace with actual audio file path

    print(f"Loading model: {model_name}")

    # Check if audio file exists
    if not os.path.exists(audio_file):
        print(f"Error: Audio file not found: {audio_file}")
        print("Please provide a valid audio file path to test ASR functionality.")
        return None

    # Create ASR instance
    asr = ASR.from_(model=model_name)
    print('ASR model loaded successfully!')

    print(f"\n=== Starting Transcription ===")

    # Perform transcription
    result = asr.transcribe(
        audio_path=audio_file,
        language="en",
        timestamps="segment",
        beam_size=5
    )

    # Display results
    print(f"\n=== Transcription Results ===")
    print(result.transcript)

    return result


result = asr_example()

5. Reranker Inference

Using CPU/GPU-accelerated reranking models for document reranking.
import logging
from nexaai import Reranker, setup_logging

setup_logging(level=logging.DEBUG)


def reranker_example():
    """Reranker Inference example"""
    print("=== Reranker Inference Example ===")

    # Model configuration
    model_name = "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-rerank-mlx/jina-reranker-v2-base-multilingual-f16.safetensors"
    batch_size = None  # Use default or len(documents)

    print(f"Loading model: {model_name}")

    # Create reranker instance
    reranker = Reranker.from_(model=model_name)
    print('Reranker loaded successfully!')

    # Example query and documents
    query = "Where is on-device AI?"
    documents = [
        "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
        "edge computing",
        "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
        "The capital of France is Paris.",
    ]

    print(f"Query: {query}")
    print(f"Documents: {len(documents)} documents")
    print("-" * 50)

    # Perform reranking
    result = reranker.rerank(
        query=query,
        documents=documents,
        batch_size=batch_size or len(documents)
    )
    scores = result.scores

    # Display ranking results
    for i, score in enumerate(scores):
        print(f"[{score:.4f}] : {documents[i]}")

    return reranker


reranker = reranker_example()

6. Computer Vision (CV) Inference

Using CPU/GPU-accelerated computer vision tasks (e.g., OCR/text recognition) on images.
import os
import logging
from nexaai import CV, setup_logging

setup_logging(level=logging.DEBUG)


def cv_ocr_example():
    """CV OCR Inference example"""
    print("=== CV OCR Inference Example ===")

    # Model configuration
    model_name = "NexaAI/paddleocr-npu"
    image_path = r"path\to\image.png"  # Replace with actual image file path

    # Check if image file exists
    if not os.path.exists(image_path):
        print(f"Error: Image file not found: {image_path}")
        print("Please provide a valid image file path to test CV functionality.")
        return None

    print(f"Loading model: {model_name}")
    cv = CV.from_(model=model_name, capabilities=0, plugin_id=None)  # 0=OCR
    print("OCR model loaded successfully!")
    
    print(f"\n=== Processing Image ===")
    results = cv.infer(image_path)

    print(f"\n=== OCR Results ===")
    print(f"Number of results: {len(results.results)}")
    for result in results.results:
        print(f"[{result.confidence:.2f}] {result.text}")


cv_ocr_example()

Next Steps