Skip to main content

NexaAI Windows x64 Setup Guide

This guide demonstrates how to use the NexaAI SDK for various AI inference tasks on Windows x64, including:
  • LLM (Large Language Model): Text generation and conversation
  • VLM (Vision Language Model): Multimodal understanding and generation
  • Embedder: Text vectorization and similarity computation
  • Reranker: Document reranking
  • ASR (Automatic Speech Recognition): Speech-to-text transcription
  • CV (Computer Vision): OCR/text recognition

Prerequisites

1. Install the correct Python version

NexaAI requires Python 3.10 on Windows x64 Verify the installation:
python -c "import sys, platform; print(f'Python version: {sys.version}')"
Your output should look like:
Python version: 3.10.11 (tags/v3.10.11:7d4cc5a, Apr 5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]
Expected output must contain version 3.10.x and architecture AMD64. Here are suggested ways to install Python 3.10: Using Anaconda
conda create -n nexaai python=3.10
conda activate nexaai
After installation, you may need to access Python 3.10 using python3.10:
python3.10 --version

2. Create and activate a virtual environment

python -m venv nexaai-env
nexaai-env\Scripts\activate

3. Install the NexaAI SDK

pip install nexaai

4. Verify Your Environment

Run the following code to ensure you have the right environment:
import sys
import platform

current_ver = sys.version_info
arch = platform.machine()

if current_ver.major != 3 or current_ver.minor != 10:
    print(f"❌ Error: Python {current_ver.major}.{current_ver.minor} detected")
    print("✅ Required: Python 3.10")
    print("Please install Python 3.10 and restart the kernel.")
    sys.exit(1)
else:
    print("✅ Python 3.10 ready to proceed!")

Authentication Setup

Before running any examples, you need to set up your NexaAI authentication token from https://sdk.nexa.ai/.

Set Token in Environment

Replace "YOUR_NEXA_TOKEN_HERE" with your actual NexaAI token:
set NEXA_TOKEN=YOUR_NEXA_TOKEN_HERE

Verify the Token is Set

echo %NEXA_TOKEN%
Your NEXA_TOKEN must start with key/. If it doesn’t, please check your token from the dashboard.

1. LLM (Large Language Model) Inference

Using CPU/GPU-accelerated large language models for text generation and conversation.
import io
import os

from nexaai.common import GenerationConfig, ModelConfig, ChatMessage
from nexaai.llm import LLM


def llm_example():
    """LLM Inference example"""
    print("=== LLM Inference Example ===")

    # Model configuration
    model_name = "Qwen/Qwen3-1.7B-GGUF/Qwen3-1.7B-Q8_0.gguf"
    plugin_id = "cpu_gpu"
    max_tokens = 100
    system_message = "You are a helpful assistant."

    print(f"Loading model: {model_name}")
    print(f"Using plugin: {plugin_id}")

    # Create model instance
    m_cfg = ModelConfig()
    llm = LLM.from_(name_or_path=model_name, m_cfg=m_cfg, plugin_id=plugin_id)

    # Create conversation history
    conversation = [ChatMessage(role="system", content=system_message)]

    # Example conversations
    test_prompts = [
        "What is artificial intelligence?",
        "Explain the benefits of on-device AI processing.",
        "How does CPU/GPU acceleration work?"
    ]

    for i, prompt in enumerate(test_prompts, 1):
        print(f"\n--- Conversation {i} ---")
        print(f"User: {prompt}")

        # Add user message
        conversation.append(ChatMessage(role="user", content=prompt))

        # Apply chat template
        formatted_prompt = llm.apply_chat_template(conversation)

        # Generate response
        print("Assistant: ", end="", flush=True)
        response_buffer = io.StringIO()

        for token in llm.generate_stream(formatted_prompt, g_cfg=GenerationConfig(max_tokens=max_tokens)):
            print(token, end="", flush=True)
            response_buffer.write(token)

        # Get profiling data
        profiling_data = llm.get_profiling_data()
        if profiling_data:
            print(f"\nProfiling data: {profiling_data}")

        # Add assistant response to conversation history
        conversation.append(ChatMessage(role="assistant", content=response_buffer.getvalue()))
        print("\n" + "=" * 50)


llm_example()

2. VLM (Vision Language Model) Inference

Using CPU/GPU-accelerated vision language models for multimodal understanding and generation.
import os
import io

from nexaai.vlm import VLM
from nexaai.common import GenerationConfig, ModelConfig, MultiModalMessage, MultiModalMessageContent


def vlm_example():
    """VLM Inference example"""
    print("=== VLM Inference Example ===")

    # Model configuration
    model_name = "ggml-org/gemma-3-4b-it-GGUF/gemma-3-4b-it-Q4_K_M.gguf"
    plugin_id = "cpu_gpu"
    max_tokens = 100
    system_message = "You are a helpful assistant that can understand images and text."
    image_path = r'path\to\image.jpg'  # Replace with actual image path if available

    print(f"Loading model: {model_name}")
    print(f"Using plugin: {plugin_id}")

    # Check for image existence
    if not (image_path and os.path.exists(image_path)):
        print(f"WARNING: The specified image_path ('{image_path}') does not exist or was not provided. Multimodal prompts will not include image input.")

    # Create model instance
    m_cfg = ModelConfig()
    vlm = VLM.from_(name_or_path=model_name, m_cfg=m_cfg, plugin_id=plugin_id)

    # Create conversation history
    conversation = [MultiModalMessage(role="system",
                                      content=[MultiModalMessageContent(type="text", text=system_message)])]

    # Example multimodal conversations
    test_cases = [
        {
            "text": "What do you see in this image?",
            "image_path": image_path
        }
    ]

    for i, case in enumerate(test_cases, 1):
        print(f"\n--- Multimodal Conversation {i} ---")
        print(f"User: {case['text']}")

        # Build message content
        contents = [MultiModalMessageContent(type="text", text=case['text'])]

        # Add image content if available
        if case['image_path'] and os.path.exists(case['image_path']):
            contents.append(MultiModalMessageContent(type="image", path=case['image_path']))
            print(f"Including image: {case['image_path']}")

        # Add user message
        conversation.append(MultiModalMessage(role="user", content=contents))

        # Apply chat template
        formatted_prompt = vlm.apply_chat_template(conversation)

        # Generate response
        print("Assistant: ", end="", flush=True)
        response_buffer = io.StringIO()

        # Prepare image and audio paths
        image_paths = [case['image_path']] if case['image_path'] and os.path.exists(case['image_path']) else None
        audio_paths = None

        for token in vlm.generate_stream(formatted_prompt,
                                         g_cfg=GenerationConfig(max_tokens=max_tokens,
                                                                image_paths=image_paths,
                                                                audio_paths=audio_paths)):
            print(token, end="", flush=True)
            response_buffer.write(token)

        # Get profiling data
        profiling_data = vlm.get_profiling_data()
        if profiling_data:
            print(f"\nProfiling data: {profiling_data}")

        # Add assistant response to conversation history
        conversation.append(MultiModalMessage(role="assistant",
                                              content=[MultiModalMessageContent(type="text", text=response_buffer.getvalue())]))
        print("\n" + "=" * 50)


vlm_example()

3. Embedder Inference

Using CPU/GPU-accelerated embedding models for text vectorization and similarity computation.
import numpy as np
from nexaai.embedder import Embedder, EmbeddingConfig


def embedder_example():
    """Embedder Inference example"""
    print("=== Embedder Inference Example ===")

    # Model configuration
    model_name = "NexaAI/jina-v2-fp16-mlx"
    plugin_id = "cpu_gpu"
    batch_size = 2

    print(f"Loading model: {model_name}")
    print(f"Using plugin: {plugin_id}")
    print(f"Batch size: {batch_size}")

    # Create embedder instance
    embedder = Embedder.from_(name_or_path=model_name, plugin_id=plugin_id)
    print('Embedder loaded successfully!')

    # Get embedding dimension
    dim = embedder.get_embedding_dim()
    print(f"Embedding dimension: {dim}")

    # Example texts
    texts = [
        "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
        "Nexa AI allows you to run state-of-the-art AI models locally on CPU, GPU, or NPU.",
        "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
        "The capital of France is Paris.",
        "CPU/GPU acceleration provides significant performance improvements for AI workloads."
    ]

    query = "what is on device AI"

    print(f"\n=== Generating Embeddings ===")
    print(f"Processing {len(texts)} texts...")

    # Generate embeddings
    embeddings = embedder.generate(
        texts=texts,
        config=EmbeddingConfig(batch_size=batch_size)
    )

    print(f"Successfully generated {len(embeddings)} embeddings")

    # Display embedding information
    print(f"\n=== Embedding Details ===")
    for i, (text, embedding) in enumerate(zip(texts, embeddings)):
        print(f"\nText {i + 1}:")
        print(f"  Content: {text}")
        print(f"  Embedding dimension: {len(embedding)}")
        print(f"  First 10 elements: {embedding[:10]}")
        print("-" * 70)

    # Query processing
    print(f"\n=== Query Processing ===")
    print(f"Query: '{query}'")

    query_embedding = embedder.generate(
        texts=[query],
        config=EmbeddingConfig(batch_size=1)
    )[0]

    print(f"Query embedding dimension: {len(query_embedding)}")

    # Similarity analysis
    print(f"\n=== Similarity Analysis (Inner Product) ===")
    similarities = []

    for i, (text, embedding) in enumerate(zip(texts, embeddings)):
        query_vec = np.array(query_embedding)
        text_vec = np.array(embedding)
        inner_product = np.dot(query_vec, text_vec)
        similarities.append((i, text, inner_product))

        print(f"\nText {i + 1}:")
        print(f"  Content: {text}")
        print(f"  Inner product with query: {inner_product:.6f}")
        print("-" * 70)

    # Sort and display most similar texts
    similarities.sort(key=lambda x: x[2], reverse=True)

    print(f"\n=== Similarity Ranking Results ===")
    for rank, (idx, text, score) in enumerate(similarities, 1):
        print(f"Rank {rank}: [{score:.6f}] {text}")

    return embeddings, query_embedding, similarities


embeddings, query_emb, similarities = embedder_example()

4. ASR (Automatic Speech Recognition) Inference

Using CPU/GPU-accelerated speech recognition models for speech-to-text transcription.
import os
import time

from nexaai.asr import ASR, ASRConfig


def asr_example():
    """ASR Inference example"""
    print("=== ASR Inference Example ===")

    # Model configuration
    model_name = "NexaAI/parakeet-tdt-0.6b-v2"
    plugin_id = "cpu_gpu"
    audio_file = r"path\to\audio.wav"  # Replace with actual audio file path

    print(f"Loading model: {model_name}")
    print(f"Using plugin: {plugin_id}")

    # Check if audio file exists
    if not os.path.exists(audio_file):
        print(f"Error: Audio file not found: {audio_file}")
        print("Please provide a valid audio file path to test ASR functionality.")
        return None

    # Create ASR instance
    asr = ASR.from_(name_or_path=model_name, plugin_id=plugin_id)
    print('ASR model loaded successfully!')

    # Basic ASR configuration
    config = ASRConfig(
        timestamps="segment",  # Get segment-level timestamps
        beam_size=5,
        stream=False
    )

    print(f"\n=== Starting Transcription ===")
    start_time = time.time()

    # Perform transcription
    result = asr.transcribe(audio_path=audio_file, language="en", config=config)

    end_time = time.time()
    transcription_time = end_time - start_time

    # Display results
    print(f"\n=== Transcription Results ===")
    print(f"Transcription: {result.transcript}")
    print(f"Processing time: {transcription_time:.2f} seconds")

    # Display segment information if available
    if hasattr(result, 'segments') and result.segments:
        print(f"\nSegments ({len(result.segments)}):")
        for i, segment in enumerate(result.segments[:3]):  # Show first 3 segments
            start_time = segment.get('start', 'N/A')
            end_time = segment.get('end', 'N/A')
            text = segment.get('text', '').strip()
            print(f"  {i +1}. [{start_time:.2f}s - {end_time:.2f}s] {text}")
        if len(result.segments) > 3:
            print(f"  ... and {len(result.segments) - 3} more segments")

    # Get profiling data
    profiling_data = asr.get_profiling_data()
    if profiling_data:
        print(f"\nProfiling data: {profiling_data}")

    return result


result = asr_example()

5. Reranker Inference

Using CPU/GPU-accelerated reranking models for document reranking.
from nexaai.rerank import Reranker, RerankConfig


def reranker_example():
    """Reranker Inference example"""
    print("=== Reranker Inference Example ===")

    # Model configuration
    model_name = "NexaAI/jina-v2-rerank-mlx"
    plugin_id = "cpu_gpu"
    batch_size = 4

    print(f"Loading model: {model_name}")
    print(f"Using plugin: {plugin_id}")
    print(f"Batch size: {batch_size}")

    # Create reranker instance
    reranker = Reranker.from_(name_or_path=model_name, plugin_id=plugin_id)
    print('Reranker loaded successfully!')

    # Example queries and documents
    queries = [
        "Where is on-device AI?",
        "What is CPU/GPU acceleration?",
        "How does machine learning work?",
        "Tell me about computer vision"
    ]

    documents = [
        "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
        "CPU/GPU acceleration provides significant performance improvements for AI workloads on traditional hardware.",
        "Edge computing brings computation and data storage closer to the sources of data.",
        "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
        "The capital of France is Paris, a beautiful city known for its art and culture.",
        "Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.",
        "Computer vision is a field of artificial intelligence that trains computers to interpret and understand visual information.",
        "Deep learning uses neural networks with multiple layers to model and understand complex patterns in data."
    ]

    print(f"\n=== Document Reranking Test ===")
    print(f"Number of documents: {len(documents)}")

    # Rerank for each query
    for i, query in enumerate(queries, 1):
        print(f"\n--- Query {i} ---")
        print(f"Query: '{query}'")
        print("-" * 50)

        # Perform reranking
        scores = reranker.rerank(
            query=query,
            documents=documents,
            config=RerankConfig(batch_size=batch_size)
        )

        # Create (document, score) pairs and sort
        doc_scores = list(zip(documents, scores))
        doc_scores.sort(key=lambda x: x[1], reverse=True)

        # Display ranking results
        print("Reranking results:")
        for rank, (doc, score) in enumerate(doc_scores, 1):
            print(f"  {rank:2d}. [{score:.4f}] {doc}")

        # Display most relevant documents
        print(f"\nMost relevant documents (top 3):")
        for rank, (doc, score) in enumerate(doc_scores[:3], 1):
            print(f"  {rank}. {doc}")

        print("=" * 80)

    return reranker


reranker = reranker_example()

6. Computer Vision (CV) Inference

Using CPU/GPU-accelerated computer vision tasks (e.g., OCR/text recognition) on images.
import os
from nexaai.cv import CVCapabilities, CVModel, CVModelConfig


def cv_ocr_example():
    """CV OCR Inference example"""
    print("=== CV OCR Inference Example ===")

    # Model configuration
    model_name = "NexaAI/paddleocr-mlx"
    image_path = r"path\to\image.jpg"  # Replace with actual image file path

    # Check if image file exists
    if not os.path.exists(image_path):
        print(f"Error: Image file not found: {image_path}")
        print("Please provide a valid image file path to test CV functionality.")
        return None

    print(f"Loading model: {model_name}")
    config = CVModelConfig(capabilities=CVCapabilities.OCR)
    cv = CVModel.from_(name_or_path=model_name, config=config, plugin_id='cpu_gpu')
    print("OCR model loaded successfully!")
    
    print(f"\n=== Processing Image ===")
    results = cv.infer(image_path)

    print(f"\n=== OCR Results ===")
    print(f"Number of text regions detected: {results.result_count}")
    for i, result in enumerate(results.results, 1):
        print(f"{i}. [{result.confidence:.2f}] {result.text}")


cv_ocr_example()

Next Steps