Skip to main content

Model Name Mapping

For all NPU model, we use an internal namming mapping and please fill in plugin id accordingly. For GGUF format models, they are running on CPU/GPU and no need to fill in model name.
Model NamePlugin IDHuggingface repository name
omni-neuralnpuNexaAI/OmniNeural-4B-mobile
embed-gemmanpuNexaAI/embeddinggemma-300m-npu-mobile
parakeetnpuNexaAI/parakeet-tdt-0.6b-v3-npu-mobile
liquid-v2npuNexaAI/LFM2-1.2B-npu-mobile
parakeetnpuNexaAI/parakeet-npu-mobile
jina-reranknpuNexaAI/jina-v2-rerank-npu-mobile
paddleocrnpuNexaAI/paddleocr-npu-mobile

LLM Usage

Large Language Models for text generation and chat applications.

Streaming Conversation - NPU

We support NPU inference for NEXA format models.
LlmWrapper.builder()
    .llmCreateInput(
        LlmCreateInput(
            model_name = "liquid-v2",
            model_path = <your-model-folder-path>,
            config = ModelConfig(
                    max_tokens = 2048
            ),
        ),
        plugin_id = "npu"
    )
    .build()
    .onSuccess { llmWrapper = it }

val chatList = arrayListOf(ChatMessage("user", "What is AI?"))

llmWrapper.applyChatTemplate(chatList.toTypedArray(), null, false).onSuccess { template ->
    llmWrapper.generateStreamFlow(template.formattedText, GenerationConfig()).collect { result ->
        when (result) {
            is LlmStreamResult.Token -> println(result.text)
            is LlmStreamResult.Completed -> println("Done!")
            is LlmStreamResult.Error -> println("Error: ${result.throwable}")
        }
    }
}

Streaming Conversation - CPU

We support CPU inferrence for GGUF format models.
LlmWrapper.builder()
    .llmCreateInput(
        LlmCreateInput(
            model_path = <your-model-folder-path>,
            config = ModelConfig(nCtx = 4096, max_tokens = 2048),
            plugin_id = "cpu_gpu"
        )
    )
    .build()
    .onSuccess { llmWrapper = it }
    .onFailure { error -> 
        println("Error: ${error.message}")
    }

val chatList = arrayListOf(ChatMessage("user", "What is AI?"))

llmWrapper.applyChatTemplate(chatList.toTypedArray(), null, false).onSuccess { template ->
    llmWrapper.generateStreamFlow(template.formattedText, GenerationConfig()).collect { result ->
        when (result) {
            is LlmStreamResult.Token -> println(result.text)
            is LlmStreamResult.Completed -> println("Done!")
            is LlmStreamResult.Error -> println("Error: ${result.throwable}")
        }
    }
}

Multimodal Usage

Vision-Language Models for image understanding and multimodal applications.

Streaming Conversation - NPU

We support NPU inference for NEXA format models.
VlmWrapper.builder()
    .vlmCreateInput(
        VlmCreateInput(
            model_name = "omni-neural",  // Model name for NPU plugin
            model_path = <your-model-folder-path>,
            config = ModelConfig(
                max_tokens = 2048,
                enable_thinking = false
            ),
            plugin_id = "npu"  // Use NPU backend
        )
    )
    .build()
    .onSuccess { vlmWrapper = it }
    .onFailure { error -> 
        println("Error: ${error.message}")
    }

// Use the loaded VLM with image and text
val contents = listOf(
    VlmContent("image", <your-image-path>),
    VlmContent("text", <your-text>)
)

val chatList = arrayListOf(VlmChatMessage("user", contents))

vlmWrapper.applyChatTemplate(chatList.toTypedArray(), null, false).onSuccess { template ->
    val config = vlmWrapper.injectMediaPathsToConfig(chatList.toTypedArray(), GenerationConfig())
    vlmWrapper.generateStreamFlow(template.formattedText, config).collect { result ->
        when (result) {
            is LlmStreamResult.Token -> println(result.text)
            is LlmStreamResult.Completed -> println("Done!")
            is LlmStreamResult.Error -> println("Error: ${result.throwable}")
        }
    }
}

Streaming Conversation - CPU

We support CPU inferrence for GGUF format models.
VlmWrapper.builder()
    .vlmCreateInput(
        VlmCreateInput(
            model_path = <your-model-folder-path>,
            mmproj_path = <your-mmproj-path>,  // vision projection weights
            config = ModelConfig(
                max_tokens = 2048,
                enable_thinking = false
            ),
            plugin_id = "cpu_gpu"
        )
    )
    .build()
    .onSuccess { vlmWrapper = it }
    .onFailure { error -> 
        println("Error: ${error.message}")
    }

// Use the loaded VLM with image and text
val contents = listOf(
    VlmContent("image", <your-image-path>),
    VlmContent("text", <your-text>)
)

val chatList = arrayListOf(VlmChatMessage("user", contents))

vlmWrapper.applyChatTemplate(chatList.toTypedArray(), null, false).onSuccess { template ->
    val config = vlmWrapper.injectMediaPathsToConfig(chatList.toTypedArray(), GenerationConfig())
    vlmWrapper.generateStreamFlow(template.formattedText, config).collect { result ->
        when (result) {
            is LlmStreamResult.Token -> println(result.text)
            is LlmStreamResult.Completed -> println("Done!")
            is LlmStreamResult.Error -> println("Error: ${result.throwable}")
        }
    }
}

API Reference

VlmCreateInput

data class VlmCreateInput(
    val model_name: String? = null,            // Model name (required for NPU)
    val model_path: String,                    // Path to VLM model
    val tokenizer_path: String? = null,        // Optional tokenizer path
    val mmproj_path: String? = null,           // Vision projection weights (for GGUF models)
    val config: ModelConfig,                   // Model configuration
    val plugin_id: String? = null              // "npu" for NPU, "cpu_gpu" for CPU/GPU, null defaults to CPU
)

VlmChatMessage

data class VlmChatMessage(
    val role: String,                          // "system", "user", "assistant"
    val contents: List<VlmContent>             // Mixed text/image/audio content
)

VlmContent

data class VlmContent(
    val type: String,                          // "text", "image", "audio"
    val content: String                        // Text or file path
)

Embeddings Usage

Generate vector embeddings for semantic search and RAG applications.

Basic Usage

// Load embedder for NPU inference
EmbedderWrapper.builder()
    .embedderCreateInput(
        EmbedderCreateInput(
            model_name = "embed-gemma",  // Model name for NPU plugin
            model_path = <your-model-folder-path>,
            tokenizer_path = <your-tokenizer-path>,  // Optional
            config = ModelConfig(
                max_tokens = 2048
            ),
            plugin_id = "npu",  // Use NPU backend
            device_id = null    // Optional device ID
        )
    )
    .build()
    .onSuccess { embedderWrapper = it }
    .onFailure { error -> 
        println("Error: ${error.message}")
    }

// Generate embeddings for multiple texts
val texts = arrayOf(<your-text1>, <your-text2>, ...)

embedderWrapper.embed(texts, EmbeddingConfig()).onSuccess { embeddings ->
    val dimension = embeddings.size / texts.size
    println("Dimension: $dimension")
    println("First 5 values: ${embeddings.take(5)}")
}

API Reference

EmbedderCreateInput

data class EmbedderCreateInput(
    val model_name: String? = null,            // Model name for NPU
    val model_path: String,                    // Path to embedder model
    val tokenizer_path: String? = null,        // Path to tokenizer
    val config: ModelConfig,                   // Model configuration
    val plugin_id: String? = null,             // "npu" for NPU, null for CPU
    val device_id: String? = null              // Device ID (optional)
)

EmbeddingConfig

data class EmbeddingConfig(
    val normalize: Boolean = true              // Normalize embeddings (default: true)
)

ASR Usage

Automatic Speech Recognition for audio transcription.

Basic Usage

// Load ASR model for NPU inference
AsrWrapper.builder()
    .asrCreateInput(
        AsrCreateInput(
            model_name = "parakeet",  // Model name for NPU plugin
            model_path = <your-model-folder-path>,
            config = ModelConfig(
                max_tokens = 2048
            ),
            plugin_id = "npu"  // Use NPU backend
        )
    )
    .build()
    .onSuccess { asrWrapper = it }
    .onFailure { error -> 
        println("Error: ${error.message}")
    }

// Transcribe audio file
asrWrapper.transcribe(
    AsrTranscribeInput(
        audioPath = <your-audio-path>,  // Path to .wav, .mp3, etc.
        language = "en",                // Language code: "en", "zh", "es", etc.
        timestamps = null               // Optional timestamp format
    )
).onSuccess { result ->
    println("Transcription: ${result.result.transcript}")
}

API Reference

AsrCreateInput

data class AsrCreateInput(
    val model_name: String? = null,            // Model name for NPU
    val model_path: String,                    // Path to ASR model
    val config: ModelConfig,                   // Model configuration
    val plugin_id: String? = null              // "npu" for NPU, null for CPU
)

AsrTranscribeInput

data class AsrTranscribeInput(
    val audioPath: String,                     // Path to audio file (.wav, .mp3, etc.)
    val language: String,                      // Language code: "en", "zh", "es", etc.
    val timestamps: String? = null             // Optional: timestamp format
)

AsrTranscriptionResult

data class AsrTranscriptionResult(
    val result: AsrResult,                     // Transcription result
    val profileData: String                    // Performance metrics
)

Rerank Usage

Improve search relevance by reranking documents based on query relevance.

Basic Usage

// Load reranker model for NPU inference
RerankerWrapper.builder()
    .rerankerCreateInput(
        RerankerCreateInput(
            model_name = "jina-rerank",  // Model name for NPU plugin
            model_path = <your-model-folder-path>,
            tokenizer_path = <your-tokenizer-path>,  // Optional
            config = ModelConfig(
                max_tokens = 2048
            ),
            plugin_id = "npu",  // Use NPU backend
            device_id = null    // Optional device ID
        )
    )
    .build()
    .onSuccess { rerankerWrapper = it }
    .onFailure { error -> 
        println("Error: ${error.message}")
    }

// Rerank documents based on query relevance
val query = "What is machine learning?"
val docs = arrayOf("ML is AI subset", "Weather forecast", "Deep learning tutorial")

rerankerWrapper.rerank(query, docs, RerankConfig()).onSuccess { result ->
    result.scores?.withIndex()?.sortedByDescending { it.value }?.forEach { (idx, score) ->
        println("Score: ${"%.4f".format(score)} - ${docs[idx]}")
    }
}

API Reference

RerankerCreateInput

data class RerankerCreateInput(
    val model_name: String? = null,            // Model name for NPU
    val model_path: String,                    // Path to reranker model
    val tokenizer_path: String? = null,        // Path to tokenizer
    val config: ModelConfig,                   // Model configuration
    val plugin_id: String? = null,             // "npu" for NPU, null for CPU
    val device_id: String? = null              // Device ID (optional)
)

RerankConfig

data class RerankConfig(
    val topN: Int? = null                      // Return only top N results (null = all)
)

RerankerResult

data class RerankerResult(
    val scores: FloatArray,                    // Relevance scores (0.0 - 1.0)
    val scoreCount: Int,                       // Number of scores
    val profileData: String                    // Performance metrics
)

Methods

suspend fun rerank(query: String, documents: Array<String>, config: RerankConfig): Result<RerankerResult>
fun destroy(): Int

CV Usage

Computer Vision models for OCR, object detection, and image classification.

Basic Usage

// Load PaddleOCR model for NPU inference
CvWrapper.builder()
    .createInput(
        CVCreateInput(
            model_name = "paddleocr",  // Model name
            config = CVModelConfig(
                capabilities = CVCapability.OCR,
                det_model_path = <your-det-model-folder-path>,
                rec_model_path = <your-rec-model-path>,
                char_dict_path = <your-char-dict-path>,
                qnn_model_folder_path = <your-qnn-model-folder-path>,  // For NPU
                qnn_lib_folder_path = <your-qnn-lib-folder-path>       // For NPU
            ),
            plugin_id = "npu"  // Use NPU backend
        )
    )
    .build()
    .onSuccess { cvWrapper = it }
    .onFailure { error -> 
        println("Error: ${error.message}")
    }

// Perform OCR on image
cvWrapper.infer(<your-image-path>).onSuccess { results ->
    results.forEach { result ->
        println("Text: ${result.text}, Confidence: ${result.confidence}")
    }
}

API Reference

CVCreateInput

data class CVCreateInput(
    val model_name: String,                    // Model name: "paddleocr", etc.
    val config: CVModelConfig,                 // CV model configuration
    val plugin_id: String? = null              // "npu" for NPU, "cpu_gpu" for CPU/GPU, null defaults to CPU
)

CVModelConfig

data class CVModelConfig(
    val capabilities: CVCapability,            // OCR, DETECTION, CLASSIFICATION
    val det_model_path: String? = null,        // Detection model path
    val rec_model_path: String? = null,        // Recognition model path
    val char_dict_path: String? = null,        // Character dictionary path
    val qnn_model_folder_path: String? = null, // QNN model folder (NPU)
    val qnn_lib_folder_path: String? = null    // QNN library path (NPU)
)

CVCapability

enum class CVCapability {
    OCR,                                       // Optical Character Recognition
    DETECTION,                                 // Object detection
    CLASSIFICATION                             // Image classification
}

CVResult

data class CVResult(
    val text: String,                          // Recognized text (OCR)
    val confidence: Float,                     // Confidence score (0.0 - 1.0)
    val boundingBox: BoundingBox?,             // Bounding box coordinates
    val label: String?,                        // Class label (classification)
    val score: Float                           // Detection/classification score
)

How to use CPU, GPU, NPU

Switch between different hardware acceleration modes.

CPU/GPU Mode

// CPU/GPU inference - uses GGUF format models
LlmWrapper.builder()
    .llmCreateInput(
        LlmCreateInput(
            model_path = <your-model-folder-path>,
            config = ModelConfig(max_tokens = 2048),
            plugin_id = "cpu_gpu"  // Use CPU/GPU backend
        )
    )
    .build()
    .onSuccess { llmWrapper = it }
    .onFailure { error -> 
        println("Error: ${error.message}")
    }

NPU Mode (Qualcomm)

// NPU inference - Qualcomm Hexagon NPU, uses NEXA format models
LlmWrapper.builder()
    .llmCreateInput(
        LlmCreateInput(
            model_name = "liquid-v2",  // Model name required for NPU
            model_path = <your-model-folder-path>,
            config = ModelConfig(
                max_tokens = 2048,
                npu_lib_folder_path = applicationInfo.nativeLibraryDir,
                npu_model_folder_path = <your-npu-model-folder-path>
            ),
            plugin_id = "npu"  // Use NPU backend
        )
    )
    .build()
    .onSuccess { llmWrapper = it }
    .onFailure { error -> 
        println("Error: ${error.message}")
    }

Need Help?

Join our community to get support, share your projects, and connect with other developers.